Mercurial > code > home > repos > href
view delicious2mongo @ 0:cb3ea57da096
old version
Ignore-this: 7f8014adeb5faf9f9120648e55ad7607
author | Drew Perttula <drewp@bigasterisk.com> |
---|---|
date | Sun, 17 Feb 2013 00:55:39 -0800 |
parents | |
children | 7cecda055fae |
line wrap: on
line source
#!/usr/bin/env python """ Python script to copy a delicious XML file into a mongodb. Based on this, but updated to use bookmark format: https://gist.github.com/raw/744051/2cf68c82b9e320c9e89ff25dfa5093d3996a101b/import_to_mongo.py sample input: <DT> <A HREF="http://blog.blprnt.com/blog/blprnt/just-landed-processing-twitter-metacarta-hidden-data" ADD_DATE="1291438624" PRIVATE="0" TAGS="twitter,processing,mapping,for:scotus">Just Landed: Processing, Twitter, MetaCarta & Hidden Data | blprnt.blg</A> <DT> <A HREF="http://techblog.netflix.com/2010/12/why-we-choose-html5-for-user.html" ADD_DATE="1291436802" PRIVATE="0" TAGS="netflix,html,webapp,for:charliedad">http://techblog.netflix.com/2010/12/why-we-choose-html5-for-user.html</A> <DD>"Our PS3 UI was written entirely using HTML5" Usage: python import_to_mongo.py Requires: - pymongo: http://api.mongodb.org/python/ - beautifulsoup: http://www.crummy.com/software/BeautifulSoup/ """ import datetime, optparse from dateutil.tz import tzutc import lxml.html.soupparser import pymongo parser = optparse.OptionParser() parser.add_option("--mongo", help="connect string, like mongodb://user:pass@host:port/db (see http://www.mongodb.org/display/DOCS/Connections)") parser.add_option("--bookmarks", help="bookmarks file exported from delicious (see https://secure.delicious.com/settings/bookmarks/export)") opts, args = parser.parse_args() if __name__ == '__main__': conn = pymongo.Connection(opts.mongo) db = conn.links root = lxml.html.soupparser.fromstring(open(opts.bookmarks).read()) for dt in root.findall('dl/dt'): a = dt.find('a') forUsers = [] tags = [] for t in a.attrib['tags'].split(','): if t.startswith('for:'): forUsers.append(t[4:]) else: tags.append(t) nextTag = dt.getnext() note = None if nextTag is not None and nextTag.tag == 'dd': note = nextTag.text if note is not None: note = note.strip() item = { 'link' : a.attrib['href'], 'addTime' : datetime.datetime.fromtimestamp( int(a.attrib['add_date']), tzutc()), 'private' : int(a.attrib['private']), 'tags' : tags, 'forUsers' : forUsers, 'note' : note, } print item conn.disconnect()