Mercurial > code > home > repos > href
changeset 0:cb3ea57da096
old version
Ignore-this: 7f8014adeb5faf9f9120648e55ad7607
author | Drew Perttula <drewp@bigasterisk.com> |
---|---|
date | Sun, 17 Feb 2013 00:55:39 -0800 |
parents | |
children | 7cecda055fae |
files | delicious2mongo |
diffstat | 1 files changed, 76 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/delicious2mongo Sun Feb 17 00:55:39 2013 -0800 @@ -0,0 +1,76 @@ +#!/usr/bin/env python + +""" +Python script to copy a delicious XML file into a mongodb. +Based on this, but updated to use bookmark format: +https://gist.github.com/raw/744051/2cf68c82b9e320c9e89ff25dfa5093d3996a101b/import_to_mongo.py + +sample input: + +<DT> + <A HREF="http://blog.blprnt.com/blog/blprnt/just-landed-processing-twitter-metacarta-hidden-data" + ADD_DATE="1291438624" + PRIVATE="0" + TAGS="twitter,processing,mapping,for:scotus">Just Landed: Processing, Twitter, MetaCarta & Hidden Data | blprnt.blg</A> +<DT> +<A HREF="http://techblog.netflix.com/2010/12/why-we-choose-html5-for-user.html" +ADD_DATE="1291436802" +PRIVATE="0" +TAGS="netflix,html,webapp,for:charliedad">http://techblog.netflix.com/2010/12/why-we-choose-html5-for-user.html</A> +<DD>"Our PS3 UI was written entirely using HTML5" + + +Usage: + + python import_to_mongo.py + +Requires: + + - pymongo: http://api.mongodb.org/python/ + - beautifulsoup: http://www.crummy.com/software/BeautifulSoup/ +""" + +import datetime, optparse +from dateutil.tz import tzutc +import lxml.html.soupparser +import pymongo + +parser = optparse.OptionParser() +parser.add_option("--mongo", help="connect string, like mongodb://user:pass@host:port/db (see http://www.mongodb.org/display/DOCS/Connections)") +parser.add_option("--bookmarks", help="bookmarks file exported from delicious (see https://secure.delicious.com/settings/bookmarks/export)") +opts, args = parser.parse_args() + +if __name__ == '__main__': + conn = pymongo.Connection(opts.mongo) + db = conn.links + + root = lxml.html.soupparser.fromstring(open(opts.bookmarks).read()) + + for dt in root.findall('dl/dt'): + a = dt.find('a') + forUsers = [] + tags = [] + for t in a.attrib['tags'].split(','): + if t.startswith('for:'): + forUsers.append(t[4:]) + else: + tags.append(t) + nextTag = dt.getnext() + note = None + if nextTag is not None and nextTag.tag == 'dd': + note = nextTag.text + if note is not None: + note = note.strip() + item = { + 'link' : a.attrib['href'], + 'addTime' : datetime.datetime.fromtimestamp( + int(a.attrib['add_date']), tzutc()), + 'private' : int(a.attrib['private']), + 'tags' : tags, + 'forUsers' : forUsers, + 'note' : note, + } + + print item + + conn.disconnect()