view delicious2mongo @ 0:cb3ea57da096

old version Ignore-this: 7f8014adeb5faf9f9120648e55ad7607
author Drew Perttula <drewp@bigasterisk.com>
date Sun, 17 Feb 2013 00:55:39 -0800
parents
children 7cecda055fae
line wrap: on
line source

#!/usr/bin/env python

"""
Python script to copy a delicious XML file into a mongodb.
Based on this, but updated to use bookmark format:
https://gist.github.com/raw/744051/2cf68c82b9e320c9e89ff25dfa5093d3996a101b/import_to_mongo.py

sample input:

<DT>
  <A HREF="http://blog.blprnt.com/blog/blprnt/just-landed-processing-twitter-metacarta-hidden-data"
  ADD_DATE="1291438624"
  PRIVATE="0"
  TAGS="twitter,processing,mapping,for:scotus">Just Landed: Processing, Twitter, MetaCarta &amp; Hidden Data | blprnt.blg</A>
<DT>
<A HREF="http://techblog.netflix.com/2010/12/why-we-choose-html5-for-user.html"
ADD_DATE="1291436802"
PRIVATE="0"
TAGS="netflix,html,webapp,for:charliedad">http://techblog.netflix.com/2010/12/why-we-choose-html5-for-user.html</A>
<DD>&quot;Our PS3 UI was written entirely using HTML5&quot;


Usage:

    python import_to_mongo.py

Requires:

    - pymongo: http://api.mongodb.org/python/
    - beautifulsoup: http://www.crummy.com/software/BeautifulSoup/
"""

import datetime, optparse
from dateutil.tz import tzutc
import lxml.html.soupparser
import pymongo

parser = optparse.OptionParser()
parser.add_option("--mongo", help="connect string, like mongodb://user:pass@host:port/db (see http://www.mongodb.org/display/DOCS/Connections)")
parser.add_option("--bookmarks", help="bookmarks file exported from delicious (see https://secure.delicious.com/settings/bookmarks/export)")
opts, args = parser.parse_args()

if __name__ == '__main__':
    conn = pymongo.Connection(opts.mongo)
    db = conn.links

    root = lxml.html.soupparser.fromstring(open(opts.bookmarks).read())

    for dt in root.findall('dl/dt'):
        a = dt.find('a')
        forUsers = []
        tags = []
        for t in a.attrib['tags'].split(','):
            if t.startswith('for:'):
                forUsers.append(t[4:])
            else:
                tags.append(t)
        nextTag = dt.getnext()
        note = None
        if nextTag is not None and nextTag.tag == 'dd':
            note = nextTag.text
            if note is not None:
                note = note.strip()
        item = {
            'link' : a.attrib['href'],
            'addTime' : datetime.datetime.fromtimestamp(
                int(a.attrib['add_date']), tzutc()),
            'private' : int(a.attrib['private']),
            'tags' : tags,
            'forUsers' : forUsers,
            'note' : note,
            }
        
        print item 
    
    conn.disconnect()