comparison delicious2mongo @ 0:cb3ea57da096

old version Ignore-this: 7f8014adeb5faf9f9120648e55ad7607
author Drew Perttula <drewp@bigasterisk.com>
date Sun, 17 Feb 2013 00:55:39 -0800
parents
children 7cecda055fae
comparison
equal deleted inserted replaced
-1:000000000000 0:cb3ea57da096
1 #!/usr/bin/env python
2
3 """
4 Python script to copy a delicious XML file into a mongodb.
5 Based on this, but updated to use bookmark format:
6 https://gist.github.com/raw/744051/2cf68c82b9e320c9e89ff25dfa5093d3996a101b/import_to_mongo.py
7
8 sample input:
9
10 <DT>
11 <A HREF="http://blog.blprnt.com/blog/blprnt/just-landed-processing-twitter-metacarta-hidden-data"
12 ADD_DATE="1291438624"
13 PRIVATE="0"
14 TAGS="twitter,processing,mapping,for:scotus">Just Landed: Processing, Twitter, MetaCarta &amp; Hidden Data | blprnt.blg</A>
15 <DT>
16 <A HREF="http://techblog.netflix.com/2010/12/why-we-choose-html5-for-user.html"
17 ADD_DATE="1291436802"
18 PRIVATE="0"
19 TAGS="netflix,html,webapp,for:charliedad">http://techblog.netflix.com/2010/12/why-we-choose-html5-for-user.html</A>
20 <DD>&quot;Our PS3 UI was written entirely using HTML5&quot;
21
22
23 Usage:
24
25 python import_to_mongo.py
26
27 Requires:
28
29 - pymongo: http://api.mongodb.org/python/
30 - beautifulsoup: http://www.crummy.com/software/BeautifulSoup/
31 """
32
33 import datetime, optparse
34 from dateutil.tz import tzutc
35 import lxml.html.soupparser
36 import pymongo
37
38 parser = optparse.OptionParser()
39 parser.add_option("--mongo", help="connect string, like mongodb://user:pass@host:port/db (see http://www.mongodb.org/display/DOCS/Connections)")
40 parser.add_option("--bookmarks", help="bookmarks file exported from delicious (see https://secure.delicious.com/settings/bookmarks/export)")
41 opts, args = parser.parse_args()
42
43 if __name__ == '__main__':
44 conn = pymongo.Connection(opts.mongo)
45 db = conn.links
46
47 root = lxml.html.soupparser.fromstring(open(opts.bookmarks).read())
48
49 for dt in root.findall('dl/dt'):
50 a = dt.find('a')
51 forUsers = []
52 tags = []
53 for t in a.attrib['tags'].split(','):
54 if t.startswith('for:'):
55 forUsers.append(t[4:])
56 else:
57 tags.append(t)
58 nextTag = dt.getnext()
59 note = None
60 if nextTag is not None and nextTag.tag == 'dd':
61 note = nextTag.text
62 if note is not None:
63 note = note.strip()
64 item = {
65 'link' : a.attrib['href'],
66 'addTime' : datetime.datetime.fromtimestamp(
67 int(a.attrib['add_date']), tzutc()),
68 'private' : int(a.attrib['private']),
69 'tags' : tags,
70 'forUsers' : forUsers,
71 'note' : note,
72 }
73
74 print item
75
76 conn.disconnect()