Mercurial > code > home > repos > href
comparison delicious2mongo @ 0:cb3ea57da096
old version
Ignore-this: 7f8014adeb5faf9f9120648e55ad7607
author | Drew Perttula <drewp@bigasterisk.com> |
---|---|
date | Sun, 17 Feb 2013 00:55:39 -0800 |
parents | |
children | 7cecda055fae |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:cb3ea57da096 |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 """ | |
4 Python script to copy a delicious XML file into a mongodb. | |
5 Based on this, but updated to use bookmark format: | |
6 https://gist.github.com/raw/744051/2cf68c82b9e320c9e89ff25dfa5093d3996a101b/import_to_mongo.py | |
7 | |
8 sample input: | |
9 | |
10 <DT> | |
11 <A HREF="http://blog.blprnt.com/blog/blprnt/just-landed-processing-twitter-metacarta-hidden-data" | |
12 ADD_DATE="1291438624" | |
13 PRIVATE="0" | |
14 TAGS="twitter,processing,mapping,for:scotus">Just Landed: Processing, Twitter, MetaCarta & Hidden Data | blprnt.blg</A> | |
15 <DT> | |
16 <A HREF="http://techblog.netflix.com/2010/12/why-we-choose-html5-for-user.html" | |
17 ADD_DATE="1291436802" | |
18 PRIVATE="0" | |
19 TAGS="netflix,html,webapp,for:charliedad">http://techblog.netflix.com/2010/12/why-we-choose-html5-for-user.html</A> | |
20 <DD>"Our PS3 UI was written entirely using HTML5" | |
21 | |
22 | |
23 Usage: | |
24 | |
25 python import_to_mongo.py | |
26 | |
27 Requires: | |
28 | |
29 - pymongo: http://api.mongodb.org/python/ | |
30 - beautifulsoup: http://www.crummy.com/software/BeautifulSoup/ | |
31 """ | |
32 | |
33 import datetime, optparse | |
34 from dateutil.tz import tzutc | |
35 import lxml.html.soupparser | |
36 import pymongo | |
37 | |
38 parser = optparse.OptionParser() | |
39 parser.add_option("--mongo", help="connect string, like mongodb://user:pass@host:port/db (see http://www.mongodb.org/display/DOCS/Connections)") | |
40 parser.add_option("--bookmarks", help="bookmarks file exported from delicious (see https://secure.delicious.com/settings/bookmarks/export)") | |
41 opts, args = parser.parse_args() | |
42 | |
43 if __name__ == '__main__': | |
44 conn = pymongo.Connection(opts.mongo) | |
45 db = conn.links | |
46 | |
47 root = lxml.html.soupparser.fromstring(open(opts.bookmarks).read()) | |
48 | |
49 for dt in root.findall('dl/dt'): | |
50 a = dt.find('a') | |
51 forUsers = [] | |
52 tags = [] | |
53 for t in a.attrib['tags'].split(','): | |
54 if t.startswith('for:'): | |
55 forUsers.append(t[4:]) | |
56 else: | |
57 tags.append(t) | |
58 nextTag = dt.getnext() | |
59 note = None | |
60 if nextTag is not None and nextTag.tag == 'dd': | |
61 note = nextTag.text | |
62 if note is not None: | |
63 note = note.strip() | |
64 item = { | |
65 'link' : a.attrib['href'], | |
66 'addTime' : datetime.datetime.fromtimestamp( | |
67 int(a.attrib['add_date']), tzutc()), | |
68 'private' : int(a.attrib['private']), | |
69 'tags' : tags, | |
70 'forUsers' : forUsers, | |
71 'note' : note, | |
72 } | |
73 | |
74 print item | |
75 | |
76 conn.disconnect() |