Mercurial > code > home > repos > href
annotate delicious2mongo @ 42:530650b3bc40 default tip
something changed in pom to break pyjwt. switched to jwskate
author | drewp@bigasterisk.com |
---|---|
date | Wed, 14 Dec 2022 22:07:19 -0800 |
parents | 8a5feb8d383d |
children |
rev | line source |
---|---|
0 | 1 #!/usr/bin/env python |
1
7cecda055fae
redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents:
0
diff
changeset
|
2 # -*- coding: utf-8 -*- |
0 | 3 """ |
1
7cecda055fae
redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents:
0
diff
changeset
|
4 Python script to read from delicious.com into mongodb. |
0 | 5 |
6 Usage: | |
7 | |
1
7cecda055fae
redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents:
0
diff
changeset
|
8 python delicious2mongo --mongo localhost --login user:password |
0 | 9 """ |
10 | |
1
7cecda055fae
redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents:
0
diff
changeset
|
11 import datetime, optparse, requests |
7cecda055fae
redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents:
0
diff
changeset
|
12 from requests.auth import HTTPBasicAuth |
7cecda055fae
redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents:
0
diff
changeset
|
13 |
0 | 14 from dateutil.tz import tzutc |
1
7cecda055fae
redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents:
0
diff
changeset
|
15 from dateutil.parser import parse |
0 | 16 import pymongo |
1
7cecda055fae
redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents:
0
diff
changeset
|
17 import lxml.objectify |
7cecda055fae
redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents:
0
diff
changeset
|
18 |
7cecda055fae
redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents:
0
diff
changeset
|
19 def addPost(db, user, xmlNode): |
7cecda055fae
redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents:
0
diff
changeset
|
20 x = xmlNode |
7cecda055fae
redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents:
0
diff
changeset
|
21 out = dict( |
7cecda055fae
redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents:
0
diff
changeset
|
22 user=user, |
7cecda055fae
redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents:
0
diff
changeset
|
23 description=x.get('description'), |
7cecda055fae
redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents:
0
diff
changeset
|
24 extended=x.get('extended'), |
7cecda055fae
redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents:
0
diff
changeset
|
25 href=x.get('href'), |
7cecda055fae
redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents:
0
diff
changeset
|
26 private=x.get('private') == 'yes', |
7cecda055fae
redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents:
0
diff
changeset
|
27 shared=x.get('shared') == 'yes', |
7cecda055fae
redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents:
0
diff
changeset
|
28 tag=x.get('tag'), |
7cecda055fae
redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents:
0
diff
changeset
|
29 t=parse(x.get('time')), |
7cecda055fae
redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents:
0
diff
changeset
|
30 ) |
7cecda055fae
redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents:
0
diff
changeset
|
31 |
7cecda055fae
redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents:
0
diff
changeset
|
32 forUsers = [] |
7cecda055fae
redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents:
0
diff
changeset
|
33 tags = [] |
7cecda055fae
redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents:
0
diff
changeset
|
34 for t in x.get('tag').split(' '): |
7cecda055fae
redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents:
0
diff
changeset
|
35 if t.startswith('for:'): |
7cecda055fae
redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents:
0
diff
changeset
|
36 forUsers.append(t[4:]) |
7cecda055fae
redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents:
0
diff
changeset
|
37 else: |
7cecda055fae
redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents:
0
diff
changeset
|
38 tags.append(t) |
7cecda055fae
redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents:
0
diff
changeset
|
39 out['extracted'] = dict(tags=tags, forUsers=forUsers) |
7cecda055fae
redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents:
0
diff
changeset
|
40 |
7cecda055fae
redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents:
0
diff
changeset
|
41 db['links'].update(dict(href=out['href']), out, upsert=True) |
0 | 42 |
43 parser = optparse.OptionParser() | |
1
7cecda055fae
redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents:
0
diff
changeset
|
44 parser.add_option("--mongo", help="host") |
7cecda055fae
redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents:
0
diff
changeset
|
45 parser.add_option("--login", help="user:passwd") |
14
8a5feb8d383d
d2m takes an arbitrary username for the href side
Drew Perttula <drewp@bigasterisk.com>
parents:
1
diff
changeset
|
46 parser.add_option("--hrefuser", help="username in href") |
0 | 47 opts, args = parser.parse_args() |
48 | |
49 if __name__ == '__main__': | |
50 conn = pymongo.Connection(opts.mongo) | |
1
7cecda055fae
redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents:
0
diff
changeset
|
51 db = conn['href'] |
0 | 52 |
1
7cecda055fae
redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents:
0
diff
changeset
|
53 offset = 0 |
7cecda055fae
redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents:
0
diff
changeset
|
54 knownTotal = '?' |
7cecda055fae
redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents:
0
diff
changeset
|
55 while True: |
7cecda055fae
redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents:
0
diff
changeset
|
56 # https://delicious.com/developers/fetchall |
7cecda055fae
redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents:
0
diff
changeset
|
57 print "get [%s/%s]" % (offset, knownTotal) |
7cecda055fae
redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents:
0
diff
changeset
|
58 resp = requests.get("https://api.delicious.com/v1/posts/all", |
7cecda055fae
redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents:
0
diff
changeset
|
59 params={'start' : offset}, |
7cecda055fae
redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents:
0
diff
changeset
|
60 auth=tuple(opts.login.split(':'))) |
0 | 61 |
1
7cecda055fae
redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents:
0
diff
changeset
|
62 posts = lxml.objectify.fromstring(resp.content) |
7cecda055fae
redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents:
0
diff
changeset
|
63 knownTotal = posts.get('total') |
7cecda055fae
redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents:
0
diff
changeset
|
64 |
7cecda055fae
redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents:
0
diff
changeset
|
65 children = posts.getchildren() |
7cecda055fae
redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents:
0
diff
changeset
|
66 if not children: |
7cecda055fae
redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents:
0
diff
changeset
|
67 break |
7cecda055fae
redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents:
0
diff
changeset
|
68 for post in children: |
14
8a5feb8d383d
d2m takes an arbitrary username for the href side
Drew Perttula <drewp@bigasterisk.com>
parents:
1
diff
changeset
|
69 addPost(db, opts.hrefuser, post) |
1
7cecda055fae
redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents:
0
diff
changeset
|
70 |
7cecda055fae
redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents:
0
diff
changeset
|
71 offset += len(children) |