annotate delicious2mongo @ 42:530650b3bc40 default tip

something changed in pom to break pyjwt. switched to jwskate
author drewp@bigasterisk.com
date Wed, 14 Dec 2022 22:07:19 -0800
parents 8a5feb8d383d
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
cb3ea57da096 old version
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
1 #!/usr/bin/env python
1
7cecda055fae redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents: 0
diff changeset
2 # -*- coding: utf-8 -*-
0
cb3ea57da096 old version
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
3 """
1
7cecda055fae redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents: 0
diff changeset
4 Python script to read from delicious.com into mongodb.
0
cb3ea57da096 old version
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
5
cb3ea57da096 old version
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
6 Usage:
cb3ea57da096 old version
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
7
1
7cecda055fae redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents: 0
diff changeset
8 python delicious2mongo --mongo localhost --login user:password
0
cb3ea57da096 old version
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
9 """
cb3ea57da096 old version
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
10
1
7cecda055fae redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents: 0
diff changeset
11 import datetime, optparse, requests
7cecda055fae redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents: 0
diff changeset
12 from requests.auth import HTTPBasicAuth
7cecda055fae redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents: 0
diff changeset
13
0
cb3ea57da096 old version
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
14 from dateutil.tz import tzutc
1
7cecda055fae redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents: 0
diff changeset
15 from dateutil.parser import parse
0
cb3ea57da096 old version
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
16 import pymongo
1
7cecda055fae redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents: 0
diff changeset
17 import lxml.objectify
7cecda055fae redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents: 0
diff changeset
18
7cecda055fae redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents: 0
diff changeset
19 def addPost(db, user, xmlNode):
7cecda055fae redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents: 0
diff changeset
20 x = xmlNode
7cecda055fae redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents: 0
diff changeset
21 out = dict(
7cecda055fae redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents: 0
diff changeset
22 user=user,
7cecda055fae redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents: 0
diff changeset
23 description=x.get('description'),
7cecda055fae redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents: 0
diff changeset
24 extended=x.get('extended'),
7cecda055fae redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents: 0
diff changeset
25 href=x.get('href'),
7cecda055fae redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents: 0
diff changeset
26 private=x.get('private') == 'yes',
7cecda055fae redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents: 0
diff changeset
27 shared=x.get('shared') == 'yes',
7cecda055fae redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents: 0
diff changeset
28 tag=x.get('tag'),
7cecda055fae redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents: 0
diff changeset
29 t=parse(x.get('time')),
7cecda055fae redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents: 0
diff changeset
30 )
7cecda055fae redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents: 0
diff changeset
31
7cecda055fae redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents: 0
diff changeset
32 forUsers = []
7cecda055fae redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents: 0
diff changeset
33 tags = []
7cecda055fae redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents: 0
diff changeset
34 for t in x.get('tag').split(' '):
7cecda055fae redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents: 0
diff changeset
35 if t.startswith('for:'):
7cecda055fae redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents: 0
diff changeset
36 forUsers.append(t[4:])
7cecda055fae redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents: 0
diff changeset
37 else:
7cecda055fae redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents: 0
diff changeset
38 tags.append(t)
7cecda055fae redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents: 0
diff changeset
39 out['extracted'] = dict(tags=tags, forUsers=forUsers)
7cecda055fae redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents: 0
diff changeset
40
7cecda055fae redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents: 0
diff changeset
41 db['links'].update(dict(href=out['href']), out, upsert=True)
0
cb3ea57da096 old version
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
42
cb3ea57da096 old version
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
43 parser = optparse.OptionParser()
1
7cecda055fae redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents: 0
diff changeset
44 parser.add_option("--mongo", help="host")
7cecda055fae redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents: 0
diff changeset
45 parser.add_option("--login", help="user:passwd")
14
8a5feb8d383d d2m takes an arbitrary username for the href side
Drew Perttula <drewp@bigasterisk.com>
parents: 1
diff changeset
46 parser.add_option("--hrefuser", help="username in href")
0
cb3ea57da096 old version
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
47 opts, args = parser.parse_args()
cb3ea57da096 old version
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
48
cb3ea57da096 old version
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
49 if __name__ == '__main__':
cb3ea57da096 old version
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
50 conn = pymongo.Connection(opts.mongo)
1
7cecda055fae redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents: 0
diff changeset
51 db = conn['href']
0
cb3ea57da096 old version
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
52
1
7cecda055fae redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents: 0
diff changeset
53 offset = 0
7cecda055fae redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents: 0
diff changeset
54 knownTotal = '?'
7cecda055fae redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents: 0
diff changeset
55 while True:
7cecda055fae redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents: 0
diff changeset
56 # https://delicious.com/developers/fetchall
7cecda055fae redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents: 0
diff changeset
57 print "get [%s/%s]" % (offset, knownTotal)
7cecda055fae redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents: 0
diff changeset
58 resp = requests.get("https://api.delicious.com/v1/posts/all",
7cecda055fae redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents: 0
diff changeset
59 params={'start' : offset},
7cecda055fae redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents: 0
diff changeset
60 auth=tuple(opts.login.split(':')))
0
cb3ea57da096 old version
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
61
1
7cecda055fae redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents: 0
diff changeset
62 posts = lxml.objectify.fromstring(resp.content)
7cecda055fae redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents: 0
diff changeset
63 knownTotal = posts.get('total')
7cecda055fae redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents: 0
diff changeset
64
7cecda055fae redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents: 0
diff changeset
65 children = posts.getchildren()
7cecda055fae redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents: 0
diff changeset
66 if not children:
7cecda055fae redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents: 0
diff changeset
67 break
7cecda055fae redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents: 0
diff changeset
68 for post in children:
14
8a5feb8d383d d2m takes an arbitrary username for the href side
Drew Perttula <drewp@bigasterisk.com>
parents: 1
diff changeset
69 addPost(db, opts.hrefuser, post)
1
7cecda055fae redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents: 0
diff changeset
70
7cecda055fae redo delicious importer
Drew Perttula <drewp@bigasterisk.com>
parents: 0
diff changeset
71 offset += len(children)