view delicious2mongo @ 31:9fd62089fbf3 master

ignore
author drewp@bigasterisk.com
date Sun, 12 Jul 2020 13:35:41 -0700
parents 8a5feb8d383d
children
line wrap: on
line source

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Python script to read from delicious.com into mongodb.

Usage:

    python delicious2mongo --mongo localhost --login user:password
"""

import datetime, optparse, requests
from requests.auth import HTTPBasicAuth

from dateutil.tz import tzutc
from dateutil.parser import parse
import pymongo
import lxml.objectify

def addPost(db, user, xmlNode):
    x = xmlNode
    out = dict(
        user=user,
        description=x.get('description'),
        extended=x.get('extended'),
        href=x.get('href'),
        private=x.get('private') == 'yes',
        shared=x.get('shared') == 'yes',
        tag=x.get('tag'),
        t=parse(x.get('time')),
        )

    forUsers = []
    tags = []
    for t in x.get('tag').split(' '):
        if t.startswith('for:'):
            forUsers.append(t[4:])
        else:
            tags.append(t)
    out['extracted'] = dict(tags=tags, forUsers=forUsers)
    
    db['links'].update(dict(href=out['href']), out, upsert=True)

parser = optparse.OptionParser()
parser.add_option("--mongo", help="host")
parser.add_option("--login", help="user:passwd")
parser.add_option("--hrefuser", help="username in href")
opts, args = parser.parse_args()

if __name__ == '__main__':
    conn = pymongo.Connection(opts.mongo)
    db = conn['href']

    offset = 0
    knownTotal = '?'
    while True:
        # https://delicious.com/developers/fetchall
        print "get [%s/%s]" % (offset, knownTotal)
        resp = requests.get("https://api.delicious.com/v1/posts/all",
                            params={'start' : offset},
                            auth=tuple(opts.login.split(':')))

        posts = lxml.objectify.fromstring(resp.content)
        knownTotal = posts.get('total') 

        children = posts.getchildren()
        if not children:
            break
        for post in children:
            addPost(db, opts.hrefuser, post)

        offset += len(children)