view pagetitle.py @ 20:a8887fb93676

hide share data from the public. bug in links filter box Ignore-this: 2a390b207b8e9c8d430acd268b6d765d
author Drew Perttula <drewp@bigasterisk.com>
date Sun, 17 Mar 2013 01:03:43 -0700
parents f8c4c7ce5f4a
children adb79e44323a
line wrap: on
line source

import lxml.html.soupparser
import datetime, socket
from dateutil.tz import tzlocal
import restkit

class CantGetTitle(ValueError):
    pass

class PageTitle(object):
    def __init__(self, db):
        self.coll = db['pageTitle']

    def getPageTitleNow(self, uri):
        try:
            response = restkit.request(uri, timeout=1, follow_redirect=True,
                                headers={
                                    'user-agent':
                                    'link title checker - drewp@bigasterisk.com'
                                })
            if not response.status.startswith('2'):
                raise CantGetTitle("(got %s)" % response.status)
            root = lxml.html.soupparser.fromstring(
                response.body_string())

            for title in root.cssselect("title"):
                return title.text
        except restkit.RequestError:
            raise CantGetTitle("(error requesting title from site)")
            
    def pageTitle(self, uri):
        """page title from our db or by getting a new load from the page"""
        doc = self.coll.find_one({'_id' : uri})
        if doc is None:
            try:
                title = self.getPageTitleNow(uri)
            except CantGetTitle, e:
                return str(e)
            doc = {'_id': uri, 'title' : title,
                   'getTime':datetime.datetime.now(tzlocal())}
            self.coll.insert(doc, safe=True)
        return doc['title']