Mercurial > code > home > repos > href
diff pagetitle.py @ 5:f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Ignore-this: 863335c4680ac9bcc6a7fc5867638d61
author | Drew Perttula <drewp@bigasterisk.com> |
---|---|
date | Thu, 21 Feb 2013 01:39:01 -0800 |
parents | |
children | adb79e44323a |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pagetitle.py Thu Feb 21 01:39:01 2013 -0800 @@ -0,0 +1,41 @@ +import lxml.html.soupparser +import datetime, socket +from dateutil.tz import tzlocal +import restkit + +class CantGetTitle(ValueError): + pass + +class PageTitle(object): + def __init__(self, db): + self.coll = db['pageTitle'] + + def getPageTitleNow(self, uri): + try: + response = restkit.request(uri, timeout=1, follow_redirect=True, + headers={ + 'user-agent': + 'link title checker - drewp@bigasterisk.com' + }) + if not response.status.startswith('2'): + raise CantGetTitle("(got %s)" % response.status) + root = lxml.html.soupparser.fromstring( + response.body_string()) + + for title in root.cssselect("title"): + return title.text + except restkit.RequestError: + raise CantGetTitle("(error requesting title from site)") + + def pageTitle(self, uri): + """page title from our db or by getting a new load from the page""" + doc = self.coll.find_one({'_id' : uri}) + if doc is None: + try: + title = self.getPageTitleNow(uri) + except CantGetTitle, e: + return str(e) + doc = {'_id': uri, 'title' : title, + 'getTime':datetime.datetime.now(tzlocal())} + self.coll.insert(doc, safe=True) + return doc['title']