comparison pagetitle.py @ 5:f8c4c7ce5f4a

lots of href additions: add/edit, nav fixes Ignore-this: 863335c4680ac9bcc6a7fc5867638d61
author Drew Perttula <drewp@bigasterisk.com>
date Thu, 21 Feb 2013 01:39:01 -0800
parents
children adb79e44323a
comparison
equal deleted inserted replaced
4:409da49c148d 5:f8c4c7ce5f4a
1 import lxml.html.soupparser
2 import datetime, socket
3 from dateutil.tz import tzlocal
4 import restkit
5
6 class CantGetTitle(ValueError):
7 pass
8
9 class PageTitle(object):
10 def __init__(self, db):
11 self.coll = db['pageTitle']
12
13 def getPageTitleNow(self, uri):
14 try:
15 response = restkit.request(uri, timeout=1, follow_redirect=True,
16 headers={
17 'user-agent':
18 'link title checker - drewp@bigasterisk.com'
19 })
20 if not response.status.startswith('2'):
21 raise CantGetTitle("(got %s)" % response.status)
22 root = lxml.html.soupparser.fromstring(
23 response.body_string())
24
25 for title in root.cssselect("title"):
26 return title.text
27 except restkit.RequestError:
28 raise CantGetTitle("(error requesting title from site)")
29
30 def pageTitle(self, uri):
31 """page title from our db or by getting a new load from the page"""
32 doc = self.coll.find_one({'_id' : uri})
33 if doc is None:
34 try:
35 title = self.getPageTitleNow(uri)
36 except CantGetTitle, e:
37 return str(e)
38 doc = {'_id': uri, 'title' : title,
39 'getTime':datetime.datetime.now(tzlocal())}
40 self.coll.insert(doc, safe=True)
41 return doc['title']