Mercurial > code > home > repos > href
comparison pagetitle.py @ 5:f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Ignore-this: 863335c4680ac9bcc6a7fc5867638d61
author | Drew Perttula <drewp@bigasterisk.com> |
---|---|
date | Thu, 21 Feb 2013 01:39:01 -0800 |
parents | |
children | adb79e44323a |
comparison
equal
deleted
inserted
replaced
4:409da49c148d | 5:f8c4c7ce5f4a |
---|---|
1 import lxml.html.soupparser | |
2 import datetime, socket | |
3 from dateutil.tz import tzlocal | |
4 import restkit | |
5 | |
6 class CantGetTitle(ValueError): | |
7 pass | |
8 | |
9 class PageTitle(object): | |
10 def __init__(self, db): | |
11 self.coll = db['pageTitle'] | |
12 | |
13 def getPageTitleNow(self, uri): | |
14 try: | |
15 response = restkit.request(uri, timeout=1, follow_redirect=True, | |
16 headers={ | |
17 'user-agent': | |
18 'link title checker - drewp@bigasterisk.com' | |
19 }) | |
20 if not response.status.startswith('2'): | |
21 raise CantGetTitle("(got %s)" % response.status) | |
22 root = lxml.html.soupparser.fromstring( | |
23 response.body_string()) | |
24 | |
25 for title in root.cssselect("title"): | |
26 return title.text | |
27 except restkit.RequestError: | |
28 raise CantGetTitle("(error requesting title from site)") | |
29 | |
30 def pageTitle(self, uri): | |
31 """page title from our db or by getting a new load from the page""" | |
32 doc = self.coll.find_one({'_id' : uri}) | |
33 if doc is None: | |
34 try: | |
35 title = self.getPageTitleNow(uri) | |
36 except CantGetTitle, e: | |
37 return str(e) | |
38 doc = {'_id': uri, 'title' : title, | |
39 'getTime':datetime.datetime.now(tzlocal())} | |
40 self.coll.insert(doc, safe=True) | |
41 return doc['title'] |