annotate pagetitle.py @ 5:f8c4c7ce5f4a

lots of href additions: add/edit, nav fixes Ignore-this: 863335c4680ac9bcc6a7fc5867638d61
author Drew Perttula <drewp@bigasterisk.com>
date Thu, 21 Feb 2013 01:39:01 -0800
parents
children adb79e44323a
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
5
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
1 import lxml.html.soupparser
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
2 import datetime, socket
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
3 from dateutil.tz import tzlocal
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
4 import restkit
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
5
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
6 class CantGetTitle(ValueError):
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
7 pass
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
8
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
9 class PageTitle(object):
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
10 def __init__(self, db):
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
11 self.coll = db['pageTitle']
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
12
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
13 def getPageTitleNow(self, uri):
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
14 try:
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
15 response = restkit.request(uri, timeout=1, follow_redirect=True,
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
16 headers={
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
17 'user-agent':
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
18 'link title checker - drewp@bigasterisk.com'
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
19 })
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
20 if not response.status.startswith('2'):
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
21 raise CantGetTitle("(got %s)" % response.status)
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
22 root = lxml.html.soupparser.fromstring(
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
23 response.body_string())
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
24
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
25 for title in root.cssselect("title"):
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
26 return title.text
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
27 except restkit.RequestError:
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
28 raise CantGetTitle("(error requesting title from site)")
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
29
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
30 def pageTitle(self, uri):
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
31 """page title from our db or by getting a new load from the page"""
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
32 doc = self.coll.find_one({'_id' : uri})
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
33 if doc is None:
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
34 try:
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
35 title = self.getPageTitleNow(uri)
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
36 except CantGetTitle, e:
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
37 return str(e)
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
38 doc = {'_id': uri, 'title' : title,
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
39 'getTime':datetime.datetime.now(tzlocal())}
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
40 self.coll.insert(doc, safe=True)
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
41 return doc['title']