Mercurial > code > home > repos > href
annotate pagetitle.py @ 5:f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Ignore-this: 863335c4680ac9bcc6a7fc5867638d61
author | Drew Perttula <drewp@bigasterisk.com> |
---|---|
date | Thu, 21 Feb 2013 01:39:01 -0800 |
parents | |
children | adb79e44323a |
rev | line source |
---|---|
5
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
1 import lxml.html.soupparser |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
2 import datetime, socket |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
3 from dateutil.tz import tzlocal |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
4 import restkit |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
5 |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
6 class CantGetTitle(ValueError): |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
7 pass |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
8 |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
9 class PageTitle(object): |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
10 def __init__(self, db): |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
11 self.coll = db['pageTitle'] |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
12 |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
13 def getPageTitleNow(self, uri): |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
14 try: |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
15 response = restkit.request(uri, timeout=1, follow_redirect=True, |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
16 headers={ |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
17 'user-agent': |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
18 'link title checker - drewp@bigasterisk.com' |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
19 }) |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
20 if not response.status.startswith('2'): |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
21 raise CantGetTitle("(got %s)" % response.status) |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
22 root = lxml.html.soupparser.fromstring( |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
23 response.body_string()) |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
24 |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
25 for title in root.cssselect("title"): |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
26 return title.text |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
27 except restkit.RequestError: |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
28 raise CantGetTitle("(error requesting title from site)") |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
29 |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
30 def pageTitle(self, uri): |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
31 """page title from our db or by getting a new load from the page""" |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
32 doc = self.coll.find_one({'_id' : uri}) |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
33 if doc is None: |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
34 try: |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
35 title = self.getPageTitleNow(uri) |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
36 except CantGetTitle, e: |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
37 return str(e) |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
38 doc = {'_id': uri, 'title' : title, |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
39 'getTime':datetime.datetime.now(tzlocal())} |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
40 self.coll.insert(doc, safe=True) |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
41 return doc['title'] |