Mercurial > code > home > repos > href
annotate pagetitle.py @ 27:3d9dc1571ade
whitespace
Ignore-this: cd0c37de5a0635101fc9ea7a18653101
author | drewp@bigasterisk.com |
---|---|
date | Sat, 24 Dec 2016 20:19:23 -0800 |
parents | adb79e44323a |
children | 7c82ffbca5d0 |
rev | line source |
---|---|
5
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
1 import lxml.html.soupparser |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
2 import datetime, socket |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
3 from dateutil.tz import tzlocal |
26
adb79e44323a
switch to 'requests' lib to fix SSL errors
drewp@bigasterisk.com
parents:
5
diff
changeset
|
4 import requests |
adb79e44323a
switch to 'requests' lib to fix SSL errors
drewp@bigasterisk.com
parents:
5
diff
changeset
|
5 import traceback |
5
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
6 |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
7 class CantGetTitle(ValueError): |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
8 pass |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
9 |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
10 class PageTitle(object): |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
11 def __init__(self, db): |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
12 self.coll = db['pageTitle'] |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
13 |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
14 def getPageTitleNow(self, uri): |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
15 try: |
26
adb79e44323a
switch to 'requests' lib to fix SSL errors
drewp@bigasterisk.com
parents:
5
diff
changeset
|
16 response = requests.get(uri, timeout=1, allow_redirects=True, |
27 | 17 headers={ |
18 'user-agent': | |
19 'link title checker - drewp@bigasterisk.com' | |
20 }) | |
26
adb79e44323a
switch to 'requests' lib to fix SSL errors
drewp@bigasterisk.com
parents:
5
diff
changeset
|
21 if not str(response.status_code).startswith('2'): |
adb79e44323a
switch to 'requests' lib to fix SSL errors
drewp@bigasterisk.com
parents:
5
diff
changeset
|
22 raise CantGetTitle("(got %s)" % response.status_code) |
5
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
23 root = lxml.html.soupparser.fromstring( |
26
adb79e44323a
switch to 'requests' lib to fix SSL errors
drewp@bigasterisk.com
parents:
5
diff
changeset
|
24 response.text) |
5
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
25 |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
26 for title in root.cssselect("title"): |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
27 return title.text |
26
adb79e44323a
switch to 'requests' lib to fix SSL errors
drewp@bigasterisk.com
parents:
5
diff
changeset
|
28 except Exception: |
adb79e44323a
switch to 'requests' lib to fix SSL errors
drewp@bigasterisk.com
parents:
5
diff
changeset
|
29 traceback.print_exc() |
5
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
30 raise CantGetTitle("(error requesting title from site)") |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
31 |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
32 def pageTitle(self, uri): |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
33 """page title from our db or by getting a new load from the page""" |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
34 doc = self.coll.find_one({'_id' : uri}) |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
35 if doc is None: |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
36 try: |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
37 title = self.getPageTitleNow(uri) |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
38 except CantGetTitle, e: |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
39 return str(e) |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
40 doc = {'_id': uri, 'title' : title, |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
41 'getTime':datetime.datetime.now(tzlocal())} |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
42 self.coll.insert(doc, safe=True) |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
43 return doc['title'] |