Mercurial > code > home > repos > href
comparison pagetitle.py @ 26:adb79e44323a
switch to 'requests' lib to fix SSL errors
Ignore-this: 749393b44686715dc27b2c9bb8717e82
author | drewp@bigasterisk.com |
---|---|
date | Sat, 24 Dec 2016 20:18:45 -0800 |
parents | f8c4c7ce5f4a |
children | 3d9dc1571ade |
comparison
equal
deleted
inserted
replaced
25:e02fc021ab89 | 26:adb79e44323a |
---|---|
1 import lxml.html.soupparser | 1 import lxml.html.soupparser |
2 import datetime, socket | 2 import datetime, socket |
3 from dateutil.tz import tzlocal | 3 from dateutil.tz import tzlocal |
4 import restkit | 4 import requests |
5 import traceback | |
5 | 6 |
6 class CantGetTitle(ValueError): | 7 class CantGetTitle(ValueError): |
7 pass | 8 pass |
8 | 9 |
9 class PageTitle(object): | 10 class PageTitle(object): |
10 def __init__(self, db): | 11 def __init__(self, db): |
11 self.coll = db['pageTitle'] | 12 self.coll = db['pageTitle'] |
12 | 13 |
13 def getPageTitleNow(self, uri): | 14 def getPageTitleNow(self, uri): |
14 try: | 15 try: |
15 response = restkit.request(uri, timeout=1, follow_redirect=True, | 16 response = requests.get(uri, timeout=1, allow_redirects=True, |
16 headers={ | 17 headers={ |
17 'user-agent': | 18 'user-agent': |
18 'link title checker - drewp@bigasterisk.com' | 19 'link title checker - drewp@bigasterisk.com' |
19 }) | 20 }) |
20 if not response.status.startswith('2'): | 21 if not str(response.status_code).startswith('2'): |
21 raise CantGetTitle("(got %s)" % response.status) | 22 raise CantGetTitle("(got %s)" % response.status_code) |
22 root = lxml.html.soupparser.fromstring( | 23 root = lxml.html.soupparser.fromstring( |
23 response.body_string()) | 24 response.text) |
24 | 25 |
25 for title in root.cssselect("title"): | 26 for title in root.cssselect("title"): |
26 return title.text | 27 return title.text |
27 except restkit.RequestError: | 28 except Exception: |
29 traceback.print_exc() | |
28 raise CantGetTitle("(error requesting title from site)") | 30 raise CantGetTitle("(error requesting title from site)") |
29 | 31 |
30 def pageTitle(self, uri): | 32 def pageTitle(self, uri): |
31 """page title from our db or by getting a new load from the page""" | 33 """page title from our db or by getting a new load from the page""" |
32 doc = self.coll.find_one({'_id' : uri}) | 34 doc = self.coll.find_one({'_id' : uri}) |