comparison pagetitle.py @ 26:adb79e44323a

switch to 'requests' lib to fix SSL errors Ignore-this: 749393b44686715dc27b2c9bb8717e82
author drewp@bigasterisk.com
date Sat, 24 Dec 2016 20:18:45 -0800
parents f8c4c7ce5f4a
children 3d9dc1571ade
comparison
equal deleted inserted replaced
25:e02fc021ab89 26:adb79e44323a
1 import lxml.html.soupparser 1 import lxml.html.soupparser
2 import datetime, socket 2 import datetime, socket
3 from dateutil.tz import tzlocal 3 from dateutil.tz import tzlocal
4 import restkit 4 import requests
5 import traceback
5 6
6 class CantGetTitle(ValueError): 7 class CantGetTitle(ValueError):
7 pass 8 pass
8 9
9 class PageTitle(object): 10 class PageTitle(object):
10 def __init__(self, db): 11 def __init__(self, db):
11 self.coll = db['pageTitle'] 12 self.coll = db['pageTitle']
12 13
13 def getPageTitleNow(self, uri): 14 def getPageTitleNow(self, uri):
14 try: 15 try:
15 response = restkit.request(uri, timeout=1, follow_redirect=True, 16 response = requests.get(uri, timeout=1, allow_redirects=True,
16 headers={ 17 headers={
17 'user-agent': 18 'user-agent':
18 'link title checker - drewp@bigasterisk.com' 19 'link title checker - drewp@bigasterisk.com'
19 }) 20 })
20 if not response.status.startswith('2'): 21 if not str(response.status_code).startswith('2'):
21 raise CantGetTitle("(got %s)" % response.status) 22 raise CantGetTitle("(got %s)" % response.status_code)
22 root = lxml.html.soupparser.fromstring( 23 root = lxml.html.soupparser.fromstring(
23 response.body_string()) 24 response.text)
24 25
25 for title in root.cssselect("title"): 26 for title in root.cssselect("title"):
26 return title.text 27 return title.text
27 except restkit.RequestError: 28 except Exception:
29 traceback.print_exc()
28 raise CantGetTitle("(error requesting title from site)") 30 raise CantGetTitle("(error requesting title from site)")
29 31
30 def pageTitle(self, uri): 32 def pageTitle(self, uri):
31 """page title from our db or by getting a new load from the page""" 33 """page title from our db or by getting a new load from the page"""
32 doc = self.coll.find_one({'_id' : uri}) 34 doc = self.coll.find_one({'_id' : uri})