annotate pagetitle.py @ 27:3d9dc1571ade

whitespace Ignore-this: cd0c37de5a0635101fc9ea7a18653101
author drewp@bigasterisk.com
date Sat, 24 Dec 2016 20:19:23 -0800
parents adb79e44323a
children 7c82ffbca5d0
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
5
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
1 import lxml.html.soupparser
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
2 import datetime, socket
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
3 from dateutil.tz import tzlocal
26
adb79e44323a switch to 'requests' lib to fix SSL errors
drewp@bigasterisk.com
parents: 5
diff changeset
4 import requests
adb79e44323a switch to 'requests' lib to fix SSL errors
drewp@bigasterisk.com
parents: 5
diff changeset
5 import traceback
5
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
6
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
7 class CantGetTitle(ValueError):
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
8 pass
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
9
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
10 class PageTitle(object):
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
11 def __init__(self, db):
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
12 self.coll = db['pageTitle']
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
13
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
14 def getPageTitleNow(self, uri):
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
15 try:
26
adb79e44323a switch to 'requests' lib to fix SSL errors
drewp@bigasterisk.com
parents: 5
diff changeset
16 response = requests.get(uri, timeout=1, allow_redirects=True,
27
3d9dc1571ade whitespace
drewp@bigasterisk.com
parents: 26
diff changeset
17 headers={
3d9dc1571ade whitespace
drewp@bigasterisk.com
parents: 26
diff changeset
18 'user-agent':
3d9dc1571ade whitespace
drewp@bigasterisk.com
parents: 26
diff changeset
19 'link title checker - drewp@bigasterisk.com'
3d9dc1571ade whitespace
drewp@bigasterisk.com
parents: 26
diff changeset
20 })
26
adb79e44323a switch to 'requests' lib to fix SSL errors
drewp@bigasterisk.com
parents: 5
diff changeset
21 if not str(response.status_code).startswith('2'):
adb79e44323a switch to 'requests' lib to fix SSL errors
drewp@bigasterisk.com
parents: 5
diff changeset
22 raise CantGetTitle("(got %s)" % response.status_code)
5
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
23 root = lxml.html.soupparser.fromstring(
26
adb79e44323a switch to 'requests' lib to fix SSL errors
drewp@bigasterisk.com
parents: 5
diff changeset
24 response.text)
5
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
25
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
26 for title in root.cssselect("title"):
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
27 return title.text
26
adb79e44323a switch to 'requests' lib to fix SSL errors
drewp@bigasterisk.com
parents: 5
diff changeset
28 except Exception:
adb79e44323a switch to 'requests' lib to fix SSL errors
drewp@bigasterisk.com
parents: 5
diff changeset
29 traceback.print_exc()
5
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
30 raise CantGetTitle("(error requesting title from site)")
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
31
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
32 def pageTitle(self, uri):
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
33 """page title from our db or by getting a new load from the page"""
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
34 doc = self.coll.find_one({'_id' : uri})
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
35 if doc is None:
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
36 try:
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
37 title = self.getPageTitleNow(uri)
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
38 except CantGetTitle, e:
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
39 return str(e)
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
40 doc = {'_id': uri, 'title' : title,
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
41 'getTime':datetime.datetime.now(tzlocal())}
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
42 self.coll.insert(doc, safe=True)
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
43 return doc['title']