annotate pagetitle.py @ 30:e86642cf7393

style and requirements.txt cleanup
author drewp@bigasterisk.com
date Sun, 12 Jul 2020 13:33:54 -0700
parents 7c82ffbca5d0
children f3a15a724483
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
5
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
1 import lxml.html.soupparser
30
e86642cf7393 style and requirements.txt cleanup
drewp@bigasterisk.com
parents: 28
diff changeset
2 import datetime
5
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
3 from dateutil.tz import tzlocal
26
adb79e44323a switch to 'requests' lib to fix SSL errors
drewp@bigasterisk.com
parents: 5
diff changeset
4 import requests
adb79e44323a switch to 'requests' lib to fix SSL errors
drewp@bigasterisk.com
parents: 5
diff changeset
5 import traceback
5
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
6
30
e86642cf7393 style and requirements.txt cleanup
drewp@bigasterisk.com
parents: 28
diff changeset
7
5
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
8 class CantGetTitle(ValueError):
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
9 pass
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
10
30
e86642cf7393 style and requirements.txt cleanup
drewp@bigasterisk.com
parents: 28
diff changeset
11
5
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
12 class PageTitle(object):
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
13 def __init__(self, db):
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
14 self.coll = db['pageTitle']
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
15
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
16 def getPageTitleNow(self, uri):
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
17 try:
30
e86642cf7393 style and requirements.txt cleanup
drewp@bigasterisk.com
parents: 28
diff changeset
18 response = requests.get(
e86642cf7393 style and requirements.txt cleanup
drewp@bigasterisk.com
parents: 28
diff changeset
19 uri,
e86642cf7393 style and requirements.txt cleanup
drewp@bigasterisk.com
parents: 28
diff changeset
20 timeout=1,
e86642cf7393 style and requirements.txt cleanup
drewp@bigasterisk.com
parents: 28
diff changeset
21 allow_redirects=True,
e86642cf7393 style and requirements.txt cleanup
drewp@bigasterisk.com
parents: 28
diff changeset
22 headers={
e86642cf7393 style and requirements.txt cleanup
drewp@bigasterisk.com
parents: 28
diff changeset
23 'user-agent': 'link title checker - drewp@bigasterisk.com'
e86642cf7393 style and requirements.txt cleanup
drewp@bigasterisk.com
parents: 28
diff changeset
24 })
26
adb79e44323a switch to 'requests' lib to fix SSL errors
drewp@bigasterisk.com
parents: 5
diff changeset
25 if not str(response.status_code).startswith('2'):
adb79e44323a switch to 'requests' lib to fix SSL errors
drewp@bigasterisk.com
parents: 5
diff changeset
26 raise CantGetTitle("(got %s)" % response.status_code)
30
e86642cf7393 style and requirements.txt cleanup
drewp@bigasterisk.com
parents: 28
diff changeset
27 root = lxml.html.soupparser.fromstring(response.text)
5
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
28
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
29 for title in root.cssselect("title"):
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
30 return title.text
26
adb79e44323a switch to 'requests' lib to fix SSL errors
drewp@bigasterisk.com
parents: 5
diff changeset
31 except Exception:
adb79e44323a switch to 'requests' lib to fix SSL errors
drewp@bigasterisk.com
parents: 5
diff changeset
32 traceback.print_exc()
5
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
33 raise CantGetTitle("(error requesting title from site)")
30
e86642cf7393 style and requirements.txt cleanup
drewp@bigasterisk.com
parents: 28
diff changeset
34
5
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
35 def pageTitle(self, uri):
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
36 """page title from our db or by getting a new load from the page"""
30
e86642cf7393 style and requirements.txt cleanup
drewp@bigasterisk.com
parents: 28
diff changeset
37 doc = self.coll.find_one({'_id': uri})
5
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
38 if doc is None:
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
39 try:
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
40 title = self.getPageTitleNow(uri)
28
7c82ffbca5d0 py3 and k8s upgrade
drewp@bigasterisk.com
parents: 27
diff changeset
41 except CantGetTitle as e:
5
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
42 return str(e)
30
e86642cf7393 style and requirements.txt cleanup
drewp@bigasterisk.com
parents: 28
diff changeset
43 doc = {
e86642cf7393 style and requirements.txt cleanup
drewp@bigasterisk.com
parents: 28
diff changeset
44 '_id': uri,
e86642cf7393 style and requirements.txt cleanup
drewp@bigasterisk.com
parents: 28
diff changeset
45 'title': title,
e86642cf7393 style and requirements.txt cleanup
drewp@bigasterisk.com
parents: 28
diff changeset
46 'getTime': datetime.datetime.now(tzlocal())
e86642cf7393 style and requirements.txt cleanup
drewp@bigasterisk.com
parents: 28
diff changeset
47 }
5
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
48 self.coll.insert(doc, safe=True)
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
49 return doc['title']