annotate pagetitle.py @ 42:530650b3bc40 default tip

something changed in pom to break pyjwt. switched to jwskate
author drewp@bigasterisk.com
date Wed, 14 Dec 2022 22:07:19 -0800
parents 293a694304b8
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
41
293a694304b8 reformat
drewp@bigasterisk.com
parents: 40
diff changeset
1 import datetime
293a694304b8 reformat
drewp@bigasterisk.com
parents: 40
diff changeset
2 import traceback
293a694304b8 reformat
drewp@bigasterisk.com
parents: 40
diff changeset
3
5
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
4 import lxml.html.soupparser
41
293a694304b8 reformat
drewp@bigasterisk.com
parents: 40
diff changeset
5 import requests
5
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
6 from dateutil.tz import tzlocal
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
7
30
e86642cf7393 style and requirements.txt cleanup
drewp@bigasterisk.com
parents: 28
diff changeset
8
5
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
9 class CantGetTitle(ValueError):
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
10 pass
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
11
30
e86642cf7393 style and requirements.txt cleanup
drewp@bigasterisk.com
parents: 28
diff changeset
12
5
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
13 class PageTitle(object):
41
293a694304b8 reformat
drewp@bigasterisk.com
parents: 40
diff changeset
14
5
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
15 def __init__(self, db):
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
16 self.coll = db['pageTitle']
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
17
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
18 def getPageTitleNow(self, uri):
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
19 try:
41
293a694304b8 reformat
drewp@bigasterisk.com
parents: 40
diff changeset
20 response = requests.get(uri,
293a694304b8 reformat
drewp@bigasterisk.com
parents: 40
diff changeset
21 timeout=3,
293a694304b8 reformat
drewp@bigasterisk.com
parents: 40
diff changeset
22 allow_redirects=True,
293a694304b8 reformat
drewp@bigasterisk.com
parents: 40
diff changeset
23 headers={'user-agent': 'link title checker - drewp@bigasterisk.com'})
26
adb79e44323a switch to 'requests' lib to fix SSL errors
drewp@bigasterisk.com
parents: 5
diff changeset
24 if not str(response.status_code).startswith('2'):
adb79e44323a switch to 'requests' lib to fix SSL errors
drewp@bigasterisk.com
parents: 5
diff changeset
25 raise CantGetTitle("(got %s)" % response.status_code)
30
e86642cf7393 style and requirements.txt cleanup
drewp@bigasterisk.com
parents: 28
diff changeset
26 root = lxml.html.soupparser.fromstring(response.text)
5
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
27
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
28 for title in root.cssselect("title"):
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
29 return title.text
26
adb79e44323a switch to 'requests' lib to fix SSL errors
drewp@bigasterisk.com
parents: 5
diff changeset
30 except Exception:
adb79e44323a switch to 'requests' lib to fix SSL errors
drewp@bigasterisk.com
parents: 5
diff changeset
31 traceback.print_exc()
5
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
32 raise CantGetTitle("(error requesting title from site)")
30
e86642cf7393 style and requirements.txt cleanup
drewp@bigasterisk.com
parents: 28
diff changeset
33
5
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
34 def pageTitle(self, uri):
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
35 """page title from our db or by getting a new load from the page"""
30
e86642cf7393 style and requirements.txt cleanup
drewp@bigasterisk.com
parents: 28
diff changeset
36 doc = self.coll.find_one({'_id': uri})
5
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
37 if doc is None:
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
38 try:
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
39 title = self.getPageTitleNow(uri)
28
7c82ffbca5d0 py3 and k8s upgrade
drewp@bigasterisk.com
parents: 27
diff changeset
40 except CantGetTitle as e:
5
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
41 return str(e)
41
293a694304b8 reformat
drewp@bigasterisk.com
parents: 40
diff changeset
42 doc = {'_id': uri, 'title': title, 'getTime': datetime.datetime.now(tzlocal())}
38
f3a15a724483 mongo api and up-checking
drewp@bigasterisk.com
parents: 30
diff changeset
43 self.coll.insert(doc)
5
f8c4c7ce5f4a lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff changeset
44 return doc['title']