Mercurial > code > home > repos > href
annotate pagetitle.py @ 42:530650b3bc40 default tip
something changed in pom to break pyjwt. switched to jwskate
author | drewp@bigasterisk.com |
---|---|
date | Wed, 14 Dec 2022 22:07:19 -0800 |
parents | 293a694304b8 |
children |
rev | line source |
---|---|
41 | 1 import datetime |
2 import traceback | |
3 | |
5
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
4 import lxml.html.soupparser |
41 | 5 import requests |
5
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
6 from dateutil.tz import tzlocal |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
7 |
30 | 8 |
5
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
9 class CantGetTitle(ValueError): |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
10 pass |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
11 |
30 | 12 |
5
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
13 class PageTitle(object): |
41 | 14 |
5
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
15 def __init__(self, db): |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
16 self.coll = db['pageTitle'] |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
17 |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
18 def getPageTitleNow(self, uri): |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
19 try: |
41 | 20 response = requests.get(uri, |
21 timeout=3, | |
22 allow_redirects=True, | |
23 headers={'user-agent': 'link title checker - drewp@bigasterisk.com'}) | |
26
adb79e44323a
switch to 'requests' lib to fix SSL errors
drewp@bigasterisk.com
parents:
5
diff
changeset
|
24 if not str(response.status_code).startswith('2'): |
adb79e44323a
switch to 'requests' lib to fix SSL errors
drewp@bigasterisk.com
parents:
5
diff
changeset
|
25 raise CantGetTitle("(got %s)" % response.status_code) |
30 | 26 root = lxml.html.soupparser.fromstring(response.text) |
5
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
27 |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
28 for title in root.cssselect("title"): |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
29 return title.text |
26
adb79e44323a
switch to 'requests' lib to fix SSL errors
drewp@bigasterisk.com
parents:
5
diff
changeset
|
30 except Exception: |
adb79e44323a
switch to 'requests' lib to fix SSL errors
drewp@bigasterisk.com
parents:
5
diff
changeset
|
31 traceback.print_exc() |
5
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
32 raise CantGetTitle("(error requesting title from site)") |
30 | 33 |
5
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
34 def pageTitle(self, uri): |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
35 """page title from our db or by getting a new load from the page""" |
30 | 36 doc = self.coll.find_one({'_id': uri}) |
5
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
37 if doc is None: |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
38 try: |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
39 title = self.getPageTitleNow(uri) |
28 | 40 except CantGetTitle as e: |
5
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
41 return str(e) |
41 | 42 doc = {'_id': uri, 'title': title, 'getTime': datetime.datetime.now(tzlocal())} |
38 | 43 self.coll.insert(doc) |
5
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
44 return doc['title'] |