Mercurial > code > home > repos > href
annotate pagetitle.py @ 30:e86642cf7393
style and requirements.txt cleanup
author | drewp@bigasterisk.com |
---|---|
date | Sun, 12 Jul 2020 13:33:54 -0700 |
parents | 7c82ffbca5d0 |
children | f3a15a724483 |
rev | line source |
---|---|
5
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
1 import lxml.html.soupparser |
30 | 2 import datetime |
5
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
3 from dateutil.tz import tzlocal |
26
adb79e44323a
switch to 'requests' lib to fix SSL errors
drewp@bigasterisk.com
parents:
5
diff
changeset
|
4 import requests |
adb79e44323a
switch to 'requests' lib to fix SSL errors
drewp@bigasterisk.com
parents:
5
diff
changeset
|
5 import traceback |
5
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
6 |
30 | 7 |
5
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
8 class CantGetTitle(ValueError): |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
9 pass |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
10 |
30 | 11 |
5
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
12 class PageTitle(object): |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
13 def __init__(self, db): |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
14 self.coll = db['pageTitle'] |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
15 |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
16 def getPageTitleNow(self, uri): |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
17 try: |
30 | 18 response = requests.get( |
19 uri, | |
20 timeout=1, | |
21 allow_redirects=True, | |
22 headers={ | |
23 'user-agent': 'link title checker - drewp@bigasterisk.com' | |
24 }) | |
26
adb79e44323a
switch to 'requests' lib to fix SSL errors
drewp@bigasterisk.com
parents:
5
diff
changeset
|
25 if not str(response.status_code).startswith('2'): |
adb79e44323a
switch to 'requests' lib to fix SSL errors
drewp@bigasterisk.com
parents:
5
diff
changeset
|
26 raise CantGetTitle("(got %s)" % response.status_code) |
30 | 27 root = lxml.html.soupparser.fromstring(response.text) |
5
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
28 |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
29 for title in root.cssselect("title"): |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
30 return title.text |
26
adb79e44323a
switch to 'requests' lib to fix SSL errors
drewp@bigasterisk.com
parents:
5
diff
changeset
|
31 except Exception: |
adb79e44323a
switch to 'requests' lib to fix SSL errors
drewp@bigasterisk.com
parents:
5
diff
changeset
|
32 traceback.print_exc() |
5
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
33 raise CantGetTitle("(error requesting title from site)") |
30 | 34 |
5
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
35 def pageTitle(self, uri): |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
36 """page title from our db or by getting a new load from the page""" |
30 | 37 doc = self.coll.find_one({'_id': uri}) |
5
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
38 if doc is None: |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
39 try: |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
40 title = self.getPageTitleNow(uri) |
28 | 41 except CantGetTitle as e: |
5
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
42 return str(e) |
30 | 43 doc = { |
44 '_id': uri, | |
45 'title': title, | |
46 'getTime': datetime.datetime.now(tzlocal()) | |
47 } | |
5
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
48 self.coll.insert(doc, safe=True) |
f8c4c7ce5f4a
lots of href additions: add/edit, nav fixes
Drew Perttula <drewp@bigasterisk.com>
parents:
diff
changeset
|
49 return doc['title'] |