Mercurial > code > home > repos > href
comparison pagetitle.py @ 41:293a694304b8
reformat
author | drewp@bigasterisk.com |
---|---|
date | Sat, 19 Nov 2022 17:18:55 -0800 |
parents | 94181d521d6d |
children |
comparison
equal
deleted
inserted
replaced
40:94181d521d6d | 41:293a694304b8 |
---|---|
1 import datetime | |
2 import traceback | |
3 | |
1 import lxml.html.soupparser | 4 import lxml.html.soupparser |
2 import datetime | 5 import requests |
3 from dateutil.tz import tzlocal | 6 from dateutil.tz import tzlocal |
4 import requests | |
5 import traceback | |
6 | 7 |
7 | 8 |
8 class CantGetTitle(ValueError): | 9 class CantGetTitle(ValueError): |
9 pass | 10 pass |
10 | 11 |
11 | 12 |
12 class PageTitle(object): | 13 class PageTitle(object): |
14 | |
13 def __init__(self, db): | 15 def __init__(self, db): |
14 self.coll = db['pageTitle'] | 16 self.coll = db['pageTitle'] |
15 | 17 |
16 def getPageTitleNow(self, uri): | 18 def getPageTitleNow(self, uri): |
17 try: | 19 try: |
18 response = requests.get( | 20 response = requests.get(uri, |
19 uri, | 21 timeout=3, |
20 timeout=3, | 22 allow_redirects=True, |
21 allow_redirects=True, | 23 headers={'user-agent': 'link title checker - drewp@bigasterisk.com'}) |
22 headers={ | |
23 'user-agent': 'link title checker - drewp@bigasterisk.com' | |
24 }) | |
25 if not str(response.status_code).startswith('2'): | 24 if not str(response.status_code).startswith('2'): |
26 raise CantGetTitle("(got %s)" % response.status_code) | 25 raise CantGetTitle("(got %s)" % response.status_code) |
27 root = lxml.html.soupparser.fromstring(response.text) | 26 root = lxml.html.soupparser.fromstring(response.text) |
28 | 27 |
29 for title in root.cssselect("title"): | 28 for title in root.cssselect("title"): |
38 if doc is None: | 37 if doc is None: |
39 try: | 38 try: |
40 title = self.getPageTitleNow(uri) | 39 title = self.getPageTitleNow(uri) |
41 except CantGetTitle as e: | 40 except CantGetTitle as e: |
42 return str(e) | 41 return str(e) |
43 doc = { | 42 doc = {'_id': uri, 'title': title, 'getTime': datetime.datetime.now(tzlocal())} |
44 '_id': uri, | |
45 'title': title, | |
46 'getTime': datetime.datetime.now(tzlocal()) | |
47 } | |
48 self.coll.insert(doc) | 43 self.coll.insert(doc) |
49 return doc['title'] | 44 return doc['title'] |