comparison pagetitle.py @ 41:293a694304b8

reformat
author drewp@bigasterisk.com
date Sat, 19 Nov 2022 17:18:55 -0800
parents 94181d521d6d
children
comparison
equal deleted inserted replaced
40:94181d521d6d 41:293a694304b8
1 import datetime
2 import traceback
3
1 import lxml.html.soupparser 4 import lxml.html.soupparser
2 import datetime 5 import requests
3 from dateutil.tz import tzlocal 6 from dateutil.tz import tzlocal
4 import requests
5 import traceback
6 7
7 8
8 class CantGetTitle(ValueError): 9 class CantGetTitle(ValueError):
9 pass 10 pass
10 11
11 12
12 class PageTitle(object): 13 class PageTitle(object):
14
13 def __init__(self, db): 15 def __init__(self, db):
14 self.coll = db['pageTitle'] 16 self.coll = db['pageTitle']
15 17
16 def getPageTitleNow(self, uri): 18 def getPageTitleNow(self, uri):
17 try: 19 try:
18 response = requests.get( 20 response = requests.get(uri,
19 uri, 21 timeout=3,
20 timeout=3, 22 allow_redirects=True,
21 allow_redirects=True, 23 headers={'user-agent': 'link title checker - drewp@bigasterisk.com'})
22 headers={
23 'user-agent': 'link title checker - drewp@bigasterisk.com'
24 })
25 if not str(response.status_code).startswith('2'): 24 if not str(response.status_code).startswith('2'):
26 raise CantGetTitle("(got %s)" % response.status_code) 25 raise CantGetTitle("(got %s)" % response.status_code)
27 root = lxml.html.soupparser.fromstring(response.text) 26 root = lxml.html.soupparser.fromstring(response.text)
28 27
29 for title in root.cssselect("title"): 28 for title in root.cssselect("title"):
38 if doc is None: 37 if doc is None:
39 try: 38 try:
40 title = self.getPageTitleNow(uri) 39 title = self.getPageTitleNow(uri)
41 except CantGetTitle as e: 40 except CantGetTitle as e:
42 return str(e) 41 return str(e)
43 doc = { 42 doc = {'_id': uri, 'title': title, 'getTime': datetime.datetime.now(tzlocal())}
44 '_id': uri,
45 'title': title,
46 'getTime': datetime.datetime.now(tzlocal())
47 }
48 self.coll.insert(doc) 43 self.coll.insert(doc)
49 return doc['title'] 44 return doc['title']