Mercurial > code > home > repos > href
diff pagetitle.py @ 30:e86642cf7393
style and requirements.txt cleanup
author | drewp@bigasterisk.com |
---|---|
date | Sun, 12 Jul 2020 13:33:54 -0700 |
parents | 7c82ffbca5d0 |
children | f3a15a724483 |
line wrap: on
line diff
--- a/pagetitle.py Sun Jul 12 13:17:13 2020 -0700 +++ b/pagetitle.py Sun Jul 12 13:33:54 2020 -0700 @@ -1,43 +1,49 @@ import lxml.html.soupparser -import datetime, socket +import datetime from dateutil.tz import tzlocal import requests import traceback + class CantGetTitle(ValueError): pass + class PageTitle(object): def __init__(self, db): self.coll = db['pageTitle'] def getPageTitleNow(self, uri): try: - response = requests.get(uri, timeout=1, allow_redirects=True, - headers={ - 'user-agent': - 'link title checker - drewp@bigasterisk.com' - }) + response = requests.get( + uri, + timeout=1, + allow_redirects=True, + headers={ + 'user-agent': 'link title checker - drewp@bigasterisk.com' + }) if not str(response.status_code).startswith('2'): raise CantGetTitle("(got %s)" % response.status_code) - root = lxml.html.soupparser.fromstring( - response.text) + root = lxml.html.soupparser.fromstring(response.text) for title in root.cssselect("title"): return title.text except Exception: traceback.print_exc() raise CantGetTitle("(error requesting title from site)") - + def pageTitle(self, uri): """page title from our db or by getting a new load from the page""" - doc = self.coll.find_one({'_id' : uri}) + doc = self.coll.find_one({'_id': uri}) if doc is None: try: title = self.getPageTitleNow(uri) except CantGetTitle as e: return str(e) - doc = {'_id': uri, 'title' : title, - 'getTime':datetime.datetime.now(tzlocal())} + doc = { + '_id': uri, + 'title': title, + 'getTime': datetime.datetime.now(tzlocal()) + } self.coll.insert(doc, safe=True) return doc['title']