diff pagetitle.py @ 5:f8c4c7ce5f4a

lots of href additions: add/edit, nav fixes Ignore-this: 863335c4680ac9bcc6a7fc5867638d61
author Drew Perttula <drewp@bigasterisk.com>
date Thu, 21 Feb 2013 01:39:01 -0800
parents
children adb79e44323a
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pagetitle.py	Thu Feb 21 01:39:01 2013 -0800
@@ -0,0 +1,41 @@
+import lxml.html.soupparser
+import datetime, socket
+from dateutil.tz import tzlocal
+import restkit
+
+class CantGetTitle(ValueError):
+    pass
+
+class PageTitle(object):
+    def __init__(self, db):
+        self.coll = db['pageTitle']
+
+    def getPageTitleNow(self, uri):
+        try:
+            response = restkit.request(uri, timeout=1, follow_redirect=True,
+                                headers={
+                                    'user-agent':
+                                    'link title checker - drewp@bigasterisk.com'
+                                })
+            if not response.status.startswith('2'):
+                raise CantGetTitle("(got %s)" % response.status)
+            root = lxml.html.soupparser.fromstring(
+                response.body_string())
+
+            for title in root.cssselect("title"):
+                return title.text
+        except restkit.RequestError:
+            raise CantGetTitle("(error requesting title from site)")
+            
+    def pageTitle(self, uri):
+        """page title from our db or by getting a new load from the page"""
+        doc = self.coll.find_one({'_id' : uri})
+        if doc is None:
+            try:
+                title = self.getPageTitleNow(uri)
+            except CantGetTitle, e:
+                return str(e)
+            doc = {'_id': uri, 'title' : title,
+                   'getTime':datetime.datetime.now(tzlocal())}
+            self.coll.insert(doc, safe=True)
+        return doc['title']