diff flow/build_index.py @ 10:13438795d896

rewrite with prefect flows and whoosh search, but it's in a nested pdm env
author drewp@bigasterisk.com
date Thu, 11 Jul 2024 17:35:31 -0700
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/flow/build_index.py	Thu Jul 11 17:35:31 2024 -0700
@@ -0,0 +1,81 @@
+from pathlib import Path
+from typing import Iterable, cast
+
+import search_index
+from download import getCityMutableJson, getCityPermanent
+from local_types import MeetingRow, Url
+from lxml.html import fromstring
+from prefect import flow, task
+from prefect.logging import get_run_logger
+from search_index import SearchIndex
+
+log = None
+
+
+@task()
+def meetingListUrls() -> Iterable[Url]:
+    return [
+        "https://albanyca.primegov.com/api/v2/PublicPortal/ListArchivedMeetings?year=2024",
+        "https://albanyca.primegov.com/api/v2/PublicPortal/ListUpcomingMeetings",
+    ]
+
+
+def meetingAgendaUrl(mtg: MeetingRow) -> Url:
+    for doc in mtg.get('documentList', []):
+        if doc['templateName'] == 'HTML Agenda Packet':
+            tid = doc['templateId']
+            return f'https://albanyca.primegov.com/Portal/Meeting?meetingTemplateId={tid}'
+    raise ValueError(f"no agenda doc found for {mtg['id']=}")
+
+
+def extractMeetingText(mhtml: str) -> list[str]:
+    el = fromstring(mhtml)
+    m = el.cssselect('div#meetingSection')[0]
+    for st in m.cssselect('style'):
+        st.clear()
+    meetingText = [
+        chunk for chunk in m.xpath('.//text()', namespaces=el.nsmap)
+        if chunk.strip()
+    ]
+    return meetingText
+
+
+def addMeeting(index: SearchIndex, mtg: MeetingRow):
+    try:
+        agendaUrl = meetingAgendaUrl(mtg)
+    except ValueError:
+        pass
+    else:
+        html = getCityPermanent(agendaUrl)
+        text = extractMeetingText(html)
+        # todo group phrases
+        index.addDoc(title=f'{mtg["date"]} {mtg["title"]}', content=text)
+
+    try:
+        videoUrl = mtg['videoUrl']
+        if not videoUrl:
+            raise KeyError
+    except KeyError:
+        pass
+    else:
+        '''transcribe and index video...'''
+
+
+@flow(persist_result=True)
+def buildIndex():
+    global log
+    log = get_run_logger()
+    search_index.log = log
+
+    index = SearchIndex(Path('/tmp/scoindex'))
+    for url in meetingListUrls():
+        mtgs = cast(list[MeetingRow], getCityMutableJson(url))
+        log.info(f'got {len(mtgs)=}')
+
+        for mtg in mtgs:
+            addMeeting(index, mtg)
+    index.commit()
+
+
+if __name__ == '__main__':
+    buildIndex.serve()