Mercurial > code > home > repos > sco-bot
diff flow/build_index.py @ 10:13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
author | drewp@bigasterisk.com |
---|---|
date | Thu, 11 Jul 2024 17:35:31 -0700 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/flow/build_index.py Thu Jul 11 17:35:31 2024 -0700 @@ -0,0 +1,81 @@ +from pathlib import Path +from typing import Iterable, cast + +import search_index +from download import getCityMutableJson, getCityPermanent +from local_types import MeetingRow, Url +from lxml.html import fromstring +from prefect import flow, task +from prefect.logging import get_run_logger +from search_index import SearchIndex + +log = None + + +@task() +def meetingListUrls() -> Iterable[Url]: + return [ + "https://albanyca.primegov.com/api/v2/PublicPortal/ListArchivedMeetings?year=2024", + "https://albanyca.primegov.com/api/v2/PublicPortal/ListUpcomingMeetings", + ] + + +def meetingAgendaUrl(mtg: MeetingRow) -> Url: + for doc in mtg.get('documentList', []): + if doc['templateName'] == 'HTML Agenda Packet': + tid = doc['templateId'] + return f'https://albanyca.primegov.com/Portal/Meeting?meetingTemplateId={tid}' + raise ValueError(f"no agenda doc found for {mtg['id']=}") + + +def extractMeetingText(mhtml: str) -> list[str]: + el = fromstring(mhtml) + m = el.cssselect('div#meetingSection')[0] + for st in m.cssselect('style'): + st.clear() + meetingText = [ + chunk for chunk in m.xpath('.//text()', namespaces=el.nsmap) + if chunk.strip() + ] + return meetingText + + +def addMeeting(index: SearchIndex, mtg: MeetingRow): + try: + agendaUrl = meetingAgendaUrl(mtg) + except ValueError: + pass + else: + html = getCityPermanent(agendaUrl) + text = extractMeetingText(html) + # todo group phrases + index.addDoc(title=f'{mtg["date"]} {mtg["title"]}', content=text) + + try: + videoUrl = mtg['videoUrl'] + if not videoUrl: + raise KeyError + except KeyError: + pass + else: + '''transcribe and index video...''' + + +@flow(persist_result=True) +def buildIndex(): + global log + log = get_run_logger() + search_index.log = log + + index = SearchIndex(Path('/tmp/scoindex')) + for url in meetingListUrls(): + mtgs = cast(list[MeetingRow], getCityMutableJson(url)) + log.info(f'got {len(mtgs)=}') + + for mtg in mtgs: + addMeeting(index, mtg) + index.commit() + + +if __name__ == '__main__': + buildIndex.serve()