Mercurial > code > home > repos > sco-bot
view flow/build_index.py @ 10:13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
author | drewp@bigasterisk.com |
---|---|
date | Thu, 11 Jul 2024 17:35:31 -0700 |
parents | |
children |
line wrap: on
line source
from pathlib import Path from typing import Iterable, cast import search_index from download import getCityMutableJson, getCityPermanent from local_types import MeetingRow, Url from lxml.html import fromstring from prefect import flow, task from prefect.logging import get_run_logger from search_index import SearchIndex log = None @task() def meetingListUrls() -> Iterable[Url]: return [ "https://albanyca.primegov.com/api/v2/PublicPortal/ListArchivedMeetings?year=2024", "https://albanyca.primegov.com/api/v2/PublicPortal/ListUpcomingMeetings", ] def meetingAgendaUrl(mtg: MeetingRow) -> Url: for doc in mtg.get('documentList', []): if doc['templateName'] == 'HTML Agenda Packet': tid = doc['templateId'] return f'https://albanyca.primegov.com/Portal/Meeting?meetingTemplateId={tid}' raise ValueError(f"no agenda doc found for {mtg['id']=}") def extractMeetingText(mhtml: str) -> list[str]: el = fromstring(mhtml) m = el.cssselect('div#meetingSection')[0] for st in m.cssselect('style'): st.clear() meetingText = [ chunk for chunk in m.xpath('.//text()', namespaces=el.nsmap) if chunk.strip() ] return meetingText def addMeeting(index: SearchIndex, mtg: MeetingRow): try: agendaUrl = meetingAgendaUrl(mtg) except ValueError: pass else: html = getCityPermanent(agendaUrl) text = extractMeetingText(html) # todo group phrases index.addDoc(title=f'{mtg["date"]} {mtg["title"]}', content=text) try: videoUrl = mtg['videoUrl'] if not videoUrl: raise KeyError except KeyError: pass else: '''transcribe and index video...''' @flow(persist_result=True) def buildIndex(): global log log = get_run_logger() search_index.log = log index = SearchIndex(Path('/tmp/scoindex')) for url in meetingListUrls(): mtgs = cast(list[MeetingRow], getCityMutableJson(url)) log.info(f'got {len(mtgs)=}') for mtg in mtgs: addMeeting(index, mtg) index.commit() if __name__ == '__main__': buildIndex.serve()