Mercurial > code > home > repos > sco-bot
diff scobot/index/build_index_flow.py @ 11:6622bacb0b84
first pass at reorg
author | drewp@bigasterisk.com |
---|---|
date | Thu, 11 Jul 2024 18:15:44 -0700 |
parents | flow/build_index.py@13438795d896 |
children | 403eff4a16c8 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scobot/index/build_index_flow.py Thu Jul 11 18:15:44 2024 -0700 @@ -0,0 +1,105 @@ +import json +import re +from pathlib import Path +from typing import Iterable, cast + +import lxml.html +import nltk +from prefect import flow, task +from prefect.logging import get_run_logger + +import scobot.index.access +from scobot.index.access import SearchIndex +from scobot.index.download_tasks import getCityMutableJson, getCityPermanent +from scobot.local_types import MeetingRow, Url + +log = None + + +@task() +def meetingListUrls() -> Iterable[Url]: + return [ + "https://albanyca.primegov.com/api/v2/PublicPortal/ListArchivedMeetings?year=2024", + "https://albanyca.primegov.com/api/v2/PublicPortal/ListUpcomingMeetings", + ] + + +def meetingAgendaUrl(mtg: MeetingRow) -> Url: + for doc in mtg.get('documentList', []): + if doc['templateName'] == 'HTML Agenda Packet': + tid = doc['templateId'] + return f'https://albanyca.primegov.com/Portal/Meeting?meetingTemplateId={tid}' + raise ValueError(f"no agenda doc found for {mtg['id']=}") + + +def extractMeetingText(mhtml: str) -> list[str]: + el = lxml.html.fromstring(mhtml) + m = el.cssselect('div#meetingSection')[0] + for st in m.cssselect('style'): + st.clear() + meetingText = [ + chunk for chunk in m.xpath('.//text()', namespaces=el.nsmap) + if chunk.strip() + ] + return meetingText + + +def phrasesFromFile(p: Path) -> Iterable[dict]: + mtg = json.loads(p.read_text()) + print(f' has {len(mtg["phrases"])} phrases') + text = ' '.join(mtg['phrases']) + + i = 0 + for sentence in nltk.sent_tokenize(text): + sentence = re.sub(r'\s+', ' ', sentence).strip() + if len(sentence) < 5: + continue + if not re.search(r'\w\w\w\w\w', sentence): + continue + + yield dict(id=f"{mtg['mtg']['id']}_sentence{i}", + title=f"{mtg['mtg']['date']} {mtg['mtg']['title']}", + sourceFile=str(p), + posJson="[]", + phrase=sentence) + + +def addMeeting(index: SearchIndex, mtg: MeetingRow): + try: + agendaUrl = meetingAgendaUrl(mtg) + except ValueError: + pass + else: + html = getCityPermanent(agendaUrl) + text = extractMeetingText(html) + # todo group phrases phrasesFromFile + index.addDoc(title=f'{mtg["date"]} {mtg["title"]}', content=text) + + try: + videoUrl = mtg['videoUrl'] + if not videoUrl: + raise KeyError + except KeyError: + pass + else: + '''transcribe and index video...''' + + +@flow(persist_result=True) +def buildIndex(): + global log + log = get_run_logger() + scobot.index.access.log = log + + index = SearchIndex(Path('/tmp/scoindex')) + for url in meetingListUrls(): + mtgs = cast(list[MeetingRow], getCityMutableJson(url)) + log.info(f'got {len(mtgs)=}') + + for mtg in mtgs: + addMeeting(index, mtg) + index.commit() + + +if __name__ == '__main__': + buildIndex.serve()