Mercurial > code > home > repos > sco-bot
diff scobot/index/build_index_flow.py @ 16:7a87ba2f00d9
reformat, fix some types, make more async
author | drewp@bigasterisk.com |
---|---|
date | Fri, 19 Jul 2024 00:49:38 -0700 |
parents | 6ed25bcaaf1f |
children |
line wrap: on
line diff
--- a/scobot/index/build_index_flow.py Fri Jul 19 00:30:47 2024 -0700 +++ b/scobot/index/build_index_flow.py Fri Jul 19 00:49:38 2024 -0700 @@ -19,8 +19,10 @@ @task() def meetingListUrls() -> Iterable[Url]: return [ - "https://albanyca.primegov.com/api/v2/PublicPortal/ListArchivedMeetings?year=2024", - "https://albanyca.primegov.com/api/v2/PublicPortal/ListUpcomingMeetings", + Url("https://albanyca.primegov.com/api/v2/PublicPortal/ListArchivedMeetings?year=2024" + ), + Url("https://albanyca.primegov.com/api/v2/PublicPortal/ListUpcomingMeetings" + ), ] @@ -28,7 +30,9 @@ for doc in mtg.get('documentList', []): if doc['templateName'] == 'HTML Agenda Packet': tid = doc['templateId'] - return f'https://albanyca.primegov.com/Portal/Meeting?meetingTemplateId={tid}' + return Url( + f'https://albanyca.primegov.com/Portal/Meeting?meetingTemplateId={tid}' + ) raise ValueError(f"no agenda doc found for {mtg['id']=}") @@ -64,16 +68,17 @@ phrase=sentence) -def addMeeting(index: SearchIndex, mtg: MeetingRow): +async def addMeeting(index: SearchIndex, mtg: MeetingRow): try: agendaUrl = meetingAgendaUrl(mtg) except ValueError: pass else: - html = getCityPermanent(agendaUrl) + html = await getCityPermanent(agendaUrl) texts = extractMeetingText(html) - for se in nltk.sent_tokenize(' '.join(texts)): - index.addDoc(sourceTitle=f'{mtg["date"]} {mtg["title"]}', phrase=se) + for sentence in nltk.sent_tokenize(' '.join(texts)): + index.addDoc(sourceTitle=f'{mtg["date"]} {mtg["title"]}', + phrase=sentence) try: videoUrl = mtg['videoUrl'] @@ -86,19 +91,20 @@ @flow(persist_result=True) -def buildIndex(): +async def buildIndex(): global log log = get_run_logger() scobot.index.access.log = log index = SearchIndex(Path('data/build/index0')) for url in meetingListUrls(): - mtgs = cast(list[MeetingRow], getCityMutableJson(url)) + mtgs = cast(list[MeetingRow], await getCityMutableJson(url)) log.info(f'got {len(mtgs)=}') for mtg in mtgs: - addMeeting(index, mtg) + await addMeeting(index, mtg) index.commit() + # todo: kill search to restart it if __name__ == '__main__':