diff scobot/index/build_index_flow.py @ 16:7a87ba2f00d9

reformat, fix some types, make more async
author drewp@bigasterisk.com
date Fri, 19 Jul 2024 00:49:38 -0700
parents 6ed25bcaaf1f
children
line wrap: on
line diff
--- a/scobot/index/build_index_flow.py	Fri Jul 19 00:30:47 2024 -0700
+++ b/scobot/index/build_index_flow.py	Fri Jul 19 00:49:38 2024 -0700
@@ -19,8 +19,10 @@
 @task()
 def meetingListUrls() -> Iterable[Url]:
     return [
-        "https://albanyca.primegov.com/api/v2/PublicPortal/ListArchivedMeetings?year=2024",
-        "https://albanyca.primegov.com/api/v2/PublicPortal/ListUpcomingMeetings",
+        Url("https://albanyca.primegov.com/api/v2/PublicPortal/ListArchivedMeetings?year=2024"
+            ),
+        Url("https://albanyca.primegov.com/api/v2/PublicPortal/ListUpcomingMeetings"
+            ),
     ]
 
 
@@ -28,7 +30,9 @@
     for doc in mtg.get('documentList', []):
         if doc['templateName'] == 'HTML Agenda Packet':
             tid = doc['templateId']
-            return f'https://albanyca.primegov.com/Portal/Meeting?meetingTemplateId={tid}'
+            return Url(
+                f'https://albanyca.primegov.com/Portal/Meeting?meetingTemplateId={tid}'
+            )
     raise ValueError(f"no agenda doc found for {mtg['id']=}")
 
 
@@ -64,16 +68,17 @@
                    phrase=sentence)
 
 
-def addMeeting(index: SearchIndex, mtg: MeetingRow):
+async def addMeeting(index: SearchIndex, mtg: MeetingRow):
     try:
         agendaUrl = meetingAgendaUrl(mtg)
     except ValueError:
         pass
     else:
-        html = getCityPermanent(agendaUrl)
+        html = await getCityPermanent(agendaUrl)
         texts = extractMeetingText(html)
-        for se in nltk.sent_tokenize(' '.join(texts)):
-            index.addDoc(sourceTitle=f'{mtg["date"]} {mtg["title"]}', phrase=se)
+        for sentence in nltk.sent_tokenize(' '.join(texts)):
+            index.addDoc(sourceTitle=f'{mtg["date"]} {mtg["title"]}',
+                         phrase=sentence)
 
     try:
         videoUrl = mtg['videoUrl']
@@ -86,19 +91,20 @@
 
 
 @flow(persist_result=True)
-def buildIndex():
+async def buildIndex():
     global log
     log = get_run_logger()
     scobot.index.access.log = log
 
     index = SearchIndex(Path('data/build/index0'))
     for url in meetingListUrls():
-        mtgs = cast(list[MeetingRow], getCityMutableJson(url))
+        mtgs = cast(list[MeetingRow], await getCityMutableJson(url))
         log.info(f'got {len(mtgs)=}')
 
         for mtg in mtgs:
-            addMeeting(index, mtg)
+            await addMeeting(index, mtg)
     index.commit()
+    # todo: kill search to restart it
 
 
 if __name__ == '__main__':