comparison scobot/index/build_index_flow.py @ 16:7a87ba2f00d9

reformat, fix some types, make more async
author drewp@bigasterisk.com
date Fri, 19 Jul 2024 00:49:38 -0700
parents 6ed25bcaaf1f
children
comparison
equal deleted inserted replaced
15:6ed25bcaaf1f 16:7a87ba2f00d9
17 17
18 18
19 @task() 19 @task()
20 def meetingListUrls() -> Iterable[Url]: 20 def meetingListUrls() -> Iterable[Url]:
21 return [ 21 return [
22 "https://albanyca.primegov.com/api/v2/PublicPortal/ListArchivedMeetings?year=2024", 22 Url("https://albanyca.primegov.com/api/v2/PublicPortal/ListArchivedMeetings?year=2024"
23 "https://albanyca.primegov.com/api/v2/PublicPortal/ListUpcomingMeetings", 23 ),
24 Url("https://albanyca.primegov.com/api/v2/PublicPortal/ListUpcomingMeetings"
25 ),
24 ] 26 ]
25 27
26 28
27 def meetingAgendaUrl(mtg: MeetingRow) -> Url: 29 def meetingAgendaUrl(mtg: MeetingRow) -> Url:
28 for doc in mtg.get('documentList', []): 30 for doc in mtg.get('documentList', []):
29 if doc['templateName'] == 'HTML Agenda Packet': 31 if doc['templateName'] == 'HTML Agenda Packet':
30 tid = doc['templateId'] 32 tid = doc['templateId']
31 return f'https://albanyca.primegov.com/Portal/Meeting?meetingTemplateId={tid}' 33 return Url(
34 f'https://albanyca.primegov.com/Portal/Meeting?meetingTemplateId={tid}'
35 )
32 raise ValueError(f"no agenda doc found for {mtg['id']=}") 36 raise ValueError(f"no agenda doc found for {mtg['id']=}")
33 37
34 38
35 def extractMeetingText(mhtml: str) -> list[str]: 39 def extractMeetingText(mhtml: str) -> list[str]:
36 el = lxml.html.fromstring(mhtml) 40 el = lxml.html.fromstring(mhtml)
62 sourceFile=str(p), 66 sourceFile=str(p),
63 posJson="[]", 67 posJson="[]",
64 phrase=sentence) 68 phrase=sentence)
65 69
66 70
67 def addMeeting(index: SearchIndex, mtg: MeetingRow): 71 async def addMeeting(index: SearchIndex, mtg: MeetingRow):
68 try: 72 try:
69 agendaUrl = meetingAgendaUrl(mtg) 73 agendaUrl = meetingAgendaUrl(mtg)
70 except ValueError: 74 except ValueError:
71 pass 75 pass
72 else: 76 else:
73 html = getCityPermanent(agendaUrl) 77 html = await getCityPermanent(agendaUrl)
74 texts = extractMeetingText(html) 78 texts = extractMeetingText(html)
75 for se in nltk.sent_tokenize(' '.join(texts)): 79 for sentence in nltk.sent_tokenize(' '.join(texts)):
76 index.addDoc(sourceTitle=f'{mtg["date"]} {mtg["title"]}', phrase=se) 80 index.addDoc(sourceTitle=f'{mtg["date"]} {mtg["title"]}',
81 phrase=sentence)
77 82
78 try: 83 try:
79 videoUrl = mtg['videoUrl'] 84 videoUrl = mtg['videoUrl']
80 if not videoUrl: 85 if not videoUrl:
81 raise KeyError 86 raise KeyError
84 else: 89 else:
85 '''transcribe and index video...''' 90 '''transcribe and index video...'''
86 91
87 92
88 @flow(persist_result=True) 93 @flow(persist_result=True)
89 def buildIndex(): 94 async def buildIndex():
90 global log 95 global log
91 log = get_run_logger() 96 log = get_run_logger()
92 scobot.index.access.log = log 97 scobot.index.access.log = log
93 98
94 index = SearchIndex(Path('data/build/index0')) 99 index = SearchIndex(Path('data/build/index0'))
95 for url in meetingListUrls(): 100 for url in meetingListUrls():
96 mtgs = cast(list[MeetingRow], getCityMutableJson(url)) 101 mtgs = cast(list[MeetingRow], await getCityMutableJson(url))
97 log.info(f'got {len(mtgs)=}') 102 log.info(f'got {len(mtgs)=}')
98 103
99 for mtg in mtgs: 104 for mtg in mtgs:
100 addMeeting(index, mtg) 105 await addMeeting(index, mtg)
101 index.commit() 106 index.commit()
107 # todo: kill search to restart it
102 108
103 109
104 if __name__ == '__main__': 110 if __name__ == '__main__':
105 buildIndex.serve(name='buildIndex') 111 buildIndex.serve(name='buildIndex')