Mercurial > code > home > repos > sco-bot
comparison scobot/index/build_index_flow.py @ 16:7a87ba2f00d9
reformat, fix some types, make more async
author | drewp@bigasterisk.com |
---|---|
date | Fri, 19 Jul 2024 00:49:38 -0700 |
parents | 6ed25bcaaf1f |
children |
comparison
equal
deleted
inserted
replaced
15:6ed25bcaaf1f | 16:7a87ba2f00d9 |
---|---|
17 | 17 |
18 | 18 |
19 @task() | 19 @task() |
20 def meetingListUrls() -> Iterable[Url]: | 20 def meetingListUrls() -> Iterable[Url]: |
21 return [ | 21 return [ |
22 "https://albanyca.primegov.com/api/v2/PublicPortal/ListArchivedMeetings?year=2024", | 22 Url("https://albanyca.primegov.com/api/v2/PublicPortal/ListArchivedMeetings?year=2024" |
23 "https://albanyca.primegov.com/api/v2/PublicPortal/ListUpcomingMeetings", | 23 ), |
24 Url("https://albanyca.primegov.com/api/v2/PublicPortal/ListUpcomingMeetings" | |
25 ), | |
24 ] | 26 ] |
25 | 27 |
26 | 28 |
27 def meetingAgendaUrl(mtg: MeetingRow) -> Url: | 29 def meetingAgendaUrl(mtg: MeetingRow) -> Url: |
28 for doc in mtg.get('documentList', []): | 30 for doc in mtg.get('documentList', []): |
29 if doc['templateName'] == 'HTML Agenda Packet': | 31 if doc['templateName'] == 'HTML Agenda Packet': |
30 tid = doc['templateId'] | 32 tid = doc['templateId'] |
31 return f'https://albanyca.primegov.com/Portal/Meeting?meetingTemplateId={tid}' | 33 return Url( |
34 f'https://albanyca.primegov.com/Portal/Meeting?meetingTemplateId={tid}' | |
35 ) | |
32 raise ValueError(f"no agenda doc found for {mtg['id']=}") | 36 raise ValueError(f"no agenda doc found for {mtg['id']=}") |
33 | 37 |
34 | 38 |
35 def extractMeetingText(mhtml: str) -> list[str]: | 39 def extractMeetingText(mhtml: str) -> list[str]: |
36 el = lxml.html.fromstring(mhtml) | 40 el = lxml.html.fromstring(mhtml) |
62 sourceFile=str(p), | 66 sourceFile=str(p), |
63 posJson="[]", | 67 posJson="[]", |
64 phrase=sentence) | 68 phrase=sentence) |
65 | 69 |
66 | 70 |
67 def addMeeting(index: SearchIndex, mtg: MeetingRow): | 71 async def addMeeting(index: SearchIndex, mtg: MeetingRow): |
68 try: | 72 try: |
69 agendaUrl = meetingAgendaUrl(mtg) | 73 agendaUrl = meetingAgendaUrl(mtg) |
70 except ValueError: | 74 except ValueError: |
71 pass | 75 pass |
72 else: | 76 else: |
73 html = getCityPermanent(agendaUrl) | 77 html = await getCityPermanent(agendaUrl) |
74 texts = extractMeetingText(html) | 78 texts = extractMeetingText(html) |
75 for se in nltk.sent_tokenize(' '.join(texts)): | 79 for sentence in nltk.sent_tokenize(' '.join(texts)): |
76 index.addDoc(sourceTitle=f'{mtg["date"]} {mtg["title"]}', phrase=se) | 80 index.addDoc(sourceTitle=f'{mtg["date"]} {mtg["title"]}', |
81 phrase=sentence) | |
77 | 82 |
78 try: | 83 try: |
79 videoUrl = mtg['videoUrl'] | 84 videoUrl = mtg['videoUrl'] |
80 if not videoUrl: | 85 if not videoUrl: |
81 raise KeyError | 86 raise KeyError |
84 else: | 89 else: |
85 '''transcribe and index video...''' | 90 '''transcribe and index video...''' |
86 | 91 |
87 | 92 |
88 @flow(persist_result=True) | 93 @flow(persist_result=True) |
89 def buildIndex(): | 94 async def buildIndex(): |
90 global log | 95 global log |
91 log = get_run_logger() | 96 log = get_run_logger() |
92 scobot.index.access.log = log | 97 scobot.index.access.log = log |
93 | 98 |
94 index = SearchIndex(Path('data/build/index0')) | 99 index = SearchIndex(Path('data/build/index0')) |
95 for url in meetingListUrls(): | 100 for url in meetingListUrls(): |
96 mtgs = cast(list[MeetingRow], getCityMutableJson(url)) | 101 mtgs = cast(list[MeetingRow], await getCityMutableJson(url)) |
97 log.info(f'got {len(mtgs)=}') | 102 log.info(f'got {len(mtgs)=}') |
98 | 103 |
99 for mtg in mtgs: | 104 for mtg in mtgs: |
100 addMeeting(index, mtg) | 105 await addMeeting(index, mtg) |
101 index.commit() | 106 index.commit() |
107 # todo: kill search to restart it | |
102 | 108 |
103 | 109 |
104 if __name__ == '__main__': | 110 if __name__ == '__main__': |
105 buildIndex.serve(name='buildIndex') | 111 buildIndex.serve(name='buildIndex') |