Mercurial > code > home > repos > sco-bot
comparison flow/build_index.py @ 10:13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
author | drewp@bigasterisk.com |
---|---|
date | Thu, 11 Jul 2024 17:35:31 -0700 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
9:d1b54241a731 | 10:13438795d896 |
---|---|
1 from pathlib import Path | |
2 from typing import Iterable, cast | |
3 | |
4 import search_index | |
5 from download import getCityMutableJson, getCityPermanent | |
6 from local_types import MeetingRow, Url | |
7 from lxml.html import fromstring | |
8 from prefect import flow, task | |
9 from prefect.logging import get_run_logger | |
10 from search_index import SearchIndex | |
11 | |
12 log = None | |
13 | |
14 | |
15 @task() | |
16 def meetingListUrls() -> Iterable[Url]: | |
17 return [ | |
18 "https://albanyca.primegov.com/api/v2/PublicPortal/ListArchivedMeetings?year=2024", | |
19 "https://albanyca.primegov.com/api/v2/PublicPortal/ListUpcomingMeetings", | |
20 ] | |
21 | |
22 | |
23 def meetingAgendaUrl(mtg: MeetingRow) -> Url: | |
24 for doc in mtg.get('documentList', []): | |
25 if doc['templateName'] == 'HTML Agenda Packet': | |
26 tid = doc['templateId'] | |
27 return f'https://albanyca.primegov.com/Portal/Meeting?meetingTemplateId={tid}' | |
28 raise ValueError(f"no agenda doc found for {mtg['id']=}") | |
29 | |
30 | |
31 def extractMeetingText(mhtml: str) -> list[str]: | |
32 el = fromstring(mhtml) | |
33 m = el.cssselect('div#meetingSection')[0] | |
34 for st in m.cssselect('style'): | |
35 st.clear() | |
36 meetingText = [ | |
37 chunk for chunk in m.xpath('.//text()', namespaces=el.nsmap) | |
38 if chunk.strip() | |
39 ] | |
40 return meetingText | |
41 | |
42 | |
43 def addMeeting(index: SearchIndex, mtg: MeetingRow): | |
44 try: | |
45 agendaUrl = meetingAgendaUrl(mtg) | |
46 except ValueError: | |
47 pass | |
48 else: | |
49 html = getCityPermanent(agendaUrl) | |
50 text = extractMeetingText(html) | |
51 # todo group phrases | |
52 index.addDoc(title=f'{mtg["date"]} {mtg["title"]}', content=text) | |
53 | |
54 try: | |
55 videoUrl = mtg['videoUrl'] | |
56 if not videoUrl: | |
57 raise KeyError | |
58 except KeyError: | |
59 pass | |
60 else: | |
61 '''transcribe and index video...''' | |
62 | |
63 | |
64 @flow(persist_result=True) | |
65 def buildIndex(): | |
66 global log | |
67 log = get_run_logger() | |
68 search_index.log = log | |
69 | |
70 index = SearchIndex(Path('/tmp/scoindex')) | |
71 for url in meetingListUrls(): | |
72 mtgs = cast(list[MeetingRow], getCityMutableJson(url)) | |
73 log.info(f'got {len(mtgs)=}') | |
74 | |
75 for mtg in mtgs: | |
76 addMeeting(index, mtg) | |
77 index.commit() | |
78 | |
79 | |
80 if __name__ == '__main__': | |
81 buildIndex.serve() |