comparison flow/build_index.py @ 10:13438795d896

rewrite with prefect flows and whoosh search, but it's in a nested pdm env
author drewp@bigasterisk.com
date Thu, 11 Jul 2024 17:35:31 -0700
parents
children
comparison
equal deleted inserted replaced
9:d1b54241a731 10:13438795d896
1 from pathlib import Path
2 from typing import Iterable, cast
3
4 import search_index
5 from download import getCityMutableJson, getCityPermanent
6 from local_types import MeetingRow, Url
7 from lxml.html import fromstring
8 from prefect import flow, task
9 from prefect.logging import get_run_logger
10 from search_index import SearchIndex
11
12 log = None
13
14
15 @task()
16 def meetingListUrls() -> Iterable[Url]:
17 return [
18 "https://albanyca.primegov.com/api/v2/PublicPortal/ListArchivedMeetings?year=2024",
19 "https://albanyca.primegov.com/api/v2/PublicPortal/ListUpcomingMeetings",
20 ]
21
22
23 def meetingAgendaUrl(mtg: MeetingRow) -> Url:
24 for doc in mtg.get('documentList', []):
25 if doc['templateName'] == 'HTML Agenda Packet':
26 tid = doc['templateId']
27 return f'https://albanyca.primegov.com/Portal/Meeting?meetingTemplateId={tid}'
28 raise ValueError(f"no agenda doc found for {mtg['id']=}")
29
30
31 def extractMeetingText(mhtml: str) -> list[str]:
32 el = fromstring(mhtml)
33 m = el.cssselect('div#meetingSection')[0]
34 for st in m.cssselect('style'):
35 st.clear()
36 meetingText = [
37 chunk for chunk in m.xpath('.//text()', namespaces=el.nsmap)
38 if chunk.strip()
39 ]
40 return meetingText
41
42
43 def addMeeting(index: SearchIndex, mtg: MeetingRow):
44 try:
45 agendaUrl = meetingAgendaUrl(mtg)
46 except ValueError:
47 pass
48 else:
49 html = getCityPermanent(agendaUrl)
50 text = extractMeetingText(html)
51 # todo group phrases
52 index.addDoc(title=f'{mtg["date"]} {mtg["title"]}', content=text)
53
54 try:
55 videoUrl = mtg['videoUrl']
56 if not videoUrl:
57 raise KeyError
58 except KeyError:
59 pass
60 else:
61 '''transcribe and index video...'''
62
63
64 @flow(persist_result=True)
65 def buildIndex():
66 global log
67 log = get_run_logger()
68 search_index.log = log
69
70 index = SearchIndex(Path('/tmp/scoindex'))
71 for url in meetingListUrls():
72 mtgs = cast(list[MeetingRow], getCityMutableJson(url))
73 log.info(f'got {len(mtgs)=}')
74
75 for mtg in mtgs:
76 addMeeting(index, mtg)
77 index.commit()
78
79
80 if __name__ == '__main__':
81 buildIndex.serve()