comparison scobot/index/build_index_flow.py @ 11:6622bacb0b84

first pass at reorg
author drewp@bigasterisk.com
date Thu, 11 Jul 2024 18:15:44 -0700
parents flow/build_index.py@13438795d896
children 403eff4a16c8
comparison
equal deleted inserted replaced
10:13438795d896 11:6622bacb0b84
1 import json
2 import re
3 from pathlib import Path
4 from typing import Iterable, cast
5
6 import lxml.html
7 import nltk
8 from prefect import flow, task
9 from prefect.logging import get_run_logger
10
11 import scobot.index.access
12 from scobot.index.access import SearchIndex
13 from scobot.index.download_tasks import getCityMutableJson, getCityPermanent
14 from scobot.local_types import MeetingRow, Url
15
16 log = None
17
18
19 @task()
20 def meetingListUrls() -> Iterable[Url]:
21 return [
22 "https://albanyca.primegov.com/api/v2/PublicPortal/ListArchivedMeetings?year=2024",
23 "https://albanyca.primegov.com/api/v2/PublicPortal/ListUpcomingMeetings",
24 ]
25
26
27 def meetingAgendaUrl(mtg: MeetingRow) -> Url:
28 for doc in mtg.get('documentList', []):
29 if doc['templateName'] == 'HTML Agenda Packet':
30 tid = doc['templateId']
31 return f'https://albanyca.primegov.com/Portal/Meeting?meetingTemplateId={tid}'
32 raise ValueError(f"no agenda doc found for {mtg['id']=}")
33
34
35 def extractMeetingText(mhtml: str) -> list[str]:
36 el = lxml.html.fromstring(mhtml)
37 m = el.cssselect('div#meetingSection')[0]
38 for st in m.cssselect('style'):
39 st.clear()
40 meetingText = [
41 chunk for chunk in m.xpath('.//text()', namespaces=el.nsmap)
42 if chunk.strip()
43 ]
44 return meetingText
45
46
47 def phrasesFromFile(p: Path) -> Iterable[dict]:
48 mtg = json.loads(p.read_text())
49 print(f' has {len(mtg["phrases"])} phrases')
50 text = ' '.join(mtg['phrases'])
51
52 i = 0
53 for sentence in nltk.sent_tokenize(text):
54 sentence = re.sub(r'\s+', ' ', sentence).strip()
55 if len(sentence) < 5:
56 continue
57 if not re.search(r'\w\w\w\w\w', sentence):
58 continue
59
60 yield dict(id=f"{mtg['mtg']['id']}_sentence{i}",
61 title=f"{mtg['mtg']['date']} {mtg['mtg']['title']}",
62 sourceFile=str(p),
63 posJson="[]",
64 phrase=sentence)
65
66
67 def addMeeting(index: SearchIndex, mtg: MeetingRow):
68 try:
69 agendaUrl = meetingAgendaUrl(mtg)
70 except ValueError:
71 pass
72 else:
73 html = getCityPermanent(agendaUrl)
74 text = extractMeetingText(html)
75 # todo group phrases phrasesFromFile
76 index.addDoc(title=f'{mtg["date"]} {mtg["title"]}', content=text)
77
78 try:
79 videoUrl = mtg['videoUrl']
80 if not videoUrl:
81 raise KeyError
82 except KeyError:
83 pass
84 else:
85 '''transcribe and index video...'''
86
87
88 @flow(persist_result=True)
89 def buildIndex():
90 global log
91 log = get_run_logger()
92 scobot.index.access.log = log
93
94 index = SearchIndex(Path('/tmp/scoindex'))
95 for url in meetingListUrls():
96 mtgs = cast(list[MeetingRow], getCityMutableJson(url))
97 log.info(f'got {len(mtgs)=}')
98
99 for mtg in mtgs:
100 addMeeting(index, mtg)
101 index.commit()
102
103
104 if __name__ == '__main__':
105 buildIndex.serve()