Mercurial > code > home > repos > sco-bot
comparison scobot/index/build_index_flow.py @ 11:6622bacb0b84
first pass at reorg
author | drewp@bigasterisk.com |
---|---|
date | Thu, 11 Jul 2024 18:15:44 -0700 |
parents | flow/build_index.py@13438795d896 |
children | 403eff4a16c8 |
comparison
equal
deleted
inserted
replaced
10:13438795d896 | 11:6622bacb0b84 |
---|---|
1 import json | |
2 import re | |
3 from pathlib import Path | |
4 from typing import Iterable, cast | |
5 | |
6 import lxml.html | |
7 import nltk | |
8 from prefect import flow, task | |
9 from prefect.logging import get_run_logger | |
10 | |
11 import scobot.index.access | |
12 from scobot.index.access import SearchIndex | |
13 from scobot.index.download_tasks import getCityMutableJson, getCityPermanent | |
14 from scobot.local_types import MeetingRow, Url | |
15 | |
16 log = None | |
17 | |
18 | |
19 @task() | |
20 def meetingListUrls() -> Iterable[Url]: | |
21 return [ | |
22 "https://albanyca.primegov.com/api/v2/PublicPortal/ListArchivedMeetings?year=2024", | |
23 "https://albanyca.primegov.com/api/v2/PublicPortal/ListUpcomingMeetings", | |
24 ] | |
25 | |
26 | |
27 def meetingAgendaUrl(mtg: MeetingRow) -> Url: | |
28 for doc in mtg.get('documentList', []): | |
29 if doc['templateName'] == 'HTML Agenda Packet': | |
30 tid = doc['templateId'] | |
31 return f'https://albanyca.primegov.com/Portal/Meeting?meetingTemplateId={tid}' | |
32 raise ValueError(f"no agenda doc found for {mtg['id']=}") | |
33 | |
34 | |
35 def extractMeetingText(mhtml: str) -> list[str]: | |
36 el = lxml.html.fromstring(mhtml) | |
37 m = el.cssselect('div#meetingSection')[0] | |
38 for st in m.cssselect('style'): | |
39 st.clear() | |
40 meetingText = [ | |
41 chunk for chunk in m.xpath('.//text()', namespaces=el.nsmap) | |
42 if chunk.strip() | |
43 ] | |
44 return meetingText | |
45 | |
46 | |
47 def phrasesFromFile(p: Path) -> Iterable[dict]: | |
48 mtg = json.loads(p.read_text()) | |
49 print(f' has {len(mtg["phrases"])} phrases') | |
50 text = ' '.join(mtg['phrases']) | |
51 | |
52 i = 0 | |
53 for sentence in nltk.sent_tokenize(text): | |
54 sentence = re.sub(r'\s+', ' ', sentence).strip() | |
55 if len(sentence) < 5: | |
56 continue | |
57 if not re.search(r'\w\w\w\w\w', sentence): | |
58 continue | |
59 | |
60 yield dict(id=f"{mtg['mtg']['id']}_sentence{i}", | |
61 title=f"{mtg['mtg']['date']} {mtg['mtg']['title']}", | |
62 sourceFile=str(p), | |
63 posJson="[]", | |
64 phrase=sentence) | |
65 | |
66 | |
67 def addMeeting(index: SearchIndex, mtg: MeetingRow): | |
68 try: | |
69 agendaUrl = meetingAgendaUrl(mtg) | |
70 except ValueError: | |
71 pass | |
72 else: | |
73 html = getCityPermanent(agendaUrl) | |
74 text = extractMeetingText(html) | |
75 # todo group phrases phrasesFromFile | |
76 index.addDoc(title=f'{mtg["date"]} {mtg["title"]}', content=text) | |
77 | |
78 try: | |
79 videoUrl = mtg['videoUrl'] | |
80 if not videoUrl: | |
81 raise KeyError | |
82 except KeyError: | |
83 pass | |
84 else: | |
85 '''transcribe and index video...''' | |
86 | |
87 | |
88 @flow(persist_result=True) | |
89 def buildIndex(): | |
90 global log | |
91 log = get_run_logger() | |
92 scobot.index.access.log = log | |
93 | |
94 index = SearchIndex(Path('/tmp/scoindex')) | |
95 for url in meetingListUrls(): | |
96 mtgs = cast(list[MeetingRow], getCityMutableJson(url)) | |
97 log.info(f'got {len(mtgs)=}') | |
98 | |
99 for mtg in mtgs: | |
100 addMeeting(index, mtg) | |
101 index.commit() | |
102 | |
103 | |
104 if __name__ == '__main__': | |
105 buildIndex.serve() |