annotate scobot/index/build_index_flow.py @ 11:6622bacb0b84

first pass at reorg
author drewp@bigasterisk.com
date Thu, 11 Jul 2024 18:15:44 -0700
parents flow/build_index.py@13438795d896
children 403eff4a16c8
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
11
6622bacb0b84 first pass at reorg
drewp@bigasterisk.com
parents: 10
diff changeset
1 import json
6622bacb0b84 first pass at reorg
drewp@bigasterisk.com
parents: 10
diff changeset
2 import re
10
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
3 from pathlib import Path
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
4 from typing import Iterable, cast
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
5
11
6622bacb0b84 first pass at reorg
drewp@bigasterisk.com
parents: 10
diff changeset
6 import lxml.html
6622bacb0b84 first pass at reorg
drewp@bigasterisk.com
parents: 10
diff changeset
7 import nltk
10
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
8 from prefect import flow, task
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
9 from prefect.logging import get_run_logger
11
6622bacb0b84 first pass at reorg
drewp@bigasterisk.com
parents: 10
diff changeset
10
6622bacb0b84 first pass at reorg
drewp@bigasterisk.com
parents: 10
diff changeset
11 import scobot.index.access
6622bacb0b84 first pass at reorg
drewp@bigasterisk.com
parents: 10
diff changeset
12 from scobot.index.access import SearchIndex
6622bacb0b84 first pass at reorg
drewp@bigasterisk.com
parents: 10
diff changeset
13 from scobot.index.download_tasks import getCityMutableJson, getCityPermanent
6622bacb0b84 first pass at reorg
drewp@bigasterisk.com
parents: 10
diff changeset
14 from scobot.local_types import MeetingRow, Url
10
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
15
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
16 log = None
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
17
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
18
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
19 @task()
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
20 def meetingListUrls() -> Iterable[Url]:
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
21 return [
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
22 "https://albanyca.primegov.com/api/v2/PublicPortal/ListArchivedMeetings?year=2024",
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
23 "https://albanyca.primegov.com/api/v2/PublicPortal/ListUpcomingMeetings",
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
24 ]
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
25
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
26
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
27 def meetingAgendaUrl(mtg: MeetingRow) -> Url:
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
28 for doc in mtg.get('documentList', []):
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
29 if doc['templateName'] == 'HTML Agenda Packet':
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
30 tid = doc['templateId']
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
31 return f'https://albanyca.primegov.com/Portal/Meeting?meetingTemplateId={tid}'
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
32 raise ValueError(f"no agenda doc found for {mtg['id']=}")
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
33
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
34
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
35 def extractMeetingText(mhtml: str) -> list[str]:
11
6622bacb0b84 first pass at reorg
drewp@bigasterisk.com
parents: 10
diff changeset
36 el = lxml.html.fromstring(mhtml)
10
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
37 m = el.cssselect('div#meetingSection')[0]
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
38 for st in m.cssselect('style'):
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
39 st.clear()
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
40 meetingText = [
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
41 chunk for chunk in m.xpath('.//text()', namespaces=el.nsmap)
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
42 if chunk.strip()
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
43 ]
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
44 return meetingText
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
45
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
46
11
6622bacb0b84 first pass at reorg
drewp@bigasterisk.com
parents: 10
diff changeset
47 def phrasesFromFile(p: Path) -> Iterable[dict]:
6622bacb0b84 first pass at reorg
drewp@bigasterisk.com
parents: 10
diff changeset
48 mtg = json.loads(p.read_text())
6622bacb0b84 first pass at reorg
drewp@bigasterisk.com
parents: 10
diff changeset
49 print(f' has {len(mtg["phrases"])} phrases')
6622bacb0b84 first pass at reorg
drewp@bigasterisk.com
parents: 10
diff changeset
50 text = ' '.join(mtg['phrases'])
6622bacb0b84 first pass at reorg
drewp@bigasterisk.com
parents: 10
diff changeset
51
6622bacb0b84 first pass at reorg
drewp@bigasterisk.com
parents: 10
diff changeset
52 i = 0
6622bacb0b84 first pass at reorg
drewp@bigasterisk.com
parents: 10
diff changeset
53 for sentence in nltk.sent_tokenize(text):
6622bacb0b84 first pass at reorg
drewp@bigasterisk.com
parents: 10
diff changeset
54 sentence = re.sub(r'\s+', ' ', sentence).strip()
6622bacb0b84 first pass at reorg
drewp@bigasterisk.com
parents: 10
diff changeset
55 if len(sentence) < 5:
6622bacb0b84 first pass at reorg
drewp@bigasterisk.com
parents: 10
diff changeset
56 continue
6622bacb0b84 first pass at reorg
drewp@bigasterisk.com
parents: 10
diff changeset
57 if not re.search(r'\w\w\w\w\w', sentence):
6622bacb0b84 first pass at reorg
drewp@bigasterisk.com
parents: 10
diff changeset
58 continue
6622bacb0b84 first pass at reorg
drewp@bigasterisk.com
parents: 10
diff changeset
59
6622bacb0b84 first pass at reorg
drewp@bigasterisk.com
parents: 10
diff changeset
60 yield dict(id=f"{mtg['mtg']['id']}_sentence{i}",
6622bacb0b84 first pass at reorg
drewp@bigasterisk.com
parents: 10
diff changeset
61 title=f"{mtg['mtg']['date']} {mtg['mtg']['title']}",
6622bacb0b84 first pass at reorg
drewp@bigasterisk.com
parents: 10
diff changeset
62 sourceFile=str(p),
6622bacb0b84 first pass at reorg
drewp@bigasterisk.com
parents: 10
diff changeset
63 posJson="[]",
6622bacb0b84 first pass at reorg
drewp@bigasterisk.com
parents: 10
diff changeset
64 phrase=sentence)
6622bacb0b84 first pass at reorg
drewp@bigasterisk.com
parents: 10
diff changeset
65
6622bacb0b84 first pass at reorg
drewp@bigasterisk.com
parents: 10
diff changeset
66
10
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
67 def addMeeting(index: SearchIndex, mtg: MeetingRow):
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
68 try:
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
69 agendaUrl = meetingAgendaUrl(mtg)
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
70 except ValueError:
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
71 pass
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
72 else:
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
73 html = getCityPermanent(agendaUrl)
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
74 text = extractMeetingText(html)
11
6622bacb0b84 first pass at reorg
drewp@bigasterisk.com
parents: 10
diff changeset
75 # todo group phrases phrasesFromFile
10
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
76 index.addDoc(title=f'{mtg["date"]} {mtg["title"]}', content=text)
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
77
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
78 try:
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
79 videoUrl = mtg['videoUrl']
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
80 if not videoUrl:
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
81 raise KeyError
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
82 except KeyError:
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
83 pass
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
84 else:
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
85 '''transcribe and index video...'''
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
86
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
87
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
88 @flow(persist_result=True)
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
89 def buildIndex():
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
90 global log
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
91 log = get_run_logger()
11
6622bacb0b84 first pass at reorg
drewp@bigasterisk.com
parents: 10
diff changeset
92 scobot.index.access.log = log
10
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
93
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
94 index = SearchIndex(Path('/tmp/scoindex'))
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
95 for url in meetingListUrls():
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
96 mtgs = cast(list[MeetingRow], getCityMutableJson(url))
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
97 log.info(f'got {len(mtgs)=}')
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
98
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
99 for mtg in mtgs:
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
100 addMeeting(index, mtg)
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
101 index.commit()
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
102
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
103
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
104 if __name__ == '__main__':
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
105 buildIndex.serve()