annotate scobot/index/build_index_flow.py @ 16:7a87ba2f00d9

reformat, fix some types, make more async
author drewp@bigasterisk.com
date Fri, 19 Jul 2024 00:49:38 -0700
parents 6ed25bcaaf1f
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
11
6622bacb0b84 first pass at reorg
drewp@bigasterisk.com
parents: 10
diff changeset
1 import json
6622bacb0b84 first pass at reorg
drewp@bigasterisk.com
parents: 10
diff changeset
2 import re
10
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
3 from pathlib import Path
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
4 from typing import Iterable, cast
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
5
11
6622bacb0b84 first pass at reorg
drewp@bigasterisk.com
parents: 10
diff changeset
6 import lxml.html
6622bacb0b84 first pass at reorg
drewp@bigasterisk.com
parents: 10
diff changeset
7 import nltk
10
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
8 from prefect import flow, task
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
9 from prefect.logging import get_run_logger
11
6622bacb0b84 first pass at reorg
drewp@bigasterisk.com
parents: 10
diff changeset
10
6622bacb0b84 first pass at reorg
drewp@bigasterisk.com
parents: 10
diff changeset
11 import scobot.index.access
6622bacb0b84 first pass at reorg
drewp@bigasterisk.com
parents: 10
diff changeset
12 from scobot.index.access import SearchIndex
6622bacb0b84 first pass at reorg
drewp@bigasterisk.com
parents: 10
diff changeset
13 from scobot.index.download_tasks import getCityMutableJson, getCityPermanent
6622bacb0b84 first pass at reorg
drewp@bigasterisk.com
parents: 10
diff changeset
14 from scobot.local_types import MeetingRow, Url
10
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
15
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
16 log = None
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
17
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
18
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
19 @task()
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
20 def meetingListUrls() -> Iterable[Url]:
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
21 return [
16
7a87ba2f00d9 reformat, fix some types, make more async
drewp@bigasterisk.com
parents: 15
diff changeset
22 Url("https://albanyca.primegov.com/api/v2/PublicPortal/ListArchivedMeetings?year=2024"
7a87ba2f00d9 reformat, fix some types, make more async
drewp@bigasterisk.com
parents: 15
diff changeset
23 ),
7a87ba2f00d9 reformat, fix some types, make more async
drewp@bigasterisk.com
parents: 15
diff changeset
24 Url("https://albanyca.primegov.com/api/v2/PublicPortal/ListUpcomingMeetings"
7a87ba2f00d9 reformat, fix some types, make more async
drewp@bigasterisk.com
parents: 15
diff changeset
25 ),
10
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
26 ]
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
27
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
28
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
29 def meetingAgendaUrl(mtg: MeetingRow) -> Url:
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
30 for doc in mtg.get('documentList', []):
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
31 if doc['templateName'] == 'HTML Agenda Packet':
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
32 tid = doc['templateId']
16
7a87ba2f00d9 reformat, fix some types, make more async
drewp@bigasterisk.com
parents: 15
diff changeset
33 return Url(
7a87ba2f00d9 reformat, fix some types, make more async
drewp@bigasterisk.com
parents: 15
diff changeset
34 f'https://albanyca.primegov.com/Portal/Meeting?meetingTemplateId={tid}'
7a87ba2f00d9 reformat, fix some types, make more async
drewp@bigasterisk.com
parents: 15
diff changeset
35 )
10
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
36 raise ValueError(f"no agenda doc found for {mtg['id']=}")
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
37
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
38
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
39 def extractMeetingText(mhtml: str) -> list[str]:
11
6622bacb0b84 first pass at reorg
drewp@bigasterisk.com
parents: 10
diff changeset
40 el = lxml.html.fromstring(mhtml)
10
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
41 m = el.cssselect('div#meetingSection')[0]
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
42 for st in m.cssselect('style'):
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
43 st.clear()
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
44 meetingText = [
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
45 chunk for chunk in m.xpath('.//text()', namespaces=el.nsmap)
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
46 if chunk.strip()
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
47 ]
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
48 return meetingText
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
49
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
50
11
6622bacb0b84 first pass at reorg
drewp@bigasterisk.com
parents: 10
diff changeset
51 def phrasesFromFile(p: Path) -> Iterable[dict]:
6622bacb0b84 first pass at reorg
drewp@bigasterisk.com
parents: 10
diff changeset
52 mtg = json.loads(p.read_text())
6622bacb0b84 first pass at reorg
drewp@bigasterisk.com
parents: 10
diff changeset
53 print(f' has {len(mtg["phrases"])} phrases')
6622bacb0b84 first pass at reorg
drewp@bigasterisk.com
parents: 10
diff changeset
54 text = ' '.join(mtg['phrases'])
6622bacb0b84 first pass at reorg
drewp@bigasterisk.com
parents: 10
diff changeset
55
6622bacb0b84 first pass at reorg
drewp@bigasterisk.com
parents: 10
diff changeset
56 i = 0
6622bacb0b84 first pass at reorg
drewp@bigasterisk.com
parents: 10
diff changeset
57 for sentence in nltk.sent_tokenize(text):
6622bacb0b84 first pass at reorg
drewp@bigasterisk.com
parents: 10
diff changeset
58 sentence = re.sub(r'\s+', ' ', sentence).strip()
6622bacb0b84 first pass at reorg
drewp@bigasterisk.com
parents: 10
diff changeset
59 if len(sentence) < 5:
6622bacb0b84 first pass at reorg
drewp@bigasterisk.com
parents: 10
diff changeset
60 continue
6622bacb0b84 first pass at reorg
drewp@bigasterisk.com
parents: 10
diff changeset
61 if not re.search(r'\w\w\w\w\w', sentence):
6622bacb0b84 first pass at reorg
drewp@bigasterisk.com
parents: 10
diff changeset
62 continue
6622bacb0b84 first pass at reorg
drewp@bigasterisk.com
parents: 10
diff changeset
63
6622bacb0b84 first pass at reorg
drewp@bigasterisk.com
parents: 10
diff changeset
64 yield dict(id=f"{mtg['mtg']['id']}_sentence{i}",
6622bacb0b84 first pass at reorg
drewp@bigasterisk.com
parents: 10
diff changeset
65 title=f"{mtg['mtg']['date']} {mtg['mtg']['title']}",
6622bacb0b84 first pass at reorg
drewp@bigasterisk.com
parents: 10
diff changeset
66 sourceFile=str(p),
6622bacb0b84 first pass at reorg
drewp@bigasterisk.com
parents: 10
diff changeset
67 posJson="[]",
6622bacb0b84 first pass at reorg
drewp@bigasterisk.com
parents: 10
diff changeset
68 phrase=sentence)
6622bacb0b84 first pass at reorg
drewp@bigasterisk.com
parents: 10
diff changeset
69
6622bacb0b84 first pass at reorg
drewp@bigasterisk.com
parents: 10
diff changeset
70
16
7a87ba2f00d9 reformat, fix some types, make more async
drewp@bigasterisk.com
parents: 15
diff changeset
71 async def addMeeting(index: SearchIndex, mtg: MeetingRow):
10
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
72 try:
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
73 agendaUrl = meetingAgendaUrl(mtg)
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
74 except ValueError:
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
75 pass
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
76 else:
16
7a87ba2f00d9 reformat, fix some types, make more async
drewp@bigasterisk.com
parents: 15
diff changeset
77 html = await getCityPermanent(agendaUrl)
13
403eff4a16c8 fix up indexer flow and fastapi server
drewp@bigasterisk.com
parents: 11
diff changeset
78 texts = extractMeetingText(html)
16
7a87ba2f00d9 reformat, fix some types, make more async
drewp@bigasterisk.com
parents: 15
diff changeset
79 for sentence in nltk.sent_tokenize(' '.join(texts)):
7a87ba2f00d9 reformat, fix some types, make more async
drewp@bigasterisk.com
parents: 15
diff changeset
80 index.addDoc(sourceTitle=f'{mtg["date"]} {mtg["title"]}',
7a87ba2f00d9 reformat, fix some types, make more async
drewp@bigasterisk.com
parents: 15
diff changeset
81 phrase=sentence)
10
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
82
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
83 try:
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
84 videoUrl = mtg['videoUrl']
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
85 if not videoUrl:
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
86 raise KeyError
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
87 except KeyError:
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
88 pass
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
89 else:
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
90 '''transcribe and index video...'''
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
91
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
92
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
93 @flow(persist_result=True)
16
7a87ba2f00d9 reformat, fix some types, make more async
drewp@bigasterisk.com
parents: 15
diff changeset
94 async def buildIndex():
10
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
95 global log
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
96 log = get_run_logger()
11
6622bacb0b84 first pass at reorg
drewp@bigasterisk.com
parents: 10
diff changeset
97 scobot.index.access.log = log
10
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
98
15
6ed25bcaaf1f add prefect and rebuild flow to k8s
drewp@bigasterisk.com
parents: 14
diff changeset
99 index = SearchIndex(Path('data/build/index0'))
10
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
100 for url in meetingListUrls():
16
7a87ba2f00d9 reformat, fix some types, make more async
drewp@bigasterisk.com
parents: 15
diff changeset
101 mtgs = cast(list[MeetingRow], await getCityMutableJson(url))
10
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
102 log.info(f'got {len(mtgs)=}')
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
103
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
104 for mtg in mtgs:
16
7a87ba2f00d9 reformat, fix some types, make more async
drewp@bigasterisk.com
parents: 15
diff changeset
105 await addMeeting(index, mtg)
10
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
106 index.commit()
16
7a87ba2f00d9 reformat, fix some types, make more async
drewp@bigasterisk.com
parents: 15
diff changeset
107 # todo: kill search to restart it
10
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
108
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
109
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
110 if __name__ == '__main__':
13
403eff4a16c8 fix up indexer flow and fastapi server
drewp@bigasterisk.com
parents: 11
diff changeset
111 buildIndex.serve(name='buildIndex')