Mercurial > code > home > repos > sco-bot
annotate scobot/index/build_index_flow.py @ 16:7a87ba2f00d9
reformat, fix some types, make more async
author | drewp@bigasterisk.com |
---|---|
date | Fri, 19 Jul 2024 00:49:38 -0700 |
parents | 6ed25bcaaf1f |
children |
rev | line source |
---|---|
11 | 1 import json |
2 import re | |
10
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
3 from pathlib import Path |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
4 from typing import Iterable, cast |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
5 |
11 | 6 import lxml.html |
7 import nltk | |
10
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
8 from prefect import flow, task |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
9 from prefect.logging import get_run_logger |
11 | 10 |
11 import scobot.index.access | |
12 from scobot.index.access import SearchIndex | |
13 from scobot.index.download_tasks import getCityMutableJson, getCityPermanent | |
14 from scobot.local_types import MeetingRow, Url | |
10
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
15 |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
16 log = None |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
17 |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
18 |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
19 @task() |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
20 def meetingListUrls() -> Iterable[Url]: |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
21 return [ |
16
7a87ba2f00d9
reformat, fix some types, make more async
drewp@bigasterisk.com
parents:
15
diff
changeset
|
22 Url("https://albanyca.primegov.com/api/v2/PublicPortal/ListArchivedMeetings?year=2024" |
7a87ba2f00d9
reformat, fix some types, make more async
drewp@bigasterisk.com
parents:
15
diff
changeset
|
23 ), |
7a87ba2f00d9
reformat, fix some types, make more async
drewp@bigasterisk.com
parents:
15
diff
changeset
|
24 Url("https://albanyca.primegov.com/api/v2/PublicPortal/ListUpcomingMeetings" |
7a87ba2f00d9
reformat, fix some types, make more async
drewp@bigasterisk.com
parents:
15
diff
changeset
|
25 ), |
10
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
26 ] |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
27 |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
28 |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
29 def meetingAgendaUrl(mtg: MeetingRow) -> Url: |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
30 for doc in mtg.get('documentList', []): |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
31 if doc['templateName'] == 'HTML Agenda Packet': |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
32 tid = doc['templateId'] |
16
7a87ba2f00d9
reformat, fix some types, make more async
drewp@bigasterisk.com
parents:
15
diff
changeset
|
33 return Url( |
7a87ba2f00d9
reformat, fix some types, make more async
drewp@bigasterisk.com
parents:
15
diff
changeset
|
34 f'https://albanyca.primegov.com/Portal/Meeting?meetingTemplateId={tid}' |
7a87ba2f00d9
reformat, fix some types, make more async
drewp@bigasterisk.com
parents:
15
diff
changeset
|
35 ) |
10
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
36 raise ValueError(f"no agenda doc found for {mtg['id']=}") |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
37 |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
38 |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
39 def extractMeetingText(mhtml: str) -> list[str]: |
11 | 40 el = lxml.html.fromstring(mhtml) |
10
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
41 m = el.cssselect('div#meetingSection')[0] |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
42 for st in m.cssselect('style'): |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
43 st.clear() |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
44 meetingText = [ |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
45 chunk for chunk in m.xpath('.//text()', namespaces=el.nsmap) |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
46 if chunk.strip() |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
47 ] |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
48 return meetingText |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
49 |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
50 |
11 | 51 def phrasesFromFile(p: Path) -> Iterable[dict]: |
52 mtg = json.loads(p.read_text()) | |
53 print(f' has {len(mtg["phrases"])} phrases') | |
54 text = ' '.join(mtg['phrases']) | |
55 | |
56 i = 0 | |
57 for sentence in nltk.sent_tokenize(text): | |
58 sentence = re.sub(r'\s+', ' ', sentence).strip() | |
59 if len(sentence) < 5: | |
60 continue | |
61 if not re.search(r'\w\w\w\w\w', sentence): | |
62 continue | |
63 | |
64 yield dict(id=f"{mtg['mtg']['id']}_sentence{i}", | |
65 title=f"{mtg['mtg']['date']} {mtg['mtg']['title']}", | |
66 sourceFile=str(p), | |
67 posJson="[]", | |
68 phrase=sentence) | |
69 | |
70 | |
16
7a87ba2f00d9
reformat, fix some types, make more async
drewp@bigasterisk.com
parents:
15
diff
changeset
|
71 async def addMeeting(index: SearchIndex, mtg: MeetingRow): |
10
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
72 try: |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
73 agendaUrl = meetingAgendaUrl(mtg) |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
74 except ValueError: |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
75 pass |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
76 else: |
16
7a87ba2f00d9
reformat, fix some types, make more async
drewp@bigasterisk.com
parents:
15
diff
changeset
|
77 html = await getCityPermanent(agendaUrl) |
13
403eff4a16c8
fix up indexer flow and fastapi server
drewp@bigasterisk.com
parents:
11
diff
changeset
|
78 texts = extractMeetingText(html) |
16
7a87ba2f00d9
reformat, fix some types, make more async
drewp@bigasterisk.com
parents:
15
diff
changeset
|
79 for sentence in nltk.sent_tokenize(' '.join(texts)): |
7a87ba2f00d9
reformat, fix some types, make more async
drewp@bigasterisk.com
parents:
15
diff
changeset
|
80 index.addDoc(sourceTitle=f'{mtg["date"]} {mtg["title"]}', |
7a87ba2f00d9
reformat, fix some types, make more async
drewp@bigasterisk.com
parents:
15
diff
changeset
|
81 phrase=sentence) |
10
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
82 |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
83 try: |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
84 videoUrl = mtg['videoUrl'] |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
85 if not videoUrl: |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
86 raise KeyError |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
87 except KeyError: |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
88 pass |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
89 else: |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
90 '''transcribe and index video...''' |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
91 |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
92 |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
93 @flow(persist_result=True) |
16
7a87ba2f00d9
reformat, fix some types, make more async
drewp@bigasterisk.com
parents:
15
diff
changeset
|
94 async def buildIndex(): |
10
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
95 global log |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
96 log = get_run_logger() |
11 | 97 scobot.index.access.log = log |
10
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
98 |
15 | 99 index = SearchIndex(Path('data/build/index0')) |
10
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
100 for url in meetingListUrls(): |
16
7a87ba2f00d9
reformat, fix some types, make more async
drewp@bigasterisk.com
parents:
15
diff
changeset
|
101 mtgs = cast(list[MeetingRow], await getCityMutableJson(url)) |
10
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
102 log.info(f'got {len(mtgs)=}') |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
103 |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
104 for mtg in mtgs: |
16
7a87ba2f00d9
reformat, fix some types, make more async
drewp@bigasterisk.com
parents:
15
diff
changeset
|
105 await addMeeting(index, mtg) |
10
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
106 index.commit() |
16
7a87ba2f00d9
reformat, fix some types, make more async
drewp@bigasterisk.com
parents:
15
diff
changeset
|
107 # todo: kill search to restart it |
10
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
108 |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
109 |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
110 if __name__ == '__main__': |
13
403eff4a16c8
fix up indexer flow and fastapi server
drewp@bigasterisk.com
parents:
11
diff
changeset
|
111 buildIndex.serve(name='buildIndex') |