Mercurial > code > home > repos > sco-bot
annotate scobot/index/build_index_flow.py @ 11:6622bacb0b84
first pass at reorg
author | drewp@bigasterisk.com |
---|---|
date | Thu, 11 Jul 2024 18:15:44 -0700 |
parents | flow/build_index.py@13438795d896 |
children | 403eff4a16c8 |
rev | line source |
---|---|
11 | 1 import json |
2 import re | |
10
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
3 from pathlib import Path |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
4 from typing import Iterable, cast |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
5 |
11 | 6 import lxml.html |
7 import nltk | |
10
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
8 from prefect import flow, task |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
9 from prefect.logging import get_run_logger |
11 | 10 |
11 import scobot.index.access | |
12 from scobot.index.access import SearchIndex | |
13 from scobot.index.download_tasks import getCityMutableJson, getCityPermanent | |
14 from scobot.local_types import MeetingRow, Url | |
10
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
15 |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
16 log = None |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
17 |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
18 |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
19 @task() |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
20 def meetingListUrls() -> Iterable[Url]: |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
21 return [ |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
22 "https://albanyca.primegov.com/api/v2/PublicPortal/ListArchivedMeetings?year=2024", |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
23 "https://albanyca.primegov.com/api/v2/PublicPortal/ListUpcomingMeetings", |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
24 ] |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
25 |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
26 |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
27 def meetingAgendaUrl(mtg: MeetingRow) -> Url: |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
28 for doc in mtg.get('documentList', []): |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
29 if doc['templateName'] == 'HTML Agenda Packet': |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
30 tid = doc['templateId'] |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
31 return f'https://albanyca.primegov.com/Portal/Meeting?meetingTemplateId={tid}' |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
32 raise ValueError(f"no agenda doc found for {mtg['id']=}") |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
33 |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
34 |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
35 def extractMeetingText(mhtml: str) -> list[str]: |
11 | 36 el = lxml.html.fromstring(mhtml) |
10
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
37 m = el.cssselect('div#meetingSection')[0] |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
38 for st in m.cssselect('style'): |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
39 st.clear() |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
40 meetingText = [ |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
41 chunk for chunk in m.xpath('.//text()', namespaces=el.nsmap) |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
42 if chunk.strip() |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
43 ] |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
44 return meetingText |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
45 |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
46 |
11 | 47 def phrasesFromFile(p: Path) -> Iterable[dict]: |
48 mtg = json.loads(p.read_text()) | |
49 print(f' has {len(mtg["phrases"])} phrases') | |
50 text = ' '.join(mtg['phrases']) | |
51 | |
52 i = 0 | |
53 for sentence in nltk.sent_tokenize(text): | |
54 sentence = re.sub(r'\s+', ' ', sentence).strip() | |
55 if len(sentence) < 5: | |
56 continue | |
57 if not re.search(r'\w\w\w\w\w', sentence): | |
58 continue | |
59 | |
60 yield dict(id=f"{mtg['mtg']['id']}_sentence{i}", | |
61 title=f"{mtg['mtg']['date']} {mtg['mtg']['title']}", | |
62 sourceFile=str(p), | |
63 posJson="[]", | |
64 phrase=sentence) | |
65 | |
66 | |
10
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
67 def addMeeting(index: SearchIndex, mtg: MeetingRow): |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
68 try: |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
69 agendaUrl = meetingAgendaUrl(mtg) |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
70 except ValueError: |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
71 pass |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
72 else: |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
73 html = getCityPermanent(agendaUrl) |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
74 text = extractMeetingText(html) |
11 | 75 # todo group phrases phrasesFromFile |
10
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
76 index.addDoc(title=f'{mtg["date"]} {mtg["title"]}', content=text) |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
77 |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
78 try: |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
79 videoUrl = mtg['videoUrl'] |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
80 if not videoUrl: |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
81 raise KeyError |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
82 except KeyError: |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
83 pass |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
84 else: |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
85 '''transcribe and index video...''' |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
86 |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
87 |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
88 @flow(persist_result=True) |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
89 def buildIndex(): |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
90 global log |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
91 log = get_run_logger() |
11 | 92 scobot.index.access.log = log |
10
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
93 |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
94 index = SearchIndex(Path('/tmp/scoindex')) |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
95 for url in meetingListUrls(): |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
96 mtgs = cast(list[MeetingRow], getCityMutableJson(url)) |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
97 log.info(f'got {len(mtgs)=}') |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
98 |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
99 for mtg in mtgs: |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
100 addMeeting(index, mtg) |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
101 index.commit() |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
102 |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
103 |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
104 if __name__ == '__main__': |
13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff
changeset
|
105 buildIndex.serve() |