Mercurial > code > home > repos > sco-bot
comparison search/meeting_docs.py @ 9:d1b54241a731
rewrite meeting fetcher
author | drewp@bigasterisk.com |
---|---|
date | Wed, 10 Jul 2024 12:25:06 -0700 |
parents | |
children | 13438795d896 |
comparison
equal
deleted
inserted
replaced
8:f23b21bd0fce | 9:d1b54241a731 |
---|---|
1 import json | |
2 from pathlib import Path | |
3 from pprint import pprint | |
4 import time | |
5 import requests | |
6 from lxml.html import fromstring | |
7 | |
8 | |
9 def getMeetingText(meetingUrl) -> list[str]: | |
10 mhtml = requests.get(meetingUrl).text | |
11 el = fromstring(mhtml) | |
12 m = el.cssselect('div#meetingSection')[0] | |
13 for st in m.cssselect('style'): | |
14 st.clear() | |
15 meetingText = [ | |
16 chunk for chunk in m.xpath('.//text()', namespaces=el.nsmap) | |
17 if chunk.strip() | |
18 ] | |
19 return meetingText | |
20 | |
21 | |
22 def gatherMtgs(mtg): | |
23 outDir = Path(f'data/albany/meetingId/{mtg["id"]}') | |
24 outDir.mkdir(parents=True, exist_ok=True) | |
25 outFile = outDir / 'agenda.json' | |
26 if outFile.exists(): | |
27 return | |
28 meetingUrl = None | |
29 for doc in mtg.get('documentList', []): | |
30 if doc['templateName'] == 'HTML Agenda Packet': | |
31 tid = doc['templateId'] | |
32 meetingUrl = f'https://albanyca.primegov.com/Portal/Meeting?meetingTemplateId={tid}' | |
33 | |
34 row = { | |
35 'created': time.time(), | |
36 'mtg': mtg, | |
37 'videoUrl': mtg['videoUrl'], | |
38 'meetingUrl': meetingUrl, | |
39 'phrases': getMeetingText(meetingUrl) if meetingUrl else [], | |
40 } | |
41 outFile.write_text(json.dumps(row, indent=2)) | |
42 print(f'wrote {outFile}') | |
43 | |
44 | |
45 for mtg in (requests.get( | |
46 "https://albanyca.primegov.com/api/v2/PublicPortal/ListArchivedMeetings?year=2024" | |
47 ).json()): | |
48 gatherMtgs(mtg) | |
49 | |
50 for mtg in (requests.get( | |
51 "https://albanyca.primegov.com/api/v2/PublicPortal/ListUpcomingMeetings" | |
52 ).json()): | |
53 gatherMtgs(mtg) | |
54 break |