9
|
1 import json
|
|
2 from pathlib import Path
|
|
3 from pprint import pprint
|
|
4 import time
|
|
5 import requests
|
|
6 from lxml.html import fromstring
|
|
7
|
|
8
|
|
9 def getMeetingText(meetingUrl) -> list[str]:
|
|
10 mhtml = requests.get(meetingUrl).text
|
|
11 el = fromstring(mhtml)
|
|
12 m = el.cssselect('div#meetingSection')[0]
|
|
13 for st in m.cssselect('style'):
|
|
14 st.clear()
|
|
15 meetingText = [
|
|
16 chunk for chunk in m.xpath('.//text()', namespaces=el.nsmap)
|
|
17 if chunk.strip()
|
|
18 ]
|
|
19 return meetingText
|
|
20
|
|
21
|
|
22 def gatherMtgs(mtg):
|
|
23 outDir = Path(f'data/albany/meetingId/{mtg["id"]}')
|
|
24 outDir.mkdir(parents=True, exist_ok=True)
|
|
25 outFile = outDir / 'agenda.json'
|
|
26 if outFile.exists():
|
|
27 return
|
|
28 meetingUrl = None
|
|
29 for doc in mtg.get('documentList', []):
|
|
30 if doc['templateName'] == 'HTML Agenda Packet':
|
|
31 tid = doc['templateId']
|
|
32 meetingUrl = f'https://albanyca.primegov.com/Portal/Meeting?meetingTemplateId={tid}'
|
|
33
|
|
34 row = {
|
|
35 'created': time.time(),
|
|
36 'mtg': mtg,
|
|
37 'videoUrl': mtg['videoUrl'],
|
|
38 'meetingUrl': meetingUrl,
|
|
39 'phrases': getMeetingText(meetingUrl) if meetingUrl else [],
|
|
40 }
|
|
41 outFile.write_text(json.dumps(row, indent=2))
|
|
42 print(f'wrote {outFile}')
|
|
43
|
|
44
|
|
45 for mtg in (requests.get(
|
|
46 "https://albanyca.primegov.com/api/v2/PublicPortal/ListArchivedMeetings?year=2024"
|
|
47 ).json()):
|
|
48 gatherMtgs(mtg)
|
|
49
|
|
50 for mtg in (requests.get(
|
|
51 "https://albanyca.primegov.com/api/v2/PublicPortal/ListUpcomingMeetings"
|
|
52 ).json()):
|
|
53 gatherMtgs(mtg)
|
|
54 break
|