comparison search/meeting_docs.py @ 9:d1b54241a731

rewrite meeting fetcher
author drewp@bigasterisk.com
date Wed, 10 Jul 2024 12:25:06 -0700
parents
children 13438795d896
comparison
equal deleted inserted replaced
8:f23b21bd0fce 9:d1b54241a731
1 import json
2 from pathlib import Path
3 from pprint import pprint
4 import time
5 import requests
6 from lxml.html import fromstring
7
8
9 def getMeetingText(meetingUrl) -> list[str]:
10 mhtml = requests.get(meetingUrl).text
11 el = fromstring(mhtml)
12 m = el.cssselect('div#meetingSection')[0]
13 for st in m.cssselect('style'):
14 st.clear()
15 meetingText = [
16 chunk for chunk in m.xpath('.//text()', namespaces=el.nsmap)
17 if chunk.strip()
18 ]
19 return meetingText
20
21
22 def gatherMtgs(mtg):
23 outDir = Path(f'data/albany/meetingId/{mtg["id"]}')
24 outDir.mkdir(parents=True, exist_ok=True)
25 outFile = outDir / 'agenda.json'
26 if outFile.exists():
27 return
28 meetingUrl = None
29 for doc in mtg.get('documentList', []):
30 if doc['templateName'] == 'HTML Agenda Packet':
31 tid = doc['templateId']
32 meetingUrl = f'https://albanyca.primegov.com/Portal/Meeting?meetingTemplateId={tid}'
33
34 row = {
35 'created': time.time(),
36 'mtg': mtg,
37 'videoUrl': mtg['videoUrl'],
38 'meetingUrl': meetingUrl,
39 'phrases': getMeetingText(meetingUrl) if meetingUrl else [],
40 }
41 outFile.write_text(json.dumps(row, indent=2))
42 print(f'wrote {outFile}')
43
44
45 for mtg in (requests.get(
46 "https://albanyca.primegov.com/api/v2/PublicPortal/ListArchivedMeetings?year=2024"
47 ).json()):
48 gatherMtgs(mtg)
49
50 for mtg in (requests.get(
51 "https://albanyca.primegov.com/api/v2/PublicPortal/ListUpcomingMeetings"
52 ).json()):
53 gatherMtgs(mtg)
54 break