Mercurial > code > home > repos > sco-bot
diff search/meeting_docs.py @ 9:d1b54241a731
rewrite meeting fetcher
author | drewp@bigasterisk.com |
---|---|
date | Wed, 10 Jul 2024 12:25:06 -0700 |
parents | |
children | 13438795d896 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/search/meeting_docs.py Wed Jul 10 12:25:06 2024 -0700 @@ -0,0 +1,54 @@ +import json +from pathlib import Path +from pprint import pprint +import time +import requests +from lxml.html import fromstring + + +def getMeetingText(meetingUrl) -> list[str]: + mhtml = requests.get(meetingUrl).text + el = fromstring(mhtml) + m = el.cssselect('div#meetingSection')[0] + for st in m.cssselect('style'): + st.clear() + meetingText = [ + chunk for chunk in m.xpath('.//text()', namespaces=el.nsmap) + if chunk.strip() + ] + return meetingText + + +def gatherMtgs(mtg): + outDir = Path(f'data/albany/meetingId/{mtg["id"]}') + outDir.mkdir(parents=True, exist_ok=True) + outFile = outDir / 'agenda.json' + if outFile.exists(): + return + meetingUrl = None + for doc in mtg.get('documentList', []): + if doc['templateName'] == 'HTML Agenda Packet': + tid = doc['templateId'] + meetingUrl = f'https://albanyca.primegov.com/Portal/Meeting?meetingTemplateId={tid}' + + row = { + 'created': time.time(), + 'mtg': mtg, + 'videoUrl': mtg['videoUrl'], + 'meetingUrl': meetingUrl, + 'phrases': getMeetingText(meetingUrl) if meetingUrl else [], + } + outFile.write_text(json.dumps(row, indent=2)) + print(f'wrote {outFile}') + + +for mtg in (requests.get( + "https://albanyca.primegov.com/api/v2/PublicPortal/ListArchivedMeetings?year=2024" +).json()): + gatherMtgs(mtg) + +for mtg in (requests.get( + "https://albanyca.primegov.com/api/v2/PublicPortal/ListUpcomingMeetings" +).json()): + gatherMtgs(mtg) + break