Mercurial > code > home > repos > sco-bot
view search/meeting_docs.py @ 9:d1b54241a731
rewrite meeting fetcher
author | drewp@bigasterisk.com |
---|---|
date | Wed, 10 Jul 2024 12:25:06 -0700 |
parents | |
children | 13438795d896 |
line wrap: on
line source
import json from pathlib import Path from pprint import pprint import time import requests from lxml.html import fromstring def getMeetingText(meetingUrl) -> list[str]: mhtml = requests.get(meetingUrl).text el = fromstring(mhtml) m = el.cssselect('div#meetingSection')[0] for st in m.cssselect('style'): st.clear() meetingText = [ chunk for chunk in m.xpath('.//text()', namespaces=el.nsmap) if chunk.strip() ] return meetingText def gatherMtgs(mtg): outDir = Path(f'data/albany/meetingId/{mtg["id"]}') outDir.mkdir(parents=True, exist_ok=True) outFile = outDir / 'agenda.json' if outFile.exists(): return meetingUrl = None for doc in mtg.get('documentList', []): if doc['templateName'] == 'HTML Agenda Packet': tid = doc['templateId'] meetingUrl = f'https://albanyca.primegov.com/Portal/Meeting?meetingTemplateId={tid}' row = { 'created': time.time(), 'mtg': mtg, 'videoUrl': mtg['videoUrl'], 'meetingUrl': meetingUrl, 'phrases': getMeetingText(meetingUrl) if meetingUrl else [], } outFile.write_text(json.dumps(row, indent=2)) print(f'wrote {outFile}') for mtg in (requests.get( "https://albanyca.primegov.com/api/v2/PublicPortal/ListArchivedMeetings?year=2024" ).json()): gatherMtgs(mtg) for mtg in (requests.get( "https://albanyca.primegov.com/api/v2/PublicPortal/ListUpcomingMeetings" ).json()): gatherMtgs(mtg) break