Mercurial > code > home > repos > sco-bot
diff search/meeting_docs.py @ 10:13438795d896
rewrite with prefect flows and whoosh search, but it's in a nested pdm env
author | drewp@bigasterisk.com |
---|---|
date | Thu, 11 Jul 2024 17:35:31 -0700 |
parents | d1b54241a731 |
children |
line wrap: on
line diff
--- a/search/meeting_docs.py Wed Jul 10 12:25:06 2024 -0700 +++ b/search/meeting_docs.py Thu Jul 11 17:35:31 2024 -0700 @@ -1,54 +0,0 @@ -import json -from pathlib import Path -from pprint import pprint -import time -import requests -from lxml.html import fromstring - - -def getMeetingText(meetingUrl) -> list[str]: - mhtml = requests.get(meetingUrl).text - el = fromstring(mhtml) - m = el.cssselect('div#meetingSection')[0] - for st in m.cssselect('style'): - st.clear() - meetingText = [ - chunk for chunk in m.xpath('.//text()', namespaces=el.nsmap) - if chunk.strip() - ] - return meetingText - - -def gatherMtgs(mtg): - outDir = Path(f'data/albany/meetingId/{mtg["id"]}') - outDir.mkdir(parents=True, exist_ok=True) - outFile = outDir / 'agenda.json' - if outFile.exists(): - return - meetingUrl = None - for doc in mtg.get('documentList', []): - if doc['templateName'] == 'HTML Agenda Packet': - tid = doc['templateId'] - meetingUrl = f'https://albanyca.primegov.com/Portal/Meeting?meetingTemplateId={tid}' - - row = { - 'created': time.time(), - 'mtg': mtg, - 'videoUrl': mtg['videoUrl'], - 'meetingUrl': meetingUrl, - 'phrases': getMeetingText(meetingUrl) if meetingUrl else [], - } - outFile.write_text(json.dumps(row, indent=2)) - print(f'wrote {outFile}') - - -for mtg in (requests.get( - "https://albanyca.primegov.com/api/v2/PublicPortal/ListArchivedMeetings?year=2024" -).json()): - gatherMtgs(mtg) - -for mtg in (requests.get( - "https://albanyca.primegov.com/api/v2/PublicPortal/ListUpcomingMeetings" -).json()): - gatherMtgs(mtg) - break