diff search/meeting_docs.py @ 9:d1b54241a731

rewrite meeting fetcher
author drewp@bigasterisk.com
date Wed, 10 Jul 2024 12:25:06 -0700
parents
children 13438795d896
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/search/meeting_docs.py	Wed Jul 10 12:25:06 2024 -0700
@@ -0,0 +1,54 @@
+import json
+from pathlib import Path
+from pprint import pprint
+import time
+import requests
+from lxml.html import fromstring
+
+
+def getMeetingText(meetingUrl) -> list[str]:
+    mhtml = requests.get(meetingUrl).text
+    el = fromstring(mhtml)
+    m = el.cssselect('div#meetingSection')[0]
+    for st in m.cssselect('style'):
+        st.clear()
+    meetingText = [
+        chunk for chunk in m.xpath('.//text()', namespaces=el.nsmap)
+        if chunk.strip()
+    ]
+    return meetingText
+
+
+def gatherMtgs(mtg):
+    outDir = Path(f'data/albany/meetingId/{mtg["id"]}')
+    outDir.mkdir(parents=True, exist_ok=True)
+    outFile = outDir / 'agenda.json'
+    if outFile.exists():
+        return
+    meetingUrl = None
+    for doc in mtg.get('documentList', []):
+        if doc['templateName'] == 'HTML Agenda Packet':
+            tid = doc['templateId']
+            meetingUrl = f'https://albanyca.primegov.com/Portal/Meeting?meetingTemplateId={tid}'
+
+    row = {
+        'created': time.time(),
+        'mtg': mtg,
+        'videoUrl': mtg['videoUrl'],
+        'meetingUrl': meetingUrl,
+        'phrases': getMeetingText(meetingUrl) if meetingUrl else [],
+    }
+    outFile.write_text(json.dumps(row, indent=2))
+    print(f'wrote {outFile}')
+
+
+for mtg in (requests.get(
+        "https://albanyca.primegov.com/api/v2/PublicPortal/ListArchivedMeetings?year=2024"
+).json()):
+    gatherMtgs(mtg)
+
+for mtg in (requests.get(
+        "https://albanyca.primegov.com/api/v2/PublicPortal/ListUpcomingMeetings"
+).json()):
+    gatherMtgs(mtg)
+    break