diff search/meeting_docs.py @ 10:13438795d896

rewrite with prefect flows and whoosh search, but it's in a nested pdm env
author drewp@bigasterisk.com
date Thu, 11 Jul 2024 17:35:31 -0700
parents d1b54241a731
children
line wrap: on
line diff
--- a/search/meeting_docs.py	Wed Jul 10 12:25:06 2024 -0700
+++ b/search/meeting_docs.py	Thu Jul 11 17:35:31 2024 -0700
@@ -1,54 +0,0 @@
-import json
-from pathlib import Path
-from pprint import pprint
-import time
-import requests
-from lxml.html import fromstring
-
-
-def getMeetingText(meetingUrl) -> list[str]:
-    mhtml = requests.get(meetingUrl).text
-    el = fromstring(mhtml)
-    m = el.cssselect('div#meetingSection')[0]
-    for st in m.cssselect('style'):
-        st.clear()
-    meetingText = [
-        chunk for chunk in m.xpath('.//text()', namespaces=el.nsmap)
-        if chunk.strip()
-    ]
-    return meetingText
-
-
-def gatherMtgs(mtg):
-    outDir = Path(f'data/albany/meetingId/{mtg["id"]}')
-    outDir.mkdir(parents=True, exist_ok=True)
-    outFile = outDir / 'agenda.json'
-    if outFile.exists():
-        return
-    meetingUrl = None
-    for doc in mtg.get('documentList', []):
-        if doc['templateName'] == 'HTML Agenda Packet':
-            tid = doc['templateId']
-            meetingUrl = f'https://albanyca.primegov.com/Portal/Meeting?meetingTemplateId={tid}'
-
-    row = {
-        'created': time.time(),
-        'mtg': mtg,
-        'videoUrl': mtg['videoUrl'],
-        'meetingUrl': meetingUrl,
-        'phrases': getMeetingText(meetingUrl) if meetingUrl else [],
-    }
-    outFile.write_text(json.dumps(row, indent=2))
-    print(f'wrote {outFile}')
-
-
-for mtg in (requests.get(
-        "https://albanyca.primegov.com/api/v2/PublicPortal/ListArchivedMeetings?year=2024"
-).json()):
-    gatherMtgs(mtg)
-
-for mtg in (requests.get(
-        "https://albanyca.primegov.com/api/v2/PublicPortal/ListUpcomingMeetings"
-).json()):
-    gatherMtgs(mtg)
-    break