view search/meeting_docs.py @ 9:d1b54241a731

rewrite meeting fetcher
author drewp@bigasterisk.com
date Wed, 10 Jul 2024 12:25:06 -0700
parents
children 13438795d896
line wrap: on
line source

import json
from pathlib import Path
from pprint import pprint
import time
import requests
from lxml.html import fromstring


def getMeetingText(meetingUrl) -> list[str]:
    mhtml = requests.get(meetingUrl).text
    el = fromstring(mhtml)
    m = el.cssselect('div#meetingSection')[0]
    for st in m.cssselect('style'):
        st.clear()
    meetingText = [
        chunk for chunk in m.xpath('.//text()', namespaces=el.nsmap)
        if chunk.strip()
    ]
    return meetingText


def gatherMtgs(mtg):
    outDir = Path(f'data/albany/meetingId/{mtg["id"]}')
    outDir.mkdir(parents=True, exist_ok=True)
    outFile = outDir / 'agenda.json'
    if outFile.exists():
        return
    meetingUrl = None
    for doc in mtg.get('documentList', []):
        if doc['templateName'] == 'HTML Agenda Packet':
            tid = doc['templateId']
            meetingUrl = f'https://albanyca.primegov.com/Portal/Meeting?meetingTemplateId={tid}'

    row = {
        'created': time.time(),
        'mtg': mtg,
        'videoUrl': mtg['videoUrl'],
        'meetingUrl': meetingUrl,
        'phrases': getMeetingText(meetingUrl) if meetingUrl else [],
    }
    outFile.write_text(json.dumps(row, indent=2))
    print(f'wrote {outFile}')


for mtg in (requests.get(
        "https://albanyca.primegov.com/api/v2/PublicPortal/ListArchivedMeetings?year=2024"
).json()):
    gatherMtgs(mtg)

for mtg in (requests.get(
        "https://albanyca.primegov.com/api/v2/PublicPortal/ListUpcomingMeetings"
).json()):
    gatherMtgs(mtg)
    break