view flow/build_index.py @ 10:13438795d896

rewrite with prefect flows and whoosh search, but it's in a nested pdm env
author drewp@bigasterisk.com
date Thu, 11 Jul 2024 17:35:31 -0700
parents
children
line wrap: on
line source

from pathlib import Path
from typing import Iterable, cast

import search_index
from download import getCityMutableJson, getCityPermanent
from local_types import MeetingRow, Url
from lxml.html import fromstring
from prefect import flow, task
from prefect.logging import get_run_logger
from search_index import SearchIndex

log = None


@task()
def meetingListUrls() -> Iterable[Url]:
    return [
        "https://albanyca.primegov.com/api/v2/PublicPortal/ListArchivedMeetings?year=2024",
        "https://albanyca.primegov.com/api/v2/PublicPortal/ListUpcomingMeetings",
    ]


def meetingAgendaUrl(mtg: MeetingRow) -> Url:
    for doc in mtg.get('documentList', []):
        if doc['templateName'] == 'HTML Agenda Packet':
            tid = doc['templateId']
            return f'https://albanyca.primegov.com/Portal/Meeting?meetingTemplateId={tid}'
    raise ValueError(f"no agenda doc found for {mtg['id']=}")


def extractMeetingText(mhtml: str) -> list[str]:
    el = fromstring(mhtml)
    m = el.cssselect('div#meetingSection')[0]
    for st in m.cssselect('style'):
        st.clear()
    meetingText = [
        chunk for chunk in m.xpath('.//text()', namespaces=el.nsmap)
        if chunk.strip()
    ]
    return meetingText


def addMeeting(index: SearchIndex, mtg: MeetingRow):
    try:
        agendaUrl = meetingAgendaUrl(mtg)
    except ValueError:
        pass
    else:
        html = getCityPermanent(agendaUrl)
        text = extractMeetingText(html)
        # todo group phrases
        index.addDoc(title=f'{mtg["date"]} {mtg["title"]}', content=text)

    try:
        videoUrl = mtg['videoUrl']
        if not videoUrl:
            raise KeyError
    except KeyError:
        pass
    else:
        '''transcribe and index video...'''


@flow(persist_result=True)
def buildIndex():
    global log
    log = get_run_logger()
    search_index.log = log

    index = SearchIndex(Path('/tmp/scoindex'))
    for url in meetingListUrls():
        mtgs = cast(list[MeetingRow], getCityMutableJson(url))
        log.info(f'got {len(mtgs)=}')

        for mtg in mtgs:
            addMeeting(index, mtg)
    index.commit()


if __name__ == '__main__':
    buildIndex.serve()