view search/extract_agenda.py @ 9:d1b54241a731

rewrite meeting fetcher
author drewp@bigasterisk.com
date Wed, 10 Jul 2024 12:25:06 -0700
parents
children
line wrap: on
line source

import json
import re
from pathlib import Path
from typing import Iterable

import nltk
from doc import Doc


def files() -> Iterable[Path]:
    for p in Path('data/albany/meetingId').glob('*/agenda.json'):
        print(f'file {p}')
        yield p


def phrasesFromFile(p: Path) -> Iterable[Doc]:
    mtg = json.loads(p.read_text())
    print(f'  has {len(mtg["phrases"])} phrases')
    text = ' '.join(mtg['phrases'])

    i = 0
    for sentence in nltk.sent_tokenize(text):
        sentence = re.sub(r'\s+', ' ', sentence).strip()
        if len(sentence) < 5:
            continue
        if not re.search(r'\w\w\w\w\w', sentence):
            continue

        yield Doc(id=f"{mtg['mtg']['id']}_sentence{i}",
                  title=f"{mtg['mtg']['date']} {mtg['mtg']['title']}",
                  sourceFile=str(p),
                  posJson="[]",
                  phrase=sentence)