Mercurial > code > home > repos > sco-bot
view search/extract_agenda.py @ 9:d1b54241a731
rewrite meeting fetcher
author | drewp@bigasterisk.com |
---|---|
date | Wed, 10 Jul 2024 12:25:06 -0700 |
parents | |
children |
line wrap: on
line source
import json import re from pathlib import Path from typing import Iterable import nltk from doc import Doc def files() -> Iterable[Path]: for p in Path('data/albany/meetingId').glob('*/agenda.json'): print(f'file {p}') yield p def phrasesFromFile(p: Path) -> Iterable[Doc]: mtg = json.loads(p.read_text()) print(f' has {len(mtg["phrases"])} phrases') text = ' '.join(mtg['phrases']) i = 0 for sentence in nltk.sent_tokenize(text): sentence = re.sub(r'\s+', ' ', sentence).strip() if len(sentence) < 5: continue if not re.search(r'\w\w\w\w\w', sentence): continue yield Doc(id=f"{mtg['mtg']['id']}_sentence{i}", title=f"{mtg['mtg']['date']} {mtg['mtg']['title']}", sourceFile=str(p), posJson="[]", phrase=sentence)