Mercurial > code > home > repos > sco-bot
diff search/extract_agenda.py @ 9:d1b54241a731
rewrite meeting fetcher
author | drewp@bigasterisk.com |
---|---|
date | Wed, 10 Jul 2024 12:25:06 -0700 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/search/extract_agenda.py Wed Jul 10 12:25:06 2024 -0700 @@ -0,0 +1,33 @@ +import json +import re +from pathlib import Path +from typing import Iterable + +import nltk +from doc import Doc + + +def files() -> Iterable[Path]: + for p in Path('data/albany/meetingId').glob('*/agenda.json'): + print(f'file {p}') + yield p + + +def phrasesFromFile(p: Path) -> Iterable[Doc]: + mtg = json.loads(p.read_text()) + print(f' has {len(mtg["phrases"])} phrases') + text = ' '.join(mtg['phrases']) + + i = 0 + for sentence in nltk.sent_tokenize(text): + sentence = re.sub(r'\s+', ' ', sentence).strip() + if len(sentence) < 5: + continue + if not re.search(r'\w\w\w\w\w', sentence): + continue + + yield Doc(id=f"{mtg['mtg']['id']}_sentence{i}", + title=f"{mtg['mtg']['date']} {mtg['mtg']['title']}", + sourceFile=str(p), + posJson="[]", + phrase=sentence)