Mercurial > code > home > repos > sco-bot
comparison search/extract_agenda.py @ 9:d1b54241a731
rewrite meeting fetcher
author | drewp@bigasterisk.com |
---|---|
date | Wed, 10 Jul 2024 12:25:06 -0700 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
8:f23b21bd0fce | 9:d1b54241a731 |
---|---|
1 import json | |
2 import re | |
3 from pathlib import Path | |
4 from typing import Iterable | |
5 | |
6 import nltk | |
7 from doc import Doc | |
8 | |
9 | |
10 def files() -> Iterable[Path]: | |
11 for p in Path('data/albany/meetingId').glob('*/agenda.json'): | |
12 print(f'file {p}') | |
13 yield p | |
14 | |
15 | |
16 def phrasesFromFile(p: Path) -> Iterable[Doc]: | |
17 mtg = json.loads(p.read_text()) | |
18 print(f' has {len(mtg["phrases"])} phrases') | |
19 text = ' '.join(mtg['phrases']) | |
20 | |
21 i = 0 | |
22 for sentence in nltk.sent_tokenize(text): | |
23 sentence = re.sub(r'\s+', ' ', sentence).strip() | |
24 if len(sentence) < 5: | |
25 continue | |
26 if not re.search(r'\w\w\w\w\w', sentence): | |
27 continue | |
28 | |
29 yield Doc(id=f"{mtg['mtg']['id']}_sentence{i}", | |
30 title=f"{mtg['mtg']['date']} {mtg['mtg']['title']}", | |
31 sourceFile=str(p), | |
32 posJson="[]", | |
33 phrase=sentence) |