comparison search/extract_agenda.py @ 9:d1b54241a731

rewrite meeting fetcher
author drewp@bigasterisk.com
date Wed, 10 Jul 2024 12:25:06 -0700
parents
children
comparison
equal deleted inserted replaced
8:f23b21bd0fce 9:d1b54241a731
1 import json
2 import re
3 from pathlib import Path
4 from typing import Iterable
5
6 import nltk
7 from doc import Doc
8
9
10 def files() -> Iterable[Path]:
11 for p in Path('data/albany/meetingId').glob('*/agenda.json'):
12 print(f'file {p}')
13 yield p
14
15
16 def phrasesFromFile(p: Path) -> Iterable[Doc]:
17 mtg = json.loads(p.read_text())
18 print(f' has {len(mtg["phrases"])} phrases')
19 text = ' '.join(mtg['phrases'])
20
21 i = 0
22 for sentence in nltk.sent_tokenize(text):
23 sentence = re.sub(r'\s+', ' ', sentence).strip()
24 if len(sentence) < 5:
25 continue
26 if not re.search(r'\w\w\w\w\w', sentence):
27 continue
28
29 yield Doc(id=f"{mtg['mtg']['id']}_sentence{i}",
30 title=f"{mtg['mtg']['date']} {mtg['mtg']['title']}",
31 sourceFile=str(p),
32 posJson="[]",
33 phrase=sentence)