diff search/extract_agenda.py @ 9:d1b54241a731

rewrite meeting fetcher
author drewp@bigasterisk.com
date Wed, 10 Jul 2024 12:25:06 -0700
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/search/extract_agenda.py	Wed Jul 10 12:25:06 2024 -0700
@@ -0,0 +1,33 @@
+import json
+import re
+from pathlib import Path
+from typing import Iterable
+
+import nltk
+from doc import Doc
+
+
+def files() -> Iterable[Path]:
+    for p in Path('data/albany/meetingId').glob('*/agenda.json'):
+        print(f'file {p}')
+        yield p
+
+
+def phrasesFromFile(p: Path) -> Iterable[Doc]:
+    mtg = json.loads(p.read_text())
+    print(f'  has {len(mtg["phrases"])} phrases')
+    text = ' '.join(mtg['phrases'])
+
+    i = 0
+    for sentence in nltk.sent_tokenize(text):
+        sentence = re.sub(r'\s+', ' ', sentence).strip()
+        if len(sentence) < 5:
+            continue
+        if not re.search(r'\w\w\w\w\w', sentence):
+            continue
+
+        yield Doc(id=f"{mtg['mtg']['id']}_sentence{i}",
+                  title=f"{mtg['mtg']['date']} {mtg['mtg']['title']}",
+                  sourceFile=str(p),
+                  posJson="[]",
+                  phrase=sentence)