changeset 16:7a87ba2f00d9

reformat, fix some types, make more async
author drewp@bigasterisk.com
date Fri, 19 Jul 2024 00:49:38 -0700
parents 6ed25bcaaf1f
children 0d72635fc501
files k8s/deploy.yaml k8s/volumes.yaml scobot/index/access.py scobot/index/build_index_flow.py scobot/index/download_tasks.py scobot/local_types.py scobot/service/query.py skaffold.yaml
diffstat 8 files changed, 45 insertions(+), 37 deletions(-) [+]
line wrap: on
line diff
--- a/k8s/deploy.yaml	Fri Jul 19 00:30:47 2024 -0700
+++ b/k8s/deploy.yaml	Fri Jul 19 00:49:38 2024 -0700
@@ -46,8 +46,8 @@
             - containerPort: 8001
               name: server
           volumeMounts:
-          - name: sco-bot-data
-            mountPath: /opt/data
+            - name: sco-bot-data
+              mountPath: /opt/data
 
         - name: prefect
           image: reg:5000/sco_bot_server
@@ -63,8 +63,8 @@
             - containerPort: 4200
               name: prefect
           volumeMounts:
-          - name: sco-bot-data
-            mountPath: /opt/data
+            - name: sco-bot-data
+              mountPath: /opt/data
 
         - name: build-flow
           image: reg:5000/sco_bot_server
@@ -77,14 +77,14 @@
             - pdm
             - run_build_flow
           volumeMounts:
-          - name: sco-bot-data
-            mountPath: /opt/data
+            - name: sco-bot-data
+              mountPath: /opt/data
 
       affinity:
         nodeAffinity:
           requiredDuringSchedulingIgnoredDuringExecution:
             nodeSelectorTerms:
-            - matchExpressions:
-              - key: "kubernetes.io/hostname"
-                operator: In
-                values: ["ditto", "dash"] # need /my/serv
\ No newline at end of file
+              - matchExpressions:
+                  - key: "kubernetes.io/hostname"
+                    operator: In
+                    values: ["ditto", "dash"] # need /my/serv
--- a/k8s/volumes.yaml	Fri Jul 19 00:30:47 2024 -0700
+++ b/k8s/volumes.yaml	Fri Jul 19 00:49:38 2024 -0700
@@ -1,22 +1,21 @@
-
 apiVersion: v1
 kind: PersistentVolume
 metadata:
   name: sco-bot-data
-  labels: {type: local}
+  labels: { type: local }
 spec:
   storageClassName: manual
-  hostPath: {path: "/my/serv/sco-bot/data"}
-  capacity: {storage: 5Mi}
+  hostPath: { path: "/my/serv/sco-bot/data" }
+  capacity: { storage: 5Mi }
   accessModes: ["ReadWriteMany"]
   persistentVolumeReclaimPolicy: Retain
-  claimRef: {namespace: default, name: sco-bot-data}
+  claimRef: { namespace: default, name: sco-bot-data }
 ---
 apiVersion: v1
 kind: PersistentVolumeClaim
-metadata: {name: sco-bot-data}
+metadata: { name: sco-bot-data }
 spec:
   storageClassName: ""
   volumeName: "sco-bot-data"
   accessModes: ["ReadWriteMany"]
-  resources: { requests: { storage: 5Mi } }
\ No newline at end of file
+  resources: { requests: { storage: 5Mi } }
--- a/scobot/index/access.py	Fri Jul 19 00:30:47 2024 -0700
+++ b/scobot/index/access.py	Fri Jul 19 00:49:38 2024 -0700
@@ -1,11 +1,13 @@
+import logging
 from pathlib import Path
 import shutil
+from typing import cast
 
 from whoosh.index import create_in, open_dir
 
 from scobot.index.schema import schema
 
-log = None  # set by flow
+log = cast(logging.Logger, None)  # set by flow
 
 
 class SearchIndex:
@@ -27,8 +29,10 @@
         with self.ix.searcher() as searcher:
             log.info(f'index doc count = {searcher.doc_count()}')
 
+
 class SearchIndexRO:
+
     def __init__(self, indexDir: Path):
         self.ix = open_dir(indexDir, readonly=True)
         self.searcher = self.ix.searcher()
-        print(f'{self.searcher.doc_count()=}')
\ No newline at end of file
+        print(f'{self.searcher.doc_count()=}')
--- a/scobot/index/build_index_flow.py	Fri Jul 19 00:30:47 2024 -0700
+++ b/scobot/index/build_index_flow.py	Fri Jul 19 00:49:38 2024 -0700
@@ -19,8 +19,10 @@
 @task()
 def meetingListUrls() -> Iterable[Url]:
     return [
-        "https://albanyca.primegov.com/api/v2/PublicPortal/ListArchivedMeetings?year=2024",
-        "https://albanyca.primegov.com/api/v2/PublicPortal/ListUpcomingMeetings",
+        Url("https://albanyca.primegov.com/api/v2/PublicPortal/ListArchivedMeetings?year=2024"
+            ),
+        Url("https://albanyca.primegov.com/api/v2/PublicPortal/ListUpcomingMeetings"
+            ),
     ]
 
 
@@ -28,7 +30,9 @@
     for doc in mtg.get('documentList', []):
         if doc['templateName'] == 'HTML Agenda Packet':
             tid = doc['templateId']
-            return f'https://albanyca.primegov.com/Portal/Meeting?meetingTemplateId={tid}'
+            return Url(
+                f'https://albanyca.primegov.com/Portal/Meeting?meetingTemplateId={tid}'
+            )
     raise ValueError(f"no agenda doc found for {mtg['id']=}")
 
 
@@ -64,16 +68,17 @@
                    phrase=sentence)
 
 
-def addMeeting(index: SearchIndex, mtg: MeetingRow):
+async def addMeeting(index: SearchIndex, mtg: MeetingRow):
     try:
         agendaUrl = meetingAgendaUrl(mtg)
     except ValueError:
         pass
     else:
-        html = getCityPermanent(agendaUrl)
+        html = await getCityPermanent(agendaUrl)
         texts = extractMeetingText(html)
-        for se in nltk.sent_tokenize(' '.join(texts)):
-            index.addDoc(sourceTitle=f'{mtg["date"]} {mtg["title"]}', phrase=se)
+        for sentence in nltk.sent_tokenize(' '.join(texts)):
+            index.addDoc(sourceTitle=f'{mtg["date"]} {mtg["title"]}',
+                         phrase=sentence)
 
     try:
         videoUrl = mtg['videoUrl']
@@ -86,19 +91,20 @@
 
 
 @flow(persist_result=True)
-def buildIndex():
+async def buildIndex():
     global log
     log = get_run_logger()
     scobot.index.access.log = log
 
     index = SearchIndex(Path('data/build/index0'))
     for url in meetingListUrls():
-        mtgs = cast(list[MeetingRow], getCityMutableJson(url))
+        mtgs = cast(list[MeetingRow], await getCityMutableJson(url))
         log.info(f'got {len(mtgs)=}')
 
         for mtg in mtgs:
-            addMeeting(index, mtg)
+            await addMeeting(index, mtg)
     index.commit()
+    # todo: kill search to restart it
 
 
 if __name__ == '__main__':
--- a/scobot/index/download_tasks.py	Fri Jul 19 00:30:47 2024 -0700
+++ b/scobot/index/download_tasks.py	Fri Jul 19 00:49:38 2024 -0700
@@ -14,8 +14,8 @@
     cache_expiration=datetime.timedelta(seconds=86400),
     tags=['city'],  # todo ratelimit based on tag
 )
-def getCityMutableJson(url: Url):
-    create_link_artifact("get", url)
+async def getCityMutableJson(url: Url):
+    await create_link_artifact("get", url)
     req = httpx.get(url)  # todo async
     req.raise_for_status()
     return req.json()
@@ -24,8 +24,8 @@
 @task(task_run_name=lambda: f'getHttp-{int(time.time())}',
       cache_key_fn=lambda _, args: f'getHttp-{args["url"]}',
       tags=['city'])
-def getCityPermanent(url: Url) -> str:
-    create_link_artifact("get", url)
+async def getCityPermanent(url: Url) -> str:
+    await create_link_artifact("get", url)
     req = httpx.get(url)
     req.raise_for_status()
     return req.text
--- a/scobot/local_types.py	Fri Jul 19 00:30:47 2024 -0700
+++ b/scobot/local_types.py	Fri Jul 19 00:49:38 2024 -0700
@@ -1,5 +1,4 @@
 from typing import NewType
 
-
 Url = NewType('Url', str)
-MeetingRow = NewType('MeetingRow', dict)
\ No newline at end of file
+MeetingRow = NewType('MeetingRow', dict)
--- a/scobot/service/query.py	Fri Jul 19 00:30:47 2024 -0700
+++ b/scobot/service/query.py	Fri Jul 19 00:49:38 2024 -0700
@@ -48,7 +48,7 @@
 
 @asynccontextmanager
 async def lifespan(app: FastAPI):
-    app.state.index = SearchIndexRO('data/build/index0')
+    app.state.index = SearchIndexRO(Path('data/build/index0'))
     yield
 
 app = FastAPI(lifespan=lifespan)
--- a/skaffold.yaml	Fri Jul 19 00:30:47 2024 -0700
+++ b/skaffold.yaml	Fri Jul 19 00:49:38 2024 -0700
@@ -17,7 +17,7 @@
       platforms: [amd64]
       sync:
         infer:
-          - 'scobot/**'
+          - "scobot/**"
   tagPolicy:
     dateTime:
       format: 2006-01-02_15-04-05