# HG changeset patch # User drewp@bigasterisk.com # Date 1721375378 25200 # Node ID 7a87ba2f00d9308b8aa8a9175b9788fe73df821e # Parent 6ed25bcaaf1fee57e50c355b945f91edfdc3f1ea reformat, fix some types, make more async diff -r 6ed25bcaaf1f -r 7a87ba2f00d9 k8s/deploy.yaml --- a/k8s/deploy.yaml Fri Jul 19 00:30:47 2024 -0700 +++ b/k8s/deploy.yaml Fri Jul 19 00:49:38 2024 -0700 @@ -46,8 +46,8 @@ - containerPort: 8001 name: server volumeMounts: - - name: sco-bot-data - mountPath: /opt/data + - name: sco-bot-data + mountPath: /opt/data - name: prefect image: reg:5000/sco_bot_server @@ -63,8 +63,8 @@ - containerPort: 4200 name: prefect volumeMounts: - - name: sco-bot-data - mountPath: /opt/data + - name: sco-bot-data + mountPath: /opt/data - name: build-flow image: reg:5000/sco_bot_server @@ -77,14 +77,14 @@ - pdm - run_build_flow volumeMounts: - - name: sco-bot-data - mountPath: /opt/data + - name: sco-bot-data + mountPath: /opt/data affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: nodeSelectorTerms: - - matchExpressions: - - key: "kubernetes.io/hostname" - operator: In - values: ["ditto", "dash"] # need /my/serv \ No newline at end of file + - matchExpressions: + - key: "kubernetes.io/hostname" + operator: In + values: ["ditto", "dash"] # need /my/serv diff -r 6ed25bcaaf1f -r 7a87ba2f00d9 k8s/volumes.yaml --- a/k8s/volumes.yaml Fri Jul 19 00:30:47 2024 -0700 +++ b/k8s/volumes.yaml Fri Jul 19 00:49:38 2024 -0700 @@ -1,22 +1,21 @@ - apiVersion: v1 kind: PersistentVolume metadata: name: sco-bot-data - labels: {type: local} + labels: { type: local } spec: storageClassName: manual - hostPath: {path: "/my/serv/sco-bot/data"} - capacity: {storage: 5Mi} + hostPath: { path: "/my/serv/sco-bot/data" } + capacity: { storage: 5Mi } accessModes: ["ReadWriteMany"] persistentVolumeReclaimPolicy: Retain - claimRef: {namespace: default, name: sco-bot-data} + claimRef: { namespace: default, name: sco-bot-data } --- apiVersion: v1 kind: PersistentVolumeClaim -metadata: {name: sco-bot-data} +metadata: { name: sco-bot-data } spec: storageClassName: "" volumeName: "sco-bot-data" accessModes: ["ReadWriteMany"] - resources: { requests: { storage: 5Mi } } \ No newline at end of file + resources: { requests: { storage: 5Mi } } diff -r 6ed25bcaaf1f -r 7a87ba2f00d9 scobot/index/access.py --- a/scobot/index/access.py Fri Jul 19 00:30:47 2024 -0700 +++ b/scobot/index/access.py Fri Jul 19 00:49:38 2024 -0700 @@ -1,11 +1,13 @@ +import logging from pathlib import Path import shutil +from typing import cast from whoosh.index import create_in, open_dir from scobot.index.schema import schema -log = None # set by flow +log = cast(logging.Logger, None) # set by flow class SearchIndex: @@ -27,8 +29,10 @@ with self.ix.searcher() as searcher: log.info(f'index doc count = {searcher.doc_count()}') + class SearchIndexRO: + def __init__(self, indexDir: Path): self.ix = open_dir(indexDir, readonly=True) self.searcher = self.ix.searcher() - print(f'{self.searcher.doc_count()=}') \ No newline at end of file + print(f'{self.searcher.doc_count()=}') diff -r 6ed25bcaaf1f -r 7a87ba2f00d9 scobot/index/build_index_flow.py --- a/scobot/index/build_index_flow.py Fri Jul 19 00:30:47 2024 -0700 +++ b/scobot/index/build_index_flow.py Fri Jul 19 00:49:38 2024 -0700 @@ -19,8 +19,10 @@ @task() def meetingListUrls() -> Iterable[Url]: return [ - "https://albanyca.primegov.com/api/v2/PublicPortal/ListArchivedMeetings?year=2024", - "https://albanyca.primegov.com/api/v2/PublicPortal/ListUpcomingMeetings", + Url("https://albanyca.primegov.com/api/v2/PublicPortal/ListArchivedMeetings?year=2024" + ), + Url("https://albanyca.primegov.com/api/v2/PublicPortal/ListUpcomingMeetings" + ), ] @@ -28,7 +30,9 @@ for doc in mtg.get('documentList', []): if doc['templateName'] == 'HTML Agenda Packet': tid = doc['templateId'] - return f'https://albanyca.primegov.com/Portal/Meeting?meetingTemplateId={tid}' + return Url( + f'https://albanyca.primegov.com/Portal/Meeting?meetingTemplateId={tid}' + ) raise ValueError(f"no agenda doc found for {mtg['id']=}") @@ -64,16 +68,17 @@ phrase=sentence) -def addMeeting(index: SearchIndex, mtg: MeetingRow): +async def addMeeting(index: SearchIndex, mtg: MeetingRow): try: agendaUrl = meetingAgendaUrl(mtg) except ValueError: pass else: - html = getCityPermanent(agendaUrl) + html = await getCityPermanent(agendaUrl) texts = extractMeetingText(html) - for se in nltk.sent_tokenize(' '.join(texts)): - index.addDoc(sourceTitle=f'{mtg["date"]} {mtg["title"]}', phrase=se) + for sentence in nltk.sent_tokenize(' '.join(texts)): + index.addDoc(sourceTitle=f'{mtg["date"]} {mtg["title"]}', + phrase=sentence) try: videoUrl = mtg['videoUrl'] @@ -86,19 +91,20 @@ @flow(persist_result=True) -def buildIndex(): +async def buildIndex(): global log log = get_run_logger() scobot.index.access.log = log index = SearchIndex(Path('data/build/index0')) for url in meetingListUrls(): - mtgs = cast(list[MeetingRow], getCityMutableJson(url)) + mtgs = cast(list[MeetingRow], await getCityMutableJson(url)) log.info(f'got {len(mtgs)=}') for mtg in mtgs: - addMeeting(index, mtg) + await addMeeting(index, mtg) index.commit() + # todo: kill search to restart it if __name__ == '__main__': diff -r 6ed25bcaaf1f -r 7a87ba2f00d9 scobot/index/download_tasks.py --- a/scobot/index/download_tasks.py Fri Jul 19 00:30:47 2024 -0700 +++ b/scobot/index/download_tasks.py Fri Jul 19 00:49:38 2024 -0700 @@ -14,8 +14,8 @@ cache_expiration=datetime.timedelta(seconds=86400), tags=['city'], # todo ratelimit based on tag ) -def getCityMutableJson(url: Url): - create_link_artifact("get", url) +async def getCityMutableJson(url: Url): + await create_link_artifact("get", url) req = httpx.get(url) # todo async req.raise_for_status() return req.json() @@ -24,8 +24,8 @@ @task(task_run_name=lambda: f'getHttp-{int(time.time())}', cache_key_fn=lambda _, args: f'getHttp-{args["url"]}', tags=['city']) -def getCityPermanent(url: Url) -> str: - create_link_artifact("get", url) +async def getCityPermanent(url: Url) -> str: + await create_link_artifact("get", url) req = httpx.get(url) req.raise_for_status() return req.text diff -r 6ed25bcaaf1f -r 7a87ba2f00d9 scobot/local_types.py --- a/scobot/local_types.py Fri Jul 19 00:30:47 2024 -0700 +++ b/scobot/local_types.py Fri Jul 19 00:49:38 2024 -0700 @@ -1,5 +1,4 @@ from typing import NewType - Url = NewType('Url', str) -MeetingRow = NewType('MeetingRow', dict) \ No newline at end of file +MeetingRow = NewType('MeetingRow', dict) diff -r 6ed25bcaaf1f -r 7a87ba2f00d9 scobot/service/query.py --- a/scobot/service/query.py Fri Jul 19 00:30:47 2024 -0700 +++ b/scobot/service/query.py Fri Jul 19 00:49:38 2024 -0700 @@ -48,7 +48,7 @@ @asynccontextmanager async def lifespan(app: FastAPI): - app.state.index = SearchIndexRO('data/build/index0') + app.state.index = SearchIndexRO(Path('data/build/index0')) yield app = FastAPI(lifespan=lifespan) diff -r 6ed25bcaaf1f -r 7a87ba2f00d9 skaffold.yaml --- a/skaffold.yaml Fri Jul 19 00:30:47 2024 -0700 +++ b/skaffold.yaml Fri Jul 19 00:49:38 2024 -0700 @@ -17,7 +17,7 @@ platforms: [amd64] sync: infer: - - 'scobot/**' + - "scobot/**" tagPolicy: dateTime: format: 2006-01-02_15-04-05