changeset 13:403eff4a16c8

fix up indexer flow and fastapi server
author drewp@bigasterisk.com
date Thu, 11 Jul 2024 21:32:24 -0700
parents 7f36497bfac3
children b9c2b7fedbcd
files .hgignore env k8s/deploy.yaml pyproject.toml scobot/__init__.py scobot/index/access.py scobot/index/build_index_flow.py scobot/service/query.py
diffstat 8 files changed, 40 insertions(+), 33 deletions(-) [+]
line wrap: on
line diff
--- a/.hgignore	Thu Jul 11 18:16:20 2024 -0700
+++ b/.hgignore	Thu Jul 11 21:32:24 2024 -0700
@@ -6,3 +6,4 @@
 data/
 milvus_demo.db
 web/node_modules/
+prefect/
--- a/env	Thu Jul 11 18:16:20 2024 -0700
+++ b/env	Thu Jul 11 21:32:24 2024 -0700
@@ -1,2 +1,3 @@
-PREFECT_API_URL=http://127.0.0.1:4200/api
+PREFECT_API_URL=http://dash:4200/api
 PREFECT_HOME=./prefect
+PREFECT_SERVER_API_HOST=0.0.0.0
\ No newline at end of file
--- a/k8s/deploy.yaml	Thu Jul 11 18:16:20 2024 -0700
+++ b/k8s/deploy.yaml	Thu Jul 11 21:32:24 2024 -0700
@@ -38,7 +38,7 @@
             - "0.0.0.0"
             - --port
             - "8001"
-            - scobot/service/query.py
+            - scobot
           ports:
             - containerPort: 8001
           volumeMounts:
--- a/pyproject.toml	Thu Jul 11 18:16:20 2024 -0700
+++ b/pyproject.toml	Thu Jul 11 21:32:24 2024 -0700
@@ -31,5 +31,6 @@
 [tool.pdm.scripts]
 _.env_file = "env"
 run_prefect_server = "prefect server start"
-run_build_flow = "python build_index.py"
-start_build = "prefect deployment run buildIndex/buildIndex"
\ No newline at end of file
+run_build_flow = "python -c 'from scobot.index.build_index_flow import buildIndex; buildIndex.serve(buildIndex.__name__)'"
+start_build = "prefect deployment run buildIndex/buildIndex"
+run_local_deploy = "fastapi dev --host 0.0.0.0 --port 8001 scobot"
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/scobot/__init__.py	Thu Jul 11 21:32:24 2024 -0700
@@ -0,0 +1,1 @@
+from scobot.service.query import app
--- a/scobot/index/access.py	Thu Jul 11 18:16:20 2024 -0700
+++ b/scobot/index/access.py	Thu Jul 11 21:32:24 2024 -0700
@@ -1,6 +1,7 @@
 from pathlib import Path
+import shutil
 
-from whoosh.index import create_in
+from whoosh.index import create_in, open_dir
 
 from scobot.index.schema import schema
 
@@ -9,9 +10,13 @@
 
 class SearchIndex:
 
-    def __init__(self, indexDir: Path):
-        indexDir.mkdir(parents=True, exist_ok=True)
-        self.ix = create_in(indexDir, schema)
+    def __init__(self, indexDir: Path, delete_existing=True):
+        if delete_existing:
+            shutil.rmtree(indexDir)
+            indexDir.mkdir(parents=True, exist_ok=True)
+            self.ix = create_in(indexDir, schema)
+        else:
+            self.ix = open_dir(indexDir)
         self.writer = self.ix.writer()
 
     def addDoc(self, **kw):
@@ -21,3 +26,9 @@
         self.writer.commit()
         with self.ix.searcher() as searcher:
             log.info(f'index doc count = {searcher.doc_count()}')
+
+class SearchIndexRO:
+    def __init__(self, indexDir: Path):
+        self.ix = open_dir(indexDir, readonly=True)
+        self.searcher = self.ix.searcher()
+        print(f'{self.searcher.doc_count()=}')
\ No newline at end of file
--- a/scobot/index/build_index_flow.py	Thu Jul 11 18:16:20 2024 -0700
+++ b/scobot/index/build_index_flow.py	Thu Jul 11 21:32:24 2024 -0700
@@ -71,9 +71,9 @@
         pass
     else:
         html = getCityPermanent(agendaUrl)
-        text = extractMeetingText(html)
-        # todo group phrases phrasesFromFile
-        index.addDoc(title=f'{mtg["date"]} {mtg["title"]}', content=text)
+        texts = extractMeetingText(html)
+        for se in nltk.sent_tokenize(' '.join(texts)):
+            index.addDoc(sourceTitle=f'{mtg["date"]} {mtg["title"]}', phrase=se)
 
     try:
         videoUrl = mtg['videoUrl']
@@ -102,4 +102,4 @@
 
 
 if __name__ == '__main__':
-    buildIndex.serve()
+    buildIndex.serve(name='buildIndex')
--- a/scobot/service/query.py	Thu Jul 11 18:16:20 2024 -0700
+++ b/scobot/service/query.py	Thu Jul 11 21:32:24 2024 -0700
@@ -1,5 +1,9 @@
+from scobot.index.access import SearchIndexRO
+from whoosh.qparser import QueryParser
 import json
 from pathlib import Path
+from pprint import pprint
+from contextlib import asynccontextmanager
 
 # from pymilvus import MilvusClient
 # from milvus_model.dense.onnx import OnnxEmbeddingFunction
@@ -34,37 +38,25 @@
     print('insert:', res['insert_count'])
 
 
-def xxsearch(q, embedding_fn, client):
-    query_vectors = embedding_fn.encode_queries([q])
-
-    [query_result] = client.search(
-        collection_name="demo_collection",
-        data=query_vectors,
-        limit=5,
-        output_fields=["text"],
-    )
-    query_result.sort(key=lambda x: x["distance"], reverse=True)
-
-    for row in query_result:
-        print(f'{row["distance"]:.6f} {row["entity"]["text"]}')
-
-
-# q, = sys.argv[1:]
-
 # https://huggingface.co/models?pipeline_tag=feature-extraction&library=onnx&sort=trending
 # embedding_fn = OnnxEmbeddingFunction(model_name="jinaai/jina-embeddings-v2-base-en")
 # client = MilvusClient("milvus_demo.db")
 # rebuild(client, embedding_fn, dim=embedding_fn.dim)
 # search(q, embedding_fn, client)
 
-app = FastAPI()
 
-#search = Search()
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    app.state.index = SearchIndexRO('/tmp/scoindex')
+    yield
 
+app = FastAPI(lifespan=lifespan)
 
 @app.get("/sco/query")
 def read_query1(q: str):
-    results = []
-    results = search.search(q)
+    index = app.state.index
 
+    query = QueryParser("phrase", index.ix.schema).parse(q)
+    pprint(query)
+    results = list(index.searcher.search(query))
     return {"results": results}