diff scobot/service/query.py @ 11:6622bacb0b84

first pass at reorg
author drewp@bigasterisk.com
date Thu, 11 Jul 2024 18:15:44 -0700
parents search/query.py@f23b21bd0fce
children 403eff4a16c8
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/scobot/service/query.py	Thu Jul 11 18:15:44 2024 -0700
@@ -0,0 +1,70 @@
+import json
+from pathlib import Path
+
+# from pymilvus import MilvusClient
+# from milvus_model.dense.onnx import OnnxEmbeddingFunction
+from fastapi import FastAPI
+from tqdm import tqdm
+
+
+def rebuild(client, embedding_fn, dim):
+    client.drop_collection(collection_name="demo_collection")
+    if not client.has_collection(collection_name="demo_collection"):
+        client.create_collection(
+            collection_name="demo_collection",
+            dimension=dim,
+        )
+
+    docs = []
+    for i, (bbox, phrase) in tqdm(enumerate(
+            phrasesFromFile(
+                Path("data") /
+                "Meetings2226Minutes_20240702182359526 (1).pdf")),
+                                  desc="rebuilding",
+                                  unit=' phrase'):
+        [vector] = embedding_fn.encode_documents([phrase])
+        doc = {
+            "id": i,
+            "vector": vector,
+            "text": phrase,
+            "bbox": json.dumps(bbox),
+        }
+        docs.append(doc)
+    res = client.insert(collection_name="demo_collection", data=docs)
+    print('insert:', res['insert_count'])
+
+
+def xxsearch(q, embedding_fn, client):
+    query_vectors = embedding_fn.encode_queries([q])
+
+    [query_result] = client.search(
+        collection_name="demo_collection",
+        data=query_vectors,
+        limit=5,
+        output_fields=["text"],
+    )
+    query_result.sort(key=lambda x: x["distance"], reverse=True)
+
+    for row in query_result:
+        print(f'{row["distance"]:.6f} {row["entity"]["text"]}')
+
+
+# q, = sys.argv[1:]
+
+# https://huggingface.co/models?pipeline_tag=feature-extraction&library=onnx&sort=trending
+# embedding_fn = OnnxEmbeddingFunction(model_name="jinaai/jina-embeddings-v2-base-en")
+# client = MilvusClient("milvus_demo.db")
+# rebuild(client, embedding_fn, dim=embedding_fn.dim)
+# search(q, embedding_fn, client)
+
+app = FastAPI()
+
+#search = Search()
+
+
+@app.get("/sco/query")
+def read_query1(q: str):
+    results = []
+    results = search.search(q)
+
+    return {"results": results}