diff query.py @ 0:ca5da75f03ee

start
author drewp@bigasterisk.com
date Wed, 03 Jul 2024 19:16:28 -0700
parents
children 82428652cda1
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/query.py	Wed Jul 03 19:16:28 2024 -0700
@@ -0,0 +1,59 @@
+from pathlib import Path
+from pprint import pprint
+import re
+import sys
+from extract_pdf import phrasesFromFile
+from pymilvus import model
+from pymilvus import MilvusClient
+
+q, = sys.argv[1:]
+
+def cleanup(phrase: str) -> str:
+    p = phrase.replace('\n', ' ')
+    p = re.sub(r'\s+', ' ', p)
+    if len(p) < 5:
+        return ''
+    return p
+
+
+embedding_fn = model.DefaultEmbeddingFunction()
+
+client = MilvusClient("milvus_demo.db")
+
+# client.drop_collection(collection_name="demo_collection")
+# if not client.has_collection(collection_name="demo_collection"):
+#     client.create_collection(
+#         collection_name="demo_collection",
+#         dimension=768,  # The vectors we will use in this demo has 768 dimensions
+# )
+
+# docs = []
+# for i, (bbox, phrase) in enumerate(phrasesFromFile(Path("data") / "Meetings2226Minutes_20240702182359526 (1).pdf")):
+#     phrase = cleanup(phrase)
+#     print(f'{phrase=}')
+#     if not phrase:
+#         continue
+
+#     [vector] = embedding_fn.encode_documents([phrase])
+#     doc = {
+        
+#     "id": i,
+#     "vector": vector,
+#     "text": phrase,
+# }
+#     docs.append(doc)
+# res = client.insert(collection_name="demo_collection", data=docs)
+# print('insert:', res)
+
+query_vectors = embedding_fn.encode_queries([q])
+
+[query_result] = client.search(
+    collection_name="demo_collection",  
+    data=query_vectors,  
+    limit=15,  
+    output_fields=["text"],  
+)
+
+for row in query_result:
+    print(f'{row["distance"]:.6f} {row["entity"]["text"]}')
+# import ipdb; ipdb.set_trace()
\ No newline at end of file