Mercurial > code > home > repos > sco-bot
diff query.py @ 0:ca5da75f03ee
start
author | drewp@bigasterisk.com |
---|---|
date | Wed, 03 Jul 2024 19:16:28 -0700 |
parents | |
children | 82428652cda1 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/query.py Wed Jul 03 19:16:28 2024 -0700 @@ -0,0 +1,59 @@ +from pathlib import Path +from pprint import pprint +import re +import sys +from extract_pdf import phrasesFromFile +from pymilvus import model +from pymilvus import MilvusClient + +q, = sys.argv[1:] + +def cleanup(phrase: str) -> str: + p = phrase.replace('\n', ' ') + p = re.sub(r'\s+', ' ', p) + if len(p) < 5: + return '' + return p + + +embedding_fn = model.DefaultEmbeddingFunction() + +client = MilvusClient("milvus_demo.db") + +# client.drop_collection(collection_name="demo_collection") +# if not client.has_collection(collection_name="demo_collection"): +# client.create_collection( +# collection_name="demo_collection", +# dimension=768, # The vectors we will use in this demo has 768 dimensions +# ) + +# docs = [] +# for i, (bbox, phrase) in enumerate(phrasesFromFile(Path("data") / "Meetings2226Minutes_20240702182359526 (1).pdf")): +# phrase = cleanup(phrase) +# print(f'{phrase=}') +# if not phrase: +# continue + +# [vector] = embedding_fn.encode_documents([phrase]) +# doc = { + +# "id": i, +# "vector": vector, +# "text": phrase, +# } +# docs.append(doc) +# res = client.insert(collection_name="demo_collection", data=docs) +# print('insert:', res) + +query_vectors = embedding_fn.encode_queries([q]) + +[query_result] = client.search( + collection_name="demo_collection", + data=query_vectors, + limit=15, + output_fields=["text"], +) + +for row in query_result: + print(f'{row["distance"]:.6f} {row["entity"]["text"]}') +# import ipdb; ipdb.set_trace() \ No newline at end of file