view query.py @ 0:ca5da75f03ee

start
author drewp@bigasterisk.com
date Wed, 03 Jul 2024 19:16:28 -0700
parents
children 82428652cda1
line wrap: on
line source

from pathlib import Path
from pprint import pprint
import re
import sys
from extract_pdf import phrasesFromFile
from pymilvus import model
from pymilvus import MilvusClient

q, = sys.argv[1:]

def cleanup(phrase: str) -> str:
    p = phrase.replace('\n', ' ')
    p = re.sub(r'\s+', ' ', p)
    if len(p) < 5:
        return ''
    return p


embedding_fn = model.DefaultEmbeddingFunction()

client = MilvusClient("milvus_demo.db")

# client.drop_collection(collection_name="demo_collection")
# if not client.has_collection(collection_name="demo_collection"):
#     client.create_collection(
#         collection_name="demo_collection",
#         dimension=768,  # The vectors we will use in this demo has 768 dimensions
# )

# docs = []
# for i, (bbox, phrase) in enumerate(phrasesFromFile(Path("data") / "Meetings2226Minutes_20240702182359526 (1).pdf")):
#     phrase = cleanup(phrase)
#     print(f'{phrase=}')
#     if not phrase:
#         continue

#     [vector] = embedding_fn.encode_documents([phrase])
#     doc = {
        
#     "id": i,
#     "vector": vector,
#     "text": phrase,
# }
#     docs.append(doc)
# res = client.insert(collection_name="demo_collection", data=docs)
# print('insert:', res)

query_vectors = embedding_fn.encode_queries([q])

[query_result] = client.search(
    collection_name="demo_collection",  
    data=query_vectors,  
    limit=15,  
    output_fields=["text"],  
)

for row in query_result:
    print(f'{row["distance"]:.6f} {row["entity"]["text"]}')
# import ipdb; ipdb.set_trace()