Mercurial > code > home > repos > sco-bot
view query.py @ 0:ca5da75f03ee
start
author | drewp@bigasterisk.com |
---|---|
date | Wed, 03 Jul 2024 19:16:28 -0700 |
parents | |
children | 82428652cda1 |
line wrap: on
line source
from pathlib import Path from pprint import pprint import re import sys from extract_pdf import phrasesFromFile from pymilvus import model from pymilvus import MilvusClient q, = sys.argv[1:] def cleanup(phrase: str) -> str: p = phrase.replace('\n', ' ') p = re.sub(r'\s+', ' ', p) if len(p) < 5: return '' return p embedding_fn = model.DefaultEmbeddingFunction() client = MilvusClient("milvus_demo.db") # client.drop_collection(collection_name="demo_collection") # if not client.has_collection(collection_name="demo_collection"): # client.create_collection( # collection_name="demo_collection", # dimension=768, # The vectors we will use in this demo has 768 dimensions # ) # docs = [] # for i, (bbox, phrase) in enumerate(phrasesFromFile(Path("data") / "Meetings2226Minutes_20240702182359526 (1).pdf")): # phrase = cleanup(phrase) # print(f'{phrase=}') # if not phrase: # continue # [vector] = embedding_fn.encode_documents([phrase]) # doc = { # "id": i, # "vector": vector, # "text": phrase, # } # docs.append(doc) # res = client.insert(collection_name="demo_collection", data=docs) # print('insert:', res) query_vectors = embedding_fn.encode_queries([q]) [query_result] = client.search( collection_name="demo_collection", data=query_vectors, limit=15, output_fields=["text"], ) for row in query_result: print(f'{row["distance"]:.6f} {row["entity"]["text"]}') # import ipdb; ipdb.set_trace()