Mercurial > code > home > repos > sco-bot
diff scobot/service/query.py @ 11:6622bacb0b84
first pass at reorg
author | drewp@bigasterisk.com |
---|---|
date | Thu, 11 Jul 2024 18:15:44 -0700 |
parents | search/query.py@f23b21bd0fce |
children | 403eff4a16c8 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scobot/service/query.py Thu Jul 11 18:15:44 2024 -0700 @@ -0,0 +1,70 @@ +import json +from pathlib import Path + +# from pymilvus import MilvusClient +# from milvus_model.dense.onnx import OnnxEmbeddingFunction +from fastapi import FastAPI +from tqdm import tqdm + + +def rebuild(client, embedding_fn, dim): + client.drop_collection(collection_name="demo_collection") + if not client.has_collection(collection_name="demo_collection"): + client.create_collection( + collection_name="demo_collection", + dimension=dim, + ) + + docs = [] + for i, (bbox, phrase) in tqdm(enumerate( + phrasesFromFile( + Path("data") / + "Meetings2226Minutes_20240702182359526 (1).pdf")), + desc="rebuilding", + unit=' phrase'): + [vector] = embedding_fn.encode_documents([phrase]) + doc = { + "id": i, + "vector": vector, + "text": phrase, + "bbox": json.dumps(bbox), + } + docs.append(doc) + res = client.insert(collection_name="demo_collection", data=docs) + print('insert:', res['insert_count']) + + +def xxsearch(q, embedding_fn, client): + query_vectors = embedding_fn.encode_queries([q]) + + [query_result] = client.search( + collection_name="demo_collection", + data=query_vectors, + limit=5, + output_fields=["text"], + ) + query_result.sort(key=lambda x: x["distance"], reverse=True) + + for row in query_result: + print(f'{row["distance"]:.6f} {row["entity"]["text"]}') + + +# q, = sys.argv[1:] + +# https://huggingface.co/models?pipeline_tag=feature-extraction&library=onnx&sort=trending +# embedding_fn = OnnxEmbeddingFunction(model_name="jinaai/jina-embeddings-v2-base-en") +# client = MilvusClient("milvus_demo.db") +# rebuild(client, embedding_fn, dim=embedding_fn.dim) +# search(q, embedding_fn, client) + +app = FastAPI() + +#search = Search() + + +@app.get("/sco/query") +def read_query1(q: str): + results = [] + results = search.search(q) + + return {"results": results}