2
|
1 import json
|
0
|
2 import sys
|
2
|
3 from pathlib import Path
|
|
4
|
|
5 from tqdm import tqdm
|
|
6
|
|
7 from pymilvus import MilvusClient
|
|
8 from milvus_model.dense.onnx import OnnxEmbeddingFunction
|
|
9
|
0
|
10 from extract_pdf import phrasesFromFile
|
2
|
11
|
|
12
|
|
13 def rebuild(client, embedding_fn, dim):
|
|
14 client.drop_collection(collection_name="demo_collection")
|
|
15 if not client.has_collection(collection_name="demo_collection"):
|
|
16 client.create_collection(
|
|
17 collection_name="demo_collection",
|
|
18 dimension=dim,
|
|
19 )
|
|
20
|
|
21 docs = []
|
|
22 for i, (bbox, phrase) in tqdm(enumerate(
|
|
23 phrasesFromFile(
|
|
24 Path("data") /
|
|
25 "Meetings2226Minutes_20240702182359526 (1).pdf"))):
|
|
26 [vector] = embedding_fn.encode_documents([phrase])
|
|
27 doc = {
|
|
28 "id": i,
|
|
29 "vector": vector,
|
|
30 "text": phrase,
|
|
31 "bbox": json.dumps(bbox),
|
|
32 }
|
|
33 docs.append(doc)
|
|
34 res = client.insert(collection_name="demo_collection", data=docs)
|
|
35 print('insert:', res)
|
|
36
|
|
37
|
|
38 def search(q, embedding_fn, client):
|
|
39 query_vectors = embedding_fn.encode_queries([q])
|
|
40
|
|
41 [query_result] = client.search(
|
|
42 collection_name="demo_collection",
|
|
43 data=query_vectors,
|
|
44 limit=5,
|
|
45 output_fields=["text"],
|
|
46 )
|
|
47 query_result.sort(key=lambda x: x["distance"], reverse=True)
|
|
48
|
|
49 for row in query_result:
|
|
50 print(f'{row["distance"]:.6f} {row["entity"]["text"]}')
|
|
51
|
0
|
52
|
|
53 q, = sys.argv[1:]
|
|
54
|
2
|
55 embedding_fn = OnnxEmbeddingFunction(model_name="GPTCache/paraphrase-albert-onnx")
|
0
|
56 client = MilvusClient("milvus_demo.db")
|
2
|
57 rebuild(client, embedding_fn, dim=embedding_fn.dim)
|
|
58 search(q, embedding_fn, client)
|