0
|
1 from pathlib import Path
|
|
2 from pprint import pprint
|
|
3 import re
|
|
4 import sys
|
|
5 from extract_pdf import phrasesFromFile
|
|
6 from pymilvus import model
|
|
7 from pymilvus import MilvusClient
|
|
8
|
|
9 q, = sys.argv[1:]
|
|
10
|
|
11 def cleanup(phrase: str) -> str:
|
|
12 p = phrase.replace('\n', ' ')
|
|
13 p = re.sub(r'\s+', ' ', p)
|
|
14 if len(p) < 5:
|
|
15 return ''
|
|
16 return p
|
|
17
|
|
18
|
|
19 embedding_fn = model.DefaultEmbeddingFunction()
|
|
20
|
|
21 client = MilvusClient("milvus_demo.db")
|
|
22
|
|
23 # client.drop_collection(collection_name="demo_collection")
|
|
24 # if not client.has_collection(collection_name="demo_collection"):
|
|
25 # client.create_collection(
|
|
26 # collection_name="demo_collection",
|
|
27 # dimension=768, # The vectors we will use in this demo has 768 dimensions
|
|
28 # )
|
|
29
|
|
30 # docs = []
|
|
31 # for i, (bbox, phrase) in enumerate(phrasesFromFile(Path("data") / "Meetings2226Minutes_20240702182359526 (1).pdf")):
|
|
32 # phrase = cleanup(phrase)
|
|
33 # print(f'{phrase=}')
|
|
34 # if not phrase:
|
|
35 # continue
|
|
36
|
|
37 # [vector] = embedding_fn.encode_documents([phrase])
|
|
38 # doc = {
|
|
39
|
|
40 # "id": i,
|
|
41 # "vector": vector,
|
|
42 # "text": phrase,
|
|
43 # }
|
|
44 # docs.append(doc)
|
|
45 # res = client.insert(collection_name="demo_collection", data=docs)
|
|
46 # print('insert:', res)
|
|
47
|
|
48 query_vectors = embedding_fn.encode_queries([q])
|
|
49
|
|
50 [query_result] = client.search(
|
|
51 collection_name="demo_collection",
|
|
52 data=query_vectors,
|
|
53 limit=15,
|
|
54 output_fields=["text"],
|
|
55 )
|
|
56
|
|
57 for row in query_result:
|
|
58 print(f'{row["distance"]:.6f} {row["entity"]["text"]}')
|
|
59 # import ipdb; ipdb.set_trace() |