annotate query.py @ 0:ca5da75f03ee

start
author drewp@bigasterisk.com
date Wed, 03 Jul 2024 19:16:28 -0700
parents
children 82428652cda1
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
drewp@bigasterisk.com
parents:
diff changeset
1 from pathlib import Path
drewp@bigasterisk.com
parents:
diff changeset
2 from pprint import pprint
drewp@bigasterisk.com
parents:
diff changeset
3 import re
drewp@bigasterisk.com
parents:
diff changeset
4 import sys
drewp@bigasterisk.com
parents:
diff changeset
5 from extract_pdf import phrasesFromFile
drewp@bigasterisk.com
parents:
diff changeset
6 from pymilvus import model
drewp@bigasterisk.com
parents:
diff changeset
7 from pymilvus import MilvusClient
drewp@bigasterisk.com
parents:
diff changeset
8
drewp@bigasterisk.com
parents:
diff changeset
9 q, = sys.argv[1:]
drewp@bigasterisk.com
parents:
diff changeset
10
drewp@bigasterisk.com
parents:
diff changeset
11 def cleanup(phrase: str) -> str:
drewp@bigasterisk.com
parents:
diff changeset
12 p = phrase.replace('\n', ' ')
drewp@bigasterisk.com
parents:
diff changeset
13 p = re.sub(r'\s+', ' ', p)
drewp@bigasterisk.com
parents:
diff changeset
14 if len(p) < 5:
drewp@bigasterisk.com
parents:
diff changeset
15 return ''
drewp@bigasterisk.com
parents:
diff changeset
16 return p
drewp@bigasterisk.com
parents:
diff changeset
17
drewp@bigasterisk.com
parents:
diff changeset
18
drewp@bigasterisk.com
parents:
diff changeset
19 embedding_fn = model.DefaultEmbeddingFunction()
drewp@bigasterisk.com
parents:
diff changeset
20
drewp@bigasterisk.com
parents:
diff changeset
21 client = MilvusClient("milvus_demo.db")
drewp@bigasterisk.com
parents:
diff changeset
22
drewp@bigasterisk.com
parents:
diff changeset
23 # client.drop_collection(collection_name="demo_collection")
drewp@bigasterisk.com
parents:
diff changeset
24 # if not client.has_collection(collection_name="demo_collection"):
drewp@bigasterisk.com
parents:
diff changeset
25 # client.create_collection(
drewp@bigasterisk.com
parents:
diff changeset
26 # collection_name="demo_collection",
drewp@bigasterisk.com
parents:
diff changeset
27 # dimension=768, # The vectors we will use in this demo has 768 dimensions
drewp@bigasterisk.com
parents:
diff changeset
28 # )
drewp@bigasterisk.com
parents:
diff changeset
29
drewp@bigasterisk.com
parents:
diff changeset
30 # docs = []
drewp@bigasterisk.com
parents:
diff changeset
31 # for i, (bbox, phrase) in enumerate(phrasesFromFile(Path("data") / "Meetings2226Minutes_20240702182359526 (1).pdf")):
drewp@bigasterisk.com
parents:
diff changeset
32 # phrase = cleanup(phrase)
drewp@bigasterisk.com
parents:
diff changeset
33 # print(f'{phrase=}')
drewp@bigasterisk.com
parents:
diff changeset
34 # if not phrase:
drewp@bigasterisk.com
parents:
diff changeset
35 # continue
drewp@bigasterisk.com
parents:
diff changeset
36
drewp@bigasterisk.com
parents:
diff changeset
37 # [vector] = embedding_fn.encode_documents([phrase])
drewp@bigasterisk.com
parents:
diff changeset
38 # doc = {
drewp@bigasterisk.com
parents:
diff changeset
39
drewp@bigasterisk.com
parents:
diff changeset
40 # "id": i,
drewp@bigasterisk.com
parents:
diff changeset
41 # "vector": vector,
drewp@bigasterisk.com
parents:
diff changeset
42 # "text": phrase,
drewp@bigasterisk.com
parents:
diff changeset
43 # }
drewp@bigasterisk.com
parents:
diff changeset
44 # docs.append(doc)
drewp@bigasterisk.com
parents:
diff changeset
45 # res = client.insert(collection_name="demo_collection", data=docs)
drewp@bigasterisk.com
parents:
diff changeset
46 # print('insert:', res)
drewp@bigasterisk.com
parents:
diff changeset
47
drewp@bigasterisk.com
parents:
diff changeset
48 query_vectors = embedding_fn.encode_queries([q])
drewp@bigasterisk.com
parents:
diff changeset
49
drewp@bigasterisk.com
parents:
diff changeset
50 [query_result] = client.search(
drewp@bigasterisk.com
parents:
diff changeset
51 collection_name="demo_collection",
drewp@bigasterisk.com
parents:
diff changeset
52 data=query_vectors,
drewp@bigasterisk.com
parents:
diff changeset
53 limit=15,
drewp@bigasterisk.com
parents:
diff changeset
54 output_fields=["text"],
drewp@bigasterisk.com
parents:
diff changeset
55 )
drewp@bigasterisk.com
parents:
diff changeset
56
drewp@bigasterisk.com
parents:
diff changeset
57 for row in query_result:
drewp@bigasterisk.com
parents:
diff changeset
58 print(f'{row["distance"]:.6f} {row["entity"]["text"]}')
drewp@bigasterisk.com
parents:
diff changeset
59 # import ipdb; ipdb.set_trace()