Mercurial > code > home > repos > sco-bot
comparison query.py @ 0:ca5da75f03ee
start
author | drewp@bigasterisk.com |
---|---|
date | Wed, 03 Jul 2024 19:16:28 -0700 |
parents | |
children | 82428652cda1 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:ca5da75f03ee |
---|---|
1 from pathlib import Path | |
2 from pprint import pprint | |
3 import re | |
4 import sys | |
5 from extract_pdf import phrasesFromFile | |
6 from pymilvus import model | |
7 from pymilvus import MilvusClient | |
8 | |
9 q, = sys.argv[1:] | |
10 | |
11 def cleanup(phrase: str) -> str: | |
12 p = phrase.replace('\n', ' ') | |
13 p = re.sub(r'\s+', ' ', p) | |
14 if len(p) < 5: | |
15 return '' | |
16 return p | |
17 | |
18 | |
19 embedding_fn = model.DefaultEmbeddingFunction() | |
20 | |
21 client = MilvusClient("milvus_demo.db") | |
22 | |
23 # client.drop_collection(collection_name="demo_collection") | |
24 # if not client.has_collection(collection_name="demo_collection"): | |
25 # client.create_collection( | |
26 # collection_name="demo_collection", | |
27 # dimension=768, # The vectors we will use in this demo has 768 dimensions | |
28 # ) | |
29 | |
30 # docs = [] | |
31 # for i, (bbox, phrase) in enumerate(phrasesFromFile(Path("data") / "Meetings2226Minutes_20240702182359526 (1).pdf")): | |
32 # phrase = cleanup(phrase) | |
33 # print(f'{phrase=}') | |
34 # if not phrase: | |
35 # continue | |
36 | |
37 # [vector] = embedding_fn.encode_documents([phrase]) | |
38 # doc = { | |
39 | |
40 # "id": i, | |
41 # "vector": vector, | |
42 # "text": phrase, | |
43 # } | |
44 # docs.append(doc) | |
45 # res = client.insert(collection_name="demo_collection", data=docs) | |
46 # print('insert:', res) | |
47 | |
48 query_vectors = embedding_fn.encode_queries([q]) | |
49 | |
50 [query_result] = client.search( | |
51 collection_name="demo_collection", | |
52 data=query_vectors, | |
53 limit=15, | |
54 output_fields=["text"], | |
55 ) | |
56 | |
57 for row in query_result: | |
58 print(f'{row["distance"]:.6f} {row["entity"]["text"]}') | |
59 # import ipdb; ipdb.set_trace() |