8
|
1 from dataclasses import dataclass
|
|
2 import html
|
2
|
3 import json
|
8
|
4 from pprint import pprint
|
0
|
5 import sys
|
2
|
6 from pathlib import Path
|
8
|
7 from typing import Iterable
|
2
|
8
|
|
9 from tqdm import tqdm
|
|
10
|
|
11 from pymilvus import MilvusClient
|
|
12 from milvus_model.dense.onnx import OnnxEmbeddingFunction
|
|
13
|
8
|
14 from extract_pdf import files, phrasesFromFile
|
2
|
15
|
4
|
16 from fastapi import FastAPI
|
8
|
17 from search_apex import Search
|
|
18
|
2
|
19
|
|
20 def rebuild(client, embedding_fn, dim):
|
|
21 client.drop_collection(collection_name="demo_collection")
|
|
22 if not client.has_collection(collection_name="demo_collection"):
|
|
23 client.create_collection(
|
|
24 collection_name="demo_collection",
|
|
25 dimension=dim,
|
|
26 )
|
|
27
|
|
28 docs = []
|
|
29 for i, (bbox, phrase) in tqdm(enumerate(
|
|
30 phrasesFromFile(
|
|
31 Path("data") /
|
4
|
32 "Meetings2226Minutes_20240702182359526 (1).pdf")),
|
8
|
33 desc="rebuilding",
|
|
34 unit=' phrase'):
|
2
|
35 [vector] = embedding_fn.encode_documents([phrase])
|
|
36 doc = {
|
|
37 "id": i,
|
|
38 "vector": vector,
|
|
39 "text": phrase,
|
|
40 "bbox": json.dumps(bbox),
|
|
41 }
|
|
42 docs.append(doc)
|
|
43 res = client.insert(collection_name="demo_collection", data=docs)
|
4
|
44 print('insert:', res['insert_count'])
|
2
|
45
|
|
46
|
8
|
47 def xxsearch(q, embedding_fn, client):
|
2
|
48 query_vectors = embedding_fn.encode_queries([q])
|
|
49
|
|
50 [query_result] = client.search(
|
|
51 collection_name="demo_collection",
|
|
52 data=query_vectors,
|
|
53 limit=5,
|
|
54 output_fields=["text"],
|
|
55 )
|
|
56 query_result.sort(key=lambda x: x["distance"], reverse=True)
|
|
57
|
|
58 for row in query_result:
|
|
59 print(f'{row["distance"]:.6f} {row["entity"]["text"]}')
|
|
60
|
0
|
61
|
4
|
62 # q, = sys.argv[1:]
|
|
63
|
|
64 # https://huggingface.co/models?pipeline_tag=feature-extraction&library=onnx&sort=trending
|
|
65 # embedding_fn = OnnxEmbeddingFunction(model_name="jinaai/jina-embeddings-v2-base-en")
|
|
66 # client = MilvusClient("milvus_demo.db")
|
|
67 # rebuild(client, embedding_fn, dim=embedding_fn.dim)
|
|
68 # search(q, embedding_fn, client)
|
0
|
69
|
4
|
70 app = FastAPI()
|
|
71
|
8
|
72 search = Search()
|
|
73
|
4
|
74
|
|
75 @app.get("/sco/query")
|
8
|
76 def read_query1(q: str):
|
|
77 results = []
|
|
78 results = search.search(q)
|
|
79
|
|
80 return {"results": results}
|