annotate search/query.py @ 8:f23b21bd0fce

apex search
author drewp@bigasterisk.com
date Sun, 07 Jul 2024 16:26:56 -0700
parents 0e33c65f1904
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
8
f23b21bd0fce apex search
drewp@bigasterisk.com
parents: 4
diff changeset
1 from dataclasses import dataclass
f23b21bd0fce apex search
drewp@bigasterisk.com
parents: 4
diff changeset
2 import html
2
82428652cda1 rewrite
drewp@bigasterisk.com
parents: 0
diff changeset
3 import json
8
f23b21bd0fce apex search
drewp@bigasterisk.com
parents: 4
diff changeset
4 from pprint import pprint
0
drewp@bigasterisk.com
parents:
diff changeset
5 import sys
2
82428652cda1 rewrite
drewp@bigasterisk.com
parents: 0
diff changeset
6 from pathlib import Path
8
f23b21bd0fce apex search
drewp@bigasterisk.com
parents: 4
diff changeset
7 from typing import Iterable
2
82428652cda1 rewrite
drewp@bigasterisk.com
parents: 0
diff changeset
8
82428652cda1 rewrite
drewp@bigasterisk.com
parents: 0
diff changeset
9 from tqdm import tqdm
82428652cda1 rewrite
drewp@bigasterisk.com
parents: 0
diff changeset
10
82428652cda1 rewrite
drewp@bigasterisk.com
parents: 0
diff changeset
11 from pymilvus import MilvusClient
82428652cda1 rewrite
drewp@bigasterisk.com
parents: 0
diff changeset
12 from milvus_model.dense.onnx import OnnxEmbeddingFunction
82428652cda1 rewrite
drewp@bigasterisk.com
parents: 0
diff changeset
13
8
f23b21bd0fce apex search
drewp@bigasterisk.com
parents: 4
diff changeset
14 from extract_pdf import files, phrasesFromFile
2
82428652cda1 rewrite
drewp@bigasterisk.com
parents: 0
diff changeset
15
4
0e33c65f1904 playing with extractors
drewp@bigasterisk.com
parents: 2
diff changeset
16 from fastapi import FastAPI
8
f23b21bd0fce apex search
drewp@bigasterisk.com
parents: 4
diff changeset
17 from search_apex import Search
f23b21bd0fce apex search
drewp@bigasterisk.com
parents: 4
diff changeset
18
2
82428652cda1 rewrite
drewp@bigasterisk.com
parents: 0
diff changeset
19
82428652cda1 rewrite
drewp@bigasterisk.com
parents: 0
diff changeset
20 def rebuild(client, embedding_fn, dim):
82428652cda1 rewrite
drewp@bigasterisk.com
parents: 0
diff changeset
21 client.drop_collection(collection_name="demo_collection")
82428652cda1 rewrite
drewp@bigasterisk.com
parents: 0
diff changeset
22 if not client.has_collection(collection_name="demo_collection"):
82428652cda1 rewrite
drewp@bigasterisk.com
parents: 0
diff changeset
23 client.create_collection(
82428652cda1 rewrite
drewp@bigasterisk.com
parents: 0
diff changeset
24 collection_name="demo_collection",
82428652cda1 rewrite
drewp@bigasterisk.com
parents: 0
diff changeset
25 dimension=dim,
82428652cda1 rewrite
drewp@bigasterisk.com
parents: 0
diff changeset
26 )
82428652cda1 rewrite
drewp@bigasterisk.com
parents: 0
diff changeset
27
82428652cda1 rewrite
drewp@bigasterisk.com
parents: 0
diff changeset
28 docs = []
82428652cda1 rewrite
drewp@bigasterisk.com
parents: 0
diff changeset
29 for i, (bbox, phrase) in tqdm(enumerate(
82428652cda1 rewrite
drewp@bigasterisk.com
parents: 0
diff changeset
30 phrasesFromFile(
82428652cda1 rewrite
drewp@bigasterisk.com
parents: 0
diff changeset
31 Path("data") /
4
0e33c65f1904 playing with extractors
drewp@bigasterisk.com
parents: 2
diff changeset
32 "Meetings2226Minutes_20240702182359526 (1).pdf")),
8
f23b21bd0fce apex search
drewp@bigasterisk.com
parents: 4
diff changeset
33 desc="rebuilding",
f23b21bd0fce apex search
drewp@bigasterisk.com
parents: 4
diff changeset
34 unit=' phrase'):
2
82428652cda1 rewrite
drewp@bigasterisk.com
parents: 0
diff changeset
35 [vector] = embedding_fn.encode_documents([phrase])
82428652cda1 rewrite
drewp@bigasterisk.com
parents: 0
diff changeset
36 doc = {
82428652cda1 rewrite
drewp@bigasterisk.com
parents: 0
diff changeset
37 "id": i,
82428652cda1 rewrite
drewp@bigasterisk.com
parents: 0
diff changeset
38 "vector": vector,
82428652cda1 rewrite
drewp@bigasterisk.com
parents: 0
diff changeset
39 "text": phrase,
82428652cda1 rewrite
drewp@bigasterisk.com
parents: 0
diff changeset
40 "bbox": json.dumps(bbox),
82428652cda1 rewrite
drewp@bigasterisk.com
parents: 0
diff changeset
41 }
82428652cda1 rewrite
drewp@bigasterisk.com
parents: 0
diff changeset
42 docs.append(doc)
82428652cda1 rewrite
drewp@bigasterisk.com
parents: 0
diff changeset
43 res = client.insert(collection_name="demo_collection", data=docs)
4
0e33c65f1904 playing with extractors
drewp@bigasterisk.com
parents: 2
diff changeset
44 print('insert:', res['insert_count'])
2
82428652cda1 rewrite
drewp@bigasterisk.com
parents: 0
diff changeset
45
82428652cda1 rewrite
drewp@bigasterisk.com
parents: 0
diff changeset
46
8
f23b21bd0fce apex search
drewp@bigasterisk.com
parents: 4
diff changeset
47 def xxsearch(q, embedding_fn, client):
2
82428652cda1 rewrite
drewp@bigasterisk.com
parents: 0
diff changeset
48 query_vectors = embedding_fn.encode_queries([q])
82428652cda1 rewrite
drewp@bigasterisk.com
parents: 0
diff changeset
49
82428652cda1 rewrite
drewp@bigasterisk.com
parents: 0
diff changeset
50 [query_result] = client.search(
82428652cda1 rewrite
drewp@bigasterisk.com
parents: 0
diff changeset
51 collection_name="demo_collection",
82428652cda1 rewrite
drewp@bigasterisk.com
parents: 0
diff changeset
52 data=query_vectors,
82428652cda1 rewrite
drewp@bigasterisk.com
parents: 0
diff changeset
53 limit=5,
82428652cda1 rewrite
drewp@bigasterisk.com
parents: 0
diff changeset
54 output_fields=["text"],
82428652cda1 rewrite
drewp@bigasterisk.com
parents: 0
diff changeset
55 )
82428652cda1 rewrite
drewp@bigasterisk.com
parents: 0
diff changeset
56 query_result.sort(key=lambda x: x["distance"], reverse=True)
82428652cda1 rewrite
drewp@bigasterisk.com
parents: 0
diff changeset
57
82428652cda1 rewrite
drewp@bigasterisk.com
parents: 0
diff changeset
58 for row in query_result:
82428652cda1 rewrite
drewp@bigasterisk.com
parents: 0
diff changeset
59 print(f'{row["distance"]:.6f} {row["entity"]["text"]}')
82428652cda1 rewrite
drewp@bigasterisk.com
parents: 0
diff changeset
60
0
drewp@bigasterisk.com
parents:
diff changeset
61
4
0e33c65f1904 playing with extractors
drewp@bigasterisk.com
parents: 2
diff changeset
62 # q, = sys.argv[1:]
0e33c65f1904 playing with extractors
drewp@bigasterisk.com
parents: 2
diff changeset
63
0e33c65f1904 playing with extractors
drewp@bigasterisk.com
parents: 2
diff changeset
64 # https://huggingface.co/models?pipeline_tag=feature-extraction&library=onnx&sort=trending
0e33c65f1904 playing with extractors
drewp@bigasterisk.com
parents: 2
diff changeset
65 # embedding_fn = OnnxEmbeddingFunction(model_name="jinaai/jina-embeddings-v2-base-en")
0e33c65f1904 playing with extractors
drewp@bigasterisk.com
parents: 2
diff changeset
66 # client = MilvusClient("milvus_demo.db")
0e33c65f1904 playing with extractors
drewp@bigasterisk.com
parents: 2
diff changeset
67 # rebuild(client, embedding_fn, dim=embedding_fn.dim)
0e33c65f1904 playing with extractors
drewp@bigasterisk.com
parents: 2
diff changeset
68 # search(q, embedding_fn, client)
0
drewp@bigasterisk.com
parents:
diff changeset
69
4
0e33c65f1904 playing with extractors
drewp@bigasterisk.com
parents: 2
diff changeset
70 app = FastAPI()
0e33c65f1904 playing with extractors
drewp@bigasterisk.com
parents: 2
diff changeset
71
8
f23b21bd0fce apex search
drewp@bigasterisk.com
parents: 4
diff changeset
72 search = Search()
f23b21bd0fce apex search
drewp@bigasterisk.com
parents: 4
diff changeset
73
4
0e33c65f1904 playing with extractors
drewp@bigasterisk.com
parents: 2
diff changeset
74
0e33c65f1904 playing with extractors
drewp@bigasterisk.com
parents: 2
diff changeset
75 @app.get("/sco/query")
8
f23b21bd0fce apex search
drewp@bigasterisk.com
parents: 4
diff changeset
76 def read_query1(q: str):
f23b21bd0fce apex search
drewp@bigasterisk.com
parents: 4
diff changeset
77 results = []
f23b21bd0fce apex search
drewp@bigasterisk.com
parents: 4
diff changeset
78 results = search.search(q)
f23b21bd0fce apex search
drewp@bigasterisk.com
parents: 4
diff changeset
79
f23b21bd0fce apex search
drewp@bigasterisk.com
parents: 4
diff changeset
80 return {"results": results}