annotate scobot/service/query.py @ 17:0d72635fc501

reloadIndexIfChanged
author drewp@bigasterisk.com
date Fri, 19 Jul 2024 00:59:45 -0700
parents 7a87ba2f00d9
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
14
b9c2b7fedbcd fix up deployment and connect ui to server again
drewp@bigasterisk.com
parents: 13
diff changeset
1 import html
13
403eff4a16c8 fix up indexer flow and fastapi server
drewp@bigasterisk.com
parents: 11
diff changeset
2 from scobot.index.access import SearchIndexRO
403eff4a16c8 fix up indexer flow and fastapi server
drewp@bigasterisk.com
parents: 11
diff changeset
3 from whoosh.qparser import QueryParser
2
82428652cda1 rewrite
drewp@bigasterisk.com
parents: 0
diff changeset
4 import json
82428652cda1 rewrite
drewp@bigasterisk.com
parents: 0
diff changeset
5 from pathlib import Path
13
403eff4a16c8 fix up indexer flow and fastapi server
drewp@bigasterisk.com
parents: 11
diff changeset
6 from pprint import pprint
403eff4a16c8 fix up indexer flow and fastapi server
drewp@bigasterisk.com
parents: 11
diff changeset
7 from contextlib import asynccontextmanager
2
82428652cda1 rewrite
drewp@bigasterisk.com
parents: 0
diff changeset
8
11
6622bacb0b84 first pass at reorg
drewp@bigasterisk.com
parents: 8
diff changeset
9 # from pymilvus import MilvusClient
6622bacb0b84 first pass at reorg
drewp@bigasterisk.com
parents: 8
diff changeset
10 # from milvus_model.dense.onnx import OnnxEmbeddingFunction
6622bacb0b84 first pass at reorg
drewp@bigasterisk.com
parents: 8
diff changeset
11 from fastapi import FastAPI
2
82428652cda1 rewrite
drewp@bigasterisk.com
parents: 0
diff changeset
12 from tqdm import tqdm
82428652cda1 rewrite
drewp@bigasterisk.com
parents: 0
diff changeset
13
82428652cda1 rewrite
drewp@bigasterisk.com
parents: 0
diff changeset
14
82428652cda1 rewrite
drewp@bigasterisk.com
parents: 0
diff changeset
15 def rebuild(client, embedding_fn, dim):
82428652cda1 rewrite
drewp@bigasterisk.com
parents: 0
diff changeset
16 client.drop_collection(collection_name="demo_collection")
82428652cda1 rewrite
drewp@bigasterisk.com
parents: 0
diff changeset
17 if not client.has_collection(collection_name="demo_collection"):
82428652cda1 rewrite
drewp@bigasterisk.com
parents: 0
diff changeset
18 client.create_collection(
82428652cda1 rewrite
drewp@bigasterisk.com
parents: 0
diff changeset
19 collection_name="demo_collection",
82428652cda1 rewrite
drewp@bigasterisk.com
parents: 0
diff changeset
20 dimension=dim,
82428652cda1 rewrite
drewp@bigasterisk.com
parents: 0
diff changeset
21 )
82428652cda1 rewrite
drewp@bigasterisk.com
parents: 0
diff changeset
22
82428652cda1 rewrite
drewp@bigasterisk.com
parents: 0
diff changeset
23 docs = []
82428652cda1 rewrite
drewp@bigasterisk.com
parents: 0
diff changeset
24 for i, (bbox, phrase) in tqdm(enumerate(
82428652cda1 rewrite
drewp@bigasterisk.com
parents: 0
diff changeset
25 phrasesFromFile(
82428652cda1 rewrite
drewp@bigasterisk.com
parents: 0
diff changeset
26 Path("data") /
4
0e33c65f1904 playing with extractors
drewp@bigasterisk.com
parents: 2
diff changeset
27 "Meetings2226Minutes_20240702182359526 (1).pdf")),
8
f23b21bd0fce apex search
drewp@bigasterisk.com
parents: 4
diff changeset
28 desc="rebuilding",
f23b21bd0fce apex search
drewp@bigasterisk.com
parents: 4
diff changeset
29 unit=' phrase'):
2
82428652cda1 rewrite
drewp@bigasterisk.com
parents: 0
diff changeset
30 [vector] = embedding_fn.encode_documents([phrase])
82428652cda1 rewrite
drewp@bigasterisk.com
parents: 0
diff changeset
31 doc = {
82428652cda1 rewrite
drewp@bigasterisk.com
parents: 0
diff changeset
32 "id": i,
82428652cda1 rewrite
drewp@bigasterisk.com
parents: 0
diff changeset
33 "vector": vector,
82428652cda1 rewrite
drewp@bigasterisk.com
parents: 0
diff changeset
34 "text": phrase,
82428652cda1 rewrite
drewp@bigasterisk.com
parents: 0
diff changeset
35 "bbox": json.dumps(bbox),
82428652cda1 rewrite
drewp@bigasterisk.com
parents: 0
diff changeset
36 }
82428652cda1 rewrite
drewp@bigasterisk.com
parents: 0
diff changeset
37 docs.append(doc)
82428652cda1 rewrite
drewp@bigasterisk.com
parents: 0
diff changeset
38 res = client.insert(collection_name="demo_collection", data=docs)
4
0e33c65f1904 playing with extractors
drewp@bigasterisk.com
parents: 2
diff changeset
39 print('insert:', res['insert_count'])
2
82428652cda1 rewrite
drewp@bigasterisk.com
parents: 0
diff changeset
40
82428652cda1 rewrite
drewp@bigasterisk.com
parents: 0
diff changeset
41
4
0e33c65f1904 playing with extractors
drewp@bigasterisk.com
parents: 2
diff changeset
42 # https://huggingface.co/models?pipeline_tag=feature-extraction&library=onnx&sort=trending
0e33c65f1904 playing with extractors
drewp@bigasterisk.com
parents: 2
diff changeset
43 # embedding_fn = OnnxEmbeddingFunction(model_name="jinaai/jina-embeddings-v2-base-en")
0e33c65f1904 playing with extractors
drewp@bigasterisk.com
parents: 2
diff changeset
44 # client = MilvusClient("milvus_demo.db")
0e33c65f1904 playing with extractors
drewp@bigasterisk.com
parents: 2
diff changeset
45 # rebuild(client, embedding_fn, dim=embedding_fn.dim)
0e33c65f1904 playing with extractors
drewp@bigasterisk.com
parents: 2
diff changeset
46 # search(q, embedding_fn, client)
0
drewp@bigasterisk.com
parents:
diff changeset
47
17
0d72635fc501 reloadIndexIfChanged
drewp@bigasterisk.com
parents: 16
diff changeset
48 indexPath = Path('data/build/index0')
0d72635fc501 reloadIndexIfChanged
drewp@bigasterisk.com
parents: 16
diff changeset
49
4
0e33c65f1904 playing with extractors
drewp@bigasterisk.com
parents: 2
diff changeset
50
13
403eff4a16c8 fix up indexer flow and fastapi server
drewp@bigasterisk.com
parents: 11
diff changeset
51 @asynccontextmanager
403eff4a16c8 fix up indexer flow and fastapi server
drewp@bigasterisk.com
parents: 11
diff changeset
52 async def lifespan(app: FastAPI):
17
0d72635fc501 reloadIndexIfChanged
drewp@bigasterisk.com
parents: 16
diff changeset
53 reloadIndexIfChanged(app)
13
403eff4a16c8 fix up indexer flow and fastapi server
drewp@bigasterisk.com
parents: 11
diff changeset
54 yield
8
f23b21bd0fce apex search
drewp@bigasterisk.com
parents: 4
diff changeset
55
17
0d72635fc501 reloadIndexIfChanged
drewp@bigasterisk.com
parents: 16
diff changeset
56
13
403eff4a16c8 fix up indexer flow and fastapi server
drewp@bigasterisk.com
parents: 11
diff changeset
57 app = FastAPI(lifespan=lifespan)
4
0e33c65f1904 playing with extractors
drewp@bigasterisk.com
parents: 2
diff changeset
58
17
0d72635fc501 reloadIndexIfChanged
drewp@bigasterisk.com
parents: 16
diff changeset
59
0d72635fc501 reloadIndexIfChanged
drewp@bigasterisk.com
parents: 16
diff changeset
60 def reloadIndexIfChanged(app: FastAPI):
0d72635fc501 reloadIndexIfChanged
drewp@bigasterisk.com
parents: 16
diff changeset
61 if ((not hasattr(app.state, 'indexMtime'))
0d72635fc501 reloadIndexIfChanged
drewp@bigasterisk.com
parents: 16
diff changeset
62 or (app.state.indexMtime != indexPath.stat().st_mtime)):
0d72635fc501 reloadIndexIfChanged
drewp@bigasterisk.com
parents: 16
diff changeset
63 print('reloading index')
0d72635fc501 reloadIndexIfChanged
drewp@bigasterisk.com
parents: 16
diff changeset
64 app.state.indexMtime = indexPath.stat().st_mtime
0d72635fc501 reloadIndexIfChanged
drewp@bigasterisk.com
parents: 16
diff changeset
65 app.state.index = SearchIndexRO(indexPath)
0d72635fc501 reloadIndexIfChanged
drewp@bigasterisk.com
parents: 16
diff changeset
66
0d72635fc501 reloadIndexIfChanged
drewp@bigasterisk.com
parents: 16
diff changeset
67
4
0e33c65f1904 playing with extractors
drewp@bigasterisk.com
parents: 2
diff changeset
68 @app.get("/sco/query")
8
f23b21bd0fce apex search
drewp@bigasterisk.com
parents: 4
diff changeset
69 def read_query1(q: str):
17
0d72635fc501 reloadIndexIfChanged
drewp@bigasterisk.com
parents: 16
diff changeset
70 reloadIndexIfChanged(app)
13
403eff4a16c8 fix up indexer flow and fastapi server
drewp@bigasterisk.com
parents: 11
diff changeset
71 index = app.state.index
8
f23b21bd0fce apex search
drewp@bigasterisk.com
parents: 4
diff changeset
72
13
403eff4a16c8 fix up indexer flow and fastapi server
drewp@bigasterisk.com
parents: 11
diff changeset
73 query = QueryParser("phrase", index.ix.schema).parse(q)
403eff4a16c8 fix up indexer flow and fastapi server
drewp@bigasterisk.com
parents: 11
diff changeset
74 pprint(query)
14
b9c2b7fedbcd fix up deployment and connect ui to server again
drewp@bigasterisk.com
parents: 13
diff changeset
75 results = index.searcher.search(query)
17
0d72635fc501 reloadIndexIfChanged
drewp@bigasterisk.com
parents: 16
diff changeset
76 docs = []
14
b9c2b7fedbcd fix up deployment and connect ui to server again
drewp@bigasterisk.com
parents: 13
diff changeset
77 for res in results:
17
0d72635fc501 reloadIndexIfChanged
drewp@bigasterisk.com
parents: 16
diff changeset
78 doc = dict(res)
14
b9c2b7fedbcd fix up deployment and connect ui to server again
drewp@bigasterisk.com
parents: 13
diff changeset
79 doc['snippetHtml'] = html.escape(doc['phrase'])
b9c2b7fedbcd fix up deployment and connect ui to server again
drewp@bigasterisk.com
parents: 13
diff changeset
80 docs.append(doc)
b9c2b7fedbcd fix up deployment and connect ui to server again
drewp@bigasterisk.com
parents: 13
diff changeset
81 return {"results": docs}