Mercurial > code > home > repos > sco-bot
annotate scobot/service/query.py @ 17:0d72635fc501
reloadIndexIfChanged
author | drewp@bigasterisk.com |
---|---|
date | Fri, 19 Jul 2024 00:59:45 -0700 |
parents | 7a87ba2f00d9 |
children |
rev | line source |
---|---|
14
b9c2b7fedbcd
fix up deployment and connect ui to server again
drewp@bigasterisk.com
parents:
13
diff
changeset
|
1 import html |
13
403eff4a16c8
fix up indexer flow and fastapi server
drewp@bigasterisk.com
parents:
11
diff
changeset
|
2 from scobot.index.access import SearchIndexRO |
403eff4a16c8
fix up indexer flow and fastapi server
drewp@bigasterisk.com
parents:
11
diff
changeset
|
3 from whoosh.qparser import QueryParser |
2 | 4 import json |
5 from pathlib import Path | |
13
403eff4a16c8
fix up indexer flow and fastapi server
drewp@bigasterisk.com
parents:
11
diff
changeset
|
6 from pprint import pprint |
403eff4a16c8
fix up indexer flow and fastapi server
drewp@bigasterisk.com
parents:
11
diff
changeset
|
7 from contextlib import asynccontextmanager |
2 | 8 |
11 | 9 # from pymilvus import MilvusClient |
10 # from milvus_model.dense.onnx import OnnxEmbeddingFunction | |
11 from fastapi import FastAPI | |
2 | 12 from tqdm import tqdm |
13 | |
14 | |
15 def rebuild(client, embedding_fn, dim): | |
16 client.drop_collection(collection_name="demo_collection") | |
17 if not client.has_collection(collection_name="demo_collection"): | |
18 client.create_collection( | |
19 collection_name="demo_collection", | |
20 dimension=dim, | |
21 ) | |
22 | |
23 docs = [] | |
24 for i, (bbox, phrase) in tqdm(enumerate( | |
25 phrasesFromFile( | |
26 Path("data") / | |
4 | 27 "Meetings2226Minutes_20240702182359526 (1).pdf")), |
8 | 28 desc="rebuilding", |
29 unit=' phrase'): | |
2 | 30 [vector] = embedding_fn.encode_documents([phrase]) |
31 doc = { | |
32 "id": i, | |
33 "vector": vector, | |
34 "text": phrase, | |
35 "bbox": json.dumps(bbox), | |
36 } | |
37 docs.append(doc) | |
38 res = client.insert(collection_name="demo_collection", data=docs) | |
4 | 39 print('insert:', res['insert_count']) |
2 | 40 |
41 | |
4 | 42 # https://huggingface.co/models?pipeline_tag=feature-extraction&library=onnx&sort=trending |
43 # embedding_fn = OnnxEmbeddingFunction(model_name="jinaai/jina-embeddings-v2-base-en") | |
44 # client = MilvusClient("milvus_demo.db") | |
45 # rebuild(client, embedding_fn, dim=embedding_fn.dim) | |
46 # search(q, embedding_fn, client) | |
0 | 47 |
17 | 48 indexPath = Path('data/build/index0') |
49 | |
4 | 50 |
13
403eff4a16c8
fix up indexer flow and fastapi server
drewp@bigasterisk.com
parents:
11
diff
changeset
|
51 @asynccontextmanager |
403eff4a16c8
fix up indexer flow and fastapi server
drewp@bigasterisk.com
parents:
11
diff
changeset
|
52 async def lifespan(app: FastAPI): |
17 | 53 reloadIndexIfChanged(app) |
13
403eff4a16c8
fix up indexer flow and fastapi server
drewp@bigasterisk.com
parents:
11
diff
changeset
|
54 yield |
8 | 55 |
17 | 56 |
13
403eff4a16c8
fix up indexer flow and fastapi server
drewp@bigasterisk.com
parents:
11
diff
changeset
|
57 app = FastAPI(lifespan=lifespan) |
4 | 58 |
17 | 59 |
60 def reloadIndexIfChanged(app: FastAPI): | |
61 if ((not hasattr(app.state, 'indexMtime')) | |
62 or (app.state.indexMtime != indexPath.stat().st_mtime)): | |
63 print('reloading index') | |
64 app.state.indexMtime = indexPath.stat().st_mtime | |
65 app.state.index = SearchIndexRO(indexPath) | |
66 | |
67 | |
4 | 68 @app.get("/sco/query") |
8 | 69 def read_query1(q: str): |
17 | 70 reloadIndexIfChanged(app) |
13
403eff4a16c8
fix up indexer flow and fastapi server
drewp@bigasterisk.com
parents:
11
diff
changeset
|
71 index = app.state.index |
8 | 72 |
13
403eff4a16c8
fix up indexer flow and fastapi server
drewp@bigasterisk.com
parents:
11
diff
changeset
|
73 query = QueryParser("phrase", index.ix.schema).parse(q) |
403eff4a16c8
fix up indexer flow and fastapi server
drewp@bigasterisk.com
parents:
11
diff
changeset
|
74 pprint(query) |
14
b9c2b7fedbcd
fix up deployment and connect ui to server again
drewp@bigasterisk.com
parents:
13
diff
changeset
|
75 results = index.searcher.search(query) |
17 | 76 docs = [] |
14
b9c2b7fedbcd
fix up deployment and connect ui to server again
drewp@bigasterisk.com
parents:
13
diff
changeset
|
77 for res in results: |
17 | 78 doc = dict(res) |
14
b9c2b7fedbcd
fix up deployment and connect ui to server again
drewp@bigasterisk.com
parents:
13
diff
changeset
|
79 doc['snippetHtml'] = html.escape(doc['phrase']) |
b9c2b7fedbcd
fix up deployment and connect ui to server again
drewp@bigasterisk.com
parents:
13
diff
changeset
|
80 docs.append(doc) |
b9c2b7fedbcd
fix up deployment and connect ui to server again
drewp@bigasterisk.com
parents:
13
diff
changeset
|
81 return {"results": docs} |