Mercurial > code > home > repos > sco-bot
annotate scobot/service/query.py @ 13:403eff4a16c8
fix up indexer flow and fastapi server
author | drewp@bigasterisk.com |
---|---|
date | Thu, 11 Jul 2024 21:32:24 -0700 |
parents | 6622bacb0b84 |
children | b9c2b7fedbcd |
rev | line source |
---|---|
13
403eff4a16c8
fix up indexer flow and fastapi server
drewp@bigasterisk.com
parents:
11
diff
changeset
|
1 from scobot.index.access import SearchIndexRO |
403eff4a16c8
fix up indexer flow and fastapi server
drewp@bigasterisk.com
parents:
11
diff
changeset
|
2 from whoosh.qparser import QueryParser |
2 | 3 import json |
4 from pathlib import Path | |
13
403eff4a16c8
fix up indexer flow and fastapi server
drewp@bigasterisk.com
parents:
11
diff
changeset
|
5 from pprint import pprint |
403eff4a16c8
fix up indexer flow and fastapi server
drewp@bigasterisk.com
parents:
11
diff
changeset
|
6 from contextlib import asynccontextmanager |
2 | 7 |
11 | 8 # from pymilvus import MilvusClient |
9 # from milvus_model.dense.onnx import OnnxEmbeddingFunction | |
10 from fastapi import FastAPI | |
2 | 11 from tqdm import tqdm |
12 | |
13 | |
14 def rebuild(client, embedding_fn, dim): | |
15 client.drop_collection(collection_name="demo_collection") | |
16 if not client.has_collection(collection_name="demo_collection"): | |
17 client.create_collection( | |
18 collection_name="demo_collection", | |
19 dimension=dim, | |
20 ) | |
21 | |
22 docs = [] | |
23 for i, (bbox, phrase) in tqdm(enumerate( | |
24 phrasesFromFile( | |
25 Path("data") / | |
4 | 26 "Meetings2226Minutes_20240702182359526 (1).pdf")), |
8 | 27 desc="rebuilding", |
28 unit=' phrase'): | |
2 | 29 [vector] = embedding_fn.encode_documents([phrase]) |
30 doc = { | |
31 "id": i, | |
32 "vector": vector, | |
33 "text": phrase, | |
34 "bbox": json.dumps(bbox), | |
35 } | |
36 docs.append(doc) | |
37 res = client.insert(collection_name="demo_collection", data=docs) | |
4 | 38 print('insert:', res['insert_count']) |
2 | 39 |
40 | |
4 | 41 # https://huggingface.co/models?pipeline_tag=feature-extraction&library=onnx&sort=trending |
42 # embedding_fn = OnnxEmbeddingFunction(model_name="jinaai/jina-embeddings-v2-base-en") | |
43 # client = MilvusClient("milvus_demo.db") | |
44 # rebuild(client, embedding_fn, dim=embedding_fn.dim) | |
45 # search(q, embedding_fn, client) | |
0 | 46 |
4 | 47 |
13
403eff4a16c8
fix up indexer flow and fastapi server
drewp@bigasterisk.com
parents:
11
diff
changeset
|
48 @asynccontextmanager |
403eff4a16c8
fix up indexer flow and fastapi server
drewp@bigasterisk.com
parents:
11
diff
changeset
|
49 async def lifespan(app: FastAPI): |
403eff4a16c8
fix up indexer flow and fastapi server
drewp@bigasterisk.com
parents:
11
diff
changeset
|
50 app.state.index = SearchIndexRO('/tmp/scoindex') |
403eff4a16c8
fix up indexer flow and fastapi server
drewp@bigasterisk.com
parents:
11
diff
changeset
|
51 yield |
8 | 52 |
13
403eff4a16c8
fix up indexer flow and fastapi server
drewp@bigasterisk.com
parents:
11
diff
changeset
|
53 app = FastAPI(lifespan=lifespan) |
4 | 54 |
55 @app.get("/sco/query") | |
8 | 56 def read_query1(q: str): |
13
403eff4a16c8
fix up indexer flow and fastapi server
drewp@bigasterisk.com
parents:
11
diff
changeset
|
57 index = app.state.index |
8 | 58 |
13
403eff4a16c8
fix up indexer flow and fastapi server
drewp@bigasterisk.com
parents:
11
diff
changeset
|
59 query = QueryParser("phrase", index.ix.schema).parse(q) |
403eff4a16c8
fix up indexer flow and fastapi server
drewp@bigasterisk.com
parents:
11
diff
changeset
|
60 pprint(query) |
403eff4a16c8
fix up indexer flow and fastapi server
drewp@bigasterisk.com
parents:
11
diff
changeset
|
61 results = list(index.searcher.search(query)) |
8 | 62 return {"results": results} |