Mercurial > code > home > repos > sco-bot
view scobot/service/query.py @ 17:0d72635fc501
reloadIndexIfChanged
author | drewp@bigasterisk.com |
---|---|
date | Fri, 19 Jul 2024 00:59:45 -0700 |
parents | 7a87ba2f00d9 |
children |
line wrap: on
line source
import html from scobot.index.access import SearchIndexRO from whoosh.qparser import QueryParser import json from pathlib import Path from pprint import pprint from contextlib import asynccontextmanager # from pymilvus import MilvusClient # from milvus_model.dense.onnx import OnnxEmbeddingFunction from fastapi import FastAPI from tqdm import tqdm def rebuild(client, embedding_fn, dim): client.drop_collection(collection_name="demo_collection") if not client.has_collection(collection_name="demo_collection"): client.create_collection( collection_name="demo_collection", dimension=dim, ) docs = [] for i, (bbox, phrase) in tqdm(enumerate( phrasesFromFile( Path("data") / "Meetings2226Minutes_20240702182359526 (1).pdf")), desc="rebuilding", unit=' phrase'): [vector] = embedding_fn.encode_documents([phrase]) doc = { "id": i, "vector": vector, "text": phrase, "bbox": json.dumps(bbox), } docs.append(doc) res = client.insert(collection_name="demo_collection", data=docs) print('insert:', res['insert_count']) # https://huggingface.co/models?pipeline_tag=feature-extraction&library=onnx&sort=trending # embedding_fn = OnnxEmbeddingFunction(model_name="jinaai/jina-embeddings-v2-base-en") # client = MilvusClient("milvus_demo.db") # rebuild(client, embedding_fn, dim=embedding_fn.dim) # search(q, embedding_fn, client) indexPath = Path('data/build/index0') @asynccontextmanager async def lifespan(app: FastAPI): reloadIndexIfChanged(app) yield app = FastAPI(lifespan=lifespan) def reloadIndexIfChanged(app: FastAPI): if ((not hasattr(app.state, 'indexMtime')) or (app.state.indexMtime != indexPath.stat().st_mtime)): print('reloading index') app.state.indexMtime = indexPath.stat().st_mtime app.state.index = SearchIndexRO(indexPath) @app.get("/sco/query") def read_query1(q: str): reloadIndexIfChanged(app) index = app.state.index query = QueryParser("phrase", index.ix.schema).parse(q) pprint(query) results = index.searcher.search(query) docs = [] for res in results: doc = dict(res) doc['snippetHtml'] = html.escape(doc['phrase']) docs.append(doc) return {"results": docs}