view scobot/service/query.py @ 17:0d72635fc501

reloadIndexIfChanged
author drewp@bigasterisk.com
date Fri, 19 Jul 2024 00:59:45 -0700
parents 7a87ba2f00d9
children
line wrap: on
line source

import html
from scobot.index.access import SearchIndexRO
from whoosh.qparser import QueryParser
import json
from pathlib import Path
from pprint import pprint
from contextlib import asynccontextmanager

# from pymilvus import MilvusClient
# from milvus_model.dense.onnx import OnnxEmbeddingFunction
from fastapi import FastAPI
from tqdm import tqdm


def rebuild(client, embedding_fn, dim):
    client.drop_collection(collection_name="demo_collection")
    if not client.has_collection(collection_name="demo_collection"):
        client.create_collection(
            collection_name="demo_collection",
            dimension=dim,
        )

    docs = []
    for i, (bbox, phrase) in tqdm(enumerate(
            phrasesFromFile(
                Path("data") /
                "Meetings2226Minutes_20240702182359526 (1).pdf")),
                                  desc="rebuilding",
                                  unit=' phrase'):
        [vector] = embedding_fn.encode_documents([phrase])
        doc = {
            "id": i,
            "vector": vector,
            "text": phrase,
            "bbox": json.dumps(bbox),
        }
        docs.append(doc)
    res = client.insert(collection_name="demo_collection", data=docs)
    print('insert:', res['insert_count'])


# https://huggingface.co/models?pipeline_tag=feature-extraction&library=onnx&sort=trending
# embedding_fn = OnnxEmbeddingFunction(model_name="jinaai/jina-embeddings-v2-base-en")
# client = MilvusClient("milvus_demo.db")
# rebuild(client, embedding_fn, dim=embedding_fn.dim)
# search(q, embedding_fn, client)

indexPath = Path('data/build/index0')


@asynccontextmanager
async def lifespan(app: FastAPI):
    reloadIndexIfChanged(app)
    yield


app = FastAPI(lifespan=lifespan)


def reloadIndexIfChanged(app: FastAPI):
    if ((not hasattr(app.state, 'indexMtime'))
            or (app.state.indexMtime != indexPath.stat().st_mtime)):
        print('reloading index')
        app.state.indexMtime = indexPath.stat().st_mtime
        app.state.index = SearchIndexRO(indexPath)


@app.get("/sco/query")
def read_query1(q: str):
    reloadIndexIfChanged(app)
    index = app.state.index

    query = QueryParser("phrase", index.ix.schema).parse(q)
    pprint(query)
    results = index.searcher.search(query)
    docs = []
    for res in results:
        doc = dict(res)
        doc['snippetHtml'] = html.escape(doc['phrase'])
        docs.append(doc)
    return {"results": docs}