# HG changeset patch # User drewp@bigasterisk.com # Date 1720758744 25200 # Node ID 403eff4a16c8ac6a8ad1f357b2f88aa45c91c5ef # Parent 7f36497bfac3778cf012eb80822b583ee103e87d fix up indexer flow and fastapi server diff -r 7f36497bfac3 -r 403eff4a16c8 .hgignore --- a/.hgignore Thu Jul 11 18:16:20 2024 -0700 +++ b/.hgignore Thu Jul 11 21:32:24 2024 -0700 @@ -6,3 +6,4 @@ data/ milvus_demo.db web/node_modules/ +prefect/ diff -r 7f36497bfac3 -r 403eff4a16c8 env --- a/env Thu Jul 11 18:16:20 2024 -0700 +++ b/env Thu Jul 11 21:32:24 2024 -0700 @@ -1,2 +1,3 @@ -PREFECT_API_URL=http://127.0.0.1:4200/api +PREFECT_API_URL=http://dash:4200/api PREFECT_HOME=./prefect +PREFECT_SERVER_API_HOST=0.0.0.0 \ No newline at end of file diff -r 7f36497bfac3 -r 403eff4a16c8 k8s/deploy.yaml --- a/k8s/deploy.yaml Thu Jul 11 18:16:20 2024 -0700 +++ b/k8s/deploy.yaml Thu Jul 11 21:32:24 2024 -0700 @@ -38,7 +38,7 @@ - "0.0.0.0" - --port - "8001" - - scobot/service/query.py + - scobot ports: - containerPort: 8001 volumeMounts: diff -r 7f36497bfac3 -r 403eff4a16c8 pyproject.toml --- a/pyproject.toml Thu Jul 11 18:16:20 2024 -0700 +++ b/pyproject.toml Thu Jul 11 21:32:24 2024 -0700 @@ -31,5 +31,6 @@ [tool.pdm.scripts] _.env_file = "env" run_prefect_server = "prefect server start" -run_build_flow = "python build_index.py" -start_build = "prefect deployment run buildIndex/buildIndex" \ No newline at end of file +run_build_flow = "python -c 'from scobot.index.build_index_flow import buildIndex; buildIndex.serve(buildIndex.__name__)'" +start_build = "prefect deployment run buildIndex/buildIndex" +run_local_deploy = "fastapi dev --host 0.0.0.0 --port 8001 scobot" \ No newline at end of file diff -r 7f36497bfac3 -r 403eff4a16c8 scobot/__init__.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scobot/__init__.py Thu Jul 11 21:32:24 2024 -0700 @@ -0,0 +1,1 @@ +from scobot.service.query import app diff -r 7f36497bfac3 -r 403eff4a16c8 scobot/index/access.py --- a/scobot/index/access.py Thu Jul 11 18:16:20 2024 -0700 +++ b/scobot/index/access.py Thu Jul 11 21:32:24 2024 -0700 @@ -1,6 +1,7 @@ from pathlib import Path +import shutil -from whoosh.index import create_in +from whoosh.index import create_in, open_dir from scobot.index.schema import schema @@ -9,9 +10,13 @@ class SearchIndex: - def __init__(self, indexDir: Path): - indexDir.mkdir(parents=True, exist_ok=True) - self.ix = create_in(indexDir, schema) + def __init__(self, indexDir: Path, delete_existing=True): + if delete_existing: + shutil.rmtree(indexDir) + indexDir.mkdir(parents=True, exist_ok=True) + self.ix = create_in(indexDir, schema) + else: + self.ix = open_dir(indexDir) self.writer = self.ix.writer() def addDoc(self, **kw): @@ -21,3 +26,9 @@ self.writer.commit() with self.ix.searcher() as searcher: log.info(f'index doc count = {searcher.doc_count()}') + +class SearchIndexRO: + def __init__(self, indexDir: Path): + self.ix = open_dir(indexDir, readonly=True) + self.searcher = self.ix.searcher() + print(f'{self.searcher.doc_count()=}') \ No newline at end of file diff -r 7f36497bfac3 -r 403eff4a16c8 scobot/index/build_index_flow.py --- a/scobot/index/build_index_flow.py Thu Jul 11 18:16:20 2024 -0700 +++ b/scobot/index/build_index_flow.py Thu Jul 11 21:32:24 2024 -0700 @@ -71,9 +71,9 @@ pass else: html = getCityPermanent(agendaUrl) - text = extractMeetingText(html) - # todo group phrases phrasesFromFile - index.addDoc(title=f'{mtg["date"]} {mtg["title"]}', content=text) + texts = extractMeetingText(html) + for se in nltk.sent_tokenize(' '.join(texts)): + index.addDoc(sourceTitle=f'{mtg["date"]} {mtg["title"]}', phrase=se) try: videoUrl = mtg['videoUrl'] @@ -102,4 +102,4 @@ if __name__ == '__main__': - buildIndex.serve() + buildIndex.serve(name='buildIndex') diff -r 7f36497bfac3 -r 403eff4a16c8 scobot/service/query.py --- a/scobot/service/query.py Thu Jul 11 18:16:20 2024 -0700 +++ b/scobot/service/query.py Thu Jul 11 21:32:24 2024 -0700 @@ -1,5 +1,9 @@ +from scobot.index.access import SearchIndexRO +from whoosh.qparser import QueryParser import json from pathlib import Path +from pprint import pprint +from contextlib import asynccontextmanager # from pymilvus import MilvusClient # from milvus_model.dense.onnx import OnnxEmbeddingFunction @@ -34,37 +38,25 @@ print('insert:', res['insert_count']) -def xxsearch(q, embedding_fn, client): - query_vectors = embedding_fn.encode_queries([q]) - - [query_result] = client.search( - collection_name="demo_collection", - data=query_vectors, - limit=5, - output_fields=["text"], - ) - query_result.sort(key=lambda x: x["distance"], reverse=True) - - for row in query_result: - print(f'{row["distance"]:.6f} {row["entity"]["text"]}') - - -# q, = sys.argv[1:] - # https://huggingface.co/models?pipeline_tag=feature-extraction&library=onnx&sort=trending # embedding_fn = OnnxEmbeddingFunction(model_name="jinaai/jina-embeddings-v2-base-en") # client = MilvusClient("milvus_demo.db") # rebuild(client, embedding_fn, dim=embedding_fn.dim) # search(q, embedding_fn, client) -app = FastAPI() -#search = Search() +@asynccontextmanager +async def lifespan(app: FastAPI): + app.state.index = SearchIndexRO('/tmp/scoindex') + yield +app = FastAPI(lifespan=lifespan) @app.get("/sco/query") def read_query1(q: str): - results = [] - results = search.search(q) + index = app.state.index + query = QueryParser("phrase", index.ix.schema).parse(q) + pprint(query) + results = list(index.searcher.search(query)) return {"results": results}