Mercurial > code > home > repos > sco-bot
changeset 8:f23b21bd0fce
apex search
author | drewp@bigasterisk.com |
---|---|
date | Sun, 07 Jul 2024 16:26:56 -0700 |
parents | 53ae53f7d1b3 |
children | d1b54241a731 |
files | k8s/deploy.yaml k8s/volumes.yaml pdm.lock pyproject.toml search/Dockerfile search/extract_pdf.py search/query.py search/search_apex.py search/search_apex_rebuild.py search/search_base.py web/src/main.ts |
diffstat | 11 files changed, 218 insertions(+), 25 deletions(-) [+] |
line wrap: on
line diff
--- a/k8s/deploy.yaml Sat Jul 06 16:45:19 2024 -0700 +++ b/k8s/deploy.yaml Sun Jul 07 16:26:56 2024 -0700 @@ -12,6 +12,10 @@ labels: app: sco-bot spec: + volumes: + - name: sco-bot-data + persistentVolumeClaim: + claimName: sco-bot-data containers: - name: vite image: reg:5000/sco_bot_web @@ -36,4 +40,15 @@ - "8001" - search/query.py ports: - - containerPort: 8001 \ No newline at end of file + - containerPort: 8001 + volumeMounts: + - name: sco-bot-data + mountPath: /opt/data + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: "kubernetes.io/hostname" + operator: In + values: ["ditto", "dash"] # need /my/serv \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/k8s/volumes.yaml Sun Jul 07 16:26:56 2024 -0700 @@ -0,0 +1,22 @@ + +apiVersion: v1 +kind: PersistentVolume +metadata: + name: sco-bot-data + labels: {type: local} +spec: + storageClassName: manual + hostPath: {path: "/my/serv/sco-bot/data"} + capacity: {storage: 5Mi} + accessModes: ["ReadWriteOnce"] + persistentVolumeReclaimPolicy: Retain + claimRef: {namespace: default, name: sco-bot-data} +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: {name: sco-bot-data} +spec: + storageClassName: "" + volumeName: "sco-bot-data" + accessModes: ["ReadWriteOnce"] + resources: { requests: { storage: 5Mi } } \ No newline at end of file
--- a/pdm.lock Sat Jul 06 16:45:19 2024 -0700 +++ b/pdm.lock Sun Jul 07 16:26:56 2024 -0700 @@ -5,7 +5,7 @@ groups = ["default"] strategy = ["cross_platform", "inherit_metadata"] lock_version = "4.4.1" -content_hash = "sha256:914f437191f234f800e6f04c0fedb35ceea1ea96f0c2b60406a111aa691ab8df" +content_hash = "sha256:50f2904cbccac5298835948c8e9bc0168395a04c59bc0967ae6e8c073b2adfb8" [[package]] name = "absl-py" @@ -106,6 +106,16 @@ ] [[package]] +name = "apex-search" +version = "0.0.2" +requires_python = ">=3.10" +summary = "A full text search implementation in Python" +groups = ["default"] +files = [ + {file = "apex-search-0.0.2.tar.gz", hash = "sha256:e7593732b8b27e0994ed1d2f8b9f213444ee60f42da0bce39352cad343c68364"}, +] + +[[package]] name = "asttokens" version = "2.4.1" summary = "Annotate AST trees with source code positions" @@ -130,6 +140,20 @@ ] [[package]] +name = "beautifulsoup4" +version = "4.12.3" +requires_python = ">=3.6.0" +summary = "Screen-scraping library" +groups = ["default"] +dependencies = [ + "soupsieve>1.2", +] +files = [ + {file = "beautifulsoup4-4.12.3-py3-none-any.whl", hash = "sha256:b80878c9f40111313e55da8ba20bdba06d8fa3969fc68304167741bbf9e082ed"}, + {file = "beautifulsoup4-4.12.3.tar.gz", hash = "sha256:74e3d1928edc070d21748185c46e3fb33490f22f52a3addee9aee0f4f7781051"}, +] + +[[package]] name = "certifi" version = "2024.7.4" requires_python = ">=3.6" @@ -906,6 +930,20 @@ ] [[package]] +name = "markdownify" +version = "0.12.1" +summary = "Convert HTML to markdown." +groups = ["default"] +dependencies = [ + "beautifulsoup4<5,>=4.9", + "six<2,>=1.15", +] +files = [ + {file = "markdownify-0.12.1-py3-none-any.whl", hash = "sha256:a3805abd8166dbb7b27783c5599d91f54f10d79894b2621404d85b333c7ce561"}, + {file = "markdownify-0.12.1.tar.gz", hash = "sha256:1fb08c618b30e0ee7a31a39b998f44a18fb28ab254f55f4af06b6d35a2179e27"}, +] + +[[package]] name = "markupsafe" version = "2.1.5" requires_python = ">=3.7" @@ -1861,6 +1899,17 @@ ] [[package]] +name = "soupsieve" +version = "2.5" +requires_python = ">=3.8" +summary = "A modern CSS selector implementation for Beautiful Soup." +groups = ["default"] +files = [ + {file = "soupsieve-2.5-py3-none-any.whl", hash = "sha256:eaa337ff55a1579b6549dc679565eac1e3d000563bcb1c8ab0d0fefbc0c2cdc7"}, + {file = "soupsieve-2.5.tar.gz", hash = "sha256:5663d5a7b3bfaeee0bc4372e7fc48f9cff4940b3eec54a6451cc5299f1097690"}, +] + +[[package]] name = "sqlalchemy" version = "2.0.31" requires_python = ">=3.7" @@ -1935,6 +1984,26 @@ ] [[package]] +name = "tantivy" +version = "0.22.0" +requires_python = ">=3.8" +summary = "" +groups = ["default"] +files = [ + {file = "tantivy-0.22.0-cp311-cp311-macosx_10_7_x86_64.whl", hash = "sha256:ec693abf38f229bc1361b0d34029a8bb9f3ee5bb956a3e745e0c4a66ea815bec"}, + {file = "tantivy-0.22.0-cp311-cp311-macosx_10_9_x86_64.macosx_11_0_arm64.macosx_10_9_universal2.whl", hash = "sha256:e385839badc12b81e38bf0a4d865ee7c3a992fea9f5ce4117adae89369e7d1eb"}, + {file = "tantivy-0.22.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b6c097d94be1af106676c86c02b185f029484fdbd9a2b9f17cb980e840e7bdad"}, + {file = "tantivy-0.22.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c47a5cdec306ea8594cb6e7effd4b430932ebfd969f9e8f99e343adf56a79bc9"}, + {file = "tantivy-0.22.0-cp311-none-win_amd64.whl", hash = "sha256:ba0ca878ed025d79edd9c51cda80b0105be8facbaec180fea64a17b80c74e7db"}, + {file = "tantivy-0.22.0-cp312-cp312-macosx_10_7_x86_64.whl", hash = "sha256:925682f3acb65c85c2a5a5b131401b9f30c184ea68aa73a8cc7c2ea6115e8ae3"}, + {file = "tantivy-0.22.0-cp312-cp312-macosx_10_9_x86_64.macosx_11_0_arm64.macosx_10_9_universal2.whl", hash = "sha256:d75760e45a329313001354d6ca415ff12d9d812343792ae133da6bfbdc4b04a5"}, + {file = "tantivy-0.22.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fd909d122b5af457d955552c304f8d5d046aee7024c703c62652ad72af89f3c7"}, + {file = "tantivy-0.22.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c99266ffb204721eb2bd5b3184aa87860a6cff51b4563f808f78fa22d85a8093"}, + {file = "tantivy-0.22.0-cp312-none-win_amd64.whl", hash = "sha256:9ed6b813a1e7769444e33979b46b470b2f4c62d983c2560ce9486fb9be1491c9"}, + {file = "tantivy-0.22.0.tar.gz", hash = "sha256:dce07fa2910c94934aa3d96c91087936c24e4a5802d839625d67edc6d1c95e5c"}, +] + +[[package]] name = "tenacity" version = "8.4.2" requires_python = ">=3.8"
--- a/pyproject.toml Sat Jul 06 16:45:19 2024 -0700 +++ b/pyproject.toml Sun Jul 07 16:26:56 2024 -0700 @@ -15,6 +15,9 @@ "nltk>=3.8.1", "langchain>=0.2.6", "fastapi>=0.111.0", + "apex-search>=0.0.2", + "tantivy>=0.22.0", + "markdownify>=0.12.1", ] requires-python = ">=3.11" readme = "README.md"
--- a/search/Dockerfile Sat Jul 06 16:45:19 2024 -0700 +++ b/search/Dockerfile Sun Jul 07 16:26:56 2024 -0700 @@ -5,4 +5,6 @@ COPY pyproject.toml pdm.lock ./ RUN pdm sync +RUN pdm run python -c 'import nltk; nltk.download("punkt")' + COPY search/** ./search/ \ No newline at end of file
--- a/search/extract_pdf.py Sat Jul 06 16:45:19 2024 -0700 +++ b/search/extract_pdf.py Sun Jul 07 16:26:56 2024 -0700 @@ -8,6 +8,9 @@ from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager from pdfminer.pdfpage import PDFPage +def files() -> Iterable[Path]: + for p in Path('data').glob('*.pdf'): + yield p def phrasesFromFile(p: Path) -> Iterable: fp = open(p, 'rb')
--- a/search/query.py Sat Jul 06 16:45:19 2024 -0700 +++ b/search/query.py Sun Jul 07 16:26:56 2024 -0700 @@ -1,15 +1,21 @@ +from dataclasses import dataclass +import html import json +from pprint import pprint import sys from pathlib import Path +from typing import Iterable from tqdm import tqdm from pymilvus import MilvusClient from milvus_model.dense.onnx import OnnxEmbeddingFunction -from extract_pdf import phrasesFromFile +from extract_pdf import files, phrasesFromFile from fastapi import FastAPI +from search_apex import Search + def rebuild(client, embedding_fn, dim): client.drop_collection(collection_name="demo_collection") @@ -24,8 +30,8 @@ phrasesFromFile( Path("data") / "Meetings2226Minutes_20240702182359526 (1).pdf")), - desc="rebuilding", - unit=' phrase'): + desc="rebuilding", + unit=' phrase'): [vector] = embedding_fn.encode_documents([phrase]) doc = { "id": i, @@ -38,7 +44,7 @@ print('insert:', res['insert_count']) -def search(q, embedding_fn, client): +def xxsearch(q, embedding_fn, client): query_vectors = embedding_fn.encode_queries([q]) [query_result] = client.search( @@ -63,8 +69,12 @@ app = FastAPI() +search = Search() + @app.get("/sco/query") -def read_query1(q: str|None): - print(f'1 {q=}') - return {"Hello": "World"} +def read_query1(q: str): + results = [] + results = search.search(q) + + return {"results": results}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/search/search_apex.py Sun Jul 07 16:26:56 2024 -0700 @@ -0,0 +1,27 @@ +from pprint import pprint +from typing import Iterable +from apexsearch import ApexSearch + + +class Search: + + def __init__(self): + self.apex = ApexSearch('data/apex', + tables={ + "docs": { + "content": ["phrase"], + "title": "title", + "extras": ["sourceFile", "pos"], + } + }, + id_field='id') + + def rebuild(self, docs: Iterable): + self.apex.build_complete_index(lambda *a: docs) + print('rebuild complete') + + def search(self, q:str): + res= self.apex.search(q ,target_number=100) + pprint(res) + for row in res['results']: + yield {'title':row['title'], 'snippetHtml':row['highlighted_content']} \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/search/search_apex_rebuild.py Sun Jul 07 16:26:56 2024 -0700 @@ -0,0 +1,7 @@ + +from search.search_apex import Search +from search.search_base import allDocs + + +search = Search() +search.rebuild(allDocs()) \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/search/search_base.py Sun Jul 07 16:26:56 2024 -0700 @@ -0,0 +1,32 @@ + +from dataclasses import dataclass +import json +from typing import Iterable + +from search.extract_pdf import files, phrasesFromFile + + +@dataclass +class Doc: + id: int + title: str + sourceFile: str + posJson: str + phrase: str + + def __getitem__(self, k): + return getattr(self, k) + + pop = __getitem__ + + +def allDocs() -> Iterable[Doc]: + id = 0 + for src in files(): + for pos, line in phrasesFromFile(src): + yield Doc(id=id, + title=src.name, + sourceFile=str(src), + posJson=json.dumps(pos), + phrase=line) + id += 1
--- a/web/src/main.ts Sat Jul 06 16:45:19 2024 -0700 +++ b/web/src/main.ts Sun Jul 07 16:26:56 2024 -0700 @@ -43,6 +43,12 @@ top: -22px; z-index: -1; } + dt { + color: blue; + } + dd { + margin-bottom: 25px; + } `, ]; @state() query: string = "climate"; @@ -106,11 +112,13 @@ </form> </section> <section id="results"> - ${this.results.map( - (r) => - html`<div>${r.title}</div> - <div>${unsafeHTML(r.snippetHtml)}</div>` - )} + <dl> + ${this.results.map( + (r) => + html`<dt>${r.title}</dt> + <dd>${unsafeHTML(r.snippetHtml)}</dd>` + )} + </dl> <div>Matching results: ${this.results.length}</div> </section> `; @@ -127,17 +135,12 @@ const sentQ = await this.getCurrentQuery(); - // const resp = await fetch("query", { - // method: "POST", - // body: "query=" + encodeURIComponent(sentQ), - // headers: { "Content-Type": "application/x-www-form-urlencoded" }, - // }); - // if (sentQ != (await this.getCurrentQuery())) { - // // old result- ignore - // return; - // } - this.results.push({ title: "doc1", snippetHtml: "<h1>hello</h1>" }); - console.log("ScoSearchPage ~ submit ~ this.results:", this.results); + const resp = await fetch("query?" + new URLSearchParams({ q: sentQ })); + if (sentQ != (await this.getCurrentQuery())) { + // old result- ignore + return; + } + this.results = (await resp.json()).results; this.requestUpdate(); } }