changeset 8:f23b21bd0fce

apex search
author drewp@bigasterisk.com
date Sun, 07 Jul 2024 16:26:56 -0700
parents 53ae53f7d1b3
children d1b54241a731
files k8s/deploy.yaml k8s/volumes.yaml pdm.lock pyproject.toml search/Dockerfile search/extract_pdf.py search/query.py search/search_apex.py search/search_apex_rebuild.py search/search_base.py web/src/main.ts
diffstat 11 files changed, 218 insertions(+), 25 deletions(-) [+]
line wrap: on
line diff
--- a/k8s/deploy.yaml	Sat Jul 06 16:45:19 2024 -0700
+++ b/k8s/deploy.yaml	Sun Jul 07 16:26:56 2024 -0700
@@ -12,6 +12,10 @@
       labels:
         app: sco-bot
     spec:
+      volumes:
+        - name: sco-bot-data
+          persistentVolumeClaim:
+            claimName: sco-bot-data
       containers:
         - name: vite
           image: reg:5000/sco_bot_web
@@ -36,4 +40,15 @@
             - "8001"
             - search/query.py
           ports:
-            - containerPort: 8001
\ No newline at end of file
+            - containerPort: 8001
+          volumeMounts:
+          - name: sco-bot-data
+            mountPath: /opt/data
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+            - matchExpressions:
+              - key: "kubernetes.io/hostname"
+                operator: In
+                values: ["ditto", "dash"] # need /my/serv
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/k8s/volumes.yaml	Sun Jul 07 16:26:56 2024 -0700
@@ -0,0 +1,22 @@
+
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: sco-bot-data
+  labels: {type: local}
+spec:
+  storageClassName: manual
+  hostPath: {path: "/my/serv/sco-bot/data"}
+  capacity: {storage: 5Mi}
+  accessModes: ["ReadWriteOnce"]
+  persistentVolumeReclaimPolicy: Retain
+  claimRef: {namespace: default, name: sco-bot-data}
+---
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata: {name: sco-bot-data}
+spec:
+  storageClassName: ""
+  volumeName: "sco-bot-data"
+  accessModes: ["ReadWriteOnce"]
+  resources: { requests: { storage: 5Mi } }
\ No newline at end of file
--- a/pdm.lock	Sat Jul 06 16:45:19 2024 -0700
+++ b/pdm.lock	Sun Jul 07 16:26:56 2024 -0700
@@ -5,7 +5,7 @@
 groups = ["default"]
 strategy = ["cross_platform", "inherit_metadata"]
 lock_version = "4.4.1"
-content_hash = "sha256:914f437191f234f800e6f04c0fedb35ceea1ea96f0c2b60406a111aa691ab8df"
+content_hash = "sha256:50f2904cbccac5298835948c8e9bc0168395a04c59bc0967ae6e8c073b2adfb8"
 
 [[package]]
 name = "absl-py"
@@ -106,6 +106,16 @@
 ]
 
 [[package]]
+name = "apex-search"
+version = "0.0.2"
+requires_python = ">=3.10"
+summary = "A full text search implementation in Python"
+groups = ["default"]
+files = [
+    {file = "apex-search-0.0.2.tar.gz", hash = "sha256:e7593732b8b27e0994ed1d2f8b9f213444ee60f42da0bce39352cad343c68364"},
+]
+
+[[package]]
 name = "asttokens"
 version = "2.4.1"
 summary = "Annotate AST trees with source code positions"
@@ -130,6 +140,20 @@
 ]
 
 [[package]]
+name = "beautifulsoup4"
+version = "4.12.3"
+requires_python = ">=3.6.0"
+summary = "Screen-scraping library"
+groups = ["default"]
+dependencies = [
+    "soupsieve>1.2",
+]
+files = [
+    {file = "beautifulsoup4-4.12.3-py3-none-any.whl", hash = "sha256:b80878c9f40111313e55da8ba20bdba06d8fa3969fc68304167741bbf9e082ed"},
+    {file = "beautifulsoup4-4.12.3.tar.gz", hash = "sha256:74e3d1928edc070d21748185c46e3fb33490f22f52a3addee9aee0f4f7781051"},
+]
+
+[[package]]
 name = "certifi"
 version = "2024.7.4"
 requires_python = ">=3.6"
@@ -906,6 +930,20 @@
 ]
 
 [[package]]
+name = "markdownify"
+version = "0.12.1"
+summary = "Convert HTML to markdown."
+groups = ["default"]
+dependencies = [
+    "beautifulsoup4<5,>=4.9",
+    "six<2,>=1.15",
+]
+files = [
+    {file = "markdownify-0.12.1-py3-none-any.whl", hash = "sha256:a3805abd8166dbb7b27783c5599d91f54f10d79894b2621404d85b333c7ce561"},
+    {file = "markdownify-0.12.1.tar.gz", hash = "sha256:1fb08c618b30e0ee7a31a39b998f44a18fb28ab254f55f4af06b6d35a2179e27"},
+]
+
+[[package]]
 name = "markupsafe"
 version = "2.1.5"
 requires_python = ">=3.7"
@@ -1861,6 +1899,17 @@
 ]
 
 [[package]]
+name = "soupsieve"
+version = "2.5"
+requires_python = ">=3.8"
+summary = "A modern CSS selector implementation for Beautiful Soup."
+groups = ["default"]
+files = [
+    {file = "soupsieve-2.5-py3-none-any.whl", hash = "sha256:eaa337ff55a1579b6549dc679565eac1e3d000563bcb1c8ab0d0fefbc0c2cdc7"},
+    {file = "soupsieve-2.5.tar.gz", hash = "sha256:5663d5a7b3bfaeee0bc4372e7fc48f9cff4940b3eec54a6451cc5299f1097690"},
+]
+
+[[package]]
 name = "sqlalchemy"
 version = "2.0.31"
 requires_python = ">=3.7"
@@ -1935,6 +1984,26 @@
 ]
 
 [[package]]
+name = "tantivy"
+version = "0.22.0"
+requires_python = ">=3.8"
+summary = ""
+groups = ["default"]
+files = [
+    {file = "tantivy-0.22.0-cp311-cp311-macosx_10_7_x86_64.whl", hash = "sha256:ec693abf38f229bc1361b0d34029a8bb9f3ee5bb956a3e745e0c4a66ea815bec"},
+    {file = "tantivy-0.22.0-cp311-cp311-macosx_10_9_x86_64.macosx_11_0_arm64.macosx_10_9_universal2.whl", hash = "sha256:e385839badc12b81e38bf0a4d865ee7c3a992fea9f5ce4117adae89369e7d1eb"},
+    {file = "tantivy-0.22.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b6c097d94be1af106676c86c02b185f029484fdbd9a2b9f17cb980e840e7bdad"},
+    {file = "tantivy-0.22.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c47a5cdec306ea8594cb6e7effd4b430932ebfd969f9e8f99e343adf56a79bc9"},
+    {file = "tantivy-0.22.0-cp311-none-win_amd64.whl", hash = "sha256:ba0ca878ed025d79edd9c51cda80b0105be8facbaec180fea64a17b80c74e7db"},
+    {file = "tantivy-0.22.0-cp312-cp312-macosx_10_7_x86_64.whl", hash = "sha256:925682f3acb65c85c2a5a5b131401b9f30c184ea68aa73a8cc7c2ea6115e8ae3"},
+    {file = "tantivy-0.22.0-cp312-cp312-macosx_10_9_x86_64.macosx_11_0_arm64.macosx_10_9_universal2.whl", hash = "sha256:d75760e45a329313001354d6ca415ff12d9d812343792ae133da6bfbdc4b04a5"},
+    {file = "tantivy-0.22.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fd909d122b5af457d955552c304f8d5d046aee7024c703c62652ad72af89f3c7"},
+    {file = "tantivy-0.22.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c99266ffb204721eb2bd5b3184aa87860a6cff51b4563f808f78fa22d85a8093"},
+    {file = "tantivy-0.22.0-cp312-none-win_amd64.whl", hash = "sha256:9ed6b813a1e7769444e33979b46b470b2f4c62d983c2560ce9486fb9be1491c9"},
+    {file = "tantivy-0.22.0.tar.gz", hash = "sha256:dce07fa2910c94934aa3d96c91087936c24e4a5802d839625d67edc6d1c95e5c"},
+]
+
+[[package]]
 name = "tenacity"
 version = "8.4.2"
 requires_python = ">=3.8"
--- a/pyproject.toml	Sat Jul 06 16:45:19 2024 -0700
+++ b/pyproject.toml	Sun Jul 07 16:26:56 2024 -0700
@@ -15,6 +15,9 @@
     "nltk>=3.8.1",
     "langchain>=0.2.6",
     "fastapi>=0.111.0",
+    "apex-search>=0.0.2",
+    "tantivy>=0.22.0",
+    "markdownify>=0.12.1",
 ]
 requires-python = ">=3.11"
 readme = "README.md"
--- a/search/Dockerfile	Sat Jul 06 16:45:19 2024 -0700
+++ b/search/Dockerfile	Sun Jul 07 16:26:56 2024 -0700
@@ -5,4 +5,6 @@
 COPY pyproject.toml pdm.lock ./
 RUN pdm sync
 
+RUN pdm run python -c 'import nltk; nltk.download("punkt")'
+
 COPY search/** ./search/
\ No newline at end of file
--- a/search/extract_pdf.py	Sat Jul 06 16:45:19 2024 -0700
+++ b/search/extract_pdf.py	Sun Jul 07 16:26:56 2024 -0700
@@ -8,6 +8,9 @@
 from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
 from pdfminer.pdfpage import PDFPage
 
+def files() -> Iterable[Path]:
+    for p in Path('data').glob('*.pdf'):
+        yield p
 
 def phrasesFromFile(p: Path) -> Iterable:
     fp = open(p, 'rb')
--- a/search/query.py	Sat Jul 06 16:45:19 2024 -0700
+++ b/search/query.py	Sun Jul 07 16:26:56 2024 -0700
@@ -1,15 +1,21 @@
+from dataclasses import dataclass
+import html
 import json
+from pprint import pprint
 import sys
 from pathlib import Path
+from typing import Iterable
 
 from tqdm import tqdm
 
 from pymilvus import MilvusClient
 from milvus_model.dense.onnx import OnnxEmbeddingFunction
 
-from extract_pdf import phrasesFromFile
+from extract_pdf import files, phrasesFromFile
 
 from fastapi import FastAPI
+from search_apex import Search
+
 
 def rebuild(client, embedding_fn, dim):
     client.drop_collection(collection_name="demo_collection")
@@ -24,8 +30,8 @@
             phrasesFromFile(
                 Path("data") /
                 "Meetings2226Minutes_20240702182359526 (1).pdf")),
-                desc="rebuilding",
-                unit=' phrase'):
+                                  desc="rebuilding",
+                                  unit=' phrase'):
         [vector] = embedding_fn.encode_documents([phrase])
         doc = {
             "id": i,
@@ -38,7 +44,7 @@
     print('insert:', res['insert_count'])
 
 
-def search(q, embedding_fn, client):
+def xxsearch(q, embedding_fn, client):
     query_vectors = embedding_fn.encode_queries([q])
 
     [query_result] = client.search(
@@ -63,8 +69,12 @@
 
 app = FastAPI()
 
+search = Search()
+
 
 @app.get("/sco/query")
-def read_query1(q: str|None):
-    print(f'1 {q=}')
-    return {"Hello": "World"}
+def read_query1(q: str):
+    results = []
+    results = search.search(q)
+
+    return {"results": results}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/search/search_apex.py	Sun Jul 07 16:26:56 2024 -0700
@@ -0,0 +1,27 @@
+from pprint import pprint
+from typing import Iterable
+from apexsearch import ApexSearch
+
+
+class Search:
+
+    def __init__(self):
+        self.apex = ApexSearch('data/apex',
+                               tables={
+                                   "docs": {
+                                       "content": ["phrase"],
+                                       "title": "title",
+                                       "extras": ["sourceFile", "pos"],
+                                   }
+                               },
+                               id_field='id')
+
+    def rebuild(self, docs: Iterable):
+        self.apex.build_complete_index(lambda *a: docs)
+        print('rebuild complete')
+
+    def search(self, q:str):
+        res= self.apex.search(q ,target_number=100)
+        pprint(res)        
+        for row in res['results']:
+            yield {'title':row['title'], 'snippetHtml':row['highlighted_content']}
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/search/search_apex_rebuild.py	Sun Jul 07 16:26:56 2024 -0700
@@ -0,0 +1,7 @@
+
+from search.search_apex import Search
+from search.search_base import allDocs
+
+
+search = Search()
+search.rebuild(allDocs())
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/search/search_base.py	Sun Jul 07 16:26:56 2024 -0700
@@ -0,0 +1,32 @@
+
+from dataclasses import dataclass
+import json
+from typing import Iterable
+
+from search.extract_pdf import files, phrasesFromFile
+
+
+@dataclass
+class Doc:
+    id: int
+    title: str
+    sourceFile: str
+    posJson: str
+    phrase: str
+
+    def __getitem__(self, k):
+        return getattr(self, k)
+
+    pop = __getitem__
+
+
+def allDocs() -> Iterable[Doc]:
+    id = 0
+    for src in files():
+        for pos, line in phrasesFromFile(src):
+            yield Doc(id=id,
+                      title=src.name,
+                      sourceFile=str(src),
+                      posJson=json.dumps(pos),
+                      phrase=line)
+            id += 1
--- a/web/src/main.ts	Sat Jul 06 16:45:19 2024 -0700
+++ b/web/src/main.ts	Sun Jul 07 16:26:56 2024 -0700
@@ -43,6 +43,12 @@
         top: -22px;
         z-index: -1;
       }
+      dt {
+        color: blue;
+      }
+      dd {
+        margin-bottom: 25px;
+      }
     `,
   ];
   @state() query: string = "climate";
@@ -106,11 +112,13 @@
         </form>
       </section>
       <section id="results">
-        ${this.results.map(
-          (r) =>
-            html`<div>${r.title}</div>
-              <div>${unsafeHTML(r.snippetHtml)}</div>`
-        )}
+        <dl>
+          ${this.results.map(
+            (r) =>
+              html`<dt>${r.title}</dt>
+                <dd>${unsafeHTML(r.snippetHtml)}</dd>`
+          )}
+        </dl>
         <div>Matching results: ${this.results.length}</div>
       </section>
     `;
@@ -127,17 +135,12 @@
 
     const sentQ = await this.getCurrentQuery();
 
-    // const resp = await fetch("query", {
-    //   method: "POST",
-    //   body: "query=" + encodeURIComponent(sentQ),
-    //   headers: { "Content-Type": "application/x-www-form-urlencoded" },
-    // });
-    // if (sentQ != (await this.getCurrentQuery())) {
-    //   // old result- ignore
-    //   return;
-    // }
-    this.results.push({ title: "doc1", snippetHtml: "<h1>hello</h1>" });
-    console.log("ScoSearchPage ~ submit ~  this.results:", this.results);
+    const resp = await fetch("query?" + new URLSearchParams({ q: sentQ }));
+    if (sentQ != (await this.getCurrentQuery())) {
+      // old result- ignore
+      return;
+    }
+    this.results = (await resp.json()).results;
     this.requestUpdate();
   }
 }