# HG changeset patch
# User drewp@bigasterisk.com
# Date 1720394816 25200
# Node ID f23b21bd0fcefecc52cec22cb176290b1f68e9aa
# Parent 53ae53f7d1b3026cd71ad8738630ac61cd0e1b0f
apex search
diff -r 53ae53f7d1b3 -r f23b21bd0fce k8s/deploy.yaml
--- a/k8s/deploy.yaml Sat Jul 06 16:45:19 2024 -0700
+++ b/k8s/deploy.yaml Sun Jul 07 16:26:56 2024 -0700
@@ -12,6 +12,10 @@
labels:
app: sco-bot
spec:
+ volumes:
+ - name: sco-bot-data
+ persistentVolumeClaim:
+ claimName: sco-bot-data
containers:
- name: vite
image: reg:5000/sco_bot_web
@@ -36,4 +40,15 @@
- "8001"
- search/query.py
ports:
- - containerPort: 8001
\ No newline at end of file
+ - containerPort: 8001
+ volumeMounts:
+ - name: sco-bot-data
+ mountPath: /opt/data
+ affinity:
+ nodeAffinity:
+ requiredDuringSchedulingIgnoredDuringExecution:
+ nodeSelectorTerms:
+ - matchExpressions:
+ - key: "kubernetes.io/hostname"
+ operator: In
+ values: ["ditto", "dash"] # need /my/serv
\ No newline at end of file
diff -r 53ae53f7d1b3 -r f23b21bd0fce k8s/volumes.yaml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/k8s/volumes.yaml Sun Jul 07 16:26:56 2024 -0700
@@ -0,0 +1,22 @@
+
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+ name: sco-bot-data
+ labels: {type: local}
+spec:
+ storageClassName: manual
+ hostPath: {path: "/my/serv/sco-bot/data"}
+ capacity: {storage: 5Mi}
+ accessModes: ["ReadWriteOnce"]
+ persistentVolumeReclaimPolicy: Retain
+ claimRef: {namespace: default, name: sco-bot-data}
+---
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata: {name: sco-bot-data}
+spec:
+ storageClassName: ""
+ volumeName: "sco-bot-data"
+ accessModes: ["ReadWriteOnce"]
+ resources: { requests: { storage: 5Mi } }
\ No newline at end of file
diff -r 53ae53f7d1b3 -r f23b21bd0fce pdm.lock
--- a/pdm.lock Sat Jul 06 16:45:19 2024 -0700
+++ b/pdm.lock Sun Jul 07 16:26:56 2024 -0700
@@ -5,7 +5,7 @@
groups = ["default"]
strategy = ["cross_platform", "inherit_metadata"]
lock_version = "4.4.1"
-content_hash = "sha256:914f437191f234f800e6f04c0fedb35ceea1ea96f0c2b60406a111aa691ab8df"
+content_hash = "sha256:50f2904cbccac5298835948c8e9bc0168395a04c59bc0967ae6e8c073b2adfb8"
[[package]]
name = "absl-py"
@@ -106,6 +106,16 @@
]
[[package]]
+name = "apex-search"
+version = "0.0.2"
+requires_python = ">=3.10"
+summary = "A full text search implementation in Python"
+groups = ["default"]
+files = [
+ {file = "apex-search-0.0.2.tar.gz", hash = "sha256:e7593732b8b27e0994ed1d2f8b9f213444ee60f42da0bce39352cad343c68364"},
+]
+
+[[package]]
name = "asttokens"
version = "2.4.1"
summary = "Annotate AST trees with source code positions"
@@ -130,6 +140,20 @@
]
[[package]]
+name = "beautifulsoup4"
+version = "4.12.3"
+requires_python = ">=3.6.0"
+summary = "Screen-scraping library"
+groups = ["default"]
+dependencies = [
+ "soupsieve>1.2",
+]
+files = [
+ {file = "beautifulsoup4-4.12.3-py3-none-any.whl", hash = "sha256:b80878c9f40111313e55da8ba20bdba06d8fa3969fc68304167741bbf9e082ed"},
+ {file = "beautifulsoup4-4.12.3.tar.gz", hash = "sha256:74e3d1928edc070d21748185c46e3fb33490f22f52a3addee9aee0f4f7781051"},
+]
+
+[[package]]
name = "certifi"
version = "2024.7.4"
requires_python = ">=3.6"
@@ -906,6 +930,20 @@
]
[[package]]
+name = "markdownify"
+version = "0.12.1"
+summary = "Convert HTML to markdown."
+groups = ["default"]
+dependencies = [
+ "beautifulsoup4<5,>=4.9",
+ "six<2,>=1.15",
+]
+files = [
+ {file = "markdownify-0.12.1-py3-none-any.whl", hash = "sha256:a3805abd8166dbb7b27783c5599d91f54f10d79894b2621404d85b333c7ce561"},
+ {file = "markdownify-0.12.1.tar.gz", hash = "sha256:1fb08c618b30e0ee7a31a39b998f44a18fb28ab254f55f4af06b6d35a2179e27"},
+]
+
+[[package]]
name = "markupsafe"
version = "2.1.5"
requires_python = ">=3.7"
@@ -1861,6 +1899,17 @@
]
[[package]]
+name = "soupsieve"
+version = "2.5"
+requires_python = ">=3.8"
+summary = "A modern CSS selector implementation for Beautiful Soup."
+groups = ["default"]
+files = [
+ {file = "soupsieve-2.5-py3-none-any.whl", hash = "sha256:eaa337ff55a1579b6549dc679565eac1e3d000563bcb1c8ab0d0fefbc0c2cdc7"},
+ {file = "soupsieve-2.5.tar.gz", hash = "sha256:5663d5a7b3bfaeee0bc4372e7fc48f9cff4940b3eec54a6451cc5299f1097690"},
+]
+
+[[package]]
name = "sqlalchemy"
version = "2.0.31"
requires_python = ">=3.7"
@@ -1935,6 +1984,26 @@
]
[[package]]
+name = "tantivy"
+version = "0.22.0"
+requires_python = ">=3.8"
+summary = ""
+groups = ["default"]
+files = [
+ {file = "tantivy-0.22.0-cp311-cp311-macosx_10_7_x86_64.whl", hash = "sha256:ec693abf38f229bc1361b0d34029a8bb9f3ee5bb956a3e745e0c4a66ea815bec"},
+ {file = "tantivy-0.22.0-cp311-cp311-macosx_10_9_x86_64.macosx_11_0_arm64.macosx_10_9_universal2.whl", hash = "sha256:e385839badc12b81e38bf0a4d865ee7c3a992fea9f5ce4117adae89369e7d1eb"},
+ {file = "tantivy-0.22.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b6c097d94be1af106676c86c02b185f029484fdbd9a2b9f17cb980e840e7bdad"},
+ {file = "tantivy-0.22.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c47a5cdec306ea8594cb6e7effd4b430932ebfd969f9e8f99e343adf56a79bc9"},
+ {file = "tantivy-0.22.0-cp311-none-win_amd64.whl", hash = "sha256:ba0ca878ed025d79edd9c51cda80b0105be8facbaec180fea64a17b80c74e7db"},
+ {file = "tantivy-0.22.0-cp312-cp312-macosx_10_7_x86_64.whl", hash = "sha256:925682f3acb65c85c2a5a5b131401b9f30c184ea68aa73a8cc7c2ea6115e8ae3"},
+ {file = "tantivy-0.22.0-cp312-cp312-macosx_10_9_x86_64.macosx_11_0_arm64.macosx_10_9_universal2.whl", hash = "sha256:d75760e45a329313001354d6ca415ff12d9d812343792ae133da6bfbdc4b04a5"},
+ {file = "tantivy-0.22.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fd909d122b5af457d955552c304f8d5d046aee7024c703c62652ad72af89f3c7"},
+ {file = "tantivy-0.22.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c99266ffb204721eb2bd5b3184aa87860a6cff51b4563f808f78fa22d85a8093"},
+ {file = "tantivy-0.22.0-cp312-none-win_amd64.whl", hash = "sha256:9ed6b813a1e7769444e33979b46b470b2f4c62d983c2560ce9486fb9be1491c9"},
+ {file = "tantivy-0.22.0.tar.gz", hash = "sha256:dce07fa2910c94934aa3d96c91087936c24e4a5802d839625d67edc6d1c95e5c"},
+]
+
+[[package]]
name = "tenacity"
version = "8.4.2"
requires_python = ">=3.8"
diff -r 53ae53f7d1b3 -r f23b21bd0fce pyproject.toml
--- a/pyproject.toml Sat Jul 06 16:45:19 2024 -0700
+++ b/pyproject.toml Sun Jul 07 16:26:56 2024 -0700
@@ -15,6 +15,9 @@
"nltk>=3.8.1",
"langchain>=0.2.6",
"fastapi>=0.111.0",
+ "apex-search>=0.0.2",
+ "tantivy>=0.22.0",
+ "markdownify>=0.12.1",
]
requires-python = ">=3.11"
readme = "README.md"
diff -r 53ae53f7d1b3 -r f23b21bd0fce search/Dockerfile
--- a/search/Dockerfile Sat Jul 06 16:45:19 2024 -0700
+++ b/search/Dockerfile Sun Jul 07 16:26:56 2024 -0700
@@ -5,4 +5,6 @@
COPY pyproject.toml pdm.lock ./
RUN pdm sync
+RUN pdm run python -c 'import nltk; nltk.download("punkt")'
+
COPY search/** ./search/
\ No newline at end of file
diff -r 53ae53f7d1b3 -r f23b21bd0fce search/extract_pdf.py
--- a/search/extract_pdf.py Sat Jul 06 16:45:19 2024 -0700
+++ b/search/extract_pdf.py Sun Jul 07 16:26:56 2024 -0700
@@ -8,6 +8,9 @@
from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
from pdfminer.pdfpage import PDFPage
+def files() -> Iterable[Path]:
+ for p in Path('data').glob('*.pdf'):
+ yield p
def phrasesFromFile(p: Path) -> Iterable:
fp = open(p, 'rb')
diff -r 53ae53f7d1b3 -r f23b21bd0fce search/query.py
--- a/search/query.py Sat Jul 06 16:45:19 2024 -0700
+++ b/search/query.py Sun Jul 07 16:26:56 2024 -0700
@@ -1,15 +1,21 @@
+from dataclasses import dataclass
+import html
import json
+from pprint import pprint
import sys
from pathlib import Path
+from typing import Iterable
from tqdm import tqdm
from pymilvus import MilvusClient
from milvus_model.dense.onnx import OnnxEmbeddingFunction
-from extract_pdf import phrasesFromFile
+from extract_pdf import files, phrasesFromFile
from fastapi import FastAPI
+from search_apex import Search
+
def rebuild(client, embedding_fn, dim):
client.drop_collection(collection_name="demo_collection")
@@ -24,8 +30,8 @@
phrasesFromFile(
Path("data") /
"Meetings2226Minutes_20240702182359526 (1).pdf")),
- desc="rebuilding",
- unit=' phrase'):
+ desc="rebuilding",
+ unit=' phrase'):
[vector] = embedding_fn.encode_documents([phrase])
doc = {
"id": i,
@@ -38,7 +44,7 @@
print('insert:', res['insert_count'])
-def search(q, embedding_fn, client):
+def xxsearch(q, embedding_fn, client):
query_vectors = embedding_fn.encode_queries([q])
[query_result] = client.search(
@@ -63,8 +69,12 @@
app = FastAPI()
+search = Search()
+
@app.get("/sco/query")
-def read_query1(q: str|None):
- print(f'1 {q=}')
- return {"Hello": "World"}
+def read_query1(q: str):
+ results = []
+ results = search.search(q)
+
+ return {"results": results}
diff -r 53ae53f7d1b3 -r f23b21bd0fce search/search_apex.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/search/search_apex.py Sun Jul 07 16:26:56 2024 -0700
@@ -0,0 +1,27 @@
+from pprint import pprint
+from typing import Iterable
+from apexsearch import ApexSearch
+
+
+class Search:
+
+ def __init__(self):
+ self.apex = ApexSearch('data/apex',
+ tables={
+ "docs": {
+ "content": ["phrase"],
+ "title": "title",
+ "extras": ["sourceFile", "pos"],
+ }
+ },
+ id_field='id')
+
+ def rebuild(self, docs: Iterable):
+ self.apex.build_complete_index(lambda *a: docs)
+ print('rebuild complete')
+
+ def search(self, q:str):
+ res= self.apex.search(q ,target_number=100)
+ pprint(res)
+ for row in res['results']:
+ yield {'title':row['title'], 'snippetHtml':row['highlighted_content']}
\ No newline at end of file
diff -r 53ae53f7d1b3 -r f23b21bd0fce search/search_apex_rebuild.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/search/search_apex_rebuild.py Sun Jul 07 16:26:56 2024 -0700
@@ -0,0 +1,7 @@
+
+from search.search_apex import Search
+from search.search_base import allDocs
+
+
+search = Search()
+search.rebuild(allDocs())
\ No newline at end of file
diff -r 53ae53f7d1b3 -r f23b21bd0fce search/search_base.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/search/search_base.py Sun Jul 07 16:26:56 2024 -0700
@@ -0,0 +1,32 @@
+
+from dataclasses import dataclass
+import json
+from typing import Iterable
+
+from search.extract_pdf import files, phrasesFromFile
+
+
+@dataclass
+class Doc:
+ id: int
+ title: str
+ sourceFile: str
+ posJson: str
+ phrase: str
+
+ def __getitem__(self, k):
+ return getattr(self, k)
+
+ pop = __getitem__
+
+
+def allDocs() -> Iterable[Doc]:
+ id = 0
+ for src in files():
+ for pos, line in phrasesFromFile(src):
+ yield Doc(id=id,
+ title=src.name,
+ sourceFile=str(src),
+ posJson=json.dumps(pos),
+ phrase=line)
+ id += 1
diff -r 53ae53f7d1b3 -r f23b21bd0fce web/src/main.ts
--- a/web/src/main.ts Sat Jul 06 16:45:19 2024 -0700
+++ b/web/src/main.ts Sun Jul 07 16:26:56 2024 -0700
@@ -43,6 +43,12 @@
top: -22px;
z-index: -1;
}
+ dt {
+ color: blue;
+ }
+ dd {
+ margin-bottom: 25px;
+ }
`,
];
@state() query: string = "climate";
@@ -106,11 +112,13 @@
+ ${this.results.map(
+ (r) =>
+ html`