changeset 11:6622bacb0b84

first pass at reorg
author drewp@bigasterisk.com
date Thu, 11 Jul 2024 18:15:44 -0700
parents 13438795d896
children 7f36497bfac3
files Dockerfile.server Dockerfile.web env flow/build_index.py flow/download.py flow/env flow/local_types.py flow/pyproject.toml flow/schema.py flow/search_index.py k8s/deploy.yaml scobot/index/access.py scobot/index/build_index_flow.py scobot/index/download_tasks.py scobot/index/schema.py scobot/local_types.py scobot/service/query.py search/Dockerfile search/doc.py search/extract_agenda.py search/extract_all.py search/extract_pdf.py search/meeting_docs.py search/query.py search/search_apex.py search/search_apex_rebuild.py skaffold.yaml web/Dockerfile
diffstat 27 files changed, 279 insertions(+), 428 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/Dockerfile.server	Thu Jul 11 18:15:44 2024 -0700
@@ -0,0 +1,11 @@
+FROM reg:5000/base_basic
+
+WORKDIR /opt
+
+COPY pyproject.toml pdm.lock ./
+RUN pdm sync
+
+RUN pdm run python -c 'import nltk; nltk.download("punkt")'
+
+COPY env ./
+COPY scobot/** ./scobot/
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/Dockerfile.web	Thu Jul 11 18:15:44 2024 -0700
@@ -0,0 +1,9 @@
+FROM reg:5000/base_basic
+
+WORKDIR /opt
+
+COPY web/package.json web/pnpm-lock.yaml ./
+RUN cd web; pnpm install
+
+COPY web/vite.config.ts web/tsconfig.json ./
+COPY web/src/ ./src/
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/env	Thu Jul 11 18:15:44 2024 -0700
@@ -0,0 +1,2 @@
+PREFECT_API_URL=http://127.0.0.1:4200/api
+PREFECT_HOME=./prefect
--- a/flow/build_index.py	Thu Jul 11 17:35:31 2024 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,81 +0,0 @@
-from pathlib import Path
-from typing import Iterable, cast
-
-import search_index
-from download import getCityMutableJson, getCityPermanent
-from local_types import MeetingRow, Url
-from lxml.html import fromstring
-from prefect import flow, task
-from prefect.logging import get_run_logger
-from search_index import SearchIndex
-
-log = None
-
-
-@task()
-def meetingListUrls() -> Iterable[Url]:
-    return [
-        "https://albanyca.primegov.com/api/v2/PublicPortal/ListArchivedMeetings?year=2024",
-        "https://albanyca.primegov.com/api/v2/PublicPortal/ListUpcomingMeetings",
-    ]
-
-
-def meetingAgendaUrl(mtg: MeetingRow) -> Url:
-    for doc in mtg.get('documentList', []):
-        if doc['templateName'] == 'HTML Agenda Packet':
-            tid = doc['templateId']
-            return f'https://albanyca.primegov.com/Portal/Meeting?meetingTemplateId={tid}'
-    raise ValueError(f"no agenda doc found for {mtg['id']=}")
-
-
-def extractMeetingText(mhtml: str) -> list[str]:
-    el = fromstring(mhtml)
-    m = el.cssselect('div#meetingSection')[0]
-    for st in m.cssselect('style'):
-        st.clear()
-    meetingText = [
-        chunk for chunk in m.xpath('.//text()', namespaces=el.nsmap)
-        if chunk.strip()
-    ]
-    return meetingText
-
-
-def addMeeting(index: SearchIndex, mtg: MeetingRow):
-    try:
-        agendaUrl = meetingAgendaUrl(mtg)
-    except ValueError:
-        pass
-    else:
-        html = getCityPermanent(agendaUrl)
-        text = extractMeetingText(html)
-        # todo group phrases
-        index.addDoc(title=f'{mtg["date"]} {mtg["title"]}', content=text)
-
-    try:
-        videoUrl = mtg['videoUrl']
-        if not videoUrl:
-            raise KeyError
-    except KeyError:
-        pass
-    else:
-        '''transcribe and index video...'''
-
-
-@flow(persist_result=True)
-def buildIndex():
-    global log
-    log = get_run_logger()
-    search_index.log = log
-
-    index = SearchIndex(Path('/tmp/scoindex'))
-    for url in meetingListUrls():
-        mtgs = cast(list[MeetingRow], getCityMutableJson(url))
-        log.info(f'got {len(mtgs)=}')
-
-        for mtg in mtgs:
-            addMeeting(index, mtg)
-    index.commit()
-
-
-if __name__ == '__main__':
-    buildIndex.serve()
--- a/flow/download.py	Thu Jul 11 17:35:31 2024 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,36 +0,0 @@
-import datetime
-import time
-from local_types import Url
-
-import httpx
-from prefect import task
-from prefect.artifacts import create_link_artifact
-
-
-@task(
-    task_run_name=lambda: f'getHttp-{int(time.time())}',
-    cache_key_fn=lambda _, args: f'getHttp-{args["url"]}',
-    cache_expiration=datetime.timedelta(seconds=86400),
-    tags=['city'],  # todo ratelimit based on tag
-)
-def getCityMutableJson(url: Url):
-    create_link_artifact("get", url)
-    req = httpx.get(url)  # todo async
-    req.raise_for_status()
-    return req.json()
-
-
-@task(task_run_name=lambda: f'getHttp-{int(time.time())}',
-      cache_key_fn=lambda _, args: f'getHttp-{args["url"]}',
-      tags=['city'])
-def getCityPermanent(url: Url) -> str:
-    create_link_artifact("get", url)
-    req = httpx.get(url)
-    req.raise_for_status()
-    return req.text
-
-
-@task
-def getYoutubePermanent(url: str):
-    time.sleep(5)
-    return 'video' * 10000
--- a/flow/env	Thu Jul 11 17:35:31 2024 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,2 +0,0 @@
-PREFECT_API_URL=http://127.0.0.1:4200/api
-PREFECT_HOME=./prefect
--- a/flow/local_types.py	Thu Jul 11 17:35:31 2024 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,5 +0,0 @@
-from typing import NewType
-
-
-Url = NewType('Url', str)
-MeetingRow = NewType('MeetingRow', dict)
\ No newline at end of file
--- a/flow/pyproject.toml	Thu Jul 11 17:35:31 2024 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,28 +0,0 @@
-[project]
-name = "flow"
-version = "0.1.0"
-description = "Default template for PDM package"
-authors = [
-    {name = "", email = ""},
-]
-dependencies = [
-    "prefect>=2.19.7",
-    "lxml>=5.2.2",
-    "httpx>=0.27.0",
-    "cssselect>=1.2.0",
-    "whoosh>=2.7.4",
-    "ipython>=8.26.0",
-]
-requires-python = "==3.11.*"
-readme = "README.md"
-license = {text = "MIT"}
-
-
-[tool.pdm]
-distribution = false
-
-[tool.pdm.scripts]
-_.env_file = "env"
-run_prefect_server = "prefect server start"
-run_build_flow = "python build_index.py"
-start_build = "prefect deployment run buildIndex/buildIndex"
\ No newline at end of file
--- a/flow/schema.py	Thu Jul 11 17:35:31 2024 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,4 +0,0 @@
-from whoosh.fields import TEXT, Schema
-
-
-schema = Schema(title=TEXT(stored=True), content=TEXT(stored=True))
\ No newline at end of file
--- a/flow/search_index.py	Thu Jul 11 17:35:31 2024 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,24 +0,0 @@
-from pathlib import Path
-
-from whoosh.fields import ID
-from whoosh.index import create_in
-
-from schema import schema
-
-log = None  # set by flow
-
-
-class SearchIndex:
-
-    def __init__(self, indexDir: Path):
-        indexDir.mkdir(parents=True, exist_ok=True)
-        self.ix = create_in(indexDir, schema)
-        self.writer = self.ix.writer()
-
-    def addDoc(self, **kw):
-        self.writer.add_document(**kw)
-
-    def commit(self):
-        self.writer.commit()
-        with self.ix.searcher() as searcher:
-            log.info(f'index doc count = {searcher.doc_count()}')
--- a/k8s/deploy.yaml	Thu Jul 11 17:35:31 2024 -0700
+++ b/k8s/deploy.yaml	Thu Jul 11 18:15:44 2024 -0700
@@ -26,8 +26,8 @@
             - vite
           ports:
             - containerPort: 8002
-        - name: search
-          image: reg:5000/sco_bot_search
+        - name: server
+          image: reg:5000/sco_bot_server
           workingDir: /opt
           command:
             - pdm
@@ -38,7 +38,7 @@
             - "0.0.0.0"
             - --port
             - "8001"
-            - search/query.py
+            - scobot/service/query.py
           ports:
             - containerPort: 8001
           volumeMounts:
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/scobot/index/access.py	Thu Jul 11 18:15:44 2024 -0700
@@ -0,0 +1,23 @@
+from pathlib import Path
+
+from whoosh.index import create_in
+
+from scobot.index.schema import schema
+
+log = None  # set by flow
+
+
+class SearchIndex:
+
+    def __init__(self, indexDir: Path):
+        indexDir.mkdir(parents=True, exist_ok=True)
+        self.ix = create_in(indexDir, schema)
+        self.writer = self.ix.writer()
+
+    def addDoc(self, **kw):
+        self.writer.add_document(**kw)
+
+    def commit(self):
+        self.writer.commit()
+        with self.ix.searcher() as searcher:
+            log.info(f'index doc count = {searcher.doc_count()}')
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/scobot/index/build_index_flow.py	Thu Jul 11 18:15:44 2024 -0700
@@ -0,0 +1,105 @@
+import json
+import re
+from pathlib import Path
+from typing import Iterable, cast
+
+import lxml.html
+import nltk
+from prefect import flow, task
+from prefect.logging import get_run_logger
+
+import scobot.index.access
+from scobot.index.access import SearchIndex
+from scobot.index.download_tasks import getCityMutableJson, getCityPermanent
+from scobot.local_types import MeetingRow, Url
+
+log = None
+
+
+@task()
+def meetingListUrls() -> Iterable[Url]:
+    return [
+        "https://albanyca.primegov.com/api/v2/PublicPortal/ListArchivedMeetings?year=2024",
+        "https://albanyca.primegov.com/api/v2/PublicPortal/ListUpcomingMeetings",
+    ]
+
+
+def meetingAgendaUrl(mtg: MeetingRow) -> Url:
+    for doc in mtg.get('documentList', []):
+        if doc['templateName'] == 'HTML Agenda Packet':
+            tid = doc['templateId']
+            return f'https://albanyca.primegov.com/Portal/Meeting?meetingTemplateId={tid}'
+    raise ValueError(f"no agenda doc found for {mtg['id']=}")
+
+
+def extractMeetingText(mhtml: str) -> list[str]:
+    el = lxml.html.fromstring(mhtml)
+    m = el.cssselect('div#meetingSection')[0]
+    for st in m.cssselect('style'):
+        st.clear()
+    meetingText = [
+        chunk for chunk in m.xpath('.//text()', namespaces=el.nsmap)
+        if chunk.strip()
+    ]
+    return meetingText
+
+
+def phrasesFromFile(p: Path) -> Iterable[dict]:
+    mtg = json.loads(p.read_text())
+    print(f'  has {len(mtg["phrases"])} phrases')
+    text = ' '.join(mtg['phrases'])
+
+    i = 0
+    for sentence in nltk.sent_tokenize(text):
+        sentence = re.sub(r'\s+', ' ', sentence).strip()
+        if len(sentence) < 5:
+            continue
+        if not re.search(r'\w\w\w\w\w', sentence):
+            continue
+
+        yield dict(id=f"{mtg['mtg']['id']}_sentence{i}",
+                   title=f"{mtg['mtg']['date']} {mtg['mtg']['title']}",
+                   sourceFile=str(p),
+                   posJson="[]",
+                   phrase=sentence)
+
+
+def addMeeting(index: SearchIndex, mtg: MeetingRow):
+    try:
+        agendaUrl = meetingAgendaUrl(mtg)
+    except ValueError:
+        pass
+    else:
+        html = getCityPermanent(agendaUrl)
+        text = extractMeetingText(html)
+        # todo group phrases phrasesFromFile
+        index.addDoc(title=f'{mtg["date"]} {mtg["title"]}', content=text)
+
+    try:
+        videoUrl = mtg['videoUrl']
+        if not videoUrl:
+            raise KeyError
+    except KeyError:
+        pass
+    else:
+        '''transcribe and index video...'''
+
+
+@flow(persist_result=True)
+def buildIndex():
+    global log
+    log = get_run_logger()
+    scobot.index.access.log = log
+
+    index = SearchIndex(Path('/tmp/scoindex'))
+    for url in meetingListUrls():
+        mtgs = cast(list[MeetingRow], getCityMutableJson(url))
+        log.info(f'got {len(mtgs)=}')
+
+        for mtg in mtgs:
+            addMeeting(index, mtg)
+    index.commit()
+
+
+if __name__ == '__main__':
+    buildIndex.serve()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/scobot/index/download_tasks.py	Thu Jul 11 18:15:44 2024 -0700
@@ -0,0 +1,37 @@
+import datetime
+import time
+
+import httpx
+from prefect import task
+from prefect.artifacts import create_link_artifact
+
+from scobot.local_types import Url
+
+
+@task(
+    task_run_name=lambda: f'getHttp-{int(time.time())}',
+    cache_key_fn=lambda _, args: f'getHttp-{args["url"]}',
+    cache_expiration=datetime.timedelta(seconds=86400),
+    tags=['city'],  # todo ratelimit based on tag
+)
+def getCityMutableJson(url: Url):
+    create_link_artifact("get", url)
+    req = httpx.get(url)  # todo async
+    req.raise_for_status()
+    return req.json()
+
+
+@task(task_run_name=lambda: f'getHttp-{int(time.time())}',
+      cache_key_fn=lambda _, args: f'getHttp-{args["url"]}',
+      tags=['city'])
+def getCityPermanent(url: Url) -> str:
+    create_link_artifact("get", url)
+    req = httpx.get(url)
+    req.raise_for_status()
+    return req.text
+
+
+@task
+def getYoutubePermanent(url: str):
+    time.sleep(5)
+    return 'video' * 10000
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/scobot/index/schema.py	Thu Jul 11 18:15:44 2024 -0700
@@ -0,0 +1,8 @@
+from whoosh.fields import ID, TEXT, Schema
+
+schema = Schema(
+    sourceUrl=ID(stored=True),
+    sourceTitle=TEXT(stored=True),
+    posJson=ID(stored=True),
+    phrase=TEXT(stored=True),
+)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/scobot/local_types.py	Thu Jul 11 18:15:44 2024 -0700
@@ -0,0 +1,5 @@
+from typing import NewType
+
+
+Url = NewType('Url', str)
+MeetingRow = NewType('MeetingRow', dict)
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/scobot/service/query.py	Thu Jul 11 18:15:44 2024 -0700
@@ -0,0 +1,70 @@
+import json
+from pathlib import Path
+
+# from pymilvus import MilvusClient
+# from milvus_model.dense.onnx import OnnxEmbeddingFunction
+from fastapi import FastAPI
+from tqdm import tqdm
+
+
+def rebuild(client, embedding_fn, dim):
+    client.drop_collection(collection_name="demo_collection")
+    if not client.has_collection(collection_name="demo_collection"):
+        client.create_collection(
+            collection_name="demo_collection",
+            dimension=dim,
+        )
+
+    docs = []
+    for i, (bbox, phrase) in tqdm(enumerate(
+            phrasesFromFile(
+                Path("data") /
+                "Meetings2226Minutes_20240702182359526 (1).pdf")),
+                                  desc="rebuilding",
+                                  unit=' phrase'):
+        [vector] = embedding_fn.encode_documents([phrase])
+        doc = {
+            "id": i,
+            "vector": vector,
+            "text": phrase,
+            "bbox": json.dumps(bbox),
+        }
+        docs.append(doc)
+    res = client.insert(collection_name="demo_collection", data=docs)
+    print('insert:', res['insert_count'])
+
+
+def xxsearch(q, embedding_fn, client):
+    query_vectors = embedding_fn.encode_queries([q])
+
+    [query_result] = client.search(
+        collection_name="demo_collection",
+        data=query_vectors,
+        limit=5,
+        output_fields=["text"],
+    )
+    query_result.sort(key=lambda x: x["distance"], reverse=True)
+
+    for row in query_result:
+        print(f'{row["distance"]:.6f} {row["entity"]["text"]}')
+
+
+# q, = sys.argv[1:]
+
+# https://huggingface.co/models?pipeline_tag=feature-extraction&library=onnx&sort=trending
+# embedding_fn = OnnxEmbeddingFunction(model_name="jinaai/jina-embeddings-v2-base-en")
+# client = MilvusClient("milvus_demo.db")
+# rebuild(client, embedding_fn, dim=embedding_fn.dim)
+# search(q, embedding_fn, client)
+
+app = FastAPI()
+
+#search = Search()
+
+
+@app.get("/sco/query")
+def read_query1(q: str):
+    results = []
+    results = search.search(q)
+
+    return {"results": results}
--- a/search/Dockerfile	Thu Jul 11 17:35:31 2024 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,10 +0,0 @@
-FROM reg:5000/base_basic
-
-WORKDIR /opt
-
-COPY pyproject.toml pdm.lock ./
-RUN pdm sync
-
-RUN pdm run python -c 'import nltk; nltk.download("punkt")'
-
-COPY search/** ./search/
\ No newline at end of file
--- a/search/doc.py	Thu Jul 11 17:35:31 2024 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,15 +0,0 @@
-from dataclasses import dataclass
-
-
-@dataclass
-class Doc:
-    id: str
-    title: str
-    sourceFile: str
-    posJson: str
-    phrase: str
-
-    def __getitem__(self, k):
-        return getattr(self, k)
-
-    pop = __getitem__
--- a/search/extract_agenda.py	Thu Jul 11 17:35:31 2024 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,33 +0,0 @@
-import json
-import re
-from pathlib import Path
-from typing import Iterable
-
-import nltk
-from doc import Doc
-
-
-def files() -> Iterable[Path]:
-    for p in Path('data/albany/meetingId').glob('*/agenda.json'):
-        print(f'file {p}')
-        yield p
-
-
-def phrasesFromFile(p: Path) -> Iterable[Doc]:
-    mtg = json.loads(p.read_text())
-    print(f'  has {len(mtg["phrases"])} phrases')
-    text = ' '.join(mtg['phrases'])
-
-    i = 0
-    for sentence in nltk.sent_tokenize(text):
-        sentence = re.sub(r'\s+', ' ', sentence).strip()
-        if len(sentence) < 5:
-            continue
-        if not re.search(r'\w\w\w\w\w', sentence):
-            continue
-
-        yield Doc(id=f"{mtg['mtg']['id']}_sentence{i}",
-                  title=f"{mtg['mtg']['date']} {mtg['mtg']['title']}",
-                  sourceFile=str(p),
-                  posJson="[]",
-                  phrase=sentence)
--- a/search/extract_all.py	Thu Jul 11 17:35:31 2024 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,14 +0,0 @@
-from typing import Iterable
-
-import extract_agenda
-import extract_pdf
-from doc import Doc
-
-
-def allDocs() -> Iterable[Doc]:
-    for mod in [
-            #extract_pdf,
-            extract_agenda
-    ]:
-        for src in mod.files():
-            yield from mod.phrasesFromFile(src)
--- a/search/extract_pdf.py	Thu Jul 11 17:35:31 2024 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,36 +0,0 @@
-import re
-from pathlib import Path
-from typing import Iterable
-
-import nltk
-from pdfminer.converter import PDFPageAggregator
-from pdfminer.layout import LAParams, LTTextBox
-from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
-from pdfminer.pdfpage import PDFPage
-
-def files() -> Iterable[Path]:
-    for p in Path('data').glob('*.pdf'):
-        yield p
-
-def phrasesFromFile(p: Path) -> Iterable:
-    fp = open(p, 'rb')
-    rsrcmgr = PDFResourceManager()
-    laparams = LAParams()
-    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
-    interpreter = PDFPageInterpreter(rsrcmgr, device)
-    pages = PDFPage.get_pages(fp)
-
-    for page in pages:
-        interpreter.process_page(page)
-        layout = device.get_result()
-        for lobj in layout:
-            if isinstance(lobj, LTTextBox):
-                text = lobj.get_text()
-                for sentence in nltk.sent_tokenize(text):
-                    sentence = re.sub(r'\s+', ' ', sentence).strip()
-                    if len(sentence) < 5:
-                        continue
-                    if not re.search(r'\w\w\w\w\w', sentence):
-                        continue
-
-                    yield list(map(int, lobj.bbox)), sentence
--- a/search/query.py	Thu Jul 11 17:35:31 2024 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,80 +0,0 @@
-from dataclasses import dataclass
-import html
-import json
-from pprint import pprint
-import sys
-from pathlib import Path
-from typing import Iterable
-
-from tqdm import tqdm
-
-from pymilvus import MilvusClient
-from milvus_model.dense.onnx import OnnxEmbeddingFunction
-
-from extract_pdf import files, phrasesFromFile
-
-from fastapi import FastAPI
-from search_apex import Search
-
-
-def rebuild(client, embedding_fn, dim):
-    client.drop_collection(collection_name="demo_collection")
-    if not client.has_collection(collection_name="demo_collection"):
-        client.create_collection(
-            collection_name="demo_collection",
-            dimension=dim,
-        )
-
-    docs = []
-    for i, (bbox, phrase) in tqdm(enumerate(
-            phrasesFromFile(
-                Path("data") /
-                "Meetings2226Minutes_20240702182359526 (1).pdf")),
-                                  desc="rebuilding",
-                                  unit=' phrase'):
-        [vector] = embedding_fn.encode_documents([phrase])
-        doc = {
-            "id": i,
-            "vector": vector,
-            "text": phrase,
-            "bbox": json.dumps(bbox),
-        }
-        docs.append(doc)
-    res = client.insert(collection_name="demo_collection", data=docs)
-    print('insert:', res['insert_count'])
-
-
-def xxsearch(q, embedding_fn, client):
-    query_vectors = embedding_fn.encode_queries([q])
-
-    [query_result] = client.search(
-        collection_name="demo_collection",
-        data=query_vectors,
-        limit=5,
-        output_fields=["text"],
-    )
-    query_result.sort(key=lambda x: x["distance"], reverse=True)
-
-    for row in query_result:
-        print(f'{row["distance"]:.6f} {row["entity"]["text"]}')
-
-
-# q, = sys.argv[1:]
-
-# https://huggingface.co/models?pipeline_tag=feature-extraction&library=onnx&sort=trending
-# embedding_fn = OnnxEmbeddingFunction(model_name="jinaai/jina-embeddings-v2-base-en")
-# client = MilvusClient("milvus_demo.db")
-# rebuild(client, embedding_fn, dim=embedding_fn.dim)
-# search(q, embedding_fn, client)
-
-app = FastAPI()
-
-search = Search()
-
-
-@app.get("/sco/query")
-def read_query1(q: str):
-    results = []
-    results = search.search(q)
-
-    return {"results": results}
--- a/search/search_apex.py	Thu Jul 11 17:35:31 2024 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,30 +0,0 @@
-from pprint import pprint
-from typing import Iterable
-from apexsearch import ApexSearch
-
-
-class Search:
-
-    def __init__(self):
-        self.apex = ApexSearch('data/apex',
-                               tables={
-                                   "docs": {
-                                       "content": ["phrase"],
-                                       "title": "title",
-                                       "extras": ["sourceFile", "pos"],
-                                   }
-                               },
-                               id_field='id')
-
-    def rebuild(self, docs: Iterable):
-        self.apex.build_complete_index(lambda *a: docs)
-        print('rebuild complete')
-
-    def search(self, q: str):
-        res = self.apex.search(q, target_number=100)
-        pprint(res)
-        for row in res['results']:
-            yield {
-                'title': row['title'],
-                'snippetHtml': row['highlighted_content']
-            }
--- a/search/search_apex_rebuild.py	Thu Jul 11 17:35:31 2024 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,10 +0,0 @@
-
-import subprocess
-from search_apex import Search
-from extract_all import allDocs
-
-subprocess.check_call('rm -rf data/apex', shell=True)
-subprocess.check_call('mkdir data/apex', shell=True)
-search = Search()
-search.rebuild(allDocs())
-subprocess.check_call('chmod -R a+rw data/apex', shell=True)
--- a/skaffold.yaml	Thu Jul 11 17:35:31 2024 -0700
+++ b/skaffold.yaml	Thu Jul 11 18:15:44 2024 -0700
@@ -4,22 +4,20 @@
   name: sco-bot
 build:
   artifacts:
-    - context: .
-      docker:
-        dockerfile: web/Dockerfile
+    - docker:
+        dockerfile: Dockerfile.web
       image: reg:5000/sco_bot_web
       platforms: [amd64]
       sync:
         infer:
           - src/**
-    - context: .
-      docker:
-        dockerfile: search/Dockerfile
-      image: reg:5000/sco_bot_search
+    - docker:
+        dockerfile: Dockerfile.server
+      image: reg:5000/sco_bot_server
       platforms: [amd64]
       sync:
         infer:
-          - 'search/**'
+          - 'scobot/**'
   tagPolicy:
     dateTime:
       format: 2006-01-02_15-04-05
--- a/web/Dockerfile	Thu Jul 11 17:35:31 2024 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,9 +0,0 @@
-FROM reg:5000/base_basic
-
-WORKDIR /opt
-
-COPY web/package.json web/pnpm-lock.yaml ./
-RUN pnpm install
-
-COPY web/vite.config.ts web/tsconfig.json ./
-COPY web/src/ ./src/