annotate scobot/index/access.py @ 13:403eff4a16c8

fix up indexer flow and fastapi server
author drewp@bigasterisk.com
date Thu, 11 Jul 2024 21:32:24 -0700
parents 6622bacb0b84
children 6ed25bcaaf1f
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
10
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
1 from pathlib import Path
13
403eff4a16c8 fix up indexer flow and fastapi server
drewp@bigasterisk.com
parents: 11
diff changeset
2 import shutil
10
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
3
13
403eff4a16c8 fix up indexer flow and fastapi server
drewp@bigasterisk.com
parents: 11
diff changeset
4 from whoosh.index import create_in, open_dir
10
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
5
11
6622bacb0b84 first pass at reorg
drewp@bigasterisk.com
parents: 10
diff changeset
6 from scobot.index.schema import schema
10
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
7
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
8 log = None # set by flow
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
9
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
10
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
11 class SearchIndex:
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
12
13
403eff4a16c8 fix up indexer flow and fastapi server
drewp@bigasterisk.com
parents: 11
diff changeset
13 def __init__(self, indexDir: Path, delete_existing=True):
403eff4a16c8 fix up indexer flow and fastapi server
drewp@bigasterisk.com
parents: 11
diff changeset
14 if delete_existing:
403eff4a16c8 fix up indexer flow and fastapi server
drewp@bigasterisk.com
parents: 11
diff changeset
15 shutil.rmtree(indexDir)
403eff4a16c8 fix up indexer flow and fastapi server
drewp@bigasterisk.com
parents: 11
diff changeset
16 indexDir.mkdir(parents=True, exist_ok=True)
403eff4a16c8 fix up indexer flow and fastapi server
drewp@bigasterisk.com
parents: 11
diff changeset
17 self.ix = create_in(indexDir, schema)
403eff4a16c8 fix up indexer flow and fastapi server
drewp@bigasterisk.com
parents: 11
diff changeset
18 else:
403eff4a16c8 fix up indexer flow and fastapi server
drewp@bigasterisk.com
parents: 11
diff changeset
19 self.ix = open_dir(indexDir)
10
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
20 self.writer = self.ix.writer()
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
21
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
22 def addDoc(self, **kw):
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
23 self.writer.add_document(**kw)
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
24
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
25 def commit(self):
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
26 self.writer.commit()
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
27 with self.ix.searcher() as searcher:
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
28 log.info(f'index doc count = {searcher.doc_count()}')
13
403eff4a16c8 fix up indexer flow and fastapi server
drewp@bigasterisk.com
parents: 11
diff changeset
29
403eff4a16c8 fix up indexer flow and fastapi server
drewp@bigasterisk.com
parents: 11
diff changeset
30 class SearchIndexRO:
403eff4a16c8 fix up indexer flow and fastapi server
drewp@bigasterisk.com
parents: 11
diff changeset
31 def __init__(self, indexDir: Path):
403eff4a16c8 fix up indexer flow and fastapi server
drewp@bigasterisk.com
parents: 11
diff changeset
32 self.ix = open_dir(indexDir, readonly=True)
403eff4a16c8 fix up indexer flow and fastapi server
drewp@bigasterisk.com
parents: 11
diff changeset
33 self.searcher = self.ix.searcher()
403eff4a16c8 fix up indexer flow and fastapi server
drewp@bigasterisk.com
parents: 11
diff changeset
34 print(f'{self.searcher.doc_count()=}')