annotate scobot/index/access.py @ 18:a527228aa353 default tip

prefect use postgres
author drewp@bigasterisk.com
date Fri, 19 Jul 2024 21:01:09 -0700
parents 7a87ba2f00d9
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
16
7a87ba2f00d9 reformat, fix some types, make more async
drewp@bigasterisk.com
parents: 15
diff changeset
1 import logging
10
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
2 from pathlib import Path
13
403eff4a16c8 fix up indexer flow and fastapi server
drewp@bigasterisk.com
parents: 11
diff changeset
3 import shutil
16
7a87ba2f00d9 reformat, fix some types, make more async
drewp@bigasterisk.com
parents: 15
diff changeset
4 from typing import cast
10
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
5
13
403eff4a16c8 fix up indexer flow and fastapi server
drewp@bigasterisk.com
parents: 11
diff changeset
6 from whoosh.index import create_in, open_dir
10
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
7
11
6622bacb0b84 first pass at reorg
drewp@bigasterisk.com
parents: 10
diff changeset
8 from scobot.index.schema import schema
10
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
9
16
7a87ba2f00d9 reformat, fix some types, make more async
drewp@bigasterisk.com
parents: 15
diff changeset
10 log = cast(logging.Logger, None) # set by flow
10
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
11
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
12
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
13 class SearchIndex:
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
14
13
403eff4a16c8 fix up indexer flow and fastapi server
drewp@bigasterisk.com
parents: 11
diff changeset
15 def __init__(self, indexDir: Path, delete_existing=True):
403eff4a16c8 fix up indexer flow and fastapi server
drewp@bigasterisk.com
parents: 11
diff changeset
16 if delete_existing:
15
6ed25bcaaf1f add prefect and rebuild flow to k8s
drewp@bigasterisk.com
parents: 13
diff changeset
17 shutil.rmtree(indexDir, ignore_errors=True)
13
403eff4a16c8 fix up indexer flow and fastapi server
drewp@bigasterisk.com
parents: 11
diff changeset
18 indexDir.mkdir(parents=True, exist_ok=True)
403eff4a16c8 fix up indexer flow and fastapi server
drewp@bigasterisk.com
parents: 11
diff changeset
19 self.ix = create_in(indexDir, schema)
403eff4a16c8 fix up indexer flow and fastapi server
drewp@bigasterisk.com
parents: 11
diff changeset
20 else:
403eff4a16c8 fix up indexer flow and fastapi server
drewp@bigasterisk.com
parents: 11
diff changeset
21 self.ix = open_dir(indexDir)
10
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
22 self.writer = self.ix.writer()
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
23
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
24 def addDoc(self, **kw):
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
25 self.writer.add_document(**kw)
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
26
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
27 def commit(self):
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
28 self.writer.commit()
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
29 with self.ix.searcher() as searcher:
13438795d896 rewrite with prefect flows and whoosh search, but it's in a nested pdm env
drewp@bigasterisk.com
parents:
diff changeset
30 log.info(f'index doc count = {searcher.doc_count()}')
13
403eff4a16c8 fix up indexer flow and fastapi server
drewp@bigasterisk.com
parents: 11
diff changeset
31
16
7a87ba2f00d9 reformat, fix some types, make more async
drewp@bigasterisk.com
parents: 15
diff changeset
32
13
403eff4a16c8 fix up indexer flow and fastapi server
drewp@bigasterisk.com
parents: 11
diff changeset
33 class SearchIndexRO:
16
7a87ba2f00d9 reformat, fix some types, make more async
drewp@bigasterisk.com
parents: 15
diff changeset
34
13
403eff4a16c8 fix up indexer flow and fastapi server
drewp@bigasterisk.com
parents: 11
diff changeset
35 def __init__(self, indexDir: Path):
403eff4a16c8 fix up indexer flow and fastapi server
drewp@bigasterisk.com
parents: 11
diff changeset
36 self.ix = open_dir(indexDir, readonly=True)
403eff4a16c8 fix up indexer flow and fastapi server
drewp@bigasterisk.com
parents: 11
diff changeset
37 self.searcher = self.ix.searcher()
16
7a87ba2f00d9 reformat, fix some types, make more async
drewp@bigasterisk.com
parents: 15
diff changeset
38 print(f'{self.searcher.doc_count()=}')