8
|
1
|
|
2 from dataclasses import dataclass
|
|
3 import json
|
|
4 from typing import Iterable
|
|
5
|
|
6 from search.extract_pdf import files, phrasesFromFile
|
|
7
|
|
8
|
|
9 @dataclass
|
|
10 class Doc:
|
|
11 id: int
|
|
12 title: str
|
|
13 sourceFile: str
|
|
14 posJson: str
|
|
15 phrase: str
|
|
16
|
|
17 def __getitem__(self, k):
|
|
18 return getattr(self, k)
|
|
19
|
|
20 pop = __getitem__
|
|
21
|
|
22
|
|
23 def allDocs() -> Iterable[Doc]:
|
|
24 id = 0
|
|
25 for src in files():
|
|
26 for pos, line in phrasesFromFile(src):
|
|
27 yield Doc(id=id,
|
|
28 title=src.name,
|
|
29 sourceFile=str(src),
|
|
30 posJson=json.dumps(pos),
|
|
31 phrase=line)
|
|
32 id += 1
|