1
|
1 import re
|
0
|
2 from pathlib import Path
|
1
|
3 from typing import Iterable
|
|
4
|
|
5 import nltk
|
|
6 from pdfminer.converter import PDFPageAggregator
|
0
|
7 from pdfminer.layout import LAParams, LTTextBox
|
1
|
8 from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
|
0
|
9 from pdfminer.pdfpage import PDFPage
|
|
10
|
8
|
11 def files() -> Iterable[Path]:
|
|
12 for p in Path('data').glob('*.pdf'):
|
|
13 yield p
|
0
|
14
|
|
15 def phrasesFromFile(p: Path) -> Iterable:
|
|
16 fp = open(p, 'rb')
|
|
17 rsrcmgr = PDFResourceManager()
|
|
18 laparams = LAParams()
|
|
19 device = PDFPageAggregator(rsrcmgr, laparams=laparams)
|
|
20 interpreter = PDFPageInterpreter(rsrcmgr, device)
|
|
21 pages = PDFPage.get_pages(fp)
|
|
22
|
|
23 for page in pages:
|
|
24 interpreter.process_page(page)
|
|
25 layout = device.get_result()
|
|
26 for lobj in layout:
|
|
27 if isinstance(lobj, LTTextBox):
|
|
28 text = lobj.get_text()
|
1
|
29 for sentence in nltk.sent_tokenize(text):
|
|
30 sentence = re.sub(r'\s+', ' ', sentence).strip()
|
|
31 if len(sentence) < 5:
|
|
32 continue
|
4
|
33 if not re.search(r'\w\w\w\w\w', sentence):
|
|
34 continue
|
1
|
35
|
|
36 yield list(map(int, lobj.bbox)), sentence
|