0
|
1 from pathlib import Path
|
|
2 from pprint import pprint
|
|
3 from typing import Iterable, Sequence
|
|
4 from pdfminer.layout import LAParams, LTTextBox
|
|
5 from pdfminer.pdfpage import PDFPage
|
|
6 from pdfminer.pdfinterp import PDFResourceManager
|
|
7 from pdfminer.pdfinterp import PDFPageInterpreter
|
|
8 from pdfminer.converter import PDFPageAggregator
|
|
9
|
|
10
|
|
11
|
|
12
|
|
13 def phrasesFromFile(p: Path) -> Iterable:
|
|
14 fp = open(p, 'rb')
|
|
15 rsrcmgr = PDFResourceManager()
|
|
16 laparams = LAParams()
|
|
17 device = PDFPageAggregator(rsrcmgr, laparams=laparams)
|
|
18 interpreter = PDFPageInterpreter(rsrcmgr, device)
|
|
19 pages = PDFPage.get_pages(fp)
|
|
20
|
|
21 for page in pages:
|
|
22 interpreter.process_page(page)
|
|
23 layout = device.get_result()
|
|
24 for lobj in layout:
|
|
25 if isinstance(lobj, LTTextBox):
|
|
26 text = lobj.get_text()
|
|
27
|
|
28 yield list(map(int, lobj.bbox)), text |