comparison extract_pdf.py @ 0:ca5da75f03ee

start
author drewp@bigasterisk.com
date Wed, 03 Jul 2024 19:16:28 -0700
parents
children c2176e9a2696
comparison
equal deleted inserted replaced
-1:000000000000 0:ca5da75f03ee
1 from pathlib import Path
2 from pprint import pprint
3 from typing import Iterable, Sequence
4 from pdfminer.layout import LAParams, LTTextBox
5 from pdfminer.pdfpage import PDFPage
6 from pdfminer.pdfinterp import PDFResourceManager
7 from pdfminer.pdfinterp import PDFPageInterpreter
8 from pdfminer.converter import PDFPageAggregator
9
10
11
12
13 def phrasesFromFile(p: Path) -> Iterable:
14 fp = open(p, 'rb')
15 rsrcmgr = PDFResourceManager()
16 laparams = LAParams()
17 device = PDFPageAggregator(rsrcmgr, laparams=laparams)
18 interpreter = PDFPageInterpreter(rsrcmgr, device)
19 pages = PDFPage.get_pages(fp)
20
21 for page in pages:
22 interpreter.process_page(page)
23 layout = device.get_result()
24 for lobj in layout:
25 if isinstance(lobj, LTTextBox):
26 text = lobj.get_text()
27
28 yield list(map(int, lobj.bbox)), text