annotate extract_pdf.py @ 0:ca5da75f03ee

start
author drewp@bigasterisk.com
date Wed, 03 Jul 2024 19:16:28 -0700
parents
children c2176e9a2696
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
drewp@bigasterisk.com
parents:
diff changeset
1 from pathlib import Path
drewp@bigasterisk.com
parents:
diff changeset
2 from pprint import pprint
drewp@bigasterisk.com
parents:
diff changeset
3 from typing import Iterable, Sequence
drewp@bigasterisk.com
parents:
diff changeset
4 from pdfminer.layout import LAParams, LTTextBox
drewp@bigasterisk.com
parents:
diff changeset
5 from pdfminer.pdfpage import PDFPage
drewp@bigasterisk.com
parents:
diff changeset
6 from pdfminer.pdfinterp import PDFResourceManager
drewp@bigasterisk.com
parents:
diff changeset
7 from pdfminer.pdfinterp import PDFPageInterpreter
drewp@bigasterisk.com
parents:
diff changeset
8 from pdfminer.converter import PDFPageAggregator
drewp@bigasterisk.com
parents:
diff changeset
9
drewp@bigasterisk.com
parents:
diff changeset
10
drewp@bigasterisk.com
parents:
diff changeset
11
drewp@bigasterisk.com
parents:
diff changeset
12
drewp@bigasterisk.com
parents:
diff changeset
13 def phrasesFromFile(p: Path) -> Iterable:
drewp@bigasterisk.com
parents:
diff changeset
14 fp = open(p, 'rb')
drewp@bigasterisk.com
parents:
diff changeset
15 rsrcmgr = PDFResourceManager()
drewp@bigasterisk.com
parents:
diff changeset
16 laparams = LAParams()
drewp@bigasterisk.com
parents:
diff changeset
17 device = PDFPageAggregator(rsrcmgr, laparams=laparams)
drewp@bigasterisk.com
parents:
diff changeset
18 interpreter = PDFPageInterpreter(rsrcmgr, device)
drewp@bigasterisk.com
parents:
diff changeset
19 pages = PDFPage.get_pages(fp)
drewp@bigasterisk.com
parents:
diff changeset
20
drewp@bigasterisk.com
parents:
diff changeset
21 for page in pages:
drewp@bigasterisk.com
parents:
diff changeset
22 interpreter.process_page(page)
drewp@bigasterisk.com
parents:
diff changeset
23 layout = device.get_result()
drewp@bigasterisk.com
parents:
diff changeset
24 for lobj in layout:
drewp@bigasterisk.com
parents:
diff changeset
25 if isinstance(lobj, LTTextBox):
drewp@bigasterisk.com
parents:
diff changeset
26 text = lobj.get_text()
drewp@bigasterisk.com
parents:
diff changeset
27
drewp@bigasterisk.com
parents:
diff changeset
28 yield list(map(int, lobj.bbox)), text