Mercurial > code > home > repos > sco-bot
comparison extract_pdf.py @ 0:ca5da75f03ee
start
author | drewp@bigasterisk.com |
---|---|
date | Wed, 03 Jul 2024 19:16:28 -0700 |
parents | |
children | c2176e9a2696 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:ca5da75f03ee |
---|---|
1 from pathlib import Path | |
2 from pprint import pprint | |
3 from typing import Iterable, Sequence | |
4 from pdfminer.layout import LAParams, LTTextBox | |
5 from pdfminer.pdfpage import PDFPage | |
6 from pdfminer.pdfinterp import PDFResourceManager | |
7 from pdfminer.pdfinterp import PDFPageInterpreter | |
8 from pdfminer.converter import PDFPageAggregator | |
9 | |
10 | |
11 | |
12 | |
13 def phrasesFromFile(p: Path) -> Iterable: | |
14 fp = open(p, 'rb') | |
15 rsrcmgr = PDFResourceManager() | |
16 laparams = LAParams() | |
17 device = PDFPageAggregator(rsrcmgr, laparams=laparams) | |
18 interpreter = PDFPageInterpreter(rsrcmgr, device) | |
19 pages = PDFPage.get_pages(fp) | |
20 | |
21 for page in pages: | |
22 interpreter.process_page(page) | |
23 layout = device.get_result() | |
24 for lobj in layout: | |
25 if isinstance(lobj, LTTextBox): | |
26 text = lobj.get_text() | |
27 | |
28 yield list(map(int, lobj.bbox)), text |