Mercurial > code > home > repos > sco-bot
view extract_pdf.py @ 0:ca5da75f03ee
start
author | drewp@bigasterisk.com |
---|---|
date | Wed, 03 Jul 2024 19:16:28 -0700 |
parents | |
children | c2176e9a2696 |
line wrap: on
line source
from pathlib import Path from pprint import pprint from typing import Iterable, Sequence from pdfminer.layout import LAParams, LTTextBox from pdfminer.pdfpage import PDFPage from pdfminer.pdfinterp import PDFResourceManager from pdfminer.pdfinterp import PDFPageInterpreter from pdfminer.converter import PDFPageAggregator def phrasesFromFile(p: Path) -> Iterable: fp = open(p, 'rb') rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) pages = PDFPage.get_pages(fp) for page in pages: interpreter.process_page(page) layout = device.get_result() for lobj in layout: if isinstance(lobj, LTTextBox): text = lobj.get_text() yield list(map(int, lobj.bbox)), text