Mercurial > code > home > repos > sco-bot
diff extract_pdf.py @ 0:ca5da75f03ee
start
author | drewp@bigasterisk.com |
---|---|
date | Wed, 03 Jul 2024 19:16:28 -0700 |
parents | |
children | c2176e9a2696 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/extract_pdf.py Wed Jul 03 19:16:28 2024 -0700 @@ -0,0 +1,28 @@ +from pathlib import Path +from pprint import pprint +from typing import Iterable, Sequence +from pdfminer.layout import LAParams, LTTextBox +from pdfminer.pdfpage import PDFPage +from pdfminer.pdfinterp import PDFResourceManager +from pdfminer.pdfinterp import PDFPageInterpreter +from pdfminer.converter import PDFPageAggregator + + + + +def phrasesFromFile(p: Path) -> Iterable: + fp = open(p, 'rb') + rsrcmgr = PDFResourceManager() + laparams = LAParams() + device = PDFPageAggregator(rsrcmgr, laparams=laparams) + interpreter = PDFPageInterpreter(rsrcmgr, device) + pages = PDFPage.get_pages(fp) + + for page in pages: + interpreter.process_page(page) + layout = device.get_result() + for lobj in layout: + if isinstance(lobj, LTTextBox): + text = lobj.get_text() + + yield list(map(int, lobj.bbox)), text \ No newline at end of file