diff extract_pdf.py @ 0:ca5da75f03ee

start
author drewp@bigasterisk.com
date Wed, 03 Jul 2024 19:16:28 -0700
parents
children c2176e9a2696
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/extract_pdf.py	Wed Jul 03 19:16:28 2024 -0700
@@ -0,0 +1,28 @@
+from pathlib import Path
+from pprint import pprint
+from typing import Iterable, Sequence
+from pdfminer.layout import LAParams, LTTextBox
+from pdfminer.pdfpage import PDFPage
+from pdfminer.pdfinterp import PDFResourceManager
+from pdfminer.pdfinterp import PDFPageInterpreter
+from pdfminer.converter import PDFPageAggregator
+
+
+
+
+def phrasesFromFile(p: Path) -> Iterable:
+    fp = open(p, 'rb')
+    rsrcmgr = PDFResourceManager()
+    laparams = LAParams()
+    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
+    interpreter = PDFPageInterpreter(rsrcmgr, device)
+    pages = PDFPage.get_pages(fp)
+
+    for page in pages:
+        interpreter.process_page(page)
+        layout = device.get_result()
+        for lobj in layout:
+            if isinstance(lobj, LTTextBox):
+                text = lobj.get_text()
+                
+                yield list(map(int, lobj.bbox)), text
\ No newline at end of file