diff search/extract_pdf.py @ 4:0e33c65f1904

playing with extractors
author drewp@bigasterisk.com
date Sat, 06 Jul 2024 16:42:36 -0700
parents extract_pdf.py@c2176e9a2696
children f23b21bd0fce
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/search/extract_pdf.py	Sat Jul 06 16:42:36 2024 -0700
@@ -0,0 +1,33 @@
+import re
+from pathlib import Path
+from typing import Iterable
+
+import nltk
+from pdfminer.converter import PDFPageAggregator
+from pdfminer.layout import LAParams, LTTextBox
+from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
+from pdfminer.pdfpage import PDFPage
+
+
+def phrasesFromFile(p: Path) -> Iterable:
+    fp = open(p, 'rb')
+    rsrcmgr = PDFResourceManager()
+    laparams = LAParams()
+    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
+    interpreter = PDFPageInterpreter(rsrcmgr, device)
+    pages = PDFPage.get_pages(fp)
+
+    for page in pages:
+        interpreter.process_page(page)
+        layout = device.get_result()
+        for lobj in layout:
+            if isinstance(lobj, LTTextBox):
+                text = lobj.get_text()
+                for sentence in nltk.sent_tokenize(text):
+                    sentence = re.sub(r'\s+', ' ', sentence).strip()
+                    if len(sentence) < 5:
+                        continue
+                    if not re.search(r'\w\w\w\w\w', sentence):
+                        continue
+
+                    yield list(map(int, lobj.bbox)), sentence