diff extract_pdf.py @ 1:c2176e9a2696

split by sentences
author drewp@bigasterisk.com
date Wed, 03 Jul 2024 20:19:57 -0700
parents ca5da75f03ee
children
line wrap: on
line diff
--- a/extract_pdf.py	Wed Jul 03 19:16:28 2024 -0700
+++ b/extract_pdf.py	Wed Jul 03 20:19:57 2024 -0700
@@ -1,13 +1,12 @@
+import re
 from pathlib import Path
-from pprint import pprint
-from typing import Iterable, Sequence
+from typing import Iterable
+
+import nltk
+from pdfminer.converter import PDFPageAggregator
 from pdfminer.layout import LAParams, LTTextBox
+from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
 from pdfminer.pdfpage import PDFPage
-from pdfminer.pdfinterp import PDFResourceManager
-from pdfminer.pdfinterp import PDFPageInterpreter
-from pdfminer.converter import PDFPageAggregator
-
-
 
 
 def phrasesFromFile(p: Path) -> Iterable:
@@ -24,5 +23,9 @@
         for lobj in layout:
             if isinstance(lobj, LTTextBox):
                 text = lobj.get_text()
-                
-                yield list(map(int, lobj.bbox)), text
\ No newline at end of file
+                for sentence in nltk.sent_tokenize(text):
+                    sentence = re.sub(r'\s+', ' ', sentence).strip()
+                    if len(sentence) < 5:
+                        continue
+
+                    yield list(map(int, lobj.bbox)), sentence