changeset 1:c2176e9a2696

split by sentences
author drewp@bigasterisk.com
date Wed, 03 Jul 2024 20:19:57 -0700
parents ca5da75f03ee
children 82428652cda1
files extract_pdf.py pdm.lock pyproject.toml
diffstat 3 files changed, 56 insertions(+), 10 deletions(-) [+]
line wrap: on
line diff
--- a/extract_pdf.py	Wed Jul 03 19:16:28 2024 -0700
+++ b/extract_pdf.py	Wed Jul 03 20:19:57 2024 -0700
@@ -1,13 +1,12 @@
+import re
 from pathlib import Path
-from pprint import pprint
-from typing import Iterable, Sequence
+from typing import Iterable
+
+import nltk
+from pdfminer.converter import PDFPageAggregator
 from pdfminer.layout import LAParams, LTTextBox
+from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
 from pdfminer.pdfpage import PDFPage
-from pdfminer.pdfinterp import PDFResourceManager
-from pdfminer.pdfinterp import PDFPageInterpreter
-from pdfminer.converter import PDFPageAggregator
-
-
 
 
 def phrasesFromFile(p: Path) -> Iterable:
@@ -24,5 +23,9 @@
         for lobj in layout:
             if isinstance(lobj, LTTextBox):
                 text = lobj.get_text()
-                
-                yield list(map(int, lobj.bbox)), text
\ No newline at end of file
+                for sentence in nltk.sent_tokenize(text):
+                    sentence = re.sub(r'\s+', ' ', sentence).strip()
+                    if len(sentence) < 5:
+                        continue
+
+                    yield list(map(int, lobj.bbox)), sentence
--- a/pdm.lock	Wed Jul 03 19:16:28 2024 -0700
+++ b/pdm.lock	Wed Jul 03 20:19:57 2024 -0700
@@ -5,7 +5,7 @@
 groups = ["default"]
 strategy = ["cross_platform", "inherit_metadata"]
 lock_version = "4.4.1"
-content_hash = "sha256:08218e73ce6f19b3ca692a54ab477662493547e84bd56fc549d88c8933ce94d9"
+content_hash = "sha256:61f034c69e8959c0e345276733b3e11679a2d6c4f8b4d80e6323d00d0baaae0e"
 
 [[package]]
 name = "absl-py"
@@ -104,6 +104,20 @@
 ]
 
 [[package]]
+name = "click"
+version = "8.1.7"
+requires_python = ">=3.7"
+summary = "Composable command line interface toolkit"
+groups = ["default"]
+dependencies = [
+    "colorama; platform_system == \"Windows\"",
+]
+files = [
+    {file = "click-8.1.7-py3-none-any.whl", hash = "sha256:ae74fb96c20a0277a1d615f1e4d73c8414f5a98db8b799a7931d1582f3390c28"},
+    {file = "click-8.1.7.tar.gz", hash = "sha256:ca9853ad459e787e2192211578cc907e7594e294c7ccc834310722b41b9ca6de"},
+]
+
+[[package]]
 name = "colorama"
 version = "0.4.6"
 requires_python = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7"
@@ -453,6 +467,17 @@
 ]
 
 [[package]]
+name = "joblib"
+version = "1.4.2"
+requires_python = ">=3.8"
+summary = "Lightweight pipelining with Python functions"
+groups = ["default"]
+files = [
+    {file = "joblib-1.4.2-py3-none-any.whl", hash = "sha256:06d478d5674cbc267e7496a410ee875abd68e4340feff4490bcb7afb88060ae6"},
+    {file = "joblib-1.4.2.tar.gz", hash = "sha256:2382c5816b2636fbd20a09e0f4e9dad4736765fdfb7dca582943b9c1366b3f0e"},
+]
+
+[[package]]
 name = "markdown-it-py"
 version = "3.0.0"
 requires_python = ">=3.8"
@@ -615,6 +640,23 @@
 ]
 
 [[package]]
+name = "nltk"
+version = "3.8.1"
+requires_python = ">=3.7"
+summary = "Natural Language Toolkit"
+groups = ["default"]
+dependencies = [
+    "click",
+    "joblib",
+    "regex>=2021.8.3",
+    "tqdm",
+]
+files = [
+    {file = "nltk-3.8.1-py3-none-any.whl", hash = "sha256:fd5c9109f976fa86bcadba8f91e47f5e9293bd034474752e92a520f81c93dda5"},
+    {file = "nltk-3.8.1.zip", hash = "sha256:1834da3d0682cba4f2cede2f9aad6b0fafb6461ba451db0efb6f9c39798d64d3"},
+]
+
+[[package]]
 name = "numpy"
 version = "1.26.4"
 requires_python = ">=3.9"
--- a/pyproject.toml	Wed Jul 03 19:16:28 2024 -0700
+++ b/pyproject.toml	Wed Jul 03 20:19:57 2024 -0700
@@ -12,6 +12,7 @@
     "pymilvus[model]>=2.4.4",
     "ipdb>=0.13.13",
     "flax>=0.8.5",
+    "nltk>=3.8.1",
 ]
 requires-python = ">=3.11"
 readme = "README.md"