# HG changeset patch # User drewp@bigasterisk.com # Date 1720063197 25200 # Node ID c2176e9a26964592edb686ddbbe74ec3f4a9249a # Parent ca5da75f03ee1c24d1326c509400adfc317b404d split by sentences diff -r ca5da75f03ee -r c2176e9a2696 extract_pdf.py --- a/extract_pdf.py Wed Jul 03 19:16:28 2024 -0700 +++ b/extract_pdf.py Wed Jul 03 20:19:57 2024 -0700 @@ -1,13 +1,12 @@ +import re from pathlib import Path -from pprint import pprint -from typing import Iterable, Sequence +from typing import Iterable + +import nltk +from pdfminer.converter import PDFPageAggregator from pdfminer.layout import LAParams, LTTextBox +from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager from pdfminer.pdfpage import PDFPage -from pdfminer.pdfinterp import PDFResourceManager -from pdfminer.pdfinterp import PDFPageInterpreter -from pdfminer.converter import PDFPageAggregator - - def phrasesFromFile(p: Path) -> Iterable: @@ -24,5 +23,9 @@ for lobj in layout: if isinstance(lobj, LTTextBox): text = lobj.get_text() - - yield list(map(int, lobj.bbox)), text \ No newline at end of file + for sentence in nltk.sent_tokenize(text): + sentence = re.sub(r'\s+', ' ', sentence).strip() + if len(sentence) < 5: + continue + + yield list(map(int, lobj.bbox)), sentence diff -r ca5da75f03ee -r c2176e9a2696 pdm.lock --- a/pdm.lock Wed Jul 03 19:16:28 2024 -0700 +++ b/pdm.lock Wed Jul 03 20:19:57 2024 -0700 @@ -5,7 +5,7 @@ groups = ["default"] strategy = ["cross_platform", "inherit_metadata"] lock_version = "4.4.1" -content_hash = "sha256:08218e73ce6f19b3ca692a54ab477662493547e84bd56fc549d88c8933ce94d9" +content_hash = "sha256:61f034c69e8959c0e345276733b3e11679a2d6c4f8b4d80e6323d00d0baaae0e" [[package]] name = "absl-py" @@ -104,6 +104,20 @@ ] [[package]] +name = "click" +version = "8.1.7" +requires_python = ">=3.7" +summary = "Composable command line interface toolkit" +groups = ["default"] +dependencies = [ + "colorama; platform_system == \"Windows\"", +] +files = [ + {file = "click-8.1.7-py3-none-any.whl", hash = "sha256:ae74fb96c20a0277a1d615f1e4d73c8414f5a98db8b799a7931d1582f3390c28"}, + {file = "click-8.1.7.tar.gz", hash = "sha256:ca9853ad459e787e2192211578cc907e7594e294c7ccc834310722b41b9ca6de"}, +] + +[[package]] name = "colorama" version = "0.4.6" requires_python = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" @@ -453,6 +467,17 @@ ] [[package]] +name = "joblib" +version = "1.4.2" +requires_python = ">=3.8" +summary = "Lightweight pipelining with Python functions" +groups = ["default"] +files = [ + {file = "joblib-1.4.2-py3-none-any.whl", hash = "sha256:06d478d5674cbc267e7496a410ee875abd68e4340feff4490bcb7afb88060ae6"}, + {file = "joblib-1.4.2.tar.gz", hash = "sha256:2382c5816b2636fbd20a09e0f4e9dad4736765fdfb7dca582943b9c1366b3f0e"}, +] + +[[package]] name = "markdown-it-py" version = "3.0.0" requires_python = ">=3.8" @@ -615,6 +640,23 @@ ] [[package]] +name = "nltk" +version = "3.8.1" +requires_python = ">=3.7" +summary = "Natural Language Toolkit" +groups = ["default"] +dependencies = [ + "click", + "joblib", + "regex>=2021.8.3", + "tqdm", +] +files = [ + {file = "nltk-3.8.1-py3-none-any.whl", hash = "sha256:fd5c9109f976fa86bcadba8f91e47f5e9293bd034474752e92a520f81c93dda5"}, + {file = "nltk-3.8.1.zip", hash = "sha256:1834da3d0682cba4f2cede2f9aad6b0fafb6461ba451db0efb6f9c39798d64d3"}, +] + +[[package]] name = "numpy" version = "1.26.4" requires_python = ">=3.9" diff -r ca5da75f03ee -r c2176e9a2696 pyproject.toml --- a/pyproject.toml Wed Jul 03 19:16:28 2024 -0700 +++ b/pyproject.toml Wed Jul 03 20:19:57 2024 -0700 @@ -12,6 +12,7 @@ "pymilvus[model]>=2.4.4", "ipdb>=0.13.13", "flax>=0.8.5", + "nltk>=3.8.1", ] requires-python = ">=3.11" readme = "README.md"