Mercurial > code > home > repos > sco-bot
diff extract_pdf.py @ 1:c2176e9a2696
split by sentences
author | drewp@bigasterisk.com |
---|---|
date | Wed, 03 Jul 2024 20:19:57 -0700 |
parents | ca5da75f03ee |
children |
line wrap: on
line diff
--- a/extract_pdf.py Wed Jul 03 19:16:28 2024 -0700 +++ b/extract_pdf.py Wed Jul 03 20:19:57 2024 -0700 @@ -1,13 +1,12 @@ +import re from pathlib import Path -from pprint import pprint -from typing import Iterable, Sequence +from typing import Iterable + +import nltk +from pdfminer.converter import PDFPageAggregator from pdfminer.layout import LAParams, LTTextBox +from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager from pdfminer.pdfpage import PDFPage -from pdfminer.pdfinterp import PDFResourceManager -from pdfminer.pdfinterp import PDFPageInterpreter -from pdfminer.converter import PDFPageAggregator - - def phrasesFromFile(p: Path) -> Iterable: @@ -24,5 +23,9 @@ for lobj in layout: if isinstance(lobj, LTTextBox): text = lobj.get_text() - - yield list(map(int, lobj.bbox)), text \ No newline at end of file + for sentence in nltk.sent_tokenize(text): + sentence = re.sub(r'\s+', ' ', sentence).strip() + if len(sentence) < 5: + continue + + yield list(map(int, lobj.bbox)), sentence