Mercurial > code > home > repos > sco-bot
view extract_pdf.py @ 1:c2176e9a2696
split by sentences
author | drewp@bigasterisk.com |
---|---|
date | Wed, 03 Jul 2024 20:19:57 -0700 |
parents | ca5da75f03ee |
children |
line wrap: on
line source
import re from pathlib import Path from typing import Iterable import nltk from pdfminer.converter import PDFPageAggregator from pdfminer.layout import LAParams, LTTextBox from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager from pdfminer.pdfpage import PDFPage def phrasesFromFile(p: Path) -> Iterable: fp = open(p, 'rb') rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) pages = PDFPage.get_pages(fp) for page in pages: interpreter.process_page(page) layout = device.get_result() for lobj in layout: if isinstance(lobj, LTTextBox): text = lobj.get_text() for sentence in nltk.sent_tokenize(text): sentence = re.sub(r'\s+', ' ', sentence).strip() if len(sentence) < 5: continue yield list(map(int, lobj.bbox)), sentence