Mercurial > code > home > repos > sco-bot
view search/extract_pdf.py @ 4:0e33c65f1904
playing with extractors
author | drewp@bigasterisk.com |
---|---|
date | Sat, 06 Jul 2024 16:42:36 -0700 |
parents | extract_pdf.py@c2176e9a2696 |
children | f23b21bd0fce |
line wrap: on
line source
import re from pathlib import Path from typing import Iterable import nltk from pdfminer.converter import PDFPageAggregator from pdfminer.layout import LAParams, LTTextBox from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager from pdfminer.pdfpage import PDFPage def phrasesFromFile(p: Path) -> Iterable: fp = open(p, 'rb') rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) pages = PDFPage.get_pages(fp) for page in pages: interpreter.process_page(page) layout = device.get_result() for lobj in layout: if isinstance(lobj, LTTextBox): text = lobj.get_text() for sentence in nltk.sent_tokenize(text): sentence = re.sub(r'\s+', ' ', sentence).strip() if len(sentence) < 5: continue if not re.search(r'\w\w\w\w\w', sentence): continue yield list(map(int, lobj.bbox)), sentence