Mercurial > code > home > repos > sco-bot
view search/extract_pdf.py @ 8:f23b21bd0fce
apex search
author | drewp@bigasterisk.com |
---|---|
date | Sun, 07 Jul 2024 16:26:56 -0700 |
parents | 0e33c65f1904 |
children |
line wrap: on
line source
import re from pathlib import Path from typing import Iterable import nltk from pdfminer.converter import PDFPageAggregator from pdfminer.layout import LAParams, LTTextBox from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager from pdfminer.pdfpage import PDFPage def files() -> Iterable[Path]: for p in Path('data').glob('*.pdf'): yield p def phrasesFromFile(p: Path) -> Iterable: fp = open(p, 'rb') rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) pages = PDFPage.get_pages(fp) for page in pages: interpreter.process_page(page) layout = device.get_result() for lobj in layout: if isinstance(lobj, LTTextBox): text = lobj.get_text() for sentence in nltk.sent_tokenize(text): sentence = re.sub(r'\s+', ' ', sentence).strip() if len(sentence) < 5: continue if not re.search(r'\w\w\w\w\w', sentence): continue yield list(map(int, lobj.bbox)), sentence