view search/extract_pdf.py @ 8:f23b21bd0fce

apex search
author drewp@bigasterisk.com
date Sun, 07 Jul 2024 16:26:56 -0700
parents 0e33c65f1904
children
line wrap: on
line source

import re
from pathlib import Path
from typing import Iterable

import nltk
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTTextBox
from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
from pdfminer.pdfpage import PDFPage

def files() -> Iterable[Path]:
    for p in Path('data').glob('*.pdf'):
        yield p

def phrasesFromFile(p: Path) -> Iterable:
    fp = open(p, 'rb')
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    pages = PDFPage.get_pages(fp)

    for page in pages:
        interpreter.process_page(page)
        layout = device.get_result()
        for lobj in layout:
            if isinstance(lobj, LTTextBox):
                text = lobj.get_text()
                for sentence in nltk.sent_tokenize(text):
                    sentence = re.sub(r'\s+', ' ', sentence).strip()
                    if len(sentence) < 5:
                        continue
                    if not re.search(r'\w\w\w\w\w', sentence):
                        continue

                    yield list(map(int, lobj.bbox)), sentence