Mercurial > code > home > repos > sco-bot
diff search/extract_pdf.py @ 4:0e33c65f1904
playing with extractors
author | drewp@bigasterisk.com |
---|---|
date | Sat, 06 Jul 2024 16:42:36 -0700 |
parents | extract_pdf.py@c2176e9a2696 |
children | f23b21bd0fce |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/search/extract_pdf.py Sat Jul 06 16:42:36 2024 -0700 @@ -0,0 +1,33 @@ +import re +from pathlib import Path +from typing import Iterable + +import nltk +from pdfminer.converter import PDFPageAggregator +from pdfminer.layout import LAParams, LTTextBox +from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager +from pdfminer.pdfpage import PDFPage + + +def phrasesFromFile(p: Path) -> Iterable: + fp = open(p, 'rb') + rsrcmgr = PDFResourceManager() + laparams = LAParams() + device = PDFPageAggregator(rsrcmgr, laparams=laparams) + interpreter = PDFPageInterpreter(rsrcmgr, device) + pages = PDFPage.get_pages(fp) + + for page in pages: + interpreter.process_page(page) + layout = device.get_result() + for lobj in layout: + if isinstance(lobj, LTTextBox): + text = lobj.get_text() + for sentence in nltk.sent_tokenize(text): + sentence = re.sub(r'\s+', ' ', sentence).strip() + if len(sentence) < 5: + continue + if not re.search(r'\w\w\w\w\w', sentence): + continue + + yield list(map(int, lobj.bbox)), sentence