comparison search/extract_pdf.py @ 4:0e33c65f1904

playing with extractors
author drewp@bigasterisk.com
date Sat, 06 Jul 2024 16:42:36 -0700
parents extract_pdf.py@c2176e9a2696
children f23b21bd0fce
comparison
equal deleted inserted replaced
3:ba1ce5921a4b 4:0e33c65f1904
1 import re
2 from pathlib import Path
3 from typing import Iterable
4
5 import nltk
6 from pdfminer.converter import PDFPageAggregator
7 from pdfminer.layout import LAParams, LTTextBox
8 from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
9 from pdfminer.pdfpage import PDFPage
10
11
12 def phrasesFromFile(p: Path) -> Iterable:
13 fp = open(p, 'rb')
14 rsrcmgr = PDFResourceManager()
15 laparams = LAParams()
16 device = PDFPageAggregator(rsrcmgr, laparams=laparams)
17 interpreter = PDFPageInterpreter(rsrcmgr, device)
18 pages = PDFPage.get_pages(fp)
19
20 for page in pages:
21 interpreter.process_page(page)
22 layout = device.get_result()
23 for lobj in layout:
24 if isinstance(lobj, LTTextBox):
25 text = lobj.get_text()
26 for sentence in nltk.sent_tokenize(text):
27 sentence = re.sub(r'\s+', ' ', sentence).strip()
28 if len(sentence) < 5:
29 continue
30 if not re.search(r'\w\w\w\w\w', sentence):
31 continue
32
33 yield list(map(int, lobj.bbox)), sentence