Mercurial > code > home > repos > sco-bot
comparison search/extract_pdf.py @ 4:0e33c65f1904
playing with extractors
author | drewp@bigasterisk.com |
---|---|
date | Sat, 06 Jul 2024 16:42:36 -0700 |
parents | extract_pdf.py@c2176e9a2696 |
children | f23b21bd0fce |
comparison
equal
deleted
inserted
replaced
3:ba1ce5921a4b | 4:0e33c65f1904 |
---|---|
1 import re | |
2 from pathlib import Path | |
3 from typing import Iterable | |
4 | |
5 import nltk | |
6 from pdfminer.converter import PDFPageAggregator | |
7 from pdfminer.layout import LAParams, LTTextBox | |
8 from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager | |
9 from pdfminer.pdfpage import PDFPage | |
10 | |
11 | |
12 def phrasesFromFile(p: Path) -> Iterable: | |
13 fp = open(p, 'rb') | |
14 rsrcmgr = PDFResourceManager() | |
15 laparams = LAParams() | |
16 device = PDFPageAggregator(rsrcmgr, laparams=laparams) | |
17 interpreter = PDFPageInterpreter(rsrcmgr, device) | |
18 pages = PDFPage.get_pages(fp) | |
19 | |
20 for page in pages: | |
21 interpreter.process_page(page) | |
22 layout = device.get_result() | |
23 for lobj in layout: | |
24 if isinstance(lobj, LTTextBox): | |
25 text = lobj.get_text() | |
26 for sentence in nltk.sent_tokenize(text): | |
27 sentence = re.sub(r'\s+', ' ', sentence).strip() | |
28 if len(sentence) < 5: | |
29 continue | |
30 if not re.search(r'\w\w\w\w\w', sentence): | |
31 continue | |
32 | |
33 yield list(map(int, lobj.bbox)), sentence |