annotate search/extract_pdf.py @ 8:f23b21bd0fce

apex search
author drewp@bigasterisk.com
date Sun, 07 Jul 2024 16:26:56 -0700
parents 0e33c65f1904
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
1
c2176e9a2696 split by sentences
drewp@bigasterisk.com
parents: 0
diff changeset
1 import re
0
drewp@bigasterisk.com
parents:
diff changeset
2 from pathlib import Path
1
c2176e9a2696 split by sentences
drewp@bigasterisk.com
parents: 0
diff changeset
3 from typing import Iterable
c2176e9a2696 split by sentences
drewp@bigasterisk.com
parents: 0
diff changeset
4
c2176e9a2696 split by sentences
drewp@bigasterisk.com
parents: 0
diff changeset
5 import nltk
c2176e9a2696 split by sentences
drewp@bigasterisk.com
parents: 0
diff changeset
6 from pdfminer.converter import PDFPageAggregator
0
drewp@bigasterisk.com
parents:
diff changeset
7 from pdfminer.layout import LAParams, LTTextBox
1
c2176e9a2696 split by sentences
drewp@bigasterisk.com
parents: 0
diff changeset
8 from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
0
drewp@bigasterisk.com
parents:
diff changeset
9 from pdfminer.pdfpage import PDFPage
drewp@bigasterisk.com
parents:
diff changeset
10
8
f23b21bd0fce apex search
drewp@bigasterisk.com
parents: 4
diff changeset
11 def files() -> Iterable[Path]:
f23b21bd0fce apex search
drewp@bigasterisk.com
parents: 4
diff changeset
12 for p in Path('data').glob('*.pdf'):
f23b21bd0fce apex search
drewp@bigasterisk.com
parents: 4
diff changeset
13 yield p
0
drewp@bigasterisk.com
parents:
diff changeset
14
drewp@bigasterisk.com
parents:
diff changeset
15 def phrasesFromFile(p: Path) -> Iterable:
drewp@bigasterisk.com
parents:
diff changeset
16 fp = open(p, 'rb')
drewp@bigasterisk.com
parents:
diff changeset
17 rsrcmgr = PDFResourceManager()
drewp@bigasterisk.com
parents:
diff changeset
18 laparams = LAParams()
drewp@bigasterisk.com
parents:
diff changeset
19 device = PDFPageAggregator(rsrcmgr, laparams=laparams)
drewp@bigasterisk.com
parents:
diff changeset
20 interpreter = PDFPageInterpreter(rsrcmgr, device)
drewp@bigasterisk.com
parents:
diff changeset
21 pages = PDFPage.get_pages(fp)
drewp@bigasterisk.com
parents:
diff changeset
22
drewp@bigasterisk.com
parents:
diff changeset
23 for page in pages:
drewp@bigasterisk.com
parents:
diff changeset
24 interpreter.process_page(page)
drewp@bigasterisk.com
parents:
diff changeset
25 layout = device.get_result()
drewp@bigasterisk.com
parents:
diff changeset
26 for lobj in layout:
drewp@bigasterisk.com
parents:
diff changeset
27 if isinstance(lobj, LTTextBox):
drewp@bigasterisk.com
parents:
diff changeset
28 text = lobj.get_text()
1
c2176e9a2696 split by sentences
drewp@bigasterisk.com
parents: 0
diff changeset
29 for sentence in nltk.sent_tokenize(text):
c2176e9a2696 split by sentences
drewp@bigasterisk.com
parents: 0
diff changeset
30 sentence = re.sub(r'\s+', ' ', sentence).strip()
c2176e9a2696 split by sentences
drewp@bigasterisk.com
parents: 0
diff changeset
31 if len(sentence) < 5:
c2176e9a2696 split by sentences
drewp@bigasterisk.com
parents: 0
diff changeset
32 continue
4
0e33c65f1904 playing with extractors
drewp@bigasterisk.com
parents: 1
diff changeset
33 if not re.search(r'\w\w\w\w\w', sentence):
0e33c65f1904 playing with extractors
drewp@bigasterisk.com
parents: 1
diff changeset
34 continue
1
c2176e9a2696 split by sentences
drewp@bigasterisk.com
parents: 0
diff changeset
35
c2176e9a2696 split by sentences
drewp@bigasterisk.com
parents: 0
diff changeset
36 yield list(map(int, lobj.bbox)), sentence