comparison search/extract_pdf.py @ 8:f23b21bd0fce

apex search
author drewp@bigasterisk.com
date Sun, 07 Jul 2024 16:26:56 -0700
parents 0e33c65f1904
children
comparison
equal deleted inserted replaced
7:53ae53f7d1b3 8:f23b21bd0fce
6 from pdfminer.converter import PDFPageAggregator 6 from pdfminer.converter import PDFPageAggregator
7 from pdfminer.layout import LAParams, LTTextBox 7 from pdfminer.layout import LAParams, LTTextBox
8 from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager 8 from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
9 from pdfminer.pdfpage import PDFPage 9 from pdfminer.pdfpage import PDFPage
10 10
11 def files() -> Iterable[Path]:
12 for p in Path('data').glob('*.pdf'):
13 yield p
11 14
12 def phrasesFromFile(p: Path) -> Iterable: 15 def phrasesFromFile(p: Path) -> Iterable:
13 fp = open(p, 'rb') 16 fp = open(p, 'rb')
14 rsrcmgr = PDFResourceManager() 17 rsrcmgr = PDFResourceManager()
15 laparams = LAParams() 18 laparams = LAParams()