diff search/extract_pdf.py @ 8:f23b21bd0fce

apex search
author drewp@bigasterisk.com
date Sun, 07 Jul 2024 16:26:56 -0700
parents 0e33c65f1904
children
line wrap: on
line diff
--- a/search/extract_pdf.py	Sat Jul 06 16:45:19 2024 -0700
+++ b/search/extract_pdf.py	Sun Jul 07 16:26:56 2024 -0700
@@ -8,6 +8,9 @@
 from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
 from pdfminer.pdfpage import PDFPage
 
+def files() -> Iterable[Path]:
+    for p in Path('data').glob('*.pdf'):
+        yield p
 
 def phrasesFromFile(p: Path) -> Iterable:
     fp = open(p, 'rb')