diff webvtt_1.py @ 4:0e33c65f1904

playing with extractors
author drewp@bigasterisk.com
date Sat, 06 Jul 2024 16:42:36 -0700
parents ba1ce5921a4b
children
line wrap: on
line diff
--- a/webvtt_1.py	Wed Jul 03 20:20:18 2024 -0700
+++ b/webvtt_1.py	Sat Jul 06 16:42:36 2024 -0700
@@ -15,6 +15,7 @@
 #     print(caption.end)
 #     print(caption.text)
 
+##https://stackoverflow.com/questions/51784232/how-do-i-convert-the-webvtt-format-to-plain-text
 transcript = ""
 
 lines = []
@@ -32,10 +33,18 @@
     transcript += " " + line
     previous = line
 
-print(transcript)
+#print(transcript)
 
 # text = lobj.get_text()
 #                 for sentence in nltk.sent_tokenize(text):
 #                     sentence = re.sub(r'\s+', ' ', sentence).strip()
 #                     if len(sentence) < 5:
-#                         continue
\ No newline at end of file
+#                         continue
+
+from langchain import RecursiveCharacterTextSplitter ;
+
+const text = transcript
+const splitter = new RecursiveCharacterTextSplitter({
+  chunkSize: 10,
+  chunkOverlap: 1,
+});
\ No newline at end of file