sco-bot: webvtt_1.py comparison

playing with extractors

comparison

equal deleted inserted replaced

-:ba1ce5921a4b
+:0e33c65f1904
 # for caption in vtt.iter_slice(start='00:01:11.000', end='00:02:27.000'):
 #     print(caption.start)
 #     print(caption.end)
 #     print(caption.text)
+##https://stackoverflow.com/questions/51784232/how-do-i-convert-the-webvtt-format-to-plain-text
 transcript = ""
 lines = []
 for line in vtt:
 # Strip the newlines from the end of the text.
 if line == previous:
 continue
 transcript += " " + line
 previous = line
-print(transcript)
+#print(transcript)
 # text = lobj.get_text()
 #                 for sentence in nltk.sent_tokenize(text):
 #                     sentence = re.sub(r'\s+', ' ', sentence).strip()
 #                     if len(sentence) < 5:
 #                         continue
+from langchain import RecursiveCharacterTextSplitter ;
+const text = transcript
+const splitter = new RecursiveCharacterTextSplitter({
+chunkSize: 10,
+chunkOverlap: 1,
+});

Mercurial > code > home > repos > sco-bot