comparison webvtt_1.py @ 4:0e33c65f1904

playing with extractors
author drewp@bigasterisk.com
date Sat, 06 Jul 2024 16:42:36 -0700
parents ba1ce5921a4b
children
comparison
equal deleted inserted replaced
3:ba1ce5921a4b 4:0e33c65f1904
13 # for caption in vtt.iter_slice(start='00:01:11.000', end='00:02:27.000'): 13 # for caption in vtt.iter_slice(start='00:01:11.000', end='00:02:27.000'):
14 # print(caption.start) 14 # print(caption.start)
15 # print(caption.end) 15 # print(caption.end)
16 # print(caption.text) 16 # print(caption.text)
17 17
18 ##https://stackoverflow.com/questions/51784232/how-do-i-convert-the-webvtt-format-to-plain-text
18 transcript = "" 19 transcript = ""
19 20
20 lines = [] 21 lines = []
21 for line in vtt: 22 for line in vtt:
22 # Strip the newlines from the end of the text. 23 # Strip the newlines from the end of the text.
30 if line == previous: 31 if line == previous:
31 continue 32 continue
32 transcript += " " + line 33 transcript += " " + line
33 previous = line 34 previous = line
34 35
35 print(transcript) 36 #print(transcript)
36 37
37 # text = lobj.get_text() 38 # text = lobj.get_text()
38 # for sentence in nltk.sent_tokenize(text): 39 # for sentence in nltk.sent_tokenize(text):
39 # sentence = re.sub(r'\s+', ' ', sentence).strip() 40 # sentence = re.sub(r'\s+', ' ', sentence).strip()
40 # if len(sentence) < 5: 41 # if len(sentence) < 5:
41 # continue 42 # continue
43
44 from langchain import RecursiveCharacterTextSplitter ;
45
46 const text = transcript
47 const splitter = new RecursiveCharacterTextSplitter({
48 chunkSize: 10,
49 chunkOverlap: 1,
50 });