3
|
1 import webvtt
|
|
2 # for caption in webvtt.read('data/Albany City Council - June 17, 2024 [Ywf6cOduPNI].en.vtt'):
|
|
3 # print(caption.start)
|
|
4 # print(caption.end)
|
|
5 # print(caption.text)
|
|
6
|
|
7 vtt=webvtt.read('data/Albany City Council - June 17, 2024 [Ywf6cOduPNI].en.vtt')
|
|
8
|
|
9 # for line in vtt[0].lines:
|
|
10 # print(line)
|
|
11
|
|
12
|
|
13 # for caption in vtt.iter_slice(start='00:01:11.000', end='00:02:27.000'):
|
|
14 # print(caption.start)
|
|
15 # print(caption.end)
|
|
16 # print(caption.text)
|
|
17
|
4
|
18 ##https://stackoverflow.com/questions/51784232/how-do-i-convert-the-webvtt-format-to-plain-text
|
3
|
19 transcript = ""
|
|
20
|
|
21 lines = []
|
|
22 for line in vtt:
|
|
23 # Strip the newlines from the end of the text.
|
|
24 # Split the string if it has a newline in the middle
|
|
25 # Add the lines to an array
|
|
26 lines.extend(line.text.strip().splitlines())
|
|
27
|
|
28 # Remove repeated lines
|
|
29 previous = None
|
|
30 for line in lines:
|
|
31 if line == previous:
|
|
32 continue
|
|
33 transcript += " " + line
|
|
34 previous = line
|
|
35
|
4
|
36 #print(transcript)
|
3
|
37
|
|
38 # text = lobj.get_text()
|
|
39 # for sentence in nltk.sent_tokenize(text):
|
|
40 # sentence = re.sub(r'\s+', ' ', sentence).strip()
|
|
41 # if len(sentence) < 5:
|
4
|
42 # continue
|
|
43
|
|
44 from langchain import RecursiveCharacterTextSplitter ;
|
|
45
|
|
46 const text = transcript
|
|
47 const splitter = new RecursiveCharacterTextSplitter({
|
|
48 chunkSize: 10,
|
|
49 chunkOverlap: 1,
|
|
50 }); |