3
|
1 import webvtt
|
|
2 # for caption in webvtt.read('data/Albany City Council - June 17, 2024 [Ywf6cOduPNI].en.vtt'):
|
|
3 # print(caption.start)
|
|
4 # print(caption.end)
|
|
5 # print(caption.text)
|
|
6
|
|
7 vtt=webvtt.read('data/Albany City Council - June 17, 2024 [Ywf6cOduPNI].en.vtt')
|
|
8
|
|
9 # for line in vtt[0].lines:
|
|
10 # print(line)
|
|
11
|
|
12
|
|
13 # for caption in vtt.iter_slice(start='00:01:11.000', end='00:02:27.000'):
|
|
14 # print(caption.start)
|
|
15 # print(caption.end)
|
|
16 # print(caption.text)
|
|
17
|
|
18 transcript = ""
|
|
19
|
|
20 lines = []
|
|
21 for line in vtt:
|
|
22 # Strip the newlines from the end of the text.
|
|
23 # Split the string if it has a newline in the middle
|
|
24 # Add the lines to an array
|
|
25 lines.extend(line.text.strip().splitlines())
|
|
26
|
|
27 # Remove repeated lines
|
|
28 previous = None
|
|
29 for line in lines:
|
|
30 if line == previous:
|
|
31 continue
|
|
32 transcript += " " + line
|
|
33 previous = line
|
|
34
|
|
35 print(transcript)
|
|
36
|
|
37 # text = lobj.get_text()
|
|
38 # for sentence in nltk.sent_tokenize(text):
|
|
39 # sentence = re.sub(r'\s+', ' ', sentence).strip()
|
|
40 # if len(sentence) < 5:
|
|
41 # continue |