comparison webvtt_1.py @ 3:ba1ce5921a4b

start
author drewp@bigasterisk.com
date Wed, 03 Jul 2024 20:20:18 -0700
parents
children 0e33c65f1904
comparison
equal deleted inserted replaced
2:82428652cda1 3:ba1ce5921a4b
1 import webvtt
2 # for caption in webvtt.read('data/Albany City Council - June 17, 2024 [Ywf6cOduPNI].en.vtt'):
3 # print(caption.start)
4 # print(caption.end)
5 # print(caption.text)
6
7 vtt=webvtt.read('data/Albany City Council - June 17, 2024 [Ywf6cOduPNI].en.vtt')
8
9 # for line in vtt[0].lines:
10 # print(line)
11
12
13 # for caption in vtt.iter_slice(start='00:01:11.000', end='00:02:27.000'):
14 # print(caption.start)
15 # print(caption.end)
16 # print(caption.text)
17
18 transcript = ""
19
20 lines = []
21 for line in vtt:
22 # Strip the newlines from the end of the text.
23 # Split the string if it has a newline in the middle
24 # Add the lines to an array
25 lines.extend(line.text.strip().splitlines())
26
27 # Remove repeated lines
28 previous = None
29 for line in lines:
30 if line == previous:
31 continue
32 transcript += " " + line
33 previous = line
34
35 print(transcript)
36
37 # text = lobj.get_text()
38 # for sentence in nltk.sent_tokenize(text):
39 # sentence = re.sub(r'\s+', ' ', sentence).strip()
40 # if len(sentence) < 5:
41 # continue