Mercurial > code > home > repos > sco-bot
comparison webvtt_1.py @ 3:ba1ce5921a4b
start
author | drewp@bigasterisk.com |
---|---|
date | Wed, 03 Jul 2024 20:20:18 -0700 |
parents | |
children | 0e33c65f1904 |
comparison
equal
deleted
inserted
replaced
2:82428652cda1 | 3:ba1ce5921a4b |
---|---|
1 import webvtt | |
2 # for caption in webvtt.read('data/Albany City Council - June 17, 2024 [Ywf6cOduPNI].en.vtt'): | |
3 # print(caption.start) | |
4 # print(caption.end) | |
5 # print(caption.text) | |
6 | |
7 vtt=webvtt.read('data/Albany City Council - June 17, 2024 [Ywf6cOduPNI].en.vtt') | |
8 | |
9 # for line in vtt[0].lines: | |
10 # print(line) | |
11 | |
12 | |
13 # for caption in vtt.iter_slice(start='00:01:11.000', end='00:02:27.000'): | |
14 # print(caption.start) | |
15 # print(caption.end) | |
16 # print(caption.text) | |
17 | |
18 transcript = "" | |
19 | |
20 lines = [] | |
21 for line in vtt: | |
22 # Strip the newlines from the end of the text. | |
23 # Split the string if it has a newline in the middle | |
24 # Add the lines to an array | |
25 lines.extend(line.text.strip().splitlines()) | |
26 | |
27 # Remove repeated lines | |
28 previous = None | |
29 for line in lines: | |
30 if line == previous: | |
31 continue | |
32 transcript += " " + line | |
33 previous = line | |
34 | |
35 print(transcript) | |
36 | |
37 # text = lobj.get_text() | |
38 # for sentence in nltk.sent_tokenize(text): | |
39 # sentence = re.sub(r'\s+', ' ', sentence).strip() | |
40 # if len(sentence) < 5: | |
41 # continue |