annotate webvtt_1.py @ 3:ba1ce5921a4b

start
author drewp@bigasterisk.com
date Wed, 03 Jul 2024 20:20:18 -0700
parents
children 0e33c65f1904
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
3
drewp@bigasterisk.com
parents:
diff changeset
1 import webvtt
drewp@bigasterisk.com
parents:
diff changeset
2 # for caption in webvtt.read('data/Albany City Council - June 17, 2024 [Ywf6cOduPNI].en.vtt'):
drewp@bigasterisk.com
parents:
diff changeset
3 # print(caption.start)
drewp@bigasterisk.com
parents:
diff changeset
4 # print(caption.end)
drewp@bigasterisk.com
parents:
diff changeset
5 # print(caption.text)
drewp@bigasterisk.com
parents:
diff changeset
6
drewp@bigasterisk.com
parents:
diff changeset
7 vtt=webvtt.read('data/Albany City Council - June 17, 2024 [Ywf6cOduPNI].en.vtt')
drewp@bigasterisk.com
parents:
diff changeset
8
drewp@bigasterisk.com
parents:
diff changeset
9 # for line in vtt[0].lines:
drewp@bigasterisk.com
parents:
diff changeset
10 # print(line)
drewp@bigasterisk.com
parents:
diff changeset
11
drewp@bigasterisk.com
parents:
diff changeset
12
drewp@bigasterisk.com
parents:
diff changeset
13 # for caption in vtt.iter_slice(start='00:01:11.000', end='00:02:27.000'):
drewp@bigasterisk.com
parents:
diff changeset
14 # print(caption.start)
drewp@bigasterisk.com
parents:
diff changeset
15 # print(caption.end)
drewp@bigasterisk.com
parents:
diff changeset
16 # print(caption.text)
drewp@bigasterisk.com
parents:
diff changeset
17
drewp@bigasterisk.com
parents:
diff changeset
18 transcript = ""
drewp@bigasterisk.com
parents:
diff changeset
19
drewp@bigasterisk.com
parents:
diff changeset
20 lines = []
drewp@bigasterisk.com
parents:
diff changeset
21 for line in vtt:
drewp@bigasterisk.com
parents:
diff changeset
22 # Strip the newlines from the end of the text.
drewp@bigasterisk.com
parents:
diff changeset
23 # Split the string if it has a newline in the middle
drewp@bigasterisk.com
parents:
diff changeset
24 # Add the lines to an array
drewp@bigasterisk.com
parents:
diff changeset
25 lines.extend(line.text.strip().splitlines())
drewp@bigasterisk.com
parents:
diff changeset
26
drewp@bigasterisk.com
parents:
diff changeset
27 # Remove repeated lines
drewp@bigasterisk.com
parents:
diff changeset
28 previous = None
drewp@bigasterisk.com
parents:
diff changeset
29 for line in lines:
drewp@bigasterisk.com
parents:
diff changeset
30 if line == previous:
drewp@bigasterisk.com
parents:
diff changeset
31 continue
drewp@bigasterisk.com
parents:
diff changeset
32 transcript += " " + line
drewp@bigasterisk.com
parents:
diff changeset
33 previous = line
drewp@bigasterisk.com
parents:
diff changeset
34
drewp@bigasterisk.com
parents:
diff changeset
35 print(transcript)
drewp@bigasterisk.com
parents:
diff changeset
36
drewp@bigasterisk.com
parents:
diff changeset
37 # text = lobj.get_text()
drewp@bigasterisk.com
parents:
diff changeset
38 # for sentence in nltk.sent_tokenize(text):
drewp@bigasterisk.com
parents:
diff changeset
39 # sentence = re.sub(r'\s+', ' ', sentence).strip()
drewp@bigasterisk.com
parents:
diff changeset
40 # if len(sentence) < 5:
drewp@bigasterisk.com
parents:
diff changeset
41 # continue