Mercurial > code > home > repos > sco-bot
changeset 3:ba1ce5921a4b
start
author | drewp@bigasterisk.com |
---|---|
date | Wed, 03 Jul 2024 20:20:18 -0700 |
parents | 82428652cda1 |
children | 0e33c65f1904 |
files | webvtt_1.py |
diffstat | 1 files changed, 41 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/webvtt_1.py Wed Jul 03 20:20:18 2024 -0700 @@ -0,0 +1,41 @@ +import webvtt +# for caption in webvtt.read('data/Albany City Council - June 17, 2024 [Ywf6cOduPNI].en.vtt'): +# print(caption.start) +# print(caption.end) +# print(caption.text) + +vtt=webvtt.read('data/Albany City Council - June 17, 2024 [Ywf6cOduPNI].en.vtt') + +# for line in vtt[0].lines: +# print(line) + + +# for caption in vtt.iter_slice(start='00:01:11.000', end='00:02:27.000'): +# print(caption.start) +# print(caption.end) +# print(caption.text) + +transcript = "" + +lines = [] +for line in vtt: + # Strip the newlines from the end of the text. + # Split the string if it has a newline in the middle + # Add the lines to an array + lines.extend(line.text.strip().splitlines()) + +# Remove repeated lines +previous = None +for line in lines: + if line == previous: + continue + transcript += " " + line + previous = line + +print(transcript) + +# text = lobj.get_text() +# for sentence in nltk.sent_tokenize(text): +# sentence = re.sub(r'\s+', ' ', sentence).strip() +# if len(sentence) < 5: +# continue \ No newline at end of file