view webvtt_1.py @ 3:ba1ce5921a4b

start
author drewp@bigasterisk.com
date Wed, 03 Jul 2024 20:20:18 -0700
parents
children 0e33c65f1904
line wrap: on
line source

import webvtt
# for caption in webvtt.read('data/Albany City Council - June 17, 2024 [Ywf6cOduPNI].en.vtt'):
#     print(caption.start)
#     print(caption.end)
#     print(caption.text)

vtt=webvtt.read('data/Albany City Council - June 17, 2024 [Ywf6cOduPNI].en.vtt')

# for line in vtt[0].lines:
#     print(line)


# for caption in vtt.iter_slice(start='00:01:11.000', end='00:02:27.000'):
#     print(caption.start)
#     print(caption.end)
#     print(caption.text)

transcript = ""

lines = []
for line in vtt:
    # Strip the newlines from the end of the text.
    # Split the string if it has a newline in the middle
    # Add the lines to an array
    lines.extend(line.text.strip().splitlines())

# Remove repeated lines
previous = None
for line in lines:
    if line == previous:
       continue
    transcript += " " + line
    previous = line

print(transcript)

# text = lobj.get_text()
#                 for sentence in nltk.sent_tokenize(text):
#                     sentence = re.sub(r'\s+', ' ', sentence).strip()
#                     if len(sentence) < 5:
#                         continue