view webvtt_1.py @ 7:53ae53f7d1b3

add k8s config
author drewp@bigasterisk.com
date Sat, 06 Jul 2024 16:45:19 -0700
parents 0e33c65f1904
children
line wrap: on
line source

import webvtt
# for caption in webvtt.read('data/Albany City Council - June 17, 2024 [Ywf6cOduPNI].en.vtt'):
#     print(caption.start)
#     print(caption.end)
#     print(caption.text)

vtt=webvtt.read('data/Albany City Council - June 17, 2024 [Ywf6cOduPNI].en.vtt')

# for line in vtt[0].lines:
#     print(line)


# for caption in vtt.iter_slice(start='00:01:11.000', end='00:02:27.000'):
#     print(caption.start)
#     print(caption.end)
#     print(caption.text)

##https://stackoverflow.com/questions/51784232/how-do-i-convert-the-webvtt-format-to-plain-text
transcript = ""

lines = []
for line in vtt:
    # Strip the newlines from the end of the text.
    # Split the string if it has a newline in the middle
    # Add the lines to an array
    lines.extend(line.text.strip().splitlines())

# Remove repeated lines
previous = None
for line in lines:
    if line == previous:
       continue
    transcript += " " + line
    previous = line

#print(transcript)

# text = lobj.get_text()
#                 for sentence in nltk.sent_tokenize(text):
#                     sentence = re.sub(r'\s+', ' ', sentence).strip()
#                     if len(sentence) < 5:
#                         continue

from langchain import RecursiveCharacterTextSplitter ;

const text = transcript
const splitter = new RecursiveCharacterTextSplitter({
  chunkSize: 10,
  chunkOverlap: 1,
});