Mercurial > code > home > repos > sco-bot
view webvtt_1.py @ 13:403eff4a16c8
fix up indexer flow and fastapi server
author | drewp@bigasterisk.com |
---|---|
date | Thu, 11 Jul 2024 21:32:24 -0700 |
parents | 0e33c65f1904 |
children |
line wrap: on
line source
import webvtt # for caption in webvtt.read('data/Albany City Council - June 17, 2024 [Ywf6cOduPNI].en.vtt'): # print(caption.start) # print(caption.end) # print(caption.text) vtt=webvtt.read('data/Albany City Council - June 17, 2024 [Ywf6cOduPNI].en.vtt') # for line in vtt[0].lines: # print(line) # for caption in vtt.iter_slice(start='00:01:11.000', end='00:02:27.000'): # print(caption.start) # print(caption.end) # print(caption.text) ##https://stackoverflow.com/questions/51784232/how-do-i-convert-the-webvtt-format-to-plain-text transcript = "" lines = [] for line in vtt: # Strip the newlines from the end of the text. # Split the string if it has a newline in the middle # Add the lines to an array lines.extend(line.text.strip().splitlines()) # Remove repeated lines previous = None for line in lines: if line == previous: continue transcript += " " + line previous = line #print(transcript) # text = lobj.get_text() # for sentence in nltk.sent_tokenize(text): # sentence = re.sub(r'\s+', ' ', sentence).strip() # if len(sentence) < 5: # continue from langchain import RecursiveCharacterTextSplitter ; const text = transcript const splitter = new RecursiveCharacterTextSplitter({ chunkSize: 10, chunkOverlap: 1, });