Mercurial > code > home > repos > sco-bot
diff webvtt_1.py @ 4:0e33c65f1904
playing with extractors
author | drewp@bigasterisk.com |
---|---|
date | Sat, 06 Jul 2024 16:42:36 -0700 |
parents | ba1ce5921a4b |
children |
line wrap: on
line diff
--- a/webvtt_1.py Wed Jul 03 20:20:18 2024 -0700 +++ b/webvtt_1.py Sat Jul 06 16:42:36 2024 -0700 @@ -15,6 +15,7 @@ # print(caption.end) # print(caption.text) +##https://stackoverflow.com/questions/51784232/how-do-i-convert-the-webvtt-format-to-plain-text transcript = "" lines = [] @@ -32,10 +33,18 @@ transcript += " " + line previous = line -print(transcript) +#print(transcript) # text = lobj.get_text() # for sentence in nltk.sent_tokenize(text): # sentence = re.sub(r'\s+', ' ', sentence).strip() # if len(sentence) < 5: -# continue \ No newline at end of file +# continue + +from langchain import RecursiveCharacterTextSplitter ; + +const text = transcript +const splitter = new RecursiveCharacterTextSplitter({ + chunkSize: 10, + chunkOverlap: 1, +}); \ No newline at end of file