Mercurial > code > home > repos > sco-bot
comparison webvtt_1.py @ 4:0e33c65f1904
playing with extractors
author | drewp@bigasterisk.com |
---|---|
date | Sat, 06 Jul 2024 16:42:36 -0700 |
parents | ba1ce5921a4b |
children |
comparison
equal
deleted
inserted
replaced
3:ba1ce5921a4b | 4:0e33c65f1904 |
---|---|
13 # for caption in vtt.iter_slice(start='00:01:11.000', end='00:02:27.000'): | 13 # for caption in vtt.iter_slice(start='00:01:11.000', end='00:02:27.000'): |
14 # print(caption.start) | 14 # print(caption.start) |
15 # print(caption.end) | 15 # print(caption.end) |
16 # print(caption.text) | 16 # print(caption.text) |
17 | 17 |
18 ##https://stackoverflow.com/questions/51784232/how-do-i-convert-the-webvtt-format-to-plain-text | |
18 transcript = "" | 19 transcript = "" |
19 | 20 |
20 lines = [] | 21 lines = [] |
21 for line in vtt: | 22 for line in vtt: |
22 # Strip the newlines from the end of the text. | 23 # Strip the newlines from the end of the text. |
30 if line == previous: | 31 if line == previous: |
31 continue | 32 continue |
32 transcript += " " + line | 33 transcript += " " + line |
33 previous = line | 34 previous = line |
34 | 35 |
35 print(transcript) | 36 #print(transcript) |
36 | 37 |
37 # text = lobj.get_text() | 38 # text = lobj.get_text() |
38 # for sentence in nltk.sent_tokenize(text): | 39 # for sentence in nltk.sent_tokenize(text): |
39 # sentence = re.sub(r'\s+', ' ', sentence).strip() | 40 # sentence = re.sub(r'\s+', ' ', sentence).strip() |
40 # if len(sentence) < 5: | 41 # if len(sentence) < 5: |
41 # continue | 42 # continue |
43 | |
44 from langchain import RecursiveCharacterTextSplitter ; | |
45 | |
46 const text = transcript | |
47 const splitter = new RecursiveCharacterTextSplitter({ | |
48 chunkSize: 10, | |
49 chunkOverlap: 1, | |
50 }); |