annotate webvtt_1.py @ 18:a527228aa353 default tip

prefect use postgres
author drewp@bigasterisk.com
date Fri, 19 Jul 2024 21:01:09 -0700
parents 0e33c65f1904
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
3
drewp@bigasterisk.com
parents:
diff changeset
1 import webvtt
drewp@bigasterisk.com
parents:
diff changeset
2 # for caption in webvtt.read('data/Albany City Council - June 17, 2024 [Ywf6cOduPNI].en.vtt'):
drewp@bigasterisk.com
parents:
diff changeset
3 # print(caption.start)
drewp@bigasterisk.com
parents:
diff changeset
4 # print(caption.end)
drewp@bigasterisk.com
parents:
diff changeset
5 # print(caption.text)
drewp@bigasterisk.com
parents:
diff changeset
6
drewp@bigasterisk.com
parents:
diff changeset
7 vtt=webvtt.read('data/Albany City Council - June 17, 2024 [Ywf6cOduPNI].en.vtt')
drewp@bigasterisk.com
parents:
diff changeset
8
drewp@bigasterisk.com
parents:
diff changeset
9 # for line in vtt[0].lines:
drewp@bigasterisk.com
parents:
diff changeset
10 # print(line)
drewp@bigasterisk.com
parents:
diff changeset
11
drewp@bigasterisk.com
parents:
diff changeset
12
drewp@bigasterisk.com
parents:
diff changeset
13 # for caption in vtt.iter_slice(start='00:01:11.000', end='00:02:27.000'):
drewp@bigasterisk.com
parents:
diff changeset
14 # print(caption.start)
drewp@bigasterisk.com
parents:
diff changeset
15 # print(caption.end)
drewp@bigasterisk.com
parents:
diff changeset
16 # print(caption.text)
drewp@bigasterisk.com
parents:
diff changeset
17
4
0e33c65f1904 playing with extractors
drewp@bigasterisk.com
parents: 3
diff changeset
18 ##https://stackoverflow.com/questions/51784232/how-do-i-convert-the-webvtt-format-to-plain-text
3
drewp@bigasterisk.com
parents:
diff changeset
19 transcript = ""
drewp@bigasterisk.com
parents:
diff changeset
20
drewp@bigasterisk.com
parents:
diff changeset
21 lines = []
drewp@bigasterisk.com
parents:
diff changeset
22 for line in vtt:
drewp@bigasterisk.com
parents:
diff changeset
23 # Strip the newlines from the end of the text.
drewp@bigasterisk.com
parents:
diff changeset
24 # Split the string if it has a newline in the middle
drewp@bigasterisk.com
parents:
diff changeset
25 # Add the lines to an array
drewp@bigasterisk.com
parents:
diff changeset
26 lines.extend(line.text.strip().splitlines())
drewp@bigasterisk.com
parents:
diff changeset
27
drewp@bigasterisk.com
parents:
diff changeset
28 # Remove repeated lines
drewp@bigasterisk.com
parents:
diff changeset
29 previous = None
drewp@bigasterisk.com
parents:
diff changeset
30 for line in lines:
drewp@bigasterisk.com
parents:
diff changeset
31 if line == previous:
drewp@bigasterisk.com
parents:
diff changeset
32 continue
drewp@bigasterisk.com
parents:
diff changeset
33 transcript += " " + line
drewp@bigasterisk.com
parents:
diff changeset
34 previous = line
drewp@bigasterisk.com
parents:
diff changeset
35
4
0e33c65f1904 playing with extractors
drewp@bigasterisk.com
parents: 3
diff changeset
36 #print(transcript)
3
drewp@bigasterisk.com
parents:
diff changeset
37
drewp@bigasterisk.com
parents:
diff changeset
38 # text = lobj.get_text()
drewp@bigasterisk.com
parents:
diff changeset
39 # for sentence in nltk.sent_tokenize(text):
drewp@bigasterisk.com
parents:
diff changeset
40 # sentence = re.sub(r'\s+', ' ', sentence).strip()
drewp@bigasterisk.com
parents:
diff changeset
41 # if len(sentence) < 5:
4
0e33c65f1904 playing with extractors
drewp@bigasterisk.com
parents: 3
diff changeset
42 # continue
0e33c65f1904 playing with extractors
drewp@bigasterisk.com
parents: 3
diff changeset
43
0e33c65f1904 playing with extractors
drewp@bigasterisk.com
parents: 3
diff changeset
44 from langchain import RecursiveCharacterTextSplitter ;
0e33c65f1904 playing with extractors
drewp@bigasterisk.com
parents: 3
diff changeset
45
0e33c65f1904 playing with extractors
drewp@bigasterisk.com
parents: 3
diff changeset
46 const text = transcript
0e33c65f1904 playing with extractors
drewp@bigasterisk.com
parents: 3
diff changeset
47 const splitter = new RecursiveCharacterTextSplitter({
0e33c65f1904 playing with extractors
drewp@bigasterisk.com
parents: 3
diff changeset
48 chunkSize: 10,
0e33c65f1904 playing with extractors
drewp@bigasterisk.com
parents: 3
diff changeset
49 chunkOverlap: 1,
0e33c65f1904 playing with extractors
drewp@bigasterisk.com
parents: 3
diff changeset
50 });