diff webvtt_1.py @ 3:ba1ce5921a4b

start
author drewp@bigasterisk.com
date Wed, 03 Jul 2024 20:20:18 -0700
parents
children 0e33c65f1904
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/webvtt_1.py	Wed Jul 03 20:20:18 2024 -0700
@@ -0,0 +1,41 @@
+import webvtt
+# for caption in webvtt.read('data/Albany City Council - June 17, 2024 [Ywf6cOduPNI].en.vtt'):
+#     print(caption.start)
+#     print(caption.end)
+#     print(caption.text)
+
+vtt=webvtt.read('data/Albany City Council - June 17, 2024 [Ywf6cOduPNI].en.vtt')
+
+# for line in vtt[0].lines:
+#     print(line)
+
+
+# for caption in vtt.iter_slice(start='00:01:11.000', end='00:02:27.000'):
+#     print(caption.start)
+#     print(caption.end)
+#     print(caption.text)
+
+transcript = ""
+
+lines = []
+for line in vtt:
+    # Strip the newlines from the end of the text.
+    # Split the string if it has a newline in the middle
+    # Add the lines to an array
+    lines.extend(line.text.strip().splitlines())
+
+# Remove repeated lines
+previous = None
+for line in lines:
+    if line == previous:
+       continue
+    transcript += " " + line
+    previous = line
+
+print(transcript)
+
+# text = lobj.get_text()
+#                 for sentence in nltk.sent_tokenize(text):
+#                     sentence = re.sub(r'\s+', ' ', sentence).strip()
+#                     if len(sentence) < 5:
+#                         continue
\ No newline at end of file