Use OpenAI api word-level timestamps

regularfry · Feb 14, 2024 · f3c76f7 · f3c76f7
1 parent 46598ad
commit f3c76f7
Showing 1 changed file with 10 additions and 26 deletions.
diff --git a/whisper_online.py b/whisper_online.py
@@ -176,30 +176,14 @@ def load_model(self, *args, **kwargs):
 
     def ts_words(self, segments):
         o = []
-        for segment in segments:
-            # If VAD on, skip segments containing no speech. 
-            # TODO: threshold can be set from outside
-            if self.use_vad and segment["no_speech_prob"] > 0.8:
-                continue
-
-            # Splitting the text into words and filtering out empty strings
-            words = [word.strip() for word in segment["text"].split() if word.strip()]
-
-            if not words:
-                continue
+        # If VAD on, skip segments containing no speech. 
+        # TODO: threshold can be set from outside
+        # TODO: Make VAD work again with word-level timestamps
+        #if self.use_vad and segment["no_speech_prob"] > 0.8:
+        #    continue
 
-            # Assign start and end times for each word
-            # We only have timestamps per segment, so interpolating start and end-times
-
-
-            segment_duration = segment["end"] - segment["start"]
-            total_characters = sum(len(word) for word in words)
-            duration_per_character = segment_duration / total_characters
-            start_time = segment["start"]
-            for word in words:
-                end_time = start_time + duration_per_character * len(word)
-                o.append((start_time, end_time, word))
-                start_time = end_time
+        for word in segments:
+            o.append((word.get("start"), word.get("end"), word.get("word")))
 
         return o
 
@@ -220,7 +204,8 @@ def transcribe(self, audio_data, prompt=None, *args, **kwargs):
             "model": self.modelname,
             "file": buffer,
             "response_format": self.response_format,
-            "temperature": self.temperature
+            "temperature": self.temperature,
+            "timestamp_granularities": ["word"]
         }
         if self.task != "translate" and self.language:
             params["language"] = self.language
@@ -233,11 +218,10 @@ def transcribe(self, audio_data, prompt=None, *args, **kwargs):
             proc = self.client.audio.transcriptions
 
         # Process transcription/translation
-
         transcript = proc.create(**params)
         print(f"OpenAI API processed accumulated {self.transcribed_seconds} seconds",file=self.logfile)
 
-        return transcript.segments
+        return transcript.words
 
     def use_vad(self):
         self.use_vad = True