Skip to content

Commit

Permalink
Use OpenAI api word-level timestamps
Browse files Browse the repository at this point in the history
  • Loading branch information
tijszwinkels committed Feb 14, 2024
1 parent 46598ad commit f3c76f7
Showing 1 changed file with 10 additions and 26 deletions.
36 changes: 10 additions & 26 deletions whisper_online.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,30 +176,14 @@ def load_model(self, *args, **kwargs):

def ts_words(self, segments):
o = []
for segment in segments:
# If VAD on, skip segments containing no speech.
# TODO: threshold can be set from outside
if self.use_vad and segment["no_speech_prob"] > 0.8:
continue

# Splitting the text into words and filtering out empty strings
words = [word.strip() for word in segment["text"].split() if word.strip()]

if not words:
continue
# If VAD on, skip segments containing no speech.
# TODO: threshold can be set from outside
# TODO: Make VAD work again with word-level timestamps
#if self.use_vad and segment["no_speech_prob"] > 0.8:
# continue

# Assign start and end times for each word
# We only have timestamps per segment, so interpolating start and end-times


segment_duration = segment["end"] - segment["start"]
total_characters = sum(len(word) for word in words)
duration_per_character = segment_duration / total_characters
start_time = segment["start"]
for word in words:
end_time = start_time + duration_per_character * len(word)
o.append((start_time, end_time, word))
start_time = end_time
for word in segments:
o.append((word.get("start"), word.get("end"), word.get("word")))

return o

Expand All @@ -220,7 +204,8 @@ def transcribe(self, audio_data, prompt=None, *args, **kwargs):
"model": self.modelname,
"file": buffer,
"response_format": self.response_format,
"temperature": self.temperature
"temperature": self.temperature,
"timestamp_granularities": ["word"]
}
if self.task != "translate" and self.language:
params["language"] = self.language
Expand All @@ -233,11 +218,10 @@ def transcribe(self, audio_data, prompt=None, *args, **kwargs):
proc = self.client.audio.transcriptions

# Process transcription/translation

transcript = proc.create(**params)
print(f"OpenAI API processed accumulated {self.transcribed_seconds} seconds",file=self.logfile)

return transcript.segments
return transcript.words

def use_vad(self):
self.use_vad = True
Expand Down

0 comments on commit f3c76f7

Please sign in to comment.