From 4b786595c3a32af86b79372704f849dcd523672b Mon Sep 17 00:00:00 2001 From: Raivis Dejus Date: Wed, 18 Jun 2025 16:33:31 +0300 Subject: [PATCH] =?UTF-8?q?Adding=20support=20for=20word=20level=20timings?= =?UTF-8?q?=20in=20Whisper=20API=20and=20Whisper.cpp=20=E2=80=A6=20(#1183)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../openai_whisper_api_file_transcriber.py | 93 +++++++++++++++++-- buzz/transcriber/whisper_cpp.py | 45 +++++---- .../file_transcription_form_widget.py | 9 -- docs/docs/preferences.md | 2 +- 4 files changed, 108 insertions(+), 41 deletions(-) diff --git a/buzz/transcriber/openai_whisper_api_file_transcriber.py b/buzz/transcriber/openai_whisper_api_file_transcriber.py index 275c8113..27085503 100644 --- a/buzz/transcriber/openai_whisper_api_file_transcriber.py +++ b/buzz/transcriber/openai_whisper_api_file_transcriber.py @@ -13,6 +13,7 @@ from buzz.settings.settings import Settings from buzz.model_loader import get_custom_api_whisper_model from buzz.transcriber.file_transcriber import FileTranscriber from buzz.transcriber.transcriber import FileTranscriptionTask, Segment, Task +from buzz.transcriber.whisper_cpp import append_segment class OpenAIWhisperAPIFileTranscriber(FileTranscriber): @@ -28,6 +29,7 @@ class OpenAIWhisperAPIFileTranscriber(FileTranscriber): base_url=custom_openai_base_url if custom_openai_base_url else None ) self.whisper_api_model = get_custom_api_whisper_model(custom_openai_base_url) + self.word_level_timings = self.transcription_task.transcription_options.word_level_timings logging.debug("Will use whisper API on %s, %s", custom_openai_base_url, self.whisper_api_model) @@ -136,6 +138,12 @@ class OpenAIWhisperAPIFileTranscriber(FileTranscriber): return segments + @staticmethod + def get_value(segment, key): + if hasattr(segment, key): + return getattr(segment, key) + return segment[key] + def get_segments_for_file(self, file: str, offset_ms: int = 0): with open(file, "rb") as file: options = { @@ -144,6 +152,10 @@ class OpenAIWhisperAPIFileTranscriber(FileTranscriber): "response_format": "verbose_json", "prompt": self.transcription_task.transcription_options.initial_prompt, } + + if self.word_level_timings: + options["timestamp_granularities"] = ["word"] + transcript = ( self.openai_client.audio.transcriptions.create( **options, @@ -153,14 +165,79 @@ class OpenAIWhisperAPIFileTranscriber(FileTranscriber): else self.openai_client.audio.translations.create(**options) ) - return [ - Segment( - int(segment["start"] * 1000 + offset_ms), - int(segment["end"] * 1000 + offset_ms), - segment["text"], - ) - for segment in transcript.model_extra["segments"] - ] + segments = getattr(transcript, "segments", None) + + words = getattr(transcript, "words", None) + if "words" is None and "words" in transcript.model_extra: + words = transcript.model_extra["words"] + + if segments is None: + if "segments" in transcript.model_extra: + segments = transcript.model_extra["segments"] + else: + segments = [{"words": words}] + + result_segments = [] + if self.word_level_timings: + + # Detect response from whisper.cpp API + first_segment = segments[0] if segments else None + is_whisper_cpp = (first_segment and hasattr(first_segment, "tokens") + and hasattr(first_segment, "avg_logprob") and hasattr(first_segment, "no_speech_prob")) + + if is_whisper_cpp: + txt_buffer = b'' + txt_start = 0 + txt_end = 0 + + for segment in segments: + for word in self.get_value(segment, "words"): + + txt = self.get_value(word, "word").encode("utf-8") + start = self.get_value(word, "start") + end = self.get_value(word, "end") + + if txt.startswith(b' ') and append_segment(result_segments, txt_buffer, txt_start, txt_end): + txt_buffer = txt + txt_start = start + txt_end = end + continue + + if txt.startswith(b', '): + txt_buffer += b',' + append_segment(result_segments, txt_buffer, txt_start, txt_end) + txt_buffer = txt.lstrip(b',') + txt_start = start + txt_end = end + continue + + txt_buffer += txt + txt_end = end + + # Append the last segment + append_segment(result_segments, txt_buffer, txt_start, txt_end) + + else: + for segment in segments: + for word in self.get_value(segment, "words"): + result_segments.append( + Segment( + int(self.get_value(word, "start") * 1000 + offset_ms), + int(self.get_value(word, "end") * 1000 + offset_ms), + self.get_value(word, "word"), + ) + ) + else: + result_segments = [ + Segment( + int(self.get_value(segment, "start") * 1000 + offset_ms), + int(self.get_value(segment,"end") * 1000 + offset_ms), + self.get_value(segment,"text"), + ) + for segment in segments + ] + + return result_segments def stop(self): pass diff --git a/buzz/transcriber/whisper_cpp.py b/buzz/transcriber/whisper_cpp.py index f7dc3345..c9dc67e1 100644 --- a/buzz/transcriber/whisper_cpp.py +++ b/buzz/transcriber/whisper_cpp.py @@ -23,6 +23,24 @@ if platform.system() == "Darwin" and platform.machine() == "arm64": except ImportError: logging.exception("") +def append_segment(result, txt: bytes, start: int, end: int): + if txt == b'': + return True + + # try-catch will guard against multi-byte utf-8 characters + # https://github.com/ggerganov/whisper.cpp/issues/1798 + try: + result.append( + Segment( + start=start * 10, # centisecond to ms + end=end * 10, # centisecond to ms + text=txt.decode("utf-8"), + ) + ) + + return True + except UnicodeDecodeError: + return False class WhisperCpp: def __init__(self, model: str) -> None: @@ -40,25 +58,6 @@ class WhisperCpp: self.ctx = self.instance.init_from_file(model) self.segments: List[Segment] = [] - def append_segment(self, txt: bytes, start: int, end: int): - if txt == b'': - return True - - # try-catch will guard against multi-byte utf-8 characters - # https://github.com/ggerganov/whisper.cpp/issues/1798 - try: - self.segments.append( - Segment( - start=start * 10, # centisecond to ms - end=end * 10, # centisecond to ms - text=txt.decode("utf-8"), - ) - ) - - return True - except UnicodeDecodeError: - return False - def transcribe(self, audio: Union[np.ndarray, str], params: Any): self.segments = [] @@ -87,7 +86,7 @@ class WhisperCpp: start = self.instance.full_get_segment_t0(self.ctx, i) end = self.instance.full_get_segment_t1(self.ctx, i) - if txt.startswith(b' ') and self.append_segment(txt_buffer, txt_start, txt_end): + if txt.startswith(b' ') and append_segment(self.segments, txt_buffer, txt_start, txt_end): txt_buffer = txt txt_start = start txt_end = end @@ -95,7 +94,7 @@ class WhisperCpp: if txt.startswith(b', '): txt_buffer += b',' - self.append_segment(txt_buffer, txt_start, txt_end) + append_segment(self.segments, txt_buffer, txt_start, txt_end) txt_buffer = txt.lstrip(b',') txt_start = start txt_end = end @@ -105,7 +104,7 @@ class WhisperCpp: txt_end = end # Append the last segment - self.append_segment(txt_buffer, txt_start, txt_end) + append_segment(self.segments, txt_buffer, txt_start, txt_end) else: for i in range(n_segments): @@ -113,7 +112,7 @@ class WhisperCpp: start = self.instance.full_get_segment_t0(self.ctx, i) end = self.instance.full_get_segment_t1(self.ctx, i) - self.append_segment(txt, start, end) + append_segment(self.segments, txt, start, end) return { "segments": self.segments, diff --git a/buzz/widgets/transcriber/file_transcription_form_widget.py b/buzz/widgets/transcriber/file_transcription_form_widget.py index 7461953f..8e9ff952 100644 --- a/buzz/widgets/transcriber/file_transcription_form_widget.py +++ b/buzz/widgets/transcriber/file_transcription_form_widget.py @@ -80,13 +80,10 @@ class FileTranscriptionFormWidget(QWidget): layout.addLayout(file_transcription_layout) self.setLayout(layout) - self.reset_word_level_timings() - def on_transcription_options_changed( self, transcription_options: TranscriptionOptions ): self.transcription_options = transcription_options - self.reset_word_level_timings() self.transcription_options_changed.emit( (self.transcription_options, self.file_transcription_options) ) @@ -125,9 +122,3 @@ class FileTranscriptionFormWidget(QWidget): ) return on_checkbox_state_changed - - def reset_word_level_timings(self): - self.word_level_timings_checkbox.setDisabled( - self.transcription_options.model.model_type - == ModelType.OPEN_AI_WHISPER_API - ) diff --git a/docs/docs/preferences.md b/docs/docs/preferences.md index b50fe922..0282d06a 100644 --- a/docs/docs/preferences.md +++ b/docs/docs/preferences.md @@ -11,7 +11,7 @@ Open the Preferences window from the Menu bar, or click `Ctrl/Cmd + ,`. **API Key** - key to authenticate your requests to OpenAI API. To get API key from OpenAI see [this article](https://help.openai.com/en/articles/4936850-where-do-i-find-my-openai-api-key). -**Base URL** - By default all requests are sent to API provided by OpenAI company. Their API URL is `https://api.openai.com/v1/`. Compatible APIs are also provided by other companies. List of available API URLs you can find on [discussion page](https://github.com/chidiwilliams/buzz/discussions/827) +**Base URL** - By default all requests are sent to API provided by OpenAI company. Their API URL is `https://api.openai.com/v1/`. Compatible APIs are also provided by other companies. List of available API URLs and services to run yourself are available on [discussion page](https://github.com/chidiwilliams/buzz/discussions/827) ### Default export file name