From 04c07c6caea88dc61a4753fe5e8d172005a29ece Mon Sep 17 00:00:00 2001 From: Raivis Dejus Date: Sat, 7 Mar 2026 07:58:04 +0200 Subject: [PATCH] Adding VAD to whisper.cpp to reduce hallucinations on audio w silences (#1412) --- Makefile | 3 +++ buzz/recording.py | 4 ++++ buzz/transcriber/whisper_cpp.py | 5 +++++ share/metainfo/io.github.chidiwilliams.Buzz.metainfo.xml | 1 + tests/transcriber/whisper_cpp_test.py | 2 +- 5 files changed, 14 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 6a316b53..4beb3323 100644 --- a/Makefile +++ b/Makefile @@ -65,6 +65,7 @@ ifeq ($(OS), Windows_NT) cp whisper.cpp/build/bin/Release/whisper-cli.exe buzz/whisper_cpp/ cp whisper.cpp/build/bin/Release/whisper-server.exe buzz/whisper_cpp/ cp dll_backup/SDL2.dll buzz/whisper_cpp + PowerShell -NoProfile -ExecutionPolicy Bypass -Command "if (-not (Test-Path 'buzz\whisper_cpp\ggml-silero-v6.2.0.bin')) { Start-BitsTransfer -Source https://huggingface.co/ggml-org/whisper-vad/resolve/main/ggml-silero-v6.2.0.bin -Destination 'buzz\whisper_cpp\ggml-silero-v6.2.0.bin' }" endif ifeq ($(shell uname -s), Linux) @@ -82,6 +83,7 @@ ifeq ($(shell uname -s), Linux) cp -P whisper.cpp/build/ggml/src/libggml-base.so* buzz/whisper_cpp/ || true cp -P whisper.cpp/build/ggml/src/libggml-cpu.so* buzz/whisper_cpp/ || true cp -P whisper.cpp/build/ggml/src/ggml-vulkan/libggml-vulkan.so* buzz/whisper_cpp/ || true + test -f buzz/whisper_cpp/ggml-silero-v6.2.0.bin || curl -L -o buzz/whisper_cpp/ggml-silero-v6.2.0.bin https://huggingface.co/ggml-org/whisper-vad/resolve/main/ggml-silero-v6.2.0.bin endif # Build on Macs @@ -101,6 +103,7 @@ endif cp whisper.cpp/build/bin/whisper-server buzz/whisper_cpp/ || true cp whisper.cpp/build/src/libwhisper.dylib buzz/whisper_cpp/ || true cp whisper.cpp/build/ggml/src/libggml* buzz/whisper_cpp/ || true + test -f buzz/whisper_cpp/ggml-silero-v6.2.0.bin || curl -L -o buzz/whisper_cpp/ggml-silero-v6.2.0.bin https://huggingface.co/ggml-org/whisper-vad/resolve/main/ggml-silero-v6.2.0.bin endif # Prints all the Mac developer identities used for code signing diff --git a/buzz/recording.py b/buzz/recording.py index a831823f..158f6e5c 100644 --- a/buzz/recording.py +++ b/buzz/recording.py @@ -22,6 +22,7 @@ class RecordingAmplitudeListener(QObject): self.input_device_index = input_device_index self.buffer = np.ndarray([], dtype=np.float32) self.accumulation_size = 0 + self._active = True def start_recording(self): try: @@ -38,11 +39,14 @@ class RecordingAmplitudeListener(QObject): logging.exception("Failed to start audio stream on device %s: %s", self.input_device_index, e) def stop_recording(self): + self._active = False if self.stream is not None: self.stream.stop() self.stream.close() def stream_callback(self, in_data: np.ndarray, frame_count, time_info, status): + if not self._active: + return chunk = in_data.ravel() self.amplitude_changed.emit(float(np.sqrt(np.mean(chunk**2)))) diff --git a/buzz/transcriber/whisper_cpp.py b/buzz/transcriber/whisper_cpp.py index 977b785e..db4aff84 100644 --- a/buzz/transcriber/whisper_cpp.py +++ b/buzz/transcriber/whisper_cpp.py @@ -109,6 +109,11 @@ class WhisperCpp: "-f", file_to_process, ] + # Add VAD if the model is available + vad_model_path = os.path.join(os.path.dirname(whisper_cli_path), "ggml-silero-v6.2.0.bin") + if os.path.exists(vad_model_path): + cmd.extend(["--vad", "--vad-model", vad_model_path]) + # Add translate flag if needed if task.transcription_options.task == Task.TRANSLATE: cmd.extend(["--translate"]) diff --git a/share/metainfo/io.github.chidiwilliams.Buzz.metainfo.xml b/share/metainfo/io.github.chidiwilliams.Buzz.metainfo.xml index cc91b618..fd50fb15 100644 --- a/share/metainfo/io.github.chidiwilliams.Buzz.metainfo.xml +++ b/share/metainfo/io.github.chidiwilliams.Buzz.metainfo.xml @@ -74,6 +74,7 @@
  • Added option to import folder
  • Extra settings for live recordings
  • Update checker for Windows and Macs
  • +
  • Added voice activity detection to whisper.cpp
  • diff --git a/tests/transcriber/whisper_cpp_test.py b/tests/transcriber/whisper_cpp_test.py index 310f5f1d..cabc9fe7 100644 --- a/tests/transcriber/whisper_cpp_test.py +++ b/tests/transcriber/whisper_cpp_test.py @@ -37,7 +37,7 @@ class TestWhisperCpp: # Combine all segment texts full_text = " ".join(segment.text for segment in segments) - assert "Bien venu" in full_text + assert "Bien venu" in full_text or "bienvenu" in full_text.lower() def test_transcribe_word_level_timestamps(self): transcription_options = TranscriptionOptions(