Adding VAD to whisper.cpp to reduce hallucinations on audio w silences (#1412)

2026-03-14 22:55:46 +01:00 · 2026-03-07 07:58:04 +02:00 · 2026-03-07 07:58:04 +02:00 · 04c07c6cae
commit 04c07c6cae
parent 981dd3a758
5 changed files with 14 additions and 1 deletions
--- a/3
+++ b/3
@ -65,6 +65,7 @@ ifeq ($(OS), Windows_NT)
 	cp whisper.cpp/build/bin/Release/whisper-cli.exe buzz/whisper_cpp/
 	cp whisper.cpp/build/bin/Release/whisper-server.exe buzz/whisper_cpp/
 	cp dll_backup/SDL2.dll buzz/whisper_cpp
+	PowerShell -NoProfile -ExecutionPolicy Bypass -Command "if (-not (Test-Path 'buzz\whisper_cpp\ggml-silero-v6.2.0.bin')) { Start-BitsTransfer -Source https://huggingface.co/ggml-org/whisper-vad/resolve/main/ggml-silero-v6.2.0.bin -Destination 'buzz\whisper_cpp\ggml-silero-v6.2.0.bin' }"
 endif

 ifeq ($(shell uname -s), Linux)
@ -82,6 +83,7 @@ ifeq ($(shell uname -s), Linux)
 	cp -P whisper.cpp/build/ggml/src/libggml-base.so* buzz/whisper_cpp/ || true
 	cp -P whisper.cpp/build/ggml/src/libggml-cpu.so* buzz/whisper_cpp/ || true
 	cp -P whisper.cpp/build/ggml/src/ggml-vulkan/libggml-vulkan.so* buzz/whisper_cpp/ || true
+	test -f buzz/whisper_cpp/ggml-silero-v6.2.0.bin || curl -L -o buzz/whisper_cpp/ggml-silero-v6.2.0.bin https://huggingface.co/ggml-org/whisper-vad/resolve/main/ggml-silero-v6.2.0.bin
 endif

 # Build on Macs
@ -101,6 +103,7 @@ endif
 	cp whisper.cpp/build/bin/whisper-server buzz/whisper_cpp/ || true
 	cp whisper.cpp/build/src/libwhisper.dylib buzz/whisper_cpp/ || true
 	cp whisper.cpp/build/ggml/src/libggml* buzz/whisper_cpp/ || true
+	test -f buzz/whisper_cpp/ggml-silero-v6.2.0.bin || curl -L -o buzz/whisper_cpp/ggml-silero-v6.2.0.bin https://huggingface.co/ggml-org/whisper-vad/resolve/main/ggml-silero-v6.2.0.bin
 endif

 # Prints all the Mac developer identities used for code signing
--- a/buzz/recording.py
+++ b/buzz/recording.py
@ -22,6 +22,7 @@ class RecordingAmplitudeListener(QObject):
        self.input_device_index = input_device_index
        self.buffer = np.ndarray([], dtype=np.float32)
        self.accumulation_size = 0
+        self._active = True

    def start_recording(self):
        try:
@ -38,11 +39,14 @@ class RecordingAmplitudeListener(QObject):
            logging.exception("Failed to start audio stream on device %s: %s", self.input_device_index, e)

    def stop_recording(self):
+        self._active = False
        if self.stream is not None:
            self.stream.stop()
            self.stream.close()

    def stream_callback(self, in_data: np.ndarray, frame_count, time_info, status):
+        if not self._active:
+            return
        chunk = in_data.ravel()
        self.amplitude_changed.emit(float(np.sqrt(np.mean(chunk**2))))

--- a/buzz/transcriber/whisper_cpp.py
+++ b/buzz/transcriber/whisper_cpp.py
@ -109,6 +109,11 @@ class WhisperCpp:
            "-f", file_to_process,
        ]
    
+        # Add VAD if the model is available
+        vad_model_path = os.path.join(os.path.dirname(whisper_cli_path), "ggml-silero-v6.2.0.bin")
+        if os.path.exists(vad_model_path):
+            cmd.extend(["--vad", "--vad-model", vad_model_path])
+
        # Add translate flag if needed
        if task.transcription_options.task == Task.TRANSLATE:
            cmd.extend(["--translate"])
--- a/share/metainfo/io.github.chidiwilliams.Buzz.metainfo.xml
+++ b/share/metainfo/io.github.chidiwilliams.Buzz.metainfo.xml
@ -74,6 +74,7 @@
          <li>Added option to import folder</li>
          <li>Extra settings for live recordings</li>
          <li>Update checker for Windows and Macs</li>
+          <li>Added voice activity detection to whisper.cpp</li>
        </ul>
      </description>
    </release>
--- a/tests/transcriber/whisper_cpp_test.py
+++ b/tests/transcriber/whisper_cpp_test.py
@ -37,7 +37,7 @@ class TestWhisperCpp:

        # Combine all segment texts
        full_text = " ".join(segment.text for segment in segments)
-        assert "Bien venu" in full_text
+        assert "Bien venu" in full_text or "bienvenu" in full_text.lower()

    def test_transcribe_word_level_timestamps(self):
        transcription_options = TranscriptionOptions(