From 04c07c6caea88dc61a4753fe5e8d172005a29ece Mon Sep 17 00:00:00 2001
From: Raivis Dejus <raivisd@scandiweb.com>
Date: Sat, 7 Mar 2026 07:58:04 +0200
Subject: [PATCH] Adding VAD to whisper.cpp to reduce hallucinations on audio w
 silences (#1412)

---
 Makefile                                                 | 3 +++
 buzz/recording.py                                        | 4 ++++
 buzz/transcriber/whisper_cpp.py                          | 5 +++++
 share/metainfo/io.github.chidiwilliams.Buzz.metainfo.xml | 1 +
 tests/transcriber/whisper_cpp_test.py                    | 2 +-
 5 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 6a316b53..4beb3323 100644
--- a/Makefile
+++ b/Makefile
@@ -65,6 +65,7 @@ ifeq ($(OS), Windows_NT)
 	cp whisper.cpp/build/bin/Release/whisper-cli.exe buzz/whisper_cpp/
 	cp whisper.cpp/build/bin/Release/whisper-server.exe buzz/whisper_cpp/
 	cp dll_backup/SDL2.dll buzz/whisper_cpp
+	PowerShell -NoProfile -ExecutionPolicy Bypass -Command "if (-not (Test-Path 'buzz\whisper_cpp\ggml-silero-v6.2.0.bin')) { Start-BitsTransfer -Source https://huggingface.co/ggml-org/whisper-vad/resolve/main/ggml-silero-v6.2.0.bin -Destination 'buzz\whisper_cpp\ggml-silero-v6.2.0.bin' }"
 endif
 
 ifeq ($(shell uname -s), Linux)
@@ -82,6 +83,7 @@ ifeq ($(shell uname -s), Linux)
 	cp -P whisper.cpp/build/ggml/src/libggml-base.so* buzz/whisper_cpp/ || true
 	cp -P whisper.cpp/build/ggml/src/libggml-cpu.so* buzz/whisper_cpp/ || true
 	cp -P whisper.cpp/build/ggml/src/ggml-vulkan/libggml-vulkan.so* buzz/whisper_cpp/ || true
+	test -f buzz/whisper_cpp/ggml-silero-v6.2.0.bin || curl -L -o buzz/whisper_cpp/ggml-silero-v6.2.0.bin https://huggingface.co/ggml-org/whisper-vad/resolve/main/ggml-silero-v6.2.0.bin
 endif
 
 # Build on Macs
@@ -101,6 +103,7 @@ endif
 	cp whisper.cpp/build/bin/whisper-server buzz/whisper_cpp/ || true
 	cp whisper.cpp/build/src/libwhisper.dylib buzz/whisper_cpp/ || true
 	cp whisper.cpp/build/ggml/src/libggml* buzz/whisper_cpp/ || true
+	test -f buzz/whisper_cpp/ggml-silero-v6.2.0.bin || curl -L -o buzz/whisper_cpp/ggml-silero-v6.2.0.bin https://huggingface.co/ggml-org/whisper-vad/resolve/main/ggml-silero-v6.2.0.bin
 endif
 
 # Prints all the Mac developer identities used for code signing
diff --git a/buzz/recording.py b/buzz/recording.py
index a831823f..158f6e5c 100644
--- a/buzz/recording.py
+++ b/buzz/recording.py
@@ -22,6 +22,7 @@ class RecordingAmplitudeListener(QObject):
         self.input_device_index = input_device_index
         self.buffer = np.ndarray([], dtype=np.float32)
         self.accumulation_size = 0
+        self._active = True
 
     def start_recording(self):
         try:
@@ -38,11 +39,14 @@ class RecordingAmplitudeListener(QObject):
             logging.exception("Failed to start audio stream on device %s: %s", self.input_device_index, e)
 
     def stop_recording(self):
+        self._active = False
         if self.stream is not None:
             self.stream.stop()
             self.stream.close()
 
     def stream_callback(self, in_data: np.ndarray, frame_count, time_info, status):
+        if not self._active:
+            return
         chunk = in_data.ravel()
         self.amplitude_changed.emit(float(np.sqrt(np.mean(chunk**2))))
 
diff --git a/buzz/transcriber/whisper_cpp.py b/buzz/transcriber/whisper_cpp.py
index 977b785e..db4aff84 100644
--- a/buzz/transcriber/whisper_cpp.py
+++ b/buzz/transcriber/whisper_cpp.py
@@ -109,6 +109,11 @@ class WhisperCpp:
             "-f", file_to_process,
         ]
     
+        # Add VAD if the model is available
+        vad_model_path = os.path.join(os.path.dirname(whisper_cli_path), "ggml-silero-v6.2.0.bin")
+        if os.path.exists(vad_model_path):
+            cmd.extend(["--vad", "--vad-model", vad_model_path])
+
         # Add translate flag if needed
         if task.transcription_options.task == Task.TRANSLATE:
             cmd.extend(["--translate"])
diff --git a/share/metainfo/io.github.chidiwilliams.Buzz.metainfo.xml b/share/metainfo/io.github.chidiwilliams.Buzz.metainfo.xml
index cc91b618..fd50fb15 100644
--- a/share/metainfo/io.github.chidiwilliams.Buzz.metainfo.xml
+++ b/share/metainfo/io.github.chidiwilliams.Buzz.metainfo.xml
@@ -74,6 +74,7 @@
           <li>Added option to import folder</li>
           <li>Extra settings for live recordings</li>
           <li>Update checker for Windows and Macs</li>
+          <li>Added voice activity detection to whisper.cpp</li>
         </ul>
       </description>
     </release>
diff --git a/tests/transcriber/whisper_cpp_test.py b/tests/transcriber/whisper_cpp_test.py
index 310f5f1d..cabc9fe7 100644
--- a/tests/transcriber/whisper_cpp_test.py
+++ b/tests/transcriber/whisper_cpp_test.py
@@ -37,7 +37,7 @@ class TestWhisperCpp:
 
         # Combine all segment texts
         full_text = " ".join(segment.text for segment in segments)
-        assert "Bien venu" in full_text
+        assert "Bien venu" in full_text or "bienvenu" in full_text.lower()
 
     def test_transcribe_word_level_timestamps(self):
         transcription_options = TranscriptionOptions(