Adding VAD to whisper.cpp to reduce hallucinations on audio w silences (#1412)

This commit is contained in:
Raivis Dejus 2026-03-07 07:58:04 +02:00 committed by GitHub
commit 04c07c6cae
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 14 additions and 1 deletions

View file

@ -65,6 +65,7 @@ ifeq ($(OS), Windows_NT)
cp whisper.cpp/build/bin/Release/whisper-cli.exe buzz/whisper_cpp/
cp whisper.cpp/build/bin/Release/whisper-server.exe buzz/whisper_cpp/
cp dll_backup/SDL2.dll buzz/whisper_cpp
PowerShell -NoProfile -ExecutionPolicy Bypass -Command "if (-not (Test-Path 'buzz\whisper_cpp\ggml-silero-v6.2.0.bin')) { Start-BitsTransfer -Source https://huggingface.co/ggml-org/whisper-vad/resolve/main/ggml-silero-v6.2.0.bin -Destination 'buzz\whisper_cpp\ggml-silero-v6.2.0.bin' }"
endif
ifeq ($(shell uname -s), Linux)
@ -82,6 +83,7 @@ ifeq ($(shell uname -s), Linux)
cp -P whisper.cpp/build/ggml/src/libggml-base.so* buzz/whisper_cpp/ || true
cp -P whisper.cpp/build/ggml/src/libggml-cpu.so* buzz/whisper_cpp/ || true
cp -P whisper.cpp/build/ggml/src/ggml-vulkan/libggml-vulkan.so* buzz/whisper_cpp/ || true
test -f buzz/whisper_cpp/ggml-silero-v6.2.0.bin || curl -L -o buzz/whisper_cpp/ggml-silero-v6.2.0.bin https://huggingface.co/ggml-org/whisper-vad/resolve/main/ggml-silero-v6.2.0.bin
endif
# Build on Macs
@ -101,6 +103,7 @@ endif
cp whisper.cpp/build/bin/whisper-server buzz/whisper_cpp/ || true
cp whisper.cpp/build/src/libwhisper.dylib buzz/whisper_cpp/ || true
cp whisper.cpp/build/ggml/src/libggml* buzz/whisper_cpp/ || true
test -f buzz/whisper_cpp/ggml-silero-v6.2.0.bin || curl -L -o buzz/whisper_cpp/ggml-silero-v6.2.0.bin https://huggingface.co/ggml-org/whisper-vad/resolve/main/ggml-silero-v6.2.0.bin
endif
# Prints all the Mac developer identities used for code signing

View file

@ -22,6 +22,7 @@ class RecordingAmplitudeListener(QObject):
self.input_device_index = input_device_index
self.buffer = np.ndarray([], dtype=np.float32)
self.accumulation_size = 0
self._active = True
def start_recording(self):
try:
@ -38,11 +39,14 @@ class RecordingAmplitudeListener(QObject):
logging.exception("Failed to start audio stream on device %s: %s", self.input_device_index, e)
def stop_recording(self):
self._active = False
if self.stream is not None:
self.stream.stop()
self.stream.close()
def stream_callback(self, in_data: np.ndarray, frame_count, time_info, status):
if not self._active:
return
chunk = in_data.ravel()
self.amplitude_changed.emit(float(np.sqrt(np.mean(chunk**2))))

View file

@ -109,6 +109,11 @@ class WhisperCpp:
"-f", file_to_process,
]
# Add VAD if the model is available
vad_model_path = os.path.join(os.path.dirname(whisper_cli_path), "ggml-silero-v6.2.0.bin")
if os.path.exists(vad_model_path):
cmd.extend(["--vad", "--vad-model", vad_model_path])
# Add translate flag if needed
if task.transcription_options.task == Task.TRANSLATE:
cmd.extend(["--translate"])

View file

@ -74,6 +74,7 @@
<li>Added option to import folder</li>
<li>Extra settings for live recordings</li>
<li>Update checker for Windows and Macs</li>
<li>Added voice activity detection to whisper.cpp</li>
</ul>
</description>
</release>

View file

@ -37,7 +37,7 @@ class TestWhisperCpp:
# Combine all segment texts
full_text = " ".join(segment.text for segment in segments)
assert "Bien venu" in full_text
assert "Bien venu" in full_text or "bienvenu" in full_text.lower()
def test_transcribe_word_level_timestamps(self):
transcription_options = TranscriptionOptions(