Upgrade to Silero-VAD V6 (#1373)

Co-authored-by: sssshhhhhh 193317444+sssshhhhhh@users.noreply.github.com
2026-01-08 13:14:00 -05:00 · 2025-10-14 15:29:56 +03:00
parent 14ba1051f3
commit dea24cbcc6
5 changed files with 23 additions and 41 deletions
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,4 +1,3 @@
-include faster_whisper/assets/silero_encoder_v5.onnx
+include faster_whisper/assets/silero_vad_v6.onnx
 include faster_whisper/assets/silero_decoder_v5.onnx
 include requirements.txt
 include requirements.conversion.txt
--- a/faster_whisper/assets/silero_decoder_v5.onnx
+++ b/faster_whisper/assets/silero_decoder_v5.onnx
--- a/faster_whisper/assets/silero_encoder_v5.onnx
+++ b/faster_whisper/assets/silero_encoder_v5.onnx
--- a/faster_whisper/assets/silero_vad_v6.onnx
+++ b/faster_whisper/assets/silero_vad_v6.onnx
--- a/faster_whisper/vad.py
+++ b/faster_whisper/vad.py
@@ -86,7 +86,7 @@ def get_speech_timestamps(
    padded_audio = np.pad(
        audio, (0, window_size_samples - audio.shape[0] % window_size_samples)
    )
-    speech_probs = model(padded_audio.reshape(1, -1)).squeeze(0)
+    speech_probs = model(padded_audio)
    triggered = False
    speeches = []
@@ -288,13 +288,12 @@ class SpeechTimestampsMap:
@functools.lru_cache
 def get_vad_model():
    """Returns the VAD model instance."""
-    encoder_path = os.path.join(get_assets_path(), "silero_encoder_v5.onnx")
+    path = os.path.join(get_assets_path(), "silero_vad_v6.onnx")
-    decoder_path = os.path.join(get_assets_path(), "silero_decoder_v5.onnx")
+    return SileroVADModel(path)
    return SileroVADModel(encoder_path, decoder_path)
 class SileroVADModel:
-    def __init__(self, encoder_path, decoder_path):
+    def __init__(self, path):
        try:
            import onnxruntime
        except ImportError as e:
@@ -308,13 +307,8 @@ class SileroVADModel:
        opts.enable_cpu_mem_arena = False
        opts.log_severity_level = 4
-        self.encoder_session = onnxruntime.InferenceSession(
+        self.session = onnxruntime.InferenceSession(
-            encoder_path,
+            path,
            providers=["CPUExecutionProvider"],
            sess_options=opts,
        )
        self.decoder_session = onnxruntime.InferenceSession(
            decoder_path,
            providers=["CPUExecutionProvider"],
            sess_options=opts,
        )
@@ -322,47 +316,36 @@ class SileroVADModel:
    def __call__(
        self, audio: np.ndarray, num_samples: int = 512, context_size_samples: int = 64
    ):
        assert audio.ndim == 1, "Input should be a 1D array"
        assert (
-            audio.ndim == 2
+            audio.shape[0] % num_samples == 0
        ), "Input should be a 2D array with size (batch_size, num_samples)"
        assert (
            audio.shape[1] % num_samples == 0
        ), "Input size should be a multiple of num_samples"
-        batch_size = audio.shape[0]
+        h = np.zeros((1, 1, 128), dtype="float32")
-
+        c = np.zeros((1, 1, 128), dtype="float32")
        state = np.zeros((2, batch_size, 128), dtype="float32")
        context = np.zeros(
-            (batch_size, context_size_samples),
+            (1, context_size_samples),
            dtype="float32",
        )
-        batched_audio = audio.reshape(batch_size, -1, num_samples)
+        batched_audio = audio.reshape(-1, num_samples)
        context = batched_audio[..., -context_size_samples:]
-        context[:, -1] = 0
+        context[-1] = 0
-        context = np.roll(context, 1, 1)
+        context = np.roll(context, 1, 0)
-        batched_audio = np.concatenate([context, batched_audio], 2)
+        batched_audio = np.concatenate([context, batched_audio], 1)
        batched_audio = batched_audio.reshape(-1, num_samples + context_size_samples)
        encoder_batch_size = 10000
        num_segments = batched_audio.shape[0]
-        encoder_outputs = []
+        outputs = []
        for i in range(0, num_segments, encoder_batch_size):
-            encoder_output = self.encoder_session.run(
+            output, h, c = self.session.run(
-                None, {"input": batched_audio[i : i + encoder_batch_size]}
+                None,
-            )[0]
+                {"input": batched_audio[i : i + encoder_batch_size], "h": h, "c": c},
            encoder_outputs.append(encoder_output)
        encoder_output = np.concatenate(encoder_outputs, axis=0)
        encoder_output = encoder_output.reshape(batch_size, -1, 128)
        decoder_outputs = []
        for window in np.split(encoder_output, encoder_output.shape[1], axis=1):
            out, state = self.decoder_session.run(
                None, {"input": window.squeeze(1), "state": state}
            )
-            decoder_outputs.append(out)
+            outputs.append(output)
        out = np.concatenate(outputs, axis=0)
        out = np.stack(decoder_outputs, axis=1).squeeze(-1)
        return out