diff --git a/MANIFEST.in b/MANIFEST.in
index b33ab28..304a066 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,4 +1,3 @@
-include faster_whisper/assets/silero_encoder_v5.onnx
-include faster_whisper/assets/silero_decoder_v5.onnx
+include faster_whisper/assets/silero_vad_v6.onnx
 include requirements.txt
 include requirements.conversion.txt
diff --git a/faster_whisper/assets/silero_decoder_v5.onnx b/faster_whisper/assets/silero_decoder_v5.onnx
deleted file mode 100644
index 0097e74..0000000
Binary files a/faster_whisper/assets/silero_decoder_v5.onnx and /dev/null differ
diff --git a/faster_whisper/assets/silero_encoder_v5.onnx b/faster_whisper/assets/silero_encoder_v5.onnx
deleted file mode 100644
index 0c073e2..0000000
Binary files a/faster_whisper/assets/silero_encoder_v5.onnx and /dev/null differ
diff --git a/faster_whisper/assets/silero_vad_v6.onnx b/faster_whisper/assets/silero_vad_v6.onnx
new file mode 100644
index 0000000..9febab9
Binary files /dev/null and b/faster_whisper/assets/silero_vad_v6.onnx differ
diff --git a/faster_whisper/vad.py b/faster_whisper/vad.py
index cc42f37..305919d 100644
--- a/faster_whisper/vad.py
+++ b/faster_whisper/vad.py
@@ -86,7 +86,7 @@ def get_speech_timestamps(
     padded_audio = np.pad(
         audio, (0, window_size_samples - audio.shape[0] % window_size_samples)
     )
-    speech_probs = model(padded_audio.reshape(1, -1)).squeeze(0)
+    speech_probs = model(padded_audio)
 
     triggered = False
     speeches = []
@@ -288,13 +288,12 @@ class SpeechTimestampsMap:
 @functools.lru_cache
 def get_vad_model():
     """Returns the VAD model instance."""
-    encoder_path = os.path.join(get_assets_path(), "silero_encoder_v5.onnx")
-    decoder_path = os.path.join(get_assets_path(), "silero_decoder_v5.onnx")
-    return SileroVADModel(encoder_path, decoder_path)
+    path = os.path.join(get_assets_path(), "silero_vad_v6.onnx")
+    return SileroVADModel(path)
 
 
 class SileroVADModel:
-    def __init__(self, encoder_path, decoder_path):
+    def __init__(self, path):
         try:
             import onnxruntime
         except ImportError as e:
@@ -308,13 +307,8 @@ class SileroVADModel:
         opts.enable_cpu_mem_arena = False
         opts.log_severity_level = 4
 
-        self.encoder_session = onnxruntime.InferenceSession(
-            encoder_path,
-            providers=["CPUExecutionProvider"],
-            sess_options=opts,
-        )
-        self.decoder_session = onnxruntime.InferenceSession(
-            decoder_path,
+        self.session = onnxruntime.InferenceSession(
+            path,
             providers=["CPUExecutionProvider"],
             sess_options=opts,
         )
@@ -322,47 +316,36 @@ class SileroVADModel:
     def __call__(
         self, audio: np.ndarray, num_samples: int = 512, context_size_samples: int = 64
     ):
+        assert audio.ndim == 1, "Input should be a 1D array"
         assert (
-            audio.ndim == 2
-        ), "Input should be a 2D array with size (batch_size, num_samples)"
-        assert (
-            audio.shape[1] % num_samples == 0
+            audio.shape[0] % num_samples == 0
         ), "Input size should be a multiple of num_samples"
 
-        batch_size = audio.shape[0]
-
-        state = np.zeros((2, batch_size, 128), dtype="float32")
+        h = np.zeros((1, 1, 128), dtype="float32")
+        c = np.zeros((1, 1, 128), dtype="float32")
         context = np.zeros(
-            (batch_size, context_size_samples),
+            (1, context_size_samples),
             dtype="float32",
         )
 
-        batched_audio = audio.reshape(batch_size, -1, num_samples)
+        batched_audio = audio.reshape(-1, num_samples)
         context = batched_audio[..., -context_size_samples:]
-        context[:, -1] = 0
-        context = np.roll(context, 1, 1)
-        batched_audio = np.concatenate([context, batched_audio], 2)
+        context[-1] = 0
+        context = np.roll(context, 1, 0)
+        batched_audio = np.concatenate([context, batched_audio], 1)
 
         batched_audio = batched_audio.reshape(-1, num_samples + context_size_samples)
 
         encoder_batch_size = 10000
         num_segments = batched_audio.shape[0]
-        encoder_outputs = []
+        outputs = []
         for i in range(0, num_segments, encoder_batch_size):
-            encoder_output = self.encoder_session.run(
-                None, {"input": batched_audio[i : i + encoder_batch_size]}
-            )[0]
-            encoder_outputs.append(encoder_output)
-
-        encoder_output = np.concatenate(encoder_outputs, axis=0)
-        encoder_output = encoder_output.reshape(batch_size, -1, 128)
-
-        decoder_outputs = []
-        for window in np.split(encoder_output, encoder_output.shape[1], axis=1):
-            out, state = self.decoder_session.run(
-                None, {"input": window.squeeze(1), "state": state}
+            output, h, c = self.session.run(
+                None,
+                {"input": batched_audio[i : i + encoder_batch_size], "h": h, "c": c},
             )
-            decoder_outputs.append(out)
+            outputs.append(output)
+
+        out = np.concatenate(outputs, axis=0)
 
-        out = np.stack(decoder_outputs, axis=1).squeeze(-1)
         return out