diff --git a/MANIFEST.in b/MANIFEST.in index b33ab28..304a066 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,4 +1,3 @@ -include faster_whisper/assets/silero_encoder_v5.onnx -include faster_whisper/assets/silero_decoder_v5.onnx +include faster_whisper/assets/silero_vad_v6.onnx include requirements.txt include requirements.conversion.txt diff --git a/faster_whisper/assets/silero_decoder_v5.onnx b/faster_whisper/assets/silero_decoder_v5.onnx deleted file mode 100644 index 0097e74..0000000 Binary files a/faster_whisper/assets/silero_decoder_v5.onnx and /dev/null differ diff --git a/faster_whisper/assets/silero_encoder_v5.onnx b/faster_whisper/assets/silero_encoder_v5.onnx deleted file mode 100644 index 0c073e2..0000000 Binary files a/faster_whisper/assets/silero_encoder_v5.onnx and /dev/null differ diff --git a/faster_whisper/assets/silero_vad_v6.onnx b/faster_whisper/assets/silero_vad_v6.onnx new file mode 100644 index 0000000..9febab9 Binary files /dev/null and b/faster_whisper/assets/silero_vad_v6.onnx differ diff --git a/faster_whisper/vad.py b/faster_whisper/vad.py index cc42f37..305919d 100644 --- a/faster_whisper/vad.py +++ b/faster_whisper/vad.py @@ -86,7 +86,7 @@ def get_speech_timestamps( padded_audio = np.pad( audio, (0, window_size_samples - audio.shape[0] % window_size_samples) ) - speech_probs = model(padded_audio.reshape(1, -1)).squeeze(0) + speech_probs = model(padded_audio) triggered = False speeches = [] @@ -288,13 +288,12 @@ class SpeechTimestampsMap: @functools.lru_cache def get_vad_model(): """Returns the VAD model instance.""" - encoder_path = os.path.join(get_assets_path(), "silero_encoder_v5.onnx") - decoder_path = os.path.join(get_assets_path(), "silero_decoder_v5.onnx") - return SileroVADModel(encoder_path, decoder_path) + path = os.path.join(get_assets_path(), "silero_vad_v6.onnx") + return SileroVADModel(path) class SileroVADModel: - def __init__(self, encoder_path, decoder_path): + def __init__(self, path): try: import onnxruntime except ImportError as e: @@ -308,13 +307,8 @@ class SileroVADModel: opts.enable_cpu_mem_arena = False opts.log_severity_level = 4 - self.encoder_session = onnxruntime.InferenceSession( - encoder_path, - providers=["CPUExecutionProvider"], - sess_options=opts, - ) - self.decoder_session = onnxruntime.InferenceSession( - decoder_path, + self.session = onnxruntime.InferenceSession( + path, providers=["CPUExecutionProvider"], sess_options=opts, ) @@ -322,47 +316,36 @@ class SileroVADModel: def __call__( self, audio: np.ndarray, num_samples: int = 512, context_size_samples: int = 64 ): + assert audio.ndim == 1, "Input should be a 1D array" assert ( - audio.ndim == 2 - ), "Input should be a 2D array with size (batch_size, num_samples)" - assert ( - audio.shape[1] % num_samples == 0 + audio.shape[0] % num_samples == 0 ), "Input size should be a multiple of num_samples" - batch_size = audio.shape[0] - - state = np.zeros((2, batch_size, 128), dtype="float32") + h = np.zeros((1, 1, 128), dtype="float32") + c = np.zeros((1, 1, 128), dtype="float32") context = np.zeros( - (batch_size, context_size_samples), + (1, context_size_samples), dtype="float32", ) - batched_audio = audio.reshape(batch_size, -1, num_samples) + batched_audio = audio.reshape(-1, num_samples) context = batched_audio[..., -context_size_samples:] - context[:, -1] = 0 - context = np.roll(context, 1, 1) - batched_audio = np.concatenate([context, batched_audio], 2) + context[-1] = 0 + context = np.roll(context, 1, 0) + batched_audio = np.concatenate([context, batched_audio], 1) batched_audio = batched_audio.reshape(-1, num_samples + context_size_samples) encoder_batch_size = 10000 num_segments = batched_audio.shape[0] - encoder_outputs = [] + outputs = [] for i in range(0, num_segments, encoder_batch_size): - encoder_output = self.encoder_session.run( - None, {"input": batched_audio[i : i + encoder_batch_size]} - )[0] - encoder_outputs.append(encoder_output) - - encoder_output = np.concatenate(encoder_outputs, axis=0) - encoder_output = encoder_output.reshape(batch_size, -1, 128) - - decoder_outputs = [] - for window in np.split(encoder_output, encoder_output.shape[1], axis=1): - out, state = self.decoder_session.run( - None, {"input": window.squeeze(1), "state": state} + output, h, c = self.session.run( + None, + {"input": batched_audio[i : i + encoder_batch_size], "h": h, "c": c}, ) - decoder_outputs.append(out) + outputs.append(output) + + out = np.concatenate(outputs, axis=0) - out = np.stack(decoder_outputs, axis=1).squeeze(-1) return out