diff --git a/faster_whisper/vad.py b/faster_whisper/vad.py index 9605931..1f7d205 100644 --- a/faster_whisper/vad.py +++ b/faster_whisper/vad.py @@ -260,8 +260,9 @@ class SileroVADModel: ) from e opts = onnxruntime.SessionOptions() - opts.inter_op_num_threads = 0 - opts.intra_op_num_threads = 0 + opts.inter_op_num_threads = 1 + opts.intra_op_num_threads = 1 + opts.enable_cpu_mem_arena = False opts.log_severity_level = 4 self.encoder_session = onnxruntime.InferenceSession( @@ -301,7 +302,16 @@ class SileroVADModel: batched_audio = batched_audio.reshape(-1, num_samples + context_size_samples) - encoder_output = self.encoder_session.run(None, {"input": batched_audio})[0] + encoder_batch_size = 10000 + num_segments = batched_audio.shape[0] + encoder_outputs = [] + for i in range(0, num_segments, encoder_batch_size): + encoder_output = self.encoder_session.run( + None, {"input": batched_audio[i : i + encoder_batch_size]} + )[0] + encoder_outputs.append(encoder_output) + + encoder_output = np.concatenate(encoder_outputs, axis=0) encoder_output = encoder_output.reshape(batch_size, -1, 128) decoder_outputs = []