From 1b24f284c9b39bcdf47233d7ae507abaea7a98c8 Mon Sep 17 00:00:00 2001 From: Purfview <69023953+Purfview@users.noreply.github.com> Date: Thu, 12 Dec 2024 12:23:30 +0000 Subject: [PATCH] Reduce VAD memory usage (#1198) Co-authored-by: Mahmoud Ashraf --- faster_whisper/vad.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/faster_whisper/vad.py b/faster_whisper/vad.py index 9605931..1f7d205 100644 --- a/faster_whisper/vad.py +++ b/faster_whisper/vad.py @@ -260,8 +260,9 @@ class SileroVADModel: ) from e opts = onnxruntime.SessionOptions() - opts.inter_op_num_threads = 0 - opts.intra_op_num_threads = 0 + opts.inter_op_num_threads = 1 + opts.intra_op_num_threads = 1 + opts.enable_cpu_mem_arena = False opts.log_severity_level = 4 self.encoder_session = onnxruntime.InferenceSession( @@ -301,7 +302,16 @@ class SileroVADModel: batched_audio = batched_audio.reshape(-1, num_samples + context_size_samples) - encoder_output = self.encoder_session.run(None, {"input": batched_audio})[0] + encoder_batch_size = 10000 + num_segments = batched_audio.shape[0] + encoder_outputs = [] + for i in range(0, num_segments, encoder_batch_size): + encoder_output = self.encoder_session.run( + None, {"input": batched_audio[i : i + encoder_batch_size]} + )[0] + encoder_outputs.append(encoder_output) + + encoder_output = np.concatenate(encoder_outputs, axis=0) encoder_output = encoder_output.reshape(batch_size, -1, 128) decoder_outputs = []