From 1b24f284c9b39bcdf47233d7ae507abaea7a98c8 Mon Sep 17 00:00:00 2001
From: Purfview <69023953+Purfview@users.noreply.github.com>
Date: Thu, 12 Dec 2024 12:23:30 +0000
Subject: [PATCH] Reduce VAD memory usage (#1198)

Co-authored-by: Mahmoud Ashraf <hassouna97.ma@gmail.com>
---
 faster_whisper/vad.py | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/faster_whisper/vad.py b/faster_whisper/vad.py
index 9605931..1f7d205 100644
--- a/faster_whisper/vad.py
+++ b/faster_whisper/vad.py
@@ -260,8 +260,9 @@ class SileroVADModel:
             ) from e
 
         opts = onnxruntime.SessionOptions()
-        opts.inter_op_num_threads = 0
-        opts.intra_op_num_threads = 0
+        opts.inter_op_num_threads = 1
+        opts.intra_op_num_threads = 1
+        opts.enable_cpu_mem_arena = False
         opts.log_severity_level = 4
 
         self.encoder_session = onnxruntime.InferenceSession(
@@ -301,7 +302,16 @@ class SileroVADModel:
 
         batched_audio = batched_audio.reshape(-1, num_samples + context_size_samples)
 
-        encoder_output = self.encoder_session.run(None, {"input": batched_audio})[0]
+        encoder_batch_size = 10000
+        num_segments = batched_audio.shape[0]
+        encoder_outputs = []
+        for i in range(0, num_segments, encoder_batch_size):
+            encoder_output = self.encoder_session.run(
+                None, {"input": batched_audio[i : i + encoder_batch_size]}
+            )[0]
+            encoder_outputs.append(encoder_output)
+
+        encoder_output = np.concatenate(encoder_outputs, axis=0)
         encoder_output = encoder_output.reshape(batch_size, -1, 128)
 
         decoder_outputs = []