mirror of
https://github.com/SYSTRAN/faster-whisper.git
synced 2026-01-08 13:14:00 -05:00
Upgrade to Silero-VAD V6 (#1373)
Co-authored-by: sssshhhhhh 193317444+sssshhhhhh@users.noreply.github.com
This commit is contained in:
@@ -1,4 +1,3 @@
|
|||||||
include faster_whisper/assets/silero_encoder_v5.onnx
|
include faster_whisper/assets/silero_vad_v6.onnx
|
||||||
include faster_whisper/assets/silero_decoder_v5.onnx
|
|
||||||
include requirements.txt
|
include requirements.txt
|
||||||
include requirements.conversion.txt
|
include requirements.conversion.txt
|
||||||
|
|||||||
Binary file not shown.
Binary file not shown.
BIN
faster_whisper/assets/silero_vad_v6.onnx
Normal file
BIN
faster_whisper/assets/silero_vad_v6.onnx
Normal file
Binary file not shown.
@@ -86,7 +86,7 @@ def get_speech_timestamps(
|
|||||||
padded_audio = np.pad(
|
padded_audio = np.pad(
|
||||||
audio, (0, window_size_samples - audio.shape[0] % window_size_samples)
|
audio, (0, window_size_samples - audio.shape[0] % window_size_samples)
|
||||||
)
|
)
|
||||||
speech_probs = model(padded_audio.reshape(1, -1)).squeeze(0)
|
speech_probs = model(padded_audio)
|
||||||
|
|
||||||
triggered = False
|
triggered = False
|
||||||
speeches = []
|
speeches = []
|
||||||
@@ -288,13 +288,12 @@ class SpeechTimestampsMap:
|
|||||||
@functools.lru_cache
|
@functools.lru_cache
|
||||||
def get_vad_model():
|
def get_vad_model():
|
||||||
"""Returns the VAD model instance."""
|
"""Returns the VAD model instance."""
|
||||||
encoder_path = os.path.join(get_assets_path(), "silero_encoder_v5.onnx")
|
path = os.path.join(get_assets_path(), "silero_vad_v6.onnx")
|
||||||
decoder_path = os.path.join(get_assets_path(), "silero_decoder_v5.onnx")
|
return SileroVADModel(path)
|
||||||
return SileroVADModel(encoder_path, decoder_path)
|
|
||||||
|
|
||||||
|
|
||||||
class SileroVADModel:
|
class SileroVADModel:
|
||||||
def __init__(self, encoder_path, decoder_path):
|
def __init__(self, path):
|
||||||
try:
|
try:
|
||||||
import onnxruntime
|
import onnxruntime
|
||||||
except ImportError as e:
|
except ImportError as e:
|
||||||
@@ -308,13 +307,8 @@ class SileroVADModel:
|
|||||||
opts.enable_cpu_mem_arena = False
|
opts.enable_cpu_mem_arena = False
|
||||||
opts.log_severity_level = 4
|
opts.log_severity_level = 4
|
||||||
|
|
||||||
self.encoder_session = onnxruntime.InferenceSession(
|
self.session = onnxruntime.InferenceSession(
|
||||||
encoder_path,
|
path,
|
||||||
providers=["CPUExecutionProvider"],
|
|
||||||
sess_options=opts,
|
|
||||||
)
|
|
||||||
self.decoder_session = onnxruntime.InferenceSession(
|
|
||||||
decoder_path,
|
|
||||||
providers=["CPUExecutionProvider"],
|
providers=["CPUExecutionProvider"],
|
||||||
sess_options=opts,
|
sess_options=opts,
|
||||||
)
|
)
|
||||||
@@ -322,47 +316,36 @@ class SileroVADModel:
|
|||||||
def __call__(
|
def __call__(
|
||||||
self, audio: np.ndarray, num_samples: int = 512, context_size_samples: int = 64
|
self, audio: np.ndarray, num_samples: int = 512, context_size_samples: int = 64
|
||||||
):
|
):
|
||||||
|
assert audio.ndim == 1, "Input should be a 1D array"
|
||||||
assert (
|
assert (
|
||||||
audio.ndim == 2
|
audio.shape[0] % num_samples == 0
|
||||||
), "Input should be a 2D array with size (batch_size, num_samples)"
|
|
||||||
assert (
|
|
||||||
audio.shape[1] % num_samples == 0
|
|
||||||
), "Input size should be a multiple of num_samples"
|
), "Input size should be a multiple of num_samples"
|
||||||
|
|
||||||
batch_size = audio.shape[0]
|
h = np.zeros((1, 1, 128), dtype="float32")
|
||||||
|
c = np.zeros((1, 1, 128), dtype="float32")
|
||||||
state = np.zeros((2, batch_size, 128), dtype="float32")
|
|
||||||
context = np.zeros(
|
context = np.zeros(
|
||||||
(batch_size, context_size_samples),
|
(1, context_size_samples),
|
||||||
dtype="float32",
|
dtype="float32",
|
||||||
)
|
)
|
||||||
|
|
||||||
batched_audio = audio.reshape(batch_size, -1, num_samples)
|
batched_audio = audio.reshape(-1, num_samples)
|
||||||
context = batched_audio[..., -context_size_samples:]
|
context = batched_audio[..., -context_size_samples:]
|
||||||
context[:, -1] = 0
|
context[-1] = 0
|
||||||
context = np.roll(context, 1, 1)
|
context = np.roll(context, 1, 0)
|
||||||
batched_audio = np.concatenate([context, batched_audio], 2)
|
batched_audio = np.concatenate([context, batched_audio], 1)
|
||||||
|
|
||||||
batched_audio = batched_audio.reshape(-1, num_samples + context_size_samples)
|
batched_audio = batched_audio.reshape(-1, num_samples + context_size_samples)
|
||||||
|
|
||||||
encoder_batch_size = 10000
|
encoder_batch_size = 10000
|
||||||
num_segments = batched_audio.shape[0]
|
num_segments = batched_audio.shape[0]
|
||||||
encoder_outputs = []
|
outputs = []
|
||||||
for i in range(0, num_segments, encoder_batch_size):
|
for i in range(0, num_segments, encoder_batch_size):
|
||||||
encoder_output = self.encoder_session.run(
|
output, h, c = self.session.run(
|
||||||
None, {"input": batched_audio[i : i + encoder_batch_size]}
|
None,
|
||||||
)[0]
|
{"input": batched_audio[i : i + encoder_batch_size], "h": h, "c": c},
|
||||||
encoder_outputs.append(encoder_output)
|
|
||||||
|
|
||||||
encoder_output = np.concatenate(encoder_outputs, axis=0)
|
|
||||||
encoder_output = encoder_output.reshape(batch_size, -1, 128)
|
|
||||||
|
|
||||||
decoder_outputs = []
|
|
||||||
for window in np.split(encoder_output, encoder_output.shape[1], axis=1):
|
|
||||||
out, state = self.decoder_session.run(
|
|
||||||
None, {"input": window.squeeze(1), "state": state}
|
|
||||||
)
|
)
|
||||||
decoder_outputs.append(out)
|
outputs.append(output)
|
||||||
|
|
||||||
|
out = np.concatenate(outputs, axis=0)
|
||||||
|
|
||||||
out = np.stack(decoder_outputs, axis=1).squeeze(-1)
|
|
||||||
return out
|
return out
|
||||||
|
|||||||
Reference in New Issue
Block a user