Adds new VAD parameters (#1386)

* Adds new VAD parameters

Adds new VAD parameters: 

min_silence_at_max_speech: Minimum silence duration in ms which is used to avoid abrupt cuts when max_speech_duration_s is reached.

use_max_poss_sil_at_max_speech: Whether to use the maximum possible silence at max_speech_duration_s or not. If not, the last silence is used.

* Style

* Update doc

* change min_speech_duration_ms (0 -> 250)

* Change min_speech_duration_ms to zero

Set minimum speech duration to zero for flexibility.

---------

Co-authored-by: Mahmoud Ashraf <hassouna97.ma@gmail.com>
This commit is contained in:
Purfview
2025-11-19 14:40:46 +00:00
committed by GitHub
parent 2eeafe05de
commit ed9a06cd89

View File

@@ -27,11 +27,15 @@ class VadOptions:
min_speech_duration_ms: Final speech chunks shorter min_speech_duration_ms are thrown out.
max_speech_duration_s: Maximum duration of speech chunks in seconds. Chunks longer
than max_speech_duration_s will be split at the timestamp of the last silence that
lasts more than 100ms (if any), to prevent aggressive cutting. Otherwise, they will be
split aggressively just before max_speech_duration_s.
lasts more than min_silence_at_max_speech (if any), to prevent aggressive cutting.
Otherwise, they will be split aggressively just before max_speech_duration_s.
min_silence_duration_ms: In the end of each speech chunk wait for min_silence_duration_ms
before separating it
speech_pad_ms: Final speech chunks are padded by speech_pad_ms each side
min_silence_at_max_speech: Minimum silence duration in ms which is used to avoid abrupt cuts
when max_speech_duration_s is reached.
use_max_poss_sil_at_max_speech: Whether to use the maximum possible silence at
max_speech_duration_s or not. If not, the last silence is used.
"""
threshold: float = 0.5
@@ -40,6 +44,8 @@ class VadOptions:
max_speech_duration_s: float = float("inf")
min_silence_duration_ms: int = 2000
speech_pad_ms: int = 400
min_silence_at_max_speech: int = 98
use_max_poss_sil_at_max_speech: bool = True
def get_speech_timestamps(
@@ -69,6 +75,9 @@ def get_speech_timestamps(
min_silence_duration_ms = vad_options.min_silence_duration_ms
window_size_samples = 512
speech_pad_ms = vad_options.speech_pad_ms
min_silence_at_max_speech = vad_options.min_silence_at_max_speech
use_max_poss_sil_at_max_speech = vad_options.use_max_poss_sil_at_max_speech
min_speech_samples = sampling_rate * min_speech_duration_ms / 1000
speech_pad_samples = sampling_rate * speech_pad_ms / 1000
max_speech_samples = (
@@ -77,7 +86,7 @@ def get_speech_timestamps(
- 2 * speech_pad_samples
)
min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
min_silence_samples_at_max_speech = sampling_rate * 98 / 1000
min_silence_samples_at_max_speech = sampling_rate * min_silence_at_max_speech / 1000
audio_length_samples = len(audio)
@@ -91,6 +100,8 @@ def get_speech_timestamps(
triggered = False
speeches = []
current_speech = {}
possible_ends = []
if neg_threshold is None:
neg_threshold = max(threshold - 0.15, 0.01)
@@ -100,45 +111,67 @@ def get_speech_timestamps(
prev_end = next_start = 0
for i, speech_prob in enumerate(speech_probs):
cur_sample = window_size_samples * i
if (speech_prob >= threshold) and temp_end:
sil_dur = cur_sample - temp_end
if sil_dur > min_silence_samples_at_max_speech:
possible_ends.append((temp_end, sil_dur))
temp_end = 0
if next_start < prev_end:
next_start = window_size_samples * i
next_start = cur_sample
if (speech_prob >= threshold) and not triggered:
triggered = True
current_speech["start"] = window_size_samples * i
current_speech["start"] = cur_sample
continue
if (
triggered
and (window_size_samples * i) - current_speech["start"] > max_speech_samples
):
if prev_end:
if triggered and (cur_sample - current_speech["start"] > max_speech_samples):
if use_max_poss_sil_at_max_speech and possible_ends:
prev_end, dur = max(possible_ends, key=lambda x: x[1])
current_speech["end"] = prev_end
speeches.append(current_speech)
current_speech = {}
# previously reached silence (< neg_thres) and is still not speech (< thres)
if next_start < prev_end:
triggered = False
else:
next_start = prev_end + dur
if next_start < prev_end + cur_sample:
current_speech["start"] = next_start
else:
triggered = False
prev_end = next_start = temp_end = 0
possible_ends = []
else:
current_speech["end"] = window_size_samples * i
speeches.append(current_speech)
current_speech = {}
prev_end = next_start = temp_end = 0
triggered = False
continue
if prev_end:
current_speech["end"] = prev_end
speeches.append(current_speech)
current_speech = {}
if next_start < prev_end:
triggered = False
else:
current_speech["start"] = next_start
prev_end = next_start = temp_end = 0
possible_ends = []
else:
current_speech["end"] = cur_sample
speeches.append(current_speech)
current_speech = {}
prev_end = next_start = temp_end = 0
triggered = False
possible_ends = []
continue
if (speech_prob < neg_threshold) and triggered:
if not temp_end:
temp_end = window_size_samples * i
# condition to avoid cutting in very short silence
if (window_size_samples * i) - temp_end > min_silence_samples_at_max_speech:
temp_end = cur_sample
sil_dur_now = cur_sample - temp_end
if (
not use_max_poss_sil_at_max_speech
and sil_dur_now > min_silence_samples_at_max_speech
):
prev_end = temp_end
if (window_size_samples * i) - temp_end < min_silence_samples:
if sil_dur_now < min_silence_samples:
continue
else:
current_speech["end"] = temp_end
@@ -149,6 +182,7 @@ def get_speech_timestamps(
current_speech = {}
prev_end = next_start = temp_end = 0
triggered = False
possible_ends = []
continue
if (