Adds new VAD parameters (#1386)

* Adds new VAD parameters

Adds new VAD parameters: 

min_silence_at_max_speech: Minimum silence duration in ms which is used to avoid abrupt cuts when max_speech_duration_s is reached.

use_max_poss_sil_at_max_speech: Whether to use the maximum possible silence at max_speech_duration_s or not. If not, the last silence is used.

* Style

* Update doc

* change min_speech_duration_ms (0 -> 250)

* Change min_speech_duration_ms to zero

Set minimum speech duration to zero for flexibility.

---------

Co-authored-by: Mahmoud Ashraf <hassouna97.ma@gmail.com>
This commit is contained in:
Purfview
2025-11-19 14:40:46 +00:00
committed by GitHub
parent 2eeafe05de
commit ed9a06cd89

View File

@@ -27,11 +27,15 @@ class VadOptions:
min_speech_duration_ms: Final speech chunks shorter min_speech_duration_ms are thrown out. min_speech_duration_ms: Final speech chunks shorter min_speech_duration_ms are thrown out.
max_speech_duration_s: Maximum duration of speech chunks in seconds. Chunks longer max_speech_duration_s: Maximum duration of speech chunks in seconds. Chunks longer
than max_speech_duration_s will be split at the timestamp of the last silence that than max_speech_duration_s will be split at the timestamp of the last silence that
lasts more than 100ms (if any), to prevent aggressive cutting. Otherwise, they will be lasts more than min_silence_at_max_speech (if any), to prevent aggressive cutting.
split aggressively just before max_speech_duration_s. Otherwise, they will be split aggressively just before max_speech_duration_s.
min_silence_duration_ms: In the end of each speech chunk wait for min_silence_duration_ms min_silence_duration_ms: In the end of each speech chunk wait for min_silence_duration_ms
before separating it before separating it
speech_pad_ms: Final speech chunks are padded by speech_pad_ms each side speech_pad_ms: Final speech chunks are padded by speech_pad_ms each side
min_silence_at_max_speech: Minimum silence duration in ms which is used to avoid abrupt cuts
when max_speech_duration_s is reached.
use_max_poss_sil_at_max_speech: Whether to use the maximum possible silence at
max_speech_duration_s or not. If not, the last silence is used.
""" """
threshold: float = 0.5 threshold: float = 0.5
@@ -40,6 +44,8 @@ class VadOptions:
max_speech_duration_s: float = float("inf") max_speech_duration_s: float = float("inf")
min_silence_duration_ms: int = 2000 min_silence_duration_ms: int = 2000
speech_pad_ms: int = 400 speech_pad_ms: int = 400
min_silence_at_max_speech: int = 98
use_max_poss_sil_at_max_speech: bool = True
def get_speech_timestamps( def get_speech_timestamps(
@@ -69,6 +75,9 @@ def get_speech_timestamps(
min_silence_duration_ms = vad_options.min_silence_duration_ms min_silence_duration_ms = vad_options.min_silence_duration_ms
window_size_samples = 512 window_size_samples = 512
speech_pad_ms = vad_options.speech_pad_ms speech_pad_ms = vad_options.speech_pad_ms
min_silence_at_max_speech = vad_options.min_silence_at_max_speech
use_max_poss_sil_at_max_speech = vad_options.use_max_poss_sil_at_max_speech
min_speech_samples = sampling_rate * min_speech_duration_ms / 1000 min_speech_samples = sampling_rate * min_speech_duration_ms / 1000
speech_pad_samples = sampling_rate * speech_pad_ms / 1000 speech_pad_samples = sampling_rate * speech_pad_ms / 1000
max_speech_samples = ( max_speech_samples = (
@@ -77,7 +86,7 @@ def get_speech_timestamps(
- 2 * speech_pad_samples - 2 * speech_pad_samples
) )
min_silence_samples = sampling_rate * min_silence_duration_ms / 1000 min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
min_silence_samples_at_max_speech = sampling_rate * 98 / 1000 min_silence_samples_at_max_speech = sampling_rate * min_silence_at_max_speech / 1000
audio_length_samples = len(audio) audio_length_samples = len(audio)
@@ -91,6 +100,8 @@ def get_speech_timestamps(
triggered = False triggered = False
speeches = [] speeches = []
current_speech = {} current_speech = {}
possible_ends = []
if neg_threshold is None: if neg_threshold is None:
neg_threshold = max(threshold - 0.15, 0.01) neg_threshold = max(threshold - 0.15, 0.01)
@@ -100,45 +111,67 @@ def get_speech_timestamps(
prev_end = next_start = 0 prev_end = next_start = 0
for i, speech_prob in enumerate(speech_probs): for i, speech_prob in enumerate(speech_probs):
cur_sample = window_size_samples * i
if (speech_prob >= threshold) and temp_end: if (speech_prob >= threshold) and temp_end:
sil_dur = cur_sample - temp_end
if sil_dur > min_silence_samples_at_max_speech:
possible_ends.append((temp_end, sil_dur))
temp_end = 0 temp_end = 0
if next_start < prev_end: if next_start < prev_end:
next_start = window_size_samples * i next_start = cur_sample
if (speech_prob >= threshold) and not triggered: if (speech_prob >= threshold) and not triggered:
triggered = True triggered = True
current_speech["start"] = window_size_samples * i current_speech["start"] = cur_sample
continue continue
if ( if triggered and (cur_sample - current_speech["start"] > max_speech_samples):
triggered if use_max_poss_sil_at_max_speech and possible_ends:
and (window_size_samples * i) - current_speech["start"] > max_speech_samples prev_end, dur = max(possible_ends, key=lambda x: x[1])
): current_speech["end"] = prev_end
speeches.append(current_speech)
current_speech = {}
next_start = prev_end + dur
if next_start < prev_end + cur_sample:
current_speech["start"] = next_start
else:
triggered = False
prev_end = next_start = temp_end = 0
possible_ends = []
else:
if prev_end: if prev_end:
current_speech["end"] = prev_end current_speech["end"] = prev_end
speeches.append(current_speech) speeches.append(current_speech)
current_speech = {} current_speech = {}
# previously reached silence (< neg_thres) and is still not speech (< thres)
if next_start < prev_end: if next_start < prev_end:
triggered = False triggered = False
else: else:
current_speech["start"] = next_start current_speech["start"] = next_start
prev_end = next_start = temp_end = 0 prev_end = next_start = temp_end = 0
possible_ends = []
else: else:
current_speech["end"] = window_size_samples * i current_speech["end"] = cur_sample
speeches.append(current_speech) speeches.append(current_speech)
current_speech = {} current_speech = {}
prev_end = next_start = temp_end = 0 prev_end = next_start = temp_end = 0
triggered = False triggered = False
possible_ends = []
continue continue
if (speech_prob < neg_threshold) and triggered: if (speech_prob < neg_threshold) and triggered:
if not temp_end: if not temp_end:
temp_end = window_size_samples * i temp_end = cur_sample
# condition to avoid cutting in very short silence sil_dur_now = cur_sample - temp_end
if (window_size_samples * i) - temp_end > min_silence_samples_at_max_speech:
if (
not use_max_poss_sil_at_max_speech
and sil_dur_now > min_silence_samples_at_max_speech
):
prev_end = temp_end prev_end = temp_end
if (window_size_samples * i) - temp_end < min_silence_samples:
if sil_dur_now < min_silence_samples:
continue continue
else: else:
current_speech["end"] = temp_end current_speech["end"] = temp_end
@@ -149,6 +182,7 @@ def get_speech_timestamps(
current_speech = {} current_speech = {}
prev_end = next_start = temp_end = 0 prev_end = next_start = temp_end = 0
triggered = False triggered = False
possible_ends = []
continue continue
if ( if (