mirror of
https://github.com/SYSTRAN/faster-whisper.git
synced 2026-01-09 13:38:01 -05:00
Adds new VAD parameters (#1386)
* Adds new VAD parameters Adds new VAD parameters: min_silence_at_max_speech: Minimum silence duration in ms which is used to avoid abrupt cuts when max_speech_duration_s is reached. use_max_poss_sil_at_max_speech: Whether to use the maximum possible silence at max_speech_duration_s or not. If not, the last silence is used. * Style * Update doc * change min_speech_duration_ms (0 -> 250) * Change min_speech_duration_ms to zero Set minimum speech duration to zero for flexibility. --------- Co-authored-by: Mahmoud Ashraf <hassouna97.ma@gmail.com>
This commit is contained in:
@@ -27,11 +27,15 @@ class VadOptions:
|
|||||||
min_speech_duration_ms: Final speech chunks shorter min_speech_duration_ms are thrown out.
|
min_speech_duration_ms: Final speech chunks shorter min_speech_duration_ms are thrown out.
|
||||||
max_speech_duration_s: Maximum duration of speech chunks in seconds. Chunks longer
|
max_speech_duration_s: Maximum duration of speech chunks in seconds. Chunks longer
|
||||||
than max_speech_duration_s will be split at the timestamp of the last silence that
|
than max_speech_duration_s will be split at the timestamp of the last silence that
|
||||||
lasts more than 100ms (if any), to prevent aggressive cutting. Otherwise, they will be
|
lasts more than min_silence_at_max_speech (if any), to prevent aggressive cutting.
|
||||||
split aggressively just before max_speech_duration_s.
|
Otherwise, they will be split aggressively just before max_speech_duration_s.
|
||||||
min_silence_duration_ms: In the end of each speech chunk wait for min_silence_duration_ms
|
min_silence_duration_ms: In the end of each speech chunk wait for min_silence_duration_ms
|
||||||
before separating it
|
before separating it
|
||||||
speech_pad_ms: Final speech chunks are padded by speech_pad_ms each side
|
speech_pad_ms: Final speech chunks are padded by speech_pad_ms each side
|
||||||
|
min_silence_at_max_speech: Minimum silence duration in ms which is used to avoid abrupt cuts
|
||||||
|
when max_speech_duration_s is reached.
|
||||||
|
use_max_poss_sil_at_max_speech: Whether to use the maximum possible silence at
|
||||||
|
max_speech_duration_s or not. If not, the last silence is used.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
threshold: float = 0.5
|
threshold: float = 0.5
|
||||||
@@ -40,6 +44,8 @@ class VadOptions:
|
|||||||
max_speech_duration_s: float = float("inf")
|
max_speech_duration_s: float = float("inf")
|
||||||
min_silence_duration_ms: int = 2000
|
min_silence_duration_ms: int = 2000
|
||||||
speech_pad_ms: int = 400
|
speech_pad_ms: int = 400
|
||||||
|
min_silence_at_max_speech: int = 98
|
||||||
|
use_max_poss_sil_at_max_speech: bool = True
|
||||||
|
|
||||||
|
|
||||||
def get_speech_timestamps(
|
def get_speech_timestamps(
|
||||||
@@ -69,6 +75,9 @@ def get_speech_timestamps(
|
|||||||
min_silence_duration_ms = vad_options.min_silence_duration_ms
|
min_silence_duration_ms = vad_options.min_silence_duration_ms
|
||||||
window_size_samples = 512
|
window_size_samples = 512
|
||||||
speech_pad_ms = vad_options.speech_pad_ms
|
speech_pad_ms = vad_options.speech_pad_ms
|
||||||
|
min_silence_at_max_speech = vad_options.min_silence_at_max_speech
|
||||||
|
use_max_poss_sil_at_max_speech = vad_options.use_max_poss_sil_at_max_speech
|
||||||
|
|
||||||
min_speech_samples = sampling_rate * min_speech_duration_ms / 1000
|
min_speech_samples = sampling_rate * min_speech_duration_ms / 1000
|
||||||
speech_pad_samples = sampling_rate * speech_pad_ms / 1000
|
speech_pad_samples = sampling_rate * speech_pad_ms / 1000
|
||||||
max_speech_samples = (
|
max_speech_samples = (
|
||||||
@@ -77,7 +86,7 @@ def get_speech_timestamps(
|
|||||||
- 2 * speech_pad_samples
|
- 2 * speech_pad_samples
|
||||||
)
|
)
|
||||||
min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
|
min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
|
||||||
min_silence_samples_at_max_speech = sampling_rate * 98 / 1000
|
min_silence_samples_at_max_speech = sampling_rate * min_silence_at_max_speech / 1000
|
||||||
|
|
||||||
audio_length_samples = len(audio)
|
audio_length_samples = len(audio)
|
||||||
|
|
||||||
@@ -91,6 +100,8 @@ def get_speech_timestamps(
|
|||||||
triggered = False
|
triggered = False
|
||||||
speeches = []
|
speeches = []
|
||||||
current_speech = {}
|
current_speech = {}
|
||||||
|
possible_ends = []
|
||||||
|
|
||||||
if neg_threshold is None:
|
if neg_threshold is None:
|
||||||
neg_threshold = max(threshold - 0.15, 0.01)
|
neg_threshold = max(threshold - 0.15, 0.01)
|
||||||
|
|
||||||
@@ -100,45 +111,67 @@ def get_speech_timestamps(
|
|||||||
prev_end = next_start = 0
|
prev_end = next_start = 0
|
||||||
|
|
||||||
for i, speech_prob in enumerate(speech_probs):
|
for i, speech_prob in enumerate(speech_probs):
|
||||||
|
cur_sample = window_size_samples * i
|
||||||
|
|
||||||
if (speech_prob >= threshold) and temp_end:
|
if (speech_prob >= threshold) and temp_end:
|
||||||
|
sil_dur = cur_sample - temp_end
|
||||||
|
if sil_dur > min_silence_samples_at_max_speech:
|
||||||
|
possible_ends.append((temp_end, sil_dur))
|
||||||
temp_end = 0
|
temp_end = 0
|
||||||
if next_start < prev_end:
|
if next_start < prev_end:
|
||||||
next_start = window_size_samples * i
|
next_start = cur_sample
|
||||||
|
|
||||||
if (speech_prob >= threshold) and not triggered:
|
if (speech_prob >= threshold) and not triggered:
|
||||||
triggered = True
|
triggered = True
|
||||||
current_speech["start"] = window_size_samples * i
|
current_speech["start"] = cur_sample
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if (
|
if triggered and (cur_sample - current_speech["start"] > max_speech_samples):
|
||||||
triggered
|
if use_max_poss_sil_at_max_speech and possible_ends:
|
||||||
and (window_size_samples * i) - current_speech["start"] > max_speech_samples
|
prev_end, dur = max(possible_ends, key=lambda x: x[1])
|
||||||
):
|
current_speech["end"] = prev_end
|
||||||
|
speeches.append(current_speech)
|
||||||
|
current_speech = {}
|
||||||
|
next_start = prev_end + dur
|
||||||
|
|
||||||
|
if next_start < prev_end + cur_sample:
|
||||||
|
current_speech["start"] = next_start
|
||||||
|
else:
|
||||||
|
triggered = False
|
||||||
|
prev_end = next_start = temp_end = 0
|
||||||
|
possible_ends = []
|
||||||
|
else:
|
||||||
if prev_end:
|
if prev_end:
|
||||||
current_speech["end"] = prev_end
|
current_speech["end"] = prev_end
|
||||||
speeches.append(current_speech)
|
speeches.append(current_speech)
|
||||||
current_speech = {}
|
current_speech = {}
|
||||||
# previously reached silence (< neg_thres) and is still not speech (< thres)
|
|
||||||
if next_start < prev_end:
|
if next_start < prev_end:
|
||||||
triggered = False
|
triggered = False
|
||||||
else:
|
else:
|
||||||
current_speech["start"] = next_start
|
current_speech["start"] = next_start
|
||||||
prev_end = next_start = temp_end = 0
|
prev_end = next_start = temp_end = 0
|
||||||
|
possible_ends = []
|
||||||
else:
|
else:
|
||||||
current_speech["end"] = window_size_samples * i
|
current_speech["end"] = cur_sample
|
||||||
speeches.append(current_speech)
|
speeches.append(current_speech)
|
||||||
current_speech = {}
|
current_speech = {}
|
||||||
prev_end = next_start = temp_end = 0
|
prev_end = next_start = temp_end = 0
|
||||||
triggered = False
|
triggered = False
|
||||||
|
possible_ends = []
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if (speech_prob < neg_threshold) and triggered:
|
if (speech_prob < neg_threshold) and triggered:
|
||||||
if not temp_end:
|
if not temp_end:
|
||||||
temp_end = window_size_samples * i
|
temp_end = cur_sample
|
||||||
# condition to avoid cutting in very short silence
|
sil_dur_now = cur_sample - temp_end
|
||||||
if (window_size_samples * i) - temp_end > min_silence_samples_at_max_speech:
|
|
||||||
|
if (
|
||||||
|
not use_max_poss_sil_at_max_speech
|
||||||
|
and sil_dur_now > min_silence_samples_at_max_speech
|
||||||
|
):
|
||||||
prev_end = temp_end
|
prev_end = temp_end
|
||||||
if (window_size_samples * i) - temp_end < min_silence_samples:
|
|
||||||
|
if sil_dur_now < min_silence_samples:
|
||||||
continue
|
continue
|
||||||
else:
|
else:
|
||||||
current_speech["end"] = temp_end
|
current_speech["end"] = temp_end
|
||||||
@@ -149,6 +182,7 @@ def get_speech_timestamps(
|
|||||||
current_speech = {}
|
current_speech = {}
|
||||||
prev_end = next_start = temp_end = 0
|
prev_end = next_start = temp_end = 0
|
||||||
triggered = False
|
triggered = False
|
||||||
|
possible_ends = []
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if (
|
if (
|
||||||
|
|||||||
Reference in New Issue
Block a user