mirror of
https://github.com/SYSTRAN/faster-whisper.git
synced 2026-01-09 13:38:01 -05:00
Brings back original VAD parameters naming (#1181)
This commit is contained in:
@@ -16,14 +16,14 @@ class VadOptions:
|
||||
"""VAD options.
|
||||
|
||||
Attributes:
|
||||
onset: Speech threshold. Silero VAD outputs speech probabilities for each audio chunk,
|
||||
threshold: Speech threshold. Silero VAD outputs speech probabilities for each audio chunk,
|
||||
probabilities ABOVE this value are considered as SPEECH. It is better to tune this
|
||||
parameter for each dataset separately, but "lazy" 0.5 is pretty good for most datasets.
|
||||
offset: Silence threshold for determining the end of speech. If a probability is lower than
|
||||
the offset, it is always considered silence. Values higher than offset are only considered
|
||||
speech if the previous sample was classified as speech; otherwise, they are treated as
|
||||
silence. This parameter helps refine the detection of speech transitions, ensuring smoother
|
||||
segment boundaries.
|
||||
neg_threshold: Silence threshold for determining the end of speech. If a probability is lower
|
||||
than neg_threshold, it is always considered silence. Values higher than neg_threshold
|
||||
are only considered speech if the previous sample was classified as speech; otherwise,
|
||||
they are treated as silence. This parameter helps refine the detection of speech
|
||||
transitions, ensuring smoother segment boundaries.
|
||||
min_speech_duration_ms: Final speech chunks shorter min_speech_duration_ms are thrown out.
|
||||
max_speech_duration_s: Maximum duration of speech chunks in seconds. Chunks longer
|
||||
than max_speech_duration_s will be split at the timestamp of the last silence that
|
||||
@@ -34,8 +34,8 @@ class VadOptions:
|
||||
speech_pad_ms: Final speech chunks are padded by speech_pad_ms each side
|
||||
"""
|
||||
|
||||
onset: float = 0.5
|
||||
offset: float = onset - 0.15
|
||||
threshold: float = 0.5
|
||||
neg_threshold: float = threshold - 0.15
|
||||
min_speech_duration_ms: int = 0
|
||||
max_speech_duration_s: float = float("inf")
|
||||
min_silence_duration_ms: int = 2000
|
||||
@@ -62,7 +62,7 @@ def get_speech_timestamps(
|
||||
if vad_options is None:
|
||||
vad_options = VadOptions(**kwargs)
|
||||
|
||||
onset = vad_options.onset
|
||||
threshold = vad_options.threshold
|
||||
min_speech_duration_ms = vad_options.min_speech_duration_ms
|
||||
max_speech_duration_s = vad_options.max_speech_duration_s
|
||||
min_silence_duration_ms = vad_options.min_silence_duration_ms
|
||||
@@ -90,7 +90,7 @@ def get_speech_timestamps(
|
||||
triggered = False
|
||||
speeches = []
|
||||
current_speech = {}
|
||||
offset = vad_options.offset
|
||||
neg_threshold = vad_options.neg_threshold
|
||||
|
||||
# to save potential segment end (and tolerate some silence)
|
||||
temp_end = 0
|
||||
@@ -98,12 +98,12 @@ def get_speech_timestamps(
|
||||
prev_end = next_start = 0
|
||||
|
||||
for i, speech_prob in enumerate(speech_probs):
|
||||
if (speech_prob >= onset) and temp_end:
|
||||
if (speech_prob >= threshold) and temp_end:
|
||||
temp_end = 0
|
||||
if next_start < prev_end:
|
||||
next_start = window_size_samples * i
|
||||
|
||||
if (speech_prob >= onset) and not triggered:
|
||||
if (speech_prob >= threshold) and not triggered:
|
||||
triggered = True
|
||||
current_speech["start"] = window_size_samples * i
|
||||
continue
|
||||
@@ -130,7 +130,7 @@ def get_speech_timestamps(
|
||||
triggered = False
|
||||
continue
|
||||
|
||||
if (speech_prob < offset) and triggered:
|
||||
if (speech_prob < neg_threshold) and triggered:
|
||||
if not temp_end:
|
||||
temp_end = window_size_samples * i
|
||||
# condition to avoid cutting in very short silence
|
||||
|
||||
Reference in New Issue
Block a user