mirror of
https://github.com/SYSTRAN/faster-whisper.git
synced 2026-01-09 21:48:08 -05:00
Brings back original VAD parameters naming (#1181)
This commit is contained in:
@@ -16,14 +16,14 @@ class VadOptions:
|
|||||||
"""VAD options.
|
"""VAD options.
|
||||||
|
|
||||||
Attributes:
|
Attributes:
|
||||||
onset: Speech threshold. Silero VAD outputs speech probabilities for each audio chunk,
|
threshold: Speech threshold. Silero VAD outputs speech probabilities for each audio chunk,
|
||||||
probabilities ABOVE this value are considered as SPEECH. It is better to tune this
|
probabilities ABOVE this value are considered as SPEECH. It is better to tune this
|
||||||
parameter for each dataset separately, but "lazy" 0.5 is pretty good for most datasets.
|
parameter for each dataset separately, but "lazy" 0.5 is pretty good for most datasets.
|
||||||
offset: Silence threshold for determining the end of speech. If a probability is lower than
|
neg_threshold: Silence threshold for determining the end of speech. If a probability is lower
|
||||||
the offset, it is always considered silence. Values higher than offset are only considered
|
than neg_threshold, it is always considered silence. Values higher than neg_threshold
|
||||||
speech if the previous sample was classified as speech; otherwise, they are treated as
|
are only considered speech if the previous sample was classified as speech; otherwise,
|
||||||
silence. This parameter helps refine the detection of speech transitions, ensuring smoother
|
they are treated as silence. This parameter helps refine the detection of speech
|
||||||
segment boundaries.
|
transitions, ensuring smoother segment boundaries.
|
||||||
min_speech_duration_ms: Final speech chunks shorter min_speech_duration_ms are thrown out.
|
min_speech_duration_ms: Final speech chunks shorter min_speech_duration_ms are thrown out.
|
||||||
max_speech_duration_s: Maximum duration of speech chunks in seconds. Chunks longer
|
max_speech_duration_s: Maximum duration of speech chunks in seconds. Chunks longer
|
||||||
than max_speech_duration_s will be split at the timestamp of the last silence that
|
than max_speech_duration_s will be split at the timestamp of the last silence that
|
||||||
@@ -34,8 +34,8 @@ class VadOptions:
|
|||||||
speech_pad_ms: Final speech chunks are padded by speech_pad_ms each side
|
speech_pad_ms: Final speech chunks are padded by speech_pad_ms each side
|
||||||
"""
|
"""
|
||||||
|
|
||||||
onset: float = 0.5
|
threshold: float = 0.5
|
||||||
offset: float = onset - 0.15
|
neg_threshold: float = threshold - 0.15
|
||||||
min_speech_duration_ms: int = 0
|
min_speech_duration_ms: int = 0
|
||||||
max_speech_duration_s: float = float("inf")
|
max_speech_duration_s: float = float("inf")
|
||||||
min_silence_duration_ms: int = 2000
|
min_silence_duration_ms: int = 2000
|
||||||
@@ -62,7 +62,7 @@ def get_speech_timestamps(
|
|||||||
if vad_options is None:
|
if vad_options is None:
|
||||||
vad_options = VadOptions(**kwargs)
|
vad_options = VadOptions(**kwargs)
|
||||||
|
|
||||||
onset = vad_options.onset
|
threshold = vad_options.threshold
|
||||||
min_speech_duration_ms = vad_options.min_speech_duration_ms
|
min_speech_duration_ms = vad_options.min_speech_duration_ms
|
||||||
max_speech_duration_s = vad_options.max_speech_duration_s
|
max_speech_duration_s = vad_options.max_speech_duration_s
|
||||||
min_silence_duration_ms = vad_options.min_silence_duration_ms
|
min_silence_duration_ms = vad_options.min_silence_duration_ms
|
||||||
@@ -90,7 +90,7 @@ def get_speech_timestamps(
|
|||||||
triggered = False
|
triggered = False
|
||||||
speeches = []
|
speeches = []
|
||||||
current_speech = {}
|
current_speech = {}
|
||||||
offset = vad_options.offset
|
neg_threshold = vad_options.neg_threshold
|
||||||
|
|
||||||
# to save potential segment end (and tolerate some silence)
|
# to save potential segment end (and tolerate some silence)
|
||||||
temp_end = 0
|
temp_end = 0
|
||||||
@@ -98,12 +98,12 @@ def get_speech_timestamps(
|
|||||||
prev_end = next_start = 0
|
prev_end = next_start = 0
|
||||||
|
|
||||||
for i, speech_prob in enumerate(speech_probs):
|
for i, speech_prob in enumerate(speech_probs):
|
||||||
if (speech_prob >= onset) and temp_end:
|
if (speech_prob >= threshold) and temp_end:
|
||||||
temp_end = 0
|
temp_end = 0
|
||||||
if next_start < prev_end:
|
if next_start < prev_end:
|
||||||
next_start = window_size_samples * i
|
next_start = window_size_samples * i
|
||||||
|
|
||||||
if (speech_prob >= onset) and not triggered:
|
if (speech_prob >= threshold) and not triggered:
|
||||||
triggered = True
|
triggered = True
|
||||||
current_speech["start"] = window_size_samples * i
|
current_speech["start"] = window_size_samples * i
|
||||||
continue
|
continue
|
||||||
@@ -130,7 +130,7 @@ def get_speech_timestamps(
|
|||||||
triggered = False
|
triggered = False
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if (speech_prob < offset) and triggered:
|
if (speech_prob < neg_threshold) and triggered:
|
||||||
if not temp_end:
|
if not temp_end:
|
||||||
temp_end = window_size_samples * i
|
temp_end = window_size_samples * i
|
||||||
# condition to avoid cutting in very short silence
|
# condition to avoid cutting in very short silence
|
||||||
|
|||||||
Reference in New Issue
Block a user