Brings back original VAD parameters naming (#1181)

This commit is contained in:
Purfview
2024-12-01 17:41:53 +00:00
committed by GitHub
parent 22a5238b56
commit 8327d8cc64

View File

@@ -16,14 +16,14 @@ class VadOptions:
"""VAD options. """VAD options.
Attributes: Attributes:
onset: Speech threshold. Silero VAD outputs speech probabilities for each audio chunk, threshold: Speech threshold. Silero VAD outputs speech probabilities for each audio chunk,
probabilities ABOVE this value are considered as SPEECH. It is better to tune this probabilities ABOVE this value are considered as SPEECH. It is better to tune this
parameter for each dataset separately, but "lazy" 0.5 is pretty good for most datasets. parameter for each dataset separately, but "lazy" 0.5 is pretty good for most datasets.
offset: Silence threshold for determining the end of speech. If a probability is lower than neg_threshold: Silence threshold for determining the end of speech. If a probability is lower
the offset, it is always considered silence. Values higher than offset are only considered than neg_threshold, it is always considered silence. Values higher than neg_threshold
speech if the previous sample was classified as speech; otherwise, they are treated as are only considered speech if the previous sample was classified as speech; otherwise,
silence. This parameter helps refine the detection of speech transitions, ensuring smoother they are treated as silence. This parameter helps refine the detection of speech
segment boundaries. transitions, ensuring smoother segment boundaries.
min_speech_duration_ms: Final speech chunks shorter min_speech_duration_ms are thrown out. min_speech_duration_ms: Final speech chunks shorter min_speech_duration_ms are thrown out.
max_speech_duration_s: Maximum duration of speech chunks in seconds. Chunks longer max_speech_duration_s: Maximum duration of speech chunks in seconds. Chunks longer
than max_speech_duration_s will be split at the timestamp of the last silence that than max_speech_duration_s will be split at the timestamp of the last silence that
@@ -34,8 +34,8 @@ class VadOptions:
speech_pad_ms: Final speech chunks are padded by speech_pad_ms each side speech_pad_ms: Final speech chunks are padded by speech_pad_ms each side
""" """
onset: float = 0.5 threshold: float = 0.5
offset: float = onset - 0.15 neg_threshold: float = threshold - 0.15
min_speech_duration_ms: int = 0 min_speech_duration_ms: int = 0
max_speech_duration_s: float = float("inf") max_speech_duration_s: float = float("inf")
min_silence_duration_ms: int = 2000 min_silence_duration_ms: int = 2000
@@ -62,7 +62,7 @@ def get_speech_timestamps(
if vad_options is None: if vad_options is None:
vad_options = VadOptions(**kwargs) vad_options = VadOptions(**kwargs)
onset = vad_options.onset threshold = vad_options.threshold
min_speech_duration_ms = vad_options.min_speech_duration_ms min_speech_duration_ms = vad_options.min_speech_duration_ms
max_speech_duration_s = vad_options.max_speech_duration_s max_speech_duration_s = vad_options.max_speech_duration_s
min_silence_duration_ms = vad_options.min_silence_duration_ms min_silence_duration_ms = vad_options.min_silence_duration_ms
@@ -90,7 +90,7 @@ def get_speech_timestamps(
triggered = False triggered = False
speeches = [] speeches = []
current_speech = {} current_speech = {}
offset = vad_options.offset neg_threshold = vad_options.neg_threshold
# to save potential segment end (and tolerate some silence) # to save potential segment end (and tolerate some silence)
temp_end = 0 temp_end = 0
@@ -98,12 +98,12 @@ def get_speech_timestamps(
prev_end = next_start = 0 prev_end = next_start = 0
for i, speech_prob in enumerate(speech_probs): for i, speech_prob in enumerate(speech_probs):
if (speech_prob >= onset) and temp_end: if (speech_prob >= threshold) and temp_end:
temp_end = 0 temp_end = 0
if next_start < prev_end: if next_start < prev_end:
next_start = window_size_samples * i next_start = window_size_samples * i
if (speech_prob >= onset) and not triggered: if (speech_prob >= threshold) and not triggered:
triggered = True triggered = True
current_speech["start"] = window_size_samples * i current_speech["start"] = window_size_samples * i
continue continue
@@ -130,7 +130,7 @@ def get_speech_timestamps(
triggered = False triggered = False
continue continue
if (speech_prob < offset) and triggered: if (speech_prob < neg_threshold) and triggered:
if not temp_end: if not temp_end:
temp_end = window_size_samples * i temp_end = window_size_samples * i
# condition to avoid cutting in very short silence # condition to avoid cutting in very short silence