Brings back original VAD parameters naming (#1181)

2026-01-08 13:14:00 -05:00 · 2024-12-01 17:41:53 +00:00
parent 22a5238b56
commit 8327d8cc64
1 changed files with 13 additions and 13 deletions
--- a/faster_whisper/vad.py
+++ b/faster_whisper/vad.py
@@ -16,14 +16,14 @@ class VadOptions:
    """VAD options.

    Attributes:
-      onset: Speech threshold. Silero VAD outputs speech probabilities for each audio chunk,
+      threshold: Speech threshold. Silero VAD outputs speech probabilities for each audio chunk,
        probabilities ABOVE this value are considered as SPEECH. It is better to tune this
        parameter for each dataset separately, but "lazy" 0.5 is pretty good for most datasets.
-      offset: Silence threshold for determining the end of speech. If a probability is lower than
-        the offset, it is always considered silence. Values higher than offset are only considered
-        speech if the previous sample was classified as speech; otherwise, they are treated as
-        silence. This parameter helps refine the detection of speech transitions, ensuring smoother
-        segment boundaries.
+      neg_threshold: Silence threshold for determining the end of speech. If a probability is lower
+        than neg_threshold, it is always considered silence. Values higher than neg_threshold
+        are only considered speech if the previous sample was classified as speech; otherwise,
+        they are treated as silence. This parameter helps refine the detection of speech
+         transitions, ensuring smoother segment boundaries.
      min_speech_duration_ms: Final speech chunks shorter min_speech_duration_ms are thrown out.
      max_speech_duration_s: Maximum duration of speech chunks in seconds. Chunks longer
        than max_speech_duration_s will be split at the timestamp of the last silence that
@@ -34,8 +34,8 @@ class VadOptions:
      speech_pad_ms: Final speech chunks are padded by speech_pad_ms each side
    """

-    onset: float = 0.5
-    offset: float = onset - 0.15
+    threshold: float = 0.5
+    neg_threshold: float = threshold - 0.15
    min_speech_duration_ms: int = 0
    max_speech_duration_s: float = float("inf")
    min_silence_duration_ms: int = 2000
@@ -62,7 +62,7 @@ def get_speech_timestamps(
    if vad_options is None:
        vad_options = VadOptions(**kwargs)

-    onset = vad_options.onset
+    threshold = vad_options.threshold
    min_speech_duration_ms = vad_options.min_speech_duration_ms
    max_speech_duration_s = vad_options.max_speech_duration_s
    min_silence_duration_ms = vad_options.min_silence_duration_ms
@@ -90,7 +90,7 @@ def get_speech_timestamps(
    triggered = False
    speeches = []
    current_speech = {}
-    offset = vad_options.offset
+    neg_threshold = vad_options.neg_threshold

    # to save potential segment end (and tolerate some silence)
    temp_end = 0
@@ -98,12 +98,12 @@ def get_speech_timestamps(
    prev_end = next_start = 0

    for i, speech_prob in enumerate(speech_probs):
-        if (speech_prob >= onset) and temp_end:
+        if (speech_prob >= threshold) and temp_end:
            temp_end = 0
            if next_start < prev_end:
                next_start = window_size_samples * i

-        if (speech_prob >= onset) and not triggered:
+        if (speech_prob >= threshold) and not triggered:
            triggered = True
            current_speech["start"] = window_size_samples * i
            continue
@@ -130,7 +130,7 @@ def get_speech_timestamps(
                triggered = False
                continue

-        if (speech_prob < offset) and triggered:
+        if (speech_prob < neg_threshold) and triggered:
            if not temp_end:
                temp_end = window_size_samples * i
            # condition to avoid cutting in very short silence