3 Commits

Author SHA1 Message Date
Purfview
ed9a06cd89 Adds new VAD parameters (#1386)
* Adds new VAD parameters

Adds new VAD parameters: 

min_silence_at_max_speech: Minimum silence duration in ms which is used to avoid abrupt cuts when max_speech_duration_s is reached.

use_max_poss_sil_at_max_speech: Whether to use the maximum possible silence at max_speech_duration_s or not. If not, the last silence is used.

* Style

* Update doc

* change min_speech_duration_ms (0 -> 250)

* Change min_speech_duration_ms to zero

Set minimum speech duration to zero for flexibility.

---------

Co-authored-by: Mahmoud Ashraf <hassouna97.ma@gmail.com>
2025-11-19 17:40:46 +03:00
Purfview
2eeafe05de Update Silero-VAD weights to v6.2 (#1390)
* Update Silero-VAD weights to v6.2

Overall slight quality improvement (no metrics update);
Higher stability on OOD / rare / strange / unique data;
Significant quality improvements on various known edge cases:

    Unusual voices
    Child voices
    Cartoon voices
    Muted voices
    Muted speech
    Lower quality phone calls

https://github.com/snakers4/silero-vad/releases/tag/v6.2

* Changes: tiny -> base in test_monotonic_timestamps()
2025-11-19 17:14:42 +03:00
Purfview
cf42429f96 Remove "local_dir_use_symlinks" from download_model() (#1389)
* Remove "local_dir_use_symlinks" from download_model()

It's deprecated since huggingface_hub v0.23.0 and produce this warning:

>   /opt/hostedtoolcache/Python/3.9.24/x64/lib/python3.9/site-packages/huggingface_hub/utils/_validators.py:202: UserWarning: The `local_dir_use_symlinks` argument is deprecated and ignored in `snapshot_download`. Downloading to a local directory does not use symlinks anymore.

* Bump huggingface_hub requirement to v0.23
2025-11-18 21:59:01 +03:00
5 changed files with 61 additions and 28 deletions

View File

@@ -105,7 +105,6 @@ def download_model(
if output_dir is not None:
kwargs["local_dir"] = output_dir
kwargs["local_dir_use_symlinks"] = False
if cache_dir is not None:
kwargs["cache_dir"] = cache_dir

View File

@@ -27,11 +27,15 @@ class VadOptions:
min_speech_duration_ms: Final speech chunks shorter min_speech_duration_ms are thrown out.
max_speech_duration_s: Maximum duration of speech chunks in seconds. Chunks longer
than max_speech_duration_s will be split at the timestamp of the last silence that
lasts more than 100ms (if any), to prevent aggressive cutting. Otherwise, they will be
split aggressively just before max_speech_duration_s.
lasts more than min_silence_at_max_speech (if any), to prevent aggressive cutting.
Otherwise, they will be split aggressively just before max_speech_duration_s.
min_silence_duration_ms: In the end of each speech chunk wait for min_silence_duration_ms
before separating it
speech_pad_ms: Final speech chunks are padded by speech_pad_ms each side
min_silence_at_max_speech: Minimum silence duration in ms which is used to avoid abrupt cuts
when max_speech_duration_s is reached.
use_max_poss_sil_at_max_speech: Whether to use the maximum possible silence at
max_speech_duration_s or not. If not, the last silence is used.
"""
threshold: float = 0.5
@@ -40,6 +44,8 @@ class VadOptions:
max_speech_duration_s: float = float("inf")
min_silence_duration_ms: int = 2000
speech_pad_ms: int = 400
min_silence_at_max_speech: int = 98
use_max_poss_sil_at_max_speech: bool = True
def get_speech_timestamps(
@@ -69,6 +75,9 @@ def get_speech_timestamps(
min_silence_duration_ms = vad_options.min_silence_duration_ms
window_size_samples = 512
speech_pad_ms = vad_options.speech_pad_ms
min_silence_at_max_speech = vad_options.min_silence_at_max_speech
use_max_poss_sil_at_max_speech = vad_options.use_max_poss_sil_at_max_speech
min_speech_samples = sampling_rate * min_speech_duration_ms / 1000
speech_pad_samples = sampling_rate * speech_pad_ms / 1000
max_speech_samples = (
@@ -77,7 +86,7 @@ def get_speech_timestamps(
- 2 * speech_pad_samples
)
min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
min_silence_samples_at_max_speech = sampling_rate * 98 / 1000
min_silence_samples_at_max_speech = sampling_rate * min_silence_at_max_speech / 1000
audio_length_samples = len(audio)
@@ -91,6 +100,8 @@ def get_speech_timestamps(
triggered = False
speeches = []
current_speech = {}
possible_ends = []
if neg_threshold is None:
neg_threshold = max(threshold - 0.15, 0.01)
@@ -100,45 +111,67 @@ def get_speech_timestamps(
prev_end = next_start = 0
for i, speech_prob in enumerate(speech_probs):
cur_sample = window_size_samples * i
if (speech_prob >= threshold) and temp_end:
sil_dur = cur_sample - temp_end
if sil_dur > min_silence_samples_at_max_speech:
possible_ends.append((temp_end, sil_dur))
temp_end = 0
if next_start < prev_end:
next_start = window_size_samples * i
next_start = cur_sample
if (speech_prob >= threshold) and not triggered:
triggered = True
current_speech["start"] = window_size_samples * i
current_speech["start"] = cur_sample
continue
if (
triggered
and (window_size_samples * i) - current_speech["start"] > max_speech_samples
):
if prev_end:
if triggered and (cur_sample - current_speech["start"] > max_speech_samples):
if use_max_poss_sil_at_max_speech and possible_ends:
prev_end, dur = max(possible_ends, key=lambda x: x[1])
current_speech["end"] = prev_end
speeches.append(current_speech)
current_speech = {}
# previously reached silence (< neg_thres) and is still not speech (< thres)
if next_start < prev_end:
triggered = False
else:
next_start = prev_end + dur
if next_start < prev_end + cur_sample:
current_speech["start"] = next_start
else:
triggered = False
prev_end = next_start = temp_end = 0
possible_ends = []
else:
current_speech["end"] = window_size_samples * i
speeches.append(current_speech)
current_speech = {}
prev_end = next_start = temp_end = 0
triggered = False
continue
if prev_end:
current_speech["end"] = prev_end
speeches.append(current_speech)
current_speech = {}
if next_start < prev_end:
triggered = False
else:
current_speech["start"] = next_start
prev_end = next_start = temp_end = 0
possible_ends = []
else:
current_speech["end"] = cur_sample
speeches.append(current_speech)
current_speech = {}
prev_end = next_start = temp_end = 0
triggered = False
possible_ends = []
continue
if (speech_prob < neg_threshold) and triggered:
if not temp_end:
temp_end = window_size_samples * i
# condition to avoid cutting in very short silence
if (window_size_samples * i) - temp_end > min_silence_samples_at_max_speech:
temp_end = cur_sample
sil_dur_now = cur_sample - temp_end
if (
not use_max_poss_sil_at_max_speech
and sil_dur_now > min_silence_samples_at_max_speech
):
prev_end = temp_end
if (window_size_samples * i) - temp_end < min_silence_samples:
if sil_dur_now < min_silence_samples:
continue
else:
current_speech["end"] = temp_end
@@ -149,6 +182,7 @@ def get_speech_timestamps(
current_speech = {}
prev_end = next_start = temp_end = 0
triggered = False
possible_ends = []
continue
if (

View File

@@ -1,6 +1,6 @@
ctranslate2>=4.0,<5
huggingface_hub>=0.21
huggingface_hub>=0.23
tokenizers>=0.13,<1
onnxruntime>=1.14,<2
av>=11
tqdm
tqdm

View File

@@ -245,7 +245,7 @@ def test_transcribe_signature():
def test_monotonic_timestamps(physcisworks_path):
model = WhisperModel("tiny")
model = WhisperModel("base")
pipeline = BatchedInferencePipeline(model=model)
segments, info = model.transcribe(physcisworks_path, word_timestamps=True)