6 Commits
310 ... master

Author SHA1 Message Date
Purfview
ed9a06cd89 Adds new VAD parameters (#1386)
* Adds new VAD parameters

Adds new VAD parameters: 

min_silence_at_max_speech: Minimum silence duration in ms which is used to avoid abrupt cuts when max_speech_duration_s is reached.

use_max_poss_sil_at_max_speech: Whether to use the maximum possible silence at max_speech_duration_s or not. If not, the last silence is used.

* Style

* Update doc

* change min_speech_duration_ms (0 -> 250)

* Change min_speech_duration_ms to zero

Set minimum speech duration to zero for flexibility.

---------

Co-authored-by: Mahmoud Ashraf <hassouna97.ma@gmail.com>
2025-11-19 17:40:46 +03:00
Purfview
2eeafe05de Update Silero-VAD weights to v6.2 (#1390)
* Update Silero-VAD weights to v6.2

Overall slight quality improvement (no metrics update);
Higher stability on OOD / rare / strange / unique data;
Significant quality improvements on various known edge cases:

    Unusual voices
    Child voices
    Cartoon voices
    Muted voices
    Muted speech
    Lower quality phone calls

https://github.com/snakers4/silero-vad/releases/tag/v6.2

* Changes: tiny -> base in test_monotonic_timestamps()
2025-11-19 17:14:42 +03:00
Purfview
cf42429f96 Remove "local_dir_use_symlinks" from download_model() (#1389)
* Remove "local_dir_use_symlinks" from download_model()

It's deprecated since huggingface_hub v0.23.0 and produce this warning:

>   /opt/hostedtoolcache/Python/3.9.24/x64/lib/python3.9/site-packages/huggingface_hub/utils/_validators.py:202: UserWarning: The `local_dir_use_symlinks` argument is deprecated and ignored in `snapshot_download`. Downloading to a local directory does not use symlinks anymore.

* Bump huggingface_hub requirement to v0.23
2025-11-18 21:59:01 +03:00
Mahmoud Ashraf
65882eee9f Bump version to 1.2.1 2025-10-31 14:31:14 +03:00
Mahmoud Ashraf
409a6919f9 Prevent timestamps restoration when clip timestamps are provided in batched inference (#1376) 2025-10-31 14:26:17 +03:00
Mahmoud Ashraf
00a5b26b1f Offload retry logic to hf hub (#1382)
* remove requirement for requests
2025-10-30 22:11:01 +03:00
7 changed files with 103 additions and 51 deletions

View File

@@ -418,23 +418,34 @@ class BatchedInferencePipeline:
"Set 'vad_filter' to True or provide 'clip_timestamps'."
)
clip_timestamps_provided = False
audio_chunks, chunks_metadata = collect_chunks(
audio, clip_timestamps, max_duration=chunk_length
)
else:
clip_timestamps_provided = True
clip_timestamps = [
{k: int(v * sampling_rate) for k, v in segment.items()}
for segment in clip_timestamps
]
audio_chunks, chunks_metadata = [], []
for clip in clip_timestamps:
for i, clip in enumerate(clip_timestamps):
audio_chunks.append(audio[clip["start"] : clip["end"]])
clip_duration = (clip["end"] - clip["start"]) / sampling_rate
if clip_duration > 30:
self.model.logger.warning(
"Segment %d is longer than 30 seconds, "
"only the first 30 seconds will be transcribed",
i,
)
chunks_metadata.append(
{
"offset": clip["start"] / sampling_rate,
"duration": (clip["end"] - clip["start"]) / sampling_rate,
"duration": clip_duration,
"segments": [clip],
}
)
@@ -559,7 +570,10 @@ class BatchedInferencePipeline:
options,
log_progress,
)
segments = restore_speech_timestamps(segments, clip_timestamps, sampling_rate)
if not clip_timestamps_provided:
segments = restore_speech_timestamps(
segments, clip_timestamps, sampling_rate
)
return segments, info

View File

@@ -5,7 +5,6 @@ import re
from typing import List, Optional, Union
import huggingface_hub
import requests
from tqdm.auto import tqdm
@@ -106,7 +105,6 @@ def download_model(
if output_dir is not None:
kwargs["local_dir"] = output_dir
kwargs["local_dir_use_symlinks"] = False
if cache_dir is not None:
kwargs["cache_dir"] = cache_dir
@@ -114,24 +112,7 @@ def download_model(
if use_auth_token is not None:
kwargs["token"] = use_auth_token
try:
return huggingface_hub.snapshot_download(repo_id, **kwargs)
except (
huggingface_hub.utils.HfHubHTTPError,
requests.exceptions.ConnectionError,
) as exception:
logger = get_logger()
logger.warning(
"An error occured while synchronizing the model %s from the Hugging Face Hub:\n%s",
repo_id,
exception,
)
logger.warning(
"Trying to load the model directly from the local cache, if it exists."
)
kwargs["local_files_only"] = True
return huggingface_hub.snapshot_download(repo_id, **kwargs)
return huggingface_hub.snapshot_download(repo_id, **kwargs)
def format_timestamp(

View File

@@ -27,11 +27,15 @@ class VadOptions:
min_speech_duration_ms: Final speech chunks shorter min_speech_duration_ms are thrown out.
max_speech_duration_s: Maximum duration of speech chunks in seconds. Chunks longer
than max_speech_duration_s will be split at the timestamp of the last silence that
lasts more than 100ms (if any), to prevent aggressive cutting. Otherwise, they will be
split aggressively just before max_speech_duration_s.
lasts more than min_silence_at_max_speech (if any), to prevent aggressive cutting.
Otherwise, they will be split aggressively just before max_speech_duration_s.
min_silence_duration_ms: In the end of each speech chunk wait for min_silence_duration_ms
before separating it
speech_pad_ms: Final speech chunks are padded by speech_pad_ms each side
min_silence_at_max_speech: Minimum silence duration in ms which is used to avoid abrupt cuts
when max_speech_duration_s is reached.
use_max_poss_sil_at_max_speech: Whether to use the maximum possible silence at
max_speech_duration_s or not. If not, the last silence is used.
"""
threshold: float = 0.5
@@ -40,6 +44,8 @@ class VadOptions:
max_speech_duration_s: float = float("inf")
min_silence_duration_ms: int = 2000
speech_pad_ms: int = 400
min_silence_at_max_speech: int = 98
use_max_poss_sil_at_max_speech: bool = True
def get_speech_timestamps(
@@ -69,6 +75,9 @@ def get_speech_timestamps(
min_silence_duration_ms = vad_options.min_silence_duration_ms
window_size_samples = 512
speech_pad_ms = vad_options.speech_pad_ms
min_silence_at_max_speech = vad_options.min_silence_at_max_speech
use_max_poss_sil_at_max_speech = vad_options.use_max_poss_sil_at_max_speech
min_speech_samples = sampling_rate * min_speech_duration_ms / 1000
speech_pad_samples = sampling_rate * speech_pad_ms / 1000
max_speech_samples = (
@@ -77,7 +86,7 @@ def get_speech_timestamps(
- 2 * speech_pad_samples
)
min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
min_silence_samples_at_max_speech = sampling_rate * 98 / 1000
min_silence_samples_at_max_speech = sampling_rate * min_silence_at_max_speech / 1000
audio_length_samples = len(audio)
@@ -91,6 +100,8 @@ def get_speech_timestamps(
triggered = False
speeches = []
current_speech = {}
possible_ends = []
if neg_threshold is None:
neg_threshold = max(threshold - 0.15, 0.01)
@@ -100,45 +111,67 @@ def get_speech_timestamps(
prev_end = next_start = 0
for i, speech_prob in enumerate(speech_probs):
cur_sample = window_size_samples * i
if (speech_prob >= threshold) and temp_end:
sil_dur = cur_sample - temp_end
if sil_dur > min_silence_samples_at_max_speech:
possible_ends.append((temp_end, sil_dur))
temp_end = 0
if next_start < prev_end:
next_start = window_size_samples * i
next_start = cur_sample
if (speech_prob >= threshold) and not triggered:
triggered = True
current_speech["start"] = window_size_samples * i
current_speech["start"] = cur_sample
continue
if (
triggered
and (window_size_samples * i) - current_speech["start"] > max_speech_samples
):
if prev_end:
if triggered and (cur_sample - current_speech["start"] > max_speech_samples):
if use_max_poss_sil_at_max_speech and possible_ends:
prev_end, dur = max(possible_ends, key=lambda x: x[1])
current_speech["end"] = prev_end
speeches.append(current_speech)
current_speech = {}
# previously reached silence (< neg_thres) and is still not speech (< thres)
if next_start < prev_end:
triggered = False
else:
next_start = prev_end + dur
if next_start < prev_end + cur_sample:
current_speech["start"] = next_start
else:
triggered = False
prev_end = next_start = temp_end = 0
possible_ends = []
else:
current_speech["end"] = window_size_samples * i
speeches.append(current_speech)
current_speech = {}
prev_end = next_start = temp_end = 0
triggered = False
continue
if prev_end:
current_speech["end"] = prev_end
speeches.append(current_speech)
current_speech = {}
if next_start < prev_end:
triggered = False
else:
current_speech["start"] = next_start
prev_end = next_start = temp_end = 0
possible_ends = []
else:
current_speech["end"] = cur_sample
speeches.append(current_speech)
current_speech = {}
prev_end = next_start = temp_end = 0
triggered = False
possible_ends = []
continue
if (speech_prob < neg_threshold) and triggered:
if not temp_end:
temp_end = window_size_samples * i
# condition to avoid cutting in very short silence
if (window_size_samples * i) - temp_end > min_silence_samples_at_max_speech:
temp_end = cur_sample
sil_dur_now = cur_sample - temp_end
if (
not use_max_poss_sil_at_max_speech
and sil_dur_now > min_silence_samples_at_max_speech
):
prev_end = temp_end
if (window_size_samples * i) - temp_end < min_silence_samples:
if sil_dur_now < min_silence_samples:
continue
else:
current_speech["end"] = temp_end
@@ -149,6 +182,7 @@ def get_speech_timestamps(
current_speech = {}
prev_end = next_start = temp_end = 0
triggered = False
possible_ends = []
continue
if (

View File

@@ -1,3 +1,3 @@
"""Version information."""
__version__ = "1.2.0"
__version__ = "1.2.1"

View File

@@ -1,6 +1,6 @@
ctranslate2>=4.0,<5
huggingface_hub>=0.13
huggingface_hub>=0.23
tokenizers>=0.13,<1
onnxruntime>=1.14,<2
av>=11
tqdm
tqdm

View File

@@ -245,7 +245,7 @@ def test_transcribe_signature():
def test_monotonic_timestamps(physcisworks_path):
model = WhisperModel("tiny")
model = WhisperModel("base")
pipeline = BatchedInferencePipeline(model=model)
segments, info = model.transcribe(physcisworks_path, word_timestamps=True)
@@ -290,3 +290,26 @@ def test_cliptimestamps_segments(jfk_path):
" And so my fellow Americans ask not what your country can do for you, "
"ask what you can do for your country."
)
def test_cliptimestamps_timings(physcisworks_path):
model = WhisperModel("tiny")
pipeline = BatchedInferencePipeline(model=model)
audio = decode_audio(physcisworks_path)
clip_timestamps = [{"start": 0.0, "end": 5.0}, {"start": 6.0, "end": 15.0}]
transcripts = [
" Now I want to return to the conservation of mechanical energy.",
(
" I have here a pendulum. I have an object that weighs 15 kilograms"
" and I can lift it up one meter, which I have done now."
),
]
segments, info = pipeline.transcribe(audio, clip_timestamps=clip_timestamps)
segments = list(segments)
assert len(segments) == 2
for segment, clip, transcript in zip(segments, clip_timestamps, transcripts):
assert clip["start"] == segment.start
assert clip["end"] == segment.end
assert segment.text == transcript