4 Commits
master ... 310

Author SHA1 Message Date
Mahmoud Ashraf
ba812f55a2 Fix quotes for Python version in CI workflow 2025-10-30 21:14:30 +03:00
Mahmoud Ashraf
44466c7535 Upgrade Python version from 3.9 to 3.10 in CI 2025-10-30 21:12:36 +03:00
Mahmoud Ashraf
e3e46675b2 Update Python version requirements to 3.10 and 3.12 2025-10-30 21:11:50 +03:00
Mahmoud Ashraf
14ad587c98 Update Python version requirement to 3.10 or greater 2025-10-30 21:11:07 +03:00
10 changed files with 60 additions and 112 deletions

View File

@@ -17,10 +17,10 @@ jobs:
steps:
- uses: actions/checkout@v4
- name: Set up Python 3.9
- name: Set up Python 3.10
uses: actions/setup-python@v5
with:
python-version: 3.9
python-version: '3.10'
- name: Install module
run: |
@@ -47,10 +47,10 @@ jobs:
steps:
- uses: actions/checkout@v4
- name: Set up Python 3.9
- name: Set up Python 3.10
uses: actions/setup-python@v5
with:
python-version: 3.9
python-version: '3.10'
- name: Install module
run: |
@@ -69,10 +69,10 @@ jobs:
steps:
- uses: actions/checkout@v4
- name: Set up Python 3.9
- name: Set up Python 3.10
uses: actions/setup-python@v5
with:
python-version: 3.9
python-version: '3.10'
- name: Install dependencies
run: |

View File

@@ -56,7 +56,7 @@ For reference, here's the time and memory usage that are required to transcribe
## Requirements
* Python 3.9 or greater
* Python 3.10 or greater
Unlike openai-whisper, FFmpeg does **not** need to be installed on the system. The audio is decoded with the Python library [PyAV](https://github.com/PyAV-Org/PyAV) which bundles the FFmpeg libraries in its package.

View File

@@ -418,34 +418,23 @@ class BatchedInferencePipeline:
"Set 'vad_filter' to True or provide 'clip_timestamps'."
)
clip_timestamps_provided = False
audio_chunks, chunks_metadata = collect_chunks(
audio, clip_timestamps, max_duration=chunk_length
)
else:
clip_timestamps_provided = True
clip_timestamps = [
{k: int(v * sampling_rate) for k, v in segment.items()}
for segment in clip_timestamps
]
audio_chunks, chunks_metadata = [], []
for i, clip in enumerate(clip_timestamps):
for clip in clip_timestamps:
audio_chunks.append(audio[clip["start"] : clip["end"]])
clip_duration = (clip["end"] - clip["start"]) / sampling_rate
if clip_duration > 30:
self.model.logger.warning(
"Segment %d is longer than 30 seconds, "
"only the first 30 seconds will be transcribed",
i,
)
chunks_metadata.append(
{
"offset": clip["start"] / sampling_rate,
"duration": clip_duration,
"duration": (clip["end"] - clip["start"]) / sampling_rate,
"segments": [clip],
}
)
@@ -570,10 +559,7 @@ class BatchedInferencePipeline:
options,
log_progress,
)
if not clip_timestamps_provided:
segments = restore_speech_timestamps(
segments, clip_timestamps, sampling_rate
)
segments = restore_speech_timestamps(segments, clip_timestamps, sampling_rate)
return segments, info

View File

@@ -5,6 +5,7 @@ import re
from typing import List, Optional, Union
import huggingface_hub
import requests
from tqdm.auto import tqdm
@@ -105,6 +106,7 @@ def download_model(
if output_dir is not None:
kwargs["local_dir"] = output_dir
kwargs["local_dir_use_symlinks"] = False
if cache_dir is not None:
kwargs["cache_dir"] = cache_dir
@@ -112,7 +114,24 @@ def download_model(
if use_auth_token is not None:
kwargs["token"] = use_auth_token
return huggingface_hub.snapshot_download(repo_id, **kwargs)
try:
return huggingface_hub.snapshot_download(repo_id, **kwargs)
except (
huggingface_hub.utils.HfHubHTTPError,
requests.exceptions.ConnectionError,
) as exception:
logger = get_logger()
logger.warning(
"An error occured while synchronizing the model %s from the Hugging Face Hub:\n%s",
repo_id,
exception,
)
logger.warning(
"Trying to load the model directly from the local cache, if it exists."
)
kwargs["local_files_only"] = True
return huggingface_hub.snapshot_download(repo_id, **kwargs)
def format_timestamp(

View File

@@ -27,15 +27,11 @@ class VadOptions:
min_speech_duration_ms: Final speech chunks shorter min_speech_duration_ms are thrown out.
max_speech_duration_s: Maximum duration of speech chunks in seconds. Chunks longer
than max_speech_duration_s will be split at the timestamp of the last silence that
lasts more than min_silence_at_max_speech (if any), to prevent aggressive cutting.
Otherwise, they will be split aggressively just before max_speech_duration_s.
lasts more than 100ms (if any), to prevent aggressive cutting. Otherwise, they will be
split aggressively just before max_speech_duration_s.
min_silence_duration_ms: In the end of each speech chunk wait for min_silence_duration_ms
before separating it
speech_pad_ms: Final speech chunks are padded by speech_pad_ms each side
min_silence_at_max_speech: Minimum silence duration in ms which is used to avoid abrupt cuts
when max_speech_duration_s is reached.
use_max_poss_sil_at_max_speech: Whether to use the maximum possible silence at
max_speech_duration_s or not. If not, the last silence is used.
"""
threshold: float = 0.5
@@ -44,8 +40,6 @@ class VadOptions:
max_speech_duration_s: float = float("inf")
min_silence_duration_ms: int = 2000
speech_pad_ms: int = 400
min_silence_at_max_speech: int = 98
use_max_poss_sil_at_max_speech: bool = True
def get_speech_timestamps(
@@ -75,9 +69,6 @@ def get_speech_timestamps(
min_silence_duration_ms = vad_options.min_silence_duration_ms
window_size_samples = 512
speech_pad_ms = vad_options.speech_pad_ms
min_silence_at_max_speech = vad_options.min_silence_at_max_speech
use_max_poss_sil_at_max_speech = vad_options.use_max_poss_sil_at_max_speech
min_speech_samples = sampling_rate * min_speech_duration_ms / 1000
speech_pad_samples = sampling_rate * speech_pad_ms / 1000
max_speech_samples = (
@@ -86,7 +77,7 @@ def get_speech_timestamps(
- 2 * speech_pad_samples
)
min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
min_silence_samples_at_max_speech = sampling_rate * min_silence_at_max_speech / 1000
min_silence_samples_at_max_speech = sampling_rate * 98 / 1000
audio_length_samples = len(audio)
@@ -100,8 +91,6 @@ def get_speech_timestamps(
triggered = False
speeches = []
current_speech = {}
possible_ends = []
if neg_threshold is None:
neg_threshold = max(threshold - 0.15, 0.01)
@@ -111,67 +100,45 @@ def get_speech_timestamps(
prev_end = next_start = 0
for i, speech_prob in enumerate(speech_probs):
cur_sample = window_size_samples * i
if (speech_prob >= threshold) and temp_end:
sil_dur = cur_sample - temp_end
if sil_dur > min_silence_samples_at_max_speech:
possible_ends.append((temp_end, sil_dur))
temp_end = 0
if next_start < prev_end:
next_start = cur_sample
next_start = window_size_samples * i
if (speech_prob >= threshold) and not triggered:
triggered = True
current_speech["start"] = cur_sample
current_speech["start"] = window_size_samples * i
continue
if triggered and (cur_sample - current_speech["start"] > max_speech_samples):
if use_max_poss_sil_at_max_speech and possible_ends:
prev_end, dur = max(possible_ends, key=lambda x: x[1])
if (
triggered
and (window_size_samples * i) - current_speech["start"] > max_speech_samples
):
if prev_end:
current_speech["end"] = prev_end
speeches.append(current_speech)
current_speech = {}
next_start = prev_end + dur
if next_start < prev_end + cur_sample:
# previously reached silence (< neg_thres) and is still not speech (< thres)
if next_start < prev_end:
triggered = False
else:
current_speech["start"] = next_start
else:
triggered = False
prev_end = next_start = temp_end = 0
possible_ends = []
else:
if prev_end:
current_speech["end"] = prev_end
speeches.append(current_speech)
current_speech = {}
if next_start < prev_end:
triggered = False
else:
current_speech["start"] = next_start
prev_end = next_start = temp_end = 0
possible_ends = []
else:
current_speech["end"] = cur_sample
speeches.append(current_speech)
current_speech = {}
prev_end = next_start = temp_end = 0
triggered = False
possible_ends = []
continue
current_speech["end"] = window_size_samples * i
speeches.append(current_speech)
current_speech = {}
prev_end = next_start = temp_end = 0
triggered = False
continue
if (speech_prob < neg_threshold) and triggered:
if not temp_end:
temp_end = cur_sample
sil_dur_now = cur_sample - temp_end
if (
not use_max_poss_sil_at_max_speech
and sil_dur_now > min_silence_samples_at_max_speech
):
temp_end = window_size_samples * i
# condition to avoid cutting in very short silence
if (window_size_samples * i) - temp_end > min_silence_samples_at_max_speech:
prev_end = temp_end
if sil_dur_now < min_silence_samples:
if (window_size_samples * i) - temp_end < min_silence_samples:
continue
else:
current_speech["end"] = temp_end
@@ -182,7 +149,6 @@ def get_speech_timestamps(
current_speech = {}
prev_end = next_start = temp_end = 0
triggered = False
possible_ends = []
continue
if (

View File

@@ -1,3 +1,3 @@
"""Version information."""
__version__ = "1.2.1"
__version__ = "1.2.0"

View File

@@ -1,6 +1,6 @@
ctranslate2>=4.0,<5
huggingface_hub>=0.23
huggingface_hub>=0.13
tokenizers>=0.13,<1
onnxruntime>=1.14,<2
av>=11
tqdm
tqdm

View File

@@ -45,13 +45,13 @@ setup(
"License :: OSI Approved :: MIT License",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3 :: Only",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Topic :: Scientific/Engineering :: Artificial Intelligence",
],
keywords="openai whisper speech ctranslate2 inference quantization transformer",
python_requires=">=3.9",
python_requires=">=3.10",
install_requires=install_requires,
extras_require={
"conversion": conversion_requires,

View File

@@ -245,7 +245,7 @@ def test_transcribe_signature():
def test_monotonic_timestamps(physcisworks_path):
model = WhisperModel("base")
model = WhisperModel("tiny")
pipeline = BatchedInferencePipeline(model=model)
segments, info = model.transcribe(physcisworks_path, word_timestamps=True)
@@ -290,26 +290,3 @@ def test_cliptimestamps_segments(jfk_path):
" And so my fellow Americans ask not what your country can do for you, "
"ask what you can do for your country."
)
def test_cliptimestamps_timings(physcisworks_path):
model = WhisperModel("tiny")
pipeline = BatchedInferencePipeline(model=model)
audio = decode_audio(physcisworks_path)
clip_timestamps = [{"start": 0.0, "end": 5.0}, {"start": 6.0, "end": 15.0}]
transcripts = [
" Now I want to return to the conservation of mechanical energy.",
(
" I have here a pendulum. I have an object that weighs 15 kilograms"
" and I can lift it up one meter, which I have done now."
),
]
segments, info = pipeline.transcribe(audio, clip_timestamps=clip_timestamps)
segments = list(segments)
assert len(segments) == 2
for segment, clip, transcript in zip(segments, clip_timestamps, transcripts):
assert clip["start"] == segment.start
assert clip["end"] == segment.end
assert segment.text == transcript