Fix quotes for Python version in CI workflow

Upgrade Python version from 3.9 to 3.10 in CI
Update Python version requirements to 3.10 and 3.12
2026-01-12 23:18:06 -05:00 · 2025-10-30 21:14:30 +03:00 · 2025-10-30 21:12:36 +03:00 · 2025-10-30 21:11:50 +03:00 · 2025-10-30 21:11:07 +03:00
10 changed files with 60 additions and 112 deletions
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -17,10 +17,10 @@ jobs:
    steps:
      - uses: actions/checkout@v4

-      - name: Set up Python 3.9
+      - name: Set up Python 3.10
        uses: actions/setup-python@v5
        with:
-          python-version: 3.9
+          python-version: '3.10'

      - name: Install module
        run: |
@@ -47,10 +47,10 @@ jobs:
    steps:
      - uses: actions/checkout@v4

-      - name: Set up Python 3.9
+      - name: Set up Python 3.10
        uses: actions/setup-python@v5
        with:
-          python-version: 3.9
+          python-version: '3.10'

      - name: Install module
        run: |
@@ -69,10 +69,10 @@ jobs:
    steps:
      - uses: actions/checkout@v4

-      - name: Set up Python 3.9
+      - name: Set up Python 3.10
        uses: actions/setup-python@v5
        with:
-          python-version: 3.9
+          python-version: '3.10'

      - name: Install dependencies
        run: |
--- a/README.md
+++ b/README.md
@@ -56,7 +56,7 @@ For reference, here's the time and memory usage that are required to transcribe

 ## Requirements

-* Python 3.9 or greater
+* Python 3.10 or greater

 Unlike openai-whisper, FFmpeg does **not** need to be installed on the system. The audio is decoded with the Python library [PyAV](https://github.com/PyAV-Org/PyAV) which bundles the FFmpeg libraries in its package.

--- a/faster_whisper/assets/silero_vad_v6.onnx
+++ b/faster_whisper/assets/silero_vad_v6.onnx
--- a/faster_whisper/transcribe.py
+++ b/faster_whisper/transcribe.py
@@ -418,34 +418,23 @@ class BatchedInferencePipeline:
                    "Set 'vad_filter' to True or provide 'clip_timestamps'."
                )

-            clip_timestamps_provided = False
            audio_chunks, chunks_metadata = collect_chunks(
                audio, clip_timestamps, max_duration=chunk_length
            )

        else:
-            clip_timestamps_provided = True
            clip_timestamps = [
                {k: int(v * sampling_rate) for k, v in segment.items()}
                for segment in clip_timestamps
            ]

            audio_chunks, chunks_metadata = [], []
-            for i, clip in enumerate(clip_timestamps):
+            for clip in clip_timestamps:
                audio_chunks.append(audio[clip["start"] : clip["end"]])
-
-                clip_duration = (clip["end"] - clip["start"]) / sampling_rate
-                if clip_duration > 30:
-                    self.model.logger.warning(
-                        "Segment %d is longer than 30 seconds, "
-                        "only the first 30 seconds will be transcribed",
-                        i,
-                    )
-
                chunks_metadata.append(
                    {
                        "offset": clip["start"] / sampling_rate,
-                        "duration": clip_duration,
+                        "duration": (clip["end"] - clip["start"]) / sampling_rate,
                        "segments": [clip],
                    }
                )
@@ -570,10 +559,7 @@ class BatchedInferencePipeline:
            options,
            log_progress,
        )
-        if not clip_timestamps_provided:
-            segments = restore_speech_timestamps(
-                segments, clip_timestamps, sampling_rate
-            )
+        segments = restore_speech_timestamps(segments, clip_timestamps, sampling_rate)

        return segments, info

--- a/faster_whisper/utils.py
+++ b/faster_whisper/utils.py
@@ -5,6 +5,7 @@ import re
 from typing import List, Optional, Union

 import huggingface_hub
+import requests

 from tqdm.auto import tqdm

@@ -105,6 +106,7 @@ def download_model(

    if output_dir is not None:
        kwargs["local_dir"] = output_dir
+        kwargs["local_dir_use_symlinks"] = False

    if cache_dir is not None:
        kwargs["cache_dir"] = cache_dir
@@ -112,7 +114,24 @@ def download_model(
    if use_auth_token is not None:
        kwargs["token"] = use_auth_token

-    return huggingface_hub.snapshot_download(repo_id, **kwargs)
+    try:
+        return huggingface_hub.snapshot_download(repo_id, **kwargs)
+    except (
+        huggingface_hub.utils.HfHubHTTPError,
+        requests.exceptions.ConnectionError,
+    ) as exception:
+        logger = get_logger()
+        logger.warning(
+            "An error occured while synchronizing the model %s from the Hugging Face Hub:\n%s",
+            repo_id,
+            exception,
+        )
+        logger.warning(
+            "Trying to load the model directly from the local cache, if it exists."
+        )
+
+        kwargs["local_files_only"] = True
+        return huggingface_hub.snapshot_download(repo_id, **kwargs)


 def format_timestamp(
--- a/faster_whisper/vad.py
+++ b/faster_whisper/vad.py
@@ -27,15 +27,11 @@ class VadOptions:
      min_speech_duration_ms: Final speech chunks shorter min_speech_duration_ms are thrown out.
      max_speech_duration_s: Maximum duration of speech chunks in seconds. Chunks longer
        than max_speech_duration_s will be split at the timestamp of the last silence that
-        lasts more than min_silence_at_max_speech (if any), to prevent aggressive cutting.
-        Otherwise, they will be split aggressively just before max_speech_duration_s.
+        lasts more than 100ms (if any), to prevent aggressive cutting. Otherwise, they will be
+        split aggressively just before max_speech_duration_s.
      min_silence_duration_ms: In the end of each speech chunk wait for min_silence_duration_ms
        before separating it
      speech_pad_ms: Final speech chunks are padded by speech_pad_ms each side
-      min_silence_at_max_speech: Minimum silence duration in ms which is used to avoid abrupt cuts
-          when max_speech_duration_s is reached.
-      use_max_poss_sil_at_max_speech: Whether to use the maximum possible silence at
-          max_speech_duration_s or not. If not, the last silence is used.
    """

    threshold: float = 0.5
@@ -44,8 +40,6 @@ class VadOptions:
    max_speech_duration_s: float = float("inf")
    min_silence_duration_ms: int = 2000
    speech_pad_ms: int = 400
-    min_silence_at_max_speech: int = 98
-    use_max_poss_sil_at_max_speech: bool = True


 def get_speech_timestamps(
@@ -75,9 +69,6 @@ def get_speech_timestamps(
    min_silence_duration_ms = vad_options.min_silence_duration_ms
    window_size_samples = 512
    speech_pad_ms = vad_options.speech_pad_ms
-    min_silence_at_max_speech = vad_options.min_silence_at_max_speech
-    use_max_poss_sil_at_max_speech = vad_options.use_max_poss_sil_at_max_speech
-
    min_speech_samples = sampling_rate * min_speech_duration_ms / 1000
    speech_pad_samples = sampling_rate * speech_pad_ms / 1000
    max_speech_samples = (
@@ -86,7 +77,7 @@ def get_speech_timestamps(
        - 2 * speech_pad_samples
    )
    min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
-    min_silence_samples_at_max_speech = sampling_rate * min_silence_at_max_speech / 1000
+    min_silence_samples_at_max_speech = sampling_rate * 98 / 1000

    audio_length_samples = len(audio)

@@ -100,8 +91,6 @@ def get_speech_timestamps(
    triggered = False
    speeches = []
    current_speech = {}
-    possible_ends = []
-
    if neg_threshold is None:
        neg_threshold = max(threshold - 0.15, 0.01)

@@ -111,67 +100,45 @@ def get_speech_timestamps(
    prev_end = next_start = 0

    for i, speech_prob in enumerate(speech_probs):
-        cur_sample = window_size_samples * i
-
        if (speech_prob >= threshold) and temp_end:
-            sil_dur = cur_sample - temp_end
-            if sil_dur > min_silence_samples_at_max_speech:
-                possible_ends.append((temp_end, sil_dur))
            temp_end = 0
            if next_start < prev_end:
-                next_start = cur_sample
+                next_start = window_size_samples * i

        if (speech_prob >= threshold) and not triggered:
            triggered = True
-            current_speech["start"] = cur_sample
+            current_speech["start"] = window_size_samples * i
            continue

-        if triggered and (cur_sample - current_speech["start"] > max_speech_samples):
-            if use_max_poss_sil_at_max_speech and possible_ends:
-                prev_end, dur = max(possible_ends, key=lambda x: x[1])
+        if (
+            triggered
+            and (window_size_samples * i) - current_speech["start"] > max_speech_samples
+        ):
+            if prev_end:
                current_speech["end"] = prev_end
                speeches.append(current_speech)
                current_speech = {}
-                next_start = prev_end + dur
-
-                if next_start < prev_end + cur_sample:
+                # previously reached silence (< neg_thres) and is still not speech (< thres)
+                if next_start < prev_end:
+                    triggered = False
+                else:
                    current_speech["start"] = next_start
-                else:
-                    triggered = False
                prev_end = next_start = temp_end = 0
-                possible_ends = []
            else:
-                if prev_end:
-                    current_speech["end"] = prev_end
-                    speeches.append(current_speech)
-                    current_speech = {}
-                    if next_start < prev_end:
-                        triggered = False
-                    else:
-                        current_speech["start"] = next_start
-                    prev_end = next_start = temp_end = 0
-                    possible_ends = []
-                else:
-                    current_speech["end"] = cur_sample
-                    speeches.append(current_speech)
-                    current_speech = {}
-                    prev_end = next_start = temp_end = 0
-                    triggered = False
-                    possible_ends = []
-                    continue
+                current_speech["end"] = window_size_samples * i
+                speeches.append(current_speech)
+                current_speech = {}
+                prev_end = next_start = temp_end = 0
+                triggered = False
+                continue

        if (speech_prob < neg_threshold) and triggered:
            if not temp_end:
-                temp_end = cur_sample
-            sil_dur_now = cur_sample - temp_end
-
-            if (
-                not use_max_poss_sil_at_max_speech
-                and sil_dur_now > min_silence_samples_at_max_speech
-            ):
+                temp_end = window_size_samples * i
+            # condition to avoid cutting in very short silence
+            if (window_size_samples * i) - temp_end > min_silence_samples_at_max_speech:
                prev_end = temp_end
-
-            if sil_dur_now < min_silence_samples:
+            if (window_size_samples * i) - temp_end < min_silence_samples:
                continue
            else:
                current_speech["end"] = temp_end
@@ -182,7 +149,6 @@ def get_speech_timestamps(
                current_speech = {}
                prev_end = next_start = temp_end = 0
                triggered = False
-                possible_ends = []
                continue

    if (
--- a/faster_whisper/version.py
+++ b/faster_whisper/version.py
@@ -1,3 +1,3 @@
 """Version information."""

-__version__ = "1.2.1"
+__version__ = "1.2.0"
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,6 @@
 ctranslate2>=4.0,<5
-huggingface_hub>=0.23
+huggingface_hub>=0.13
 tokenizers>=0.13,<1
 onnxruntime>=1.14,<2 
 av>=11
-tqdm
+tqdm
--- a/setup.py
+++ b/setup.py
@@ -45,13 +45,13 @@ setup(
        "License :: OSI Approved :: MIT License",
        "Programming Language :: Python :: 3",
        "Programming Language :: Python :: 3 :: Only",
-        "Programming Language :: Python :: 3.9",
        "Programming Language :: Python :: 3.10",
        "Programming Language :: Python :: 3.11",
+        "Programming Language :: Python :: 3.12",
        "Topic :: Scientific/Engineering :: Artificial Intelligence",
    ],
    keywords="openai whisper speech ctranslate2 inference quantization transformer",
-    python_requires=">=3.9",
+    python_requires=">=3.10",
    install_requires=install_requires,
    extras_require={
        "conversion": conversion_requires,
--- a/tests/test_transcribe.py
+++ b/tests/test_transcribe.py
@@ -245,7 +245,7 @@ def test_transcribe_signature():


 def test_monotonic_timestamps(physcisworks_path):
-    model = WhisperModel("base")
+    model = WhisperModel("tiny")
    pipeline = BatchedInferencePipeline(model=model)

    segments, info = model.transcribe(physcisworks_path, word_timestamps=True)
@@ -290,26 +290,3 @@ def test_cliptimestamps_segments(jfk_path):
            " And so my fellow Americans ask not what your country can do for you, "
            "ask what you can do for your country."
        )
-
-
-def test_cliptimestamps_timings(physcisworks_path):
-    model = WhisperModel("tiny")
-    pipeline = BatchedInferencePipeline(model=model)
-
-    audio = decode_audio(physcisworks_path)
-    clip_timestamps = [{"start": 0.0, "end": 5.0}, {"start": 6.0, "end": 15.0}]
-    transcripts = [
-        " Now I want to return to the conservation of mechanical energy.",
-        (
-            " I have here a pendulum. I have an object that weighs 15 kilograms"
-            " and I can lift it up one meter, which I have done now."
-        ),
-    ]
-    segments, info = pipeline.transcribe(audio, clip_timestamps=clip_timestamps)
-    segments = list(segments)
-
-    assert len(segments) == 2
-    for segment, clip, transcript in zip(segments, clip_timestamps, transcripts):
-        assert clip["start"] == segment.start
-        assert clip["end"] == segment.end
-        assert segment.text == transcript
Author	SHA1	Message	Date
Mahmoud Ashraf	ba812f55a2	Fix quotes for Python version in CI workflow	2025-10-30 21:14:30 +03:00
Mahmoud Ashraf	44466c7535	Upgrade Python version from 3.9 to 3.10 in CI	2025-10-30 21:12:36 +03:00
Mahmoud Ashraf	e3e46675b2	Update Python version requirements to 3.10 and 3.12	2025-10-30 21:11:50 +03:00
Mahmoud Ashraf	14ad587c98	Update Python version requirement to 3.10 or greater	2025-10-30 21:11:07 +03:00