From c26d609974ef7c36715f23f0fbcdb3f9b5f8a663 Mon Sep 17 00:00:00 2001 From: Mahmoud Ashraf Date: Sat, 16 Aug 2025 14:30:50 +0300 Subject: [PATCH] only merge when `clip_timestamps` are not provided (#1345) fixes #1340 and allows for batching multiple audio files less than 30s each --- faster_whisper/transcribe.py | 18 +++++++++++++++--- tests/test_transcribe.py | 21 +++++++++++++++++++++ 2 files changed, 36 insertions(+), 3 deletions(-) diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py index 382c77c..a73b31b 100644 --- a/faster_whisper/transcribe.py +++ b/faster_whisper/transcribe.py @@ -417,15 +417,27 @@ class BatchedInferencePipeline: "No clip timestamps found. " "Set 'vad_filter' to True or provide 'clip_timestamps'." ) + + audio_chunks, chunks_metadata = collect_chunks( + audio, clip_timestamps, max_duration=chunk_length + ) + else: clip_timestamps = [ {k: int(v * sampling_rate) for k, v in segment.items()} for segment in clip_timestamps ] - audio_chunks, chunks_metadata = collect_chunks( - audio, clip_timestamps, max_duration=chunk_length - ) + audio_chunks, chunks_metadata = [], [] + for clip in clip_timestamps: + audio_chunks.append(audio[clip["start"] : clip["end"]]) + chunks_metadata.append( + { + "offset": clip["start"] / sampling_rate, + "duration": (clip["end"] - clip["start"]) / sampling_rate, + "segments": [clip], + } + ) duration_after_vad = ( sum((segment["end"] - segment["start"]) for segment in clip_timestamps) diff --git a/tests/test_transcribe.py b/tests/test_transcribe.py index d3d9270..48b409e 100644 --- a/tests/test_transcribe.py +++ b/tests/test_transcribe.py @@ -269,3 +269,24 @@ def test_monotonic_timestamps(physcisworks_path): assert word.start <= word.end assert word.end <= segments[i].end assert segments[-1].end <= info.duration + + +def test_cliptimestamps_segments(jfk_path): + model = WhisperModel("tiny") + pipeline = BatchedInferencePipeline(model=model) + + audio = decode_audio(jfk_path) + audio = np.concatenate([audio, audio]) + clip_timestamps = [{"start": 0.0, "end": 11.0}, {"start": 11.0, "end": 22.0}] + + segments, info = pipeline.transcribe(audio, clip_timestamps=clip_timestamps) + segments = list(segments) + + assert len(segments) == 2 + for segment, clip in zip(segments, clip_timestamps): + assert segment.start == clip["start"] + assert segment.end == clip["end"] + assert segment.text == ( + " And so my fellow Americans ask not what your country can do for you, " + "ask what you can do for your country." + )