From 409a6919f98e2664794e80a93e421803837bc0fe Mon Sep 17 00:00:00 2001 From: Mahmoud Ashraf Date: Fri, 31 Oct 2025 14:26:17 +0300 Subject: [PATCH] Prevent timestamps restoration when clip timestamps are provided in batched inference (#1376) --- faster_whisper/transcribe.py | 20 +++++++++++++++++--- tests/test_transcribe.py | 23 +++++++++++++++++++++++ 2 files changed, 40 insertions(+), 3 deletions(-) diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py index ae1d6f1..51eb1c5 100644 --- a/faster_whisper/transcribe.py +++ b/faster_whisper/transcribe.py @@ -418,23 +418,34 @@ class BatchedInferencePipeline: "Set 'vad_filter' to True or provide 'clip_timestamps'." ) + clip_timestamps_provided = False audio_chunks, chunks_metadata = collect_chunks( audio, clip_timestamps, max_duration=chunk_length ) else: + clip_timestamps_provided = True clip_timestamps = [ {k: int(v * sampling_rate) for k, v in segment.items()} for segment in clip_timestamps ] audio_chunks, chunks_metadata = [], [] - for clip in clip_timestamps: + for i, clip in enumerate(clip_timestamps): audio_chunks.append(audio[clip["start"] : clip["end"]]) + + clip_duration = (clip["end"] - clip["start"]) / sampling_rate + if clip_duration > 30: + self.model.logger.warning( + "Segment %d is longer than 30 seconds, " + "only the first 30 seconds will be transcribed", + i, + ) + chunks_metadata.append( { "offset": clip["start"] / sampling_rate, - "duration": (clip["end"] - clip["start"]) / sampling_rate, + "duration": clip_duration, "segments": [clip], } ) @@ -559,7 +570,10 @@ class BatchedInferencePipeline: options, log_progress, ) - segments = restore_speech_timestamps(segments, clip_timestamps, sampling_rate) + if not clip_timestamps_provided: + segments = restore_speech_timestamps( + segments, clip_timestamps, sampling_rate + ) return segments, info diff --git a/tests/test_transcribe.py b/tests/test_transcribe.py index 48b409e..ccbaba0 100644 --- a/tests/test_transcribe.py +++ b/tests/test_transcribe.py @@ -290,3 +290,26 @@ def test_cliptimestamps_segments(jfk_path): " And so my fellow Americans ask not what your country can do for you, " "ask what you can do for your country." ) + + +def test_cliptimestamps_timings(physcisworks_path): + model = WhisperModel("tiny") + pipeline = BatchedInferencePipeline(model=model) + + audio = decode_audio(physcisworks_path) + clip_timestamps = [{"start": 0.0, "end": 5.0}, {"start": 6.0, "end": 15.0}] + transcripts = [ + " Now I want to return to the conservation of mechanical energy.", + ( + " I have here a pendulum. I have an object that weighs 15 kilograms" + " and I can lift it up one meter, which I have done now." + ), + ] + segments, info = pipeline.transcribe(audio, clip_timestamps=clip_timestamps) + segments = list(segments) + + assert len(segments) == 2 + for segment, clip, transcript in zip(segments, clip_timestamps, transcripts): + assert clip["start"] == segment.start + assert clip["end"] == segment.end + assert segment.text == transcript