From c26d609974ef7c36715f23f0fbcdb3f9b5f8a663 Mon Sep 17 00:00:00 2001
From: Mahmoud Ashraf <hassouna97.ma@gmail.com>
Date: Sat, 16 Aug 2025 14:30:50 +0300
Subject: [PATCH] only merge when `clip_timestamps` are not provided (#1345)

fixes #1340 and allows for batching multiple audio files less than 30s each
---
 faster_whisper/transcribe.py | 18 +++++++++++++++---
 tests/test_transcribe.py     | 21 +++++++++++++++++++++
 2 files changed, 36 insertions(+), 3 deletions(-)

diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py
index 382c77c..a73b31b 100644
--- a/faster_whisper/transcribe.py
+++ b/faster_whisper/transcribe.py
@@ -417,15 +417,27 @@ class BatchedInferencePipeline:
                     "No clip timestamps found. "
                     "Set 'vad_filter' to True or provide 'clip_timestamps'."
                 )
+
+            audio_chunks, chunks_metadata = collect_chunks(
+                audio, clip_timestamps, max_duration=chunk_length
+            )
+
         else:
             clip_timestamps = [
                 {k: int(v * sampling_rate) for k, v in segment.items()}
                 for segment in clip_timestamps
             ]
 
-        audio_chunks, chunks_metadata = collect_chunks(
-            audio, clip_timestamps, max_duration=chunk_length
-        )
+            audio_chunks, chunks_metadata = [], []
+            for clip in clip_timestamps:
+                audio_chunks.append(audio[clip["start"] : clip["end"]])
+                chunks_metadata.append(
+                    {
+                        "offset": clip["start"] / sampling_rate,
+                        "duration": (clip["end"] - clip["start"]) / sampling_rate,
+                        "segments": [clip],
+                    }
+                )
 
         duration_after_vad = (
             sum((segment["end"] - segment["start"]) for segment in clip_timestamps)
diff --git a/tests/test_transcribe.py b/tests/test_transcribe.py
index d3d9270..48b409e 100644
--- a/tests/test_transcribe.py
+++ b/tests/test_transcribe.py
@@ -269,3 +269,24 @@ def test_monotonic_timestamps(physcisworks_path):
             assert word.start <= word.end
             assert word.end <= segments[i].end
     assert segments[-1].end <= info.duration
+
+
+def test_cliptimestamps_segments(jfk_path):
+    model = WhisperModel("tiny")
+    pipeline = BatchedInferencePipeline(model=model)
+
+    audio = decode_audio(jfk_path)
+    audio = np.concatenate([audio, audio])
+    clip_timestamps = [{"start": 0.0, "end": 11.0}, {"start": 11.0, "end": 22.0}]
+
+    segments, info = pipeline.transcribe(audio, clip_timestamps=clip_timestamps)
+    segments = list(segments)
+
+    assert len(segments) == 2
+    for segment, clip in zip(segments, clip_timestamps):
+        assert segment.start == clip["start"]
+        assert segment.end == clip["end"]
+        assert segment.text == (
+            " And so my fellow Americans ask not what your country can do for you, "
+            "ask what you can do for your country."
+        )