mirror of
https://github.com/SYSTRAN/faster-whisper.git
synced 2026-01-08 13:14:00 -05:00
Prevent timestamps restoration when clip timestamps are provided in batched inference (#1376)
This commit is contained in:
@@ -418,23 +418,34 @@ class BatchedInferencePipeline:
|
||||
"Set 'vad_filter' to True or provide 'clip_timestamps'."
|
||||
)
|
||||
|
||||
clip_timestamps_provided = False
|
||||
audio_chunks, chunks_metadata = collect_chunks(
|
||||
audio, clip_timestamps, max_duration=chunk_length
|
||||
)
|
||||
|
||||
else:
|
||||
clip_timestamps_provided = True
|
||||
clip_timestamps = [
|
||||
{k: int(v * sampling_rate) for k, v in segment.items()}
|
||||
for segment in clip_timestamps
|
||||
]
|
||||
|
||||
audio_chunks, chunks_metadata = [], []
|
||||
for clip in clip_timestamps:
|
||||
for i, clip in enumerate(clip_timestamps):
|
||||
audio_chunks.append(audio[clip["start"] : clip["end"]])
|
||||
|
||||
clip_duration = (clip["end"] - clip["start"]) / sampling_rate
|
||||
if clip_duration > 30:
|
||||
self.model.logger.warning(
|
||||
"Segment %d is longer than 30 seconds, "
|
||||
"only the first 30 seconds will be transcribed",
|
||||
i,
|
||||
)
|
||||
|
||||
chunks_metadata.append(
|
||||
{
|
||||
"offset": clip["start"] / sampling_rate,
|
||||
"duration": (clip["end"] - clip["start"]) / sampling_rate,
|
||||
"duration": clip_duration,
|
||||
"segments": [clip],
|
||||
}
|
||||
)
|
||||
@@ -559,7 +570,10 @@ class BatchedInferencePipeline:
|
||||
options,
|
||||
log_progress,
|
||||
)
|
||||
segments = restore_speech_timestamps(segments, clip_timestamps, sampling_rate)
|
||||
if not clip_timestamps_provided:
|
||||
segments = restore_speech_timestamps(
|
||||
segments, clip_timestamps, sampling_rate
|
||||
)
|
||||
|
||||
return segments, info
|
||||
|
||||
|
||||
@@ -290,3 +290,26 @@ def test_cliptimestamps_segments(jfk_path):
|
||||
" And so my fellow Americans ask not what your country can do for you, "
|
||||
"ask what you can do for your country."
|
||||
)
|
||||
|
||||
|
||||
def test_cliptimestamps_timings(physcisworks_path):
|
||||
model = WhisperModel("tiny")
|
||||
pipeline = BatchedInferencePipeline(model=model)
|
||||
|
||||
audio = decode_audio(physcisworks_path)
|
||||
clip_timestamps = [{"start": 0.0, "end": 5.0}, {"start": 6.0, "end": 15.0}]
|
||||
transcripts = [
|
||||
" Now I want to return to the conservation of mechanical energy.",
|
||||
(
|
||||
" I have here a pendulum. I have an object that weighs 15 kilograms"
|
||||
" and I can lift it up one meter, which I have done now."
|
||||
),
|
||||
]
|
||||
segments, info = pipeline.transcribe(audio, clip_timestamps=clip_timestamps)
|
||||
segments = list(segments)
|
||||
|
||||
assert len(segments) == 2
|
||||
for segment, clip, transcript in zip(segments, clip_timestamps, transcripts):
|
||||
assert clip["start"] == segment.start
|
||||
assert clip["end"] == segment.end
|
||||
assert segment.text == transcript
|
||||
|
||||
Reference in New Issue
Block a user