Files
faster-whisper/tests/test_transcribe.py
Purfview 2eeafe05de Update Silero-VAD weights to v6.2 (#1390)
* Update Silero-VAD weights to v6.2

Overall slight quality improvement (no metrics update);
Higher stability on OOD / rare / strange / unique data;
Significant quality improvements on various known edge cases:

    Unusual voices
    Child voices
    Cartoon voices
    Muted voices
    Muted speech
    Lower quality phone calls

https://github.com/snakers4/silero-vad/releases/tag/v6.2

* Changes: tiny -> base in test_monotonic_timestamps()
2025-11-19 17:14:42 +03:00

316 lines
11 KiB
Python

import inspect
import os
import numpy as np
from faster_whisper import BatchedInferencePipeline, WhisperModel, decode_audio
def test_supported_languages():
model = WhisperModel("tiny.en")
assert model.supported_languages == ["en"]
def test_transcribe(jfk_path):
model = WhisperModel("tiny")
segments, info = model.transcribe(jfk_path, word_timestamps=True)
assert info.all_language_probs is not None
assert info.language == "en"
assert info.language_probability > 0.9
assert info.duration == 11
# Get top language info from all results, which should match the
# already existing metadata
top_lang, top_lang_score = info.all_language_probs[0]
assert info.language == top_lang
assert abs(info.language_probability - top_lang_score) < 1e-16
segments = list(segments)
assert len(segments) == 1
segment = segments[0]
assert segment.text == (
" And so my fellow Americans, ask not what your country can do for you, "
"ask what you can do for your country."
)
assert segment.text == "".join(word.word for word in segment.words)
assert segment.start == segment.words[0].start
assert segment.end == segment.words[-1].end
batched_model = BatchedInferencePipeline(model=model)
result, info = batched_model.transcribe(
jfk_path, word_timestamps=True, vad_filter=False
)
assert info.language == "en"
assert info.language_probability > 0.7
segments = []
for segment in result:
segments.append(
{"start": segment.start, "end": segment.end, "text": segment.text}
)
assert len(segments) == 1
assert segment.text == (
" And so my fellow Americans ask not what your country can do for you, "
"ask what you can do for your country."
)
def test_batched_transcribe(physcisworks_path):
model = WhisperModel("tiny")
batched_model = BatchedInferencePipeline(model=model)
result, info = batched_model.transcribe(physcisworks_path, batch_size=16)
assert info.language == "en"
assert info.language_probability > 0.7
segments = []
for segment in result:
segments.append(
{"start": segment.start, "end": segment.end, "text": segment.text}
)
# number of near 30 sec segments
assert len(segments) == 6
result, info = batched_model.transcribe(
physcisworks_path,
batch_size=16,
without_timestamps=False,
word_timestamps=True,
)
segments = []
for segment in result:
assert segment.words is not None
segments.append(
{"start": segment.start, "end": segment.end, "text": segment.text}
)
assert len(segments) > 7
def test_empty_audio():
audio = np.asarray([], dtype="float32")
model = WhisperModel("tiny")
pipeline = BatchedInferencePipeline(model=model)
assert list(model.transcribe(audio)[0]) == []
assert list(pipeline.transcribe(audio)[0]) == []
model.detect_language(audio)
def test_prefix_with_timestamps(jfk_path):
model = WhisperModel("tiny")
segments, _ = model.transcribe(jfk_path, prefix="And so my fellow Americans")
segments = list(segments)
assert len(segments) == 1
segment = segments[0]
assert segment.text == (
" And so my fellow Americans, ask not what your country can do for you, "
"ask what you can do for your country."
)
assert segment.start == 0
assert 10 < segment.end <= 11
def test_vad(jfk_path):
model = WhisperModel("tiny")
segments, info = model.transcribe(
jfk_path,
vad_filter=True,
vad_parameters=dict(min_silence_duration_ms=500, speech_pad_ms=200),
)
segments = list(segments)
assert len(segments) == 1
segment = segments[0]
assert segment.text == (
" And so my fellow Americans ask not what your country can do for you, "
"ask what you can do for your country."
)
assert 0 < segment.start < 1
assert 10 < segment.end < 11
assert info.vad_options.min_silence_duration_ms == 500
assert info.vad_options.speech_pad_ms == 200
def test_stereo_diarization(data_dir):
model = WhisperModel("tiny")
audio_path = os.path.join(data_dir, "stereo_diarization.wav")
left, right = decode_audio(audio_path, split_stereo=True)
segments, _ = model.transcribe(left)
transcription = "".join(segment.text for segment in segments).strip()
assert transcription == (
"He began a confused complaint against the wizard, "
"who had vanished behind the curtain on the left."
)
segments, _ = model.transcribe(right)
transcription = "".join(segment.text for segment in segments).strip()
assert transcription == "The horizon seems extremely distant."
def test_multilingual_transcription(data_dir):
model = WhisperModel("tiny")
pipeline = BatchedInferencePipeline(model)
audio_path = os.path.join(data_dir, "multilingual.mp3")
audio = decode_audio(audio_path)
segments, info = model.transcribe(
audio,
multilingual=True,
without_timestamps=True,
condition_on_previous_text=False,
)
segments = list(segments)
assert (
segments[0].text
== " Permission is hereby granted, free of charge, to any person obtaining a copy of the"
" software and associated documentation files to deal in the software without restriction,"
" including without limitation the rights to use, copy, modify, merge, publish, distribute"
", sublicence, and or cell copies of the software, and to permit persons to whom the "
"software is furnished to do so, subject to the following conditions. The above copyright"
" notice and this permission notice, shall be included in all copies or substantial "
"portions of the software."
)
assert (
segments[1].text
== " Jedem, der dieses Software und die dazu gehöregen Dokumentationsdatein erhält, wird "
"hiermit unengeltlich die Genehmigung erteilt, wird der Software und eingeschränkt zu "
"verfahren. Dies umfasst insbesondere das Recht, die Software zu verwenden, zu "
"vervielfältigen, zu modifizieren, zu Samenzofügen, zu veröffentlichen, zu verteilen, "
"unterzulizenzieren und oder kopieren der Software zu verkaufen und diese Rechte "
"unterfolgen den Bedingungen anderen zu übertragen."
)
segments, info = pipeline.transcribe(audio, multilingual=True)
segments = list(segments)
assert (
segments[0].text
== " Permission is hereby granted, free of charge, to any person obtaining a copy of the"
" software and associated documentation files to deal in the software without restriction,"
" including without limitation the rights to use, copy, modify, merge, publish, distribute"
", sublicence, and or cell copies of the software, and to permit persons to whom the "
"software is furnished to do so, subject to the following conditions. The above copyright"
" notice and this permission notice, shall be included in all copies or substantial "
"portions of the software."
)
assert (
"Dokumentationsdatein erhält, wird hiermit unengeltlich die Genehmigung erteilt,"
" wird der Software und eingeschränkt zu verfahren. Dies umfasst insbesondere das Recht,"
" die Software zu verwenden, zu vervielfältigen, zu modifizieren"
in segments[1].text
)
def test_hotwords(data_dir):
model = WhisperModel("tiny")
pipeline = BatchedInferencePipeline(model)
audio_path = os.path.join(data_dir, "hotwords.mp3")
audio = decode_audio(audio_path)
segments, info = model.transcribe(audio, hotwords="ComfyUI")
segments = list(segments)
assert "ComfyUI" in segments[0].text
assert info.transcription_options.hotwords == "ComfyUI"
segments, info = pipeline.transcribe(audio, hotwords="ComfyUI")
segments = list(segments)
assert "ComfyUI" in segments[0].text
assert info.transcription_options.hotwords == "ComfyUI"
def test_transcribe_signature():
model_transcribe_args = set(inspect.getargs(WhisperModel.transcribe.__code__).args)
pipeline_transcribe_args = set(
inspect.getargs(BatchedInferencePipeline.transcribe.__code__).args
)
pipeline_transcribe_args.remove("batch_size")
assert model_transcribe_args == pipeline_transcribe_args
def test_monotonic_timestamps(physcisworks_path):
model = WhisperModel("base")
pipeline = BatchedInferencePipeline(model=model)
segments, info = model.transcribe(physcisworks_path, word_timestamps=True)
segments = list(segments)
for i in range(len(segments) - 1):
assert segments[i].start <= segments[i].end
assert segments[i].end <= segments[i + 1].start
for word in segments[i].words:
assert word.start <= word.end
assert word.end <= segments[i].end
assert segments[-1].end <= info.duration
segments, info = pipeline.transcribe(physcisworks_path, word_timestamps=True)
segments = list(segments)
for i in range(len(segments) - 1):
assert segments[i].start <= segments[i].end
assert segments[i].end <= segments[i + 1].start
for word in segments[i].words:
assert word.start <= word.end
assert word.end <= segments[i].end
assert segments[-1].end <= info.duration
def test_cliptimestamps_segments(jfk_path):
model = WhisperModel("tiny")
pipeline = BatchedInferencePipeline(model=model)
audio = decode_audio(jfk_path)
audio = np.concatenate([audio, audio])
clip_timestamps = [{"start": 0.0, "end": 11.0}, {"start": 11.0, "end": 22.0}]
segments, info = pipeline.transcribe(audio, clip_timestamps=clip_timestamps)
segments = list(segments)
assert len(segments) == 2
for segment, clip in zip(segments, clip_timestamps):
assert segment.start == clip["start"]
assert segment.end == clip["end"]
assert segment.text == (
" And so my fellow Americans ask not what your country can do for you, "
"ask what you can do for your country."
)
def test_cliptimestamps_timings(physcisworks_path):
model = WhisperModel("tiny")
pipeline = BatchedInferencePipeline(model=model)
audio = decode_audio(physcisworks_path)
clip_timestamps = [{"start": 0.0, "end": 5.0}, {"start": 6.0, "end": 15.0}]
transcripts = [
" Now I want to return to the conservation of mechanical energy.",
(
" I have here a pendulum. I have an object that weighs 15 kilograms"
" and I can lift it up one meter, which I have done now."
),
]
segments, info = pipeline.transcribe(audio, clip_timestamps=clip_timestamps)
segments = list(segments)
assert len(segments) == 2
for segment, clip, transcript in zip(segments, clip_timestamps, transcripts):
assert clip["start"] == segment.start
assert clip["end"] == segment.end
assert segment.text == transcript