mirror of
https://github.com/SYSTRAN/faster-whisper.git
synced 2026-01-12 23:18:06 -05:00
Compare commits
4 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ba812f55a2 | ||
|
|
44466c7535 | ||
|
|
e3e46675b2 | ||
|
|
14ad587c98 |
12
.github/workflows/ci.yml
vendored
12
.github/workflows/ci.yml
vendored
@@ -17,10 +17,10 @@ jobs:
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Set up Python 3.9
|
||||
- name: Set up Python 3.10
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: 3.9
|
||||
python-version: '3.10'
|
||||
|
||||
- name: Install module
|
||||
run: |
|
||||
@@ -47,10 +47,10 @@ jobs:
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Set up Python 3.9
|
||||
- name: Set up Python 3.10
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: 3.9
|
||||
python-version: '3.10'
|
||||
|
||||
- name: Install module
|
||||
run: |
|
||||
@@ -69,10 +69,10 @@ jobs:
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Set up Python 3.9
|
||||
- name: Set up Python 3.10
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: 3.9
|
||||
python-version: '3.10'
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
|
||||
@@ -56,7 +56,7 @@ For reference, here's the time and memory usage that are required to transcribe
|
||||
|
||||
## Requirements
|
||||
|
||||
* Python 3.9 or greater
|
||||
* Python 3.10 or greater
|
||||
|
||||
Unlike openai-whisper, FFmpeg does **not** need to be installed on the system. The audio is decoded with the Python library [PyAV](https://github.com/PyAV-Org/PyAV) which bundles the FFmpeg libraries in its package.
|
||||
|
||||
|
||||
Binary file not shown.
@@ -418,34 +418,23 @@ class BatchedInferencePipeline:
|
||||
"Set 'vad_filter' to True or provide 'clip_timestamps'."
|
||||
)
|
||||
|
||||
clip_timestamps_provided = False
|
||||
audio_chunks, chunks_metadata = collect_chunks(
|
||||
audio, clip_timestamps, max_duration=chunk_length
|
||||
)
|
||||
|
||||
else:
|
||||
clip_timestamps_provided = True
|
||||
clip_timestamps = [
|
||||
{k: int(v * sampling_rate) for k, v in segment.items()}
|
||||
for segment in clip_timestamps
|
||||
]
|
||||
|
||||
audio_chunks, chunks_metadata = [], []
|
||||
for i, clip in enumerate(clip_timestamps):
|
||||
for clip in clip_timestamps:
|
||||
audio_chunks.append(audio[clip["start"] : clip["end"]])
|
||||
|
||||
clip_duration = (clip["end"] - clip["start"]) / sampling_rate
|
||||
if clip_duration > 30:
|
||||
self.model.logger.warning(
|
||||
"Segment %d is longer than 30 seconds, "
|
||||
"only the first 30 seconds will be transcribed",
|
||||
i,
|
||||
)
|
||||
|
||||
chunks_metadata.append(
|
||||
{
|
||||
"offset": clip["start"] / sampling_rate,
|
||||
"duration": clip_duration,
|
||||
"duration": (clip["end"] - clip["start"]) / sampling_rate,
|
||||
"segments": [clip],
|
||||
}
|
||||
)
|
||||
@@ -570,10 +559,7 @@ class BatchedInferencePipeline:
|
||||
options,
|
||||
log_progress,
|
||||
)
|
||||
if not clip_timestamps_provided:
|
||||
segments = restore_speech_timestamps(
|
||||
segments, clip_timestamps, sampling_rate
|
||||
)
|
||||
segments = restore_speech_timestamps(segments, clip_timestamps, sampling_rate)
|
||||
|
||||
return segments, info
|
||||
|
||||
|
||||
@@ -5,6 +5,7 @@ import re
|
||||
from typing import List, Optional, Union
|
||||
|
||||
import huggingface_hub
|
||||
import requests
|
||||
|
||||
from tqdm.auto import tqdm
|
||||
|
||||
@@ -105,6 +106,7 @@ def download_model(
|
||||
|
||||
if output_dir is not None:
|
||||
kwargs["local_dir"] = output_dir
|
||||
kwargs["local_dir_use_symlinks"] = False
|
||||
|
||||
if cache_dir is not None:
|
||||
kwargs["cache_dir"] = cache_dir
|
||||
@@ -112,7 +114,24 @@ def download_model(
|
||||
if use_auth_token is not None:
|
||||
kwargs["token"] = use_auth_token
|
||||
|
||||
return huggingface_hub.snapshot_download(repo_id, **kwargs)
|
||||
try:
|
||||
return huggingface_hub.snapshot_download(repo_id, **kwargs)
|
||||
except (
|
||||
huggingface_hub.utils.HfHubHTTPError,
|
||||
requests.exceptions.ConnectionError,
|
||||
) as exception:
|
||||
logger = get_logger()
|
||||
logger.warning(
|
||||
"An error occured while synchronizing the model %s from the Hugging Face Hub:\n%s",
|
||||
repo_id,
|
||||
exception,
|
||||
)
|
||||
logger.warning(
|
||||
"Trying to load the model directly from the local cache, if it exists."
|
||||
)
|
||||
|
||||
kwargs["local_files_only"] = True
|
||||
return huggingface_hub.snapshot_download(repo_id, **kwargs)
|
||||
|
||||
|
||||
def format_timestamp(
|
||||
|
||||
@@ -27,15 +27,11 @@ class VadOptions:
|
||||
min_speech_duration_ms: Final speech chunks shorter min_speech_duration_ms are thrown out.
|
||||
max_speech_duration_s: Maximum duration of speech chunks in seconds. Chunks longer
|
||||
than max_speech_duration_s will be split at the timestamp of the last silence that
|
||||
lasts more than min_silence_at_max_speech (if any), to prevent aggressive cutting.
|
||||
Otherwise, they will be split aggressively just before max_speech_duration_s.
|
||||
lasts more than 100ms (if any), to prevent aggressive cutting. Otherwise, they will be
|
||||
split aggressively just before max_speech_duration_s.
|
||||
min_silence_duration_ms: In the end of each speech chunk wait for min_silence_duration_ms
|
||||
before separating it
|
||||
speech_pad_ms: Final speech chunks are padded by speech_pad_ms each side
|
||||
min_silence_at_max_speech: Minimum silence duration in ms which is used to avoid abrupt cuts
|
||||
when max_speech_duration_s is reached.
|
||||
use_max_poss_sil_at_max_speech: Whether to use the maximum possible silence at
|
||||
max_speech_duration_s or not. If not, the last silence is used.
|
||||
"""
|
||||
|
||||
threshold: float = 0.5
|
||||
@@ -44,8 +40,6 @@ class VadOptions:
|
||||
max_speech_duration_s: float = float("inf")
|
||||
min_silence_duration_ms: int = 2000
|
||||
speech_pad_ms: int = 400
|
||||
min_silence_at_max_speech: int = 98
|
||||
use_max_poss_sil_at_max_speech: bool = True
|
||||
|
||||
|
||||
def get_speech_timestamps(
|
||||
@@ -75,9 +69,6 @@ def get_speech_timestamps(
|
||||
min_silence_duration_ms = vad_options.min_silence_duration_ms
|
||||
window_size_samples = 512
|
||||
speech_pad_ms = vad_options.speech_pad_ms
|
||||
min_silence_at_max_speech = vad_options.min_silence_at_max_speech
|
||||
use_max_poss_sil_at_max_speech = vad_options.use_max_poss_sil_at_max_speech
|
||||
|
||||
min_speech_samples = sampling_rate * min_speech_duration_ms / 1000
|
||||
speech_pad_samples = sampling_rate * speech_pad_ms / 1000
|
||||
max_speech_samples = (
|
||||
@@ -86,7 +77,7 @@ def get_speech_timestamps(
|
||||
- 2 * speech_pad_samples
|
||||
)
|
||||
min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
|
||||
min_silence_samples_at_max_speech = sampling_rate * min_silence_at_max_speech / 1000
|
||||
min_silence_samples_at_max_speech = sampling_rate * 98 / 1000
|
||||
|
||||
audio_length_samples = len(audio)
|
||||
|
||||
@@ -100,8 +91,6 @@ def get_speech_timestamps(
|
||||
triggered = False
|
||||
speeches = []
|
||||
current_speech = {}
|
||||
possible_ends = []
|
||||
|
||||
if neg_threshold is None:
|
||||
neg_threshold = max(threshold - 0.15, 0.01)
|
||||
|
||||
@@ -111,67 +100,45 @@ def get_speech_timestamps(
|
||||
prev_end = next_start = 0
|
||||
|
||||
for i, speech_prob in enumerate(speech_probs):
|
||||
cur_sample = window_size_samples * i
|
||||
|
||||
if (speech_prob >= threshold) and temp_end:
|
||||
sil_dur = cur_sample - temp_end
|
||||
if sil_dur > min_silence_samples_at_max_speech:
|
||||
possible_ends.append((temp_end, sil_dur))
|
||||
temp_end = 0
|
||||
if next_start < prev_end:
|
||||
next_start = cur_sample
|
||||
next_start = window_size_samples * i
|
||||
|
||||
if (speech_prob >= threshold) and not triggered:
|
||||
triggered = True
|
||||
current_speech["start"] = cur_sample
|
||||
current_speech["start"] = window_size_samples * i
|
||||
continue
|
||||
|
||||
if triggered and (cur_sample - current_speech["start"] > max_speech_samples):
|
||||
if use_max_poss_sil_at_max_speech and possible_ends:
|
||||
prev_end, dur = max(possible_ends, key=lambda x: x[1])
|
||||
if (
|
||||
triggered
|
||||
and (window_size_samples * i) - current_speech["start"] > max_speech_samples
|
||||
):
|
||||
if prev_end:
|
||||
current_speech["end"] = prev_end
|
||||
speeches.append(current_speech)
|
||||
current_speech = {}
|
||||
next_start = prev_end + dur
|
||||
|
||||
if next_start < prev_end + cur_sample:
|
||||
# previously reached silence (< neg_thres) and is still not speech (< thres)
|
||||
if next_start < prev_end:
|
||||
triggered = False
|
||||
else:
|
||||
current_speech["start"] = next_start
|
||||
else:
|
||||
triggered = False
|
||||
prev_end = next_start = temp_end = 0
|
||||
possible_ends = []
|
||||
else:
|
||||
if prev_end:
|
||||
current_speech["end"] = prev_end
|
||||
speeches.append(current_speech)
|
||||
current_speech = {}
|
||||
if next_start < prev_end:
|
||||
triggered = False
|
||||
else:
|
||||
current_speech["start"] = next_start
|
||||
prev_end = next_start = temp_end = 0
|
||||
possible_ends = []
|
||||
else:
|
||||
current_speech["end"] = cur_sample
|
||||
speeches.append(current_speech)
|
||||
current_speech = {}
|
||||
prev_end = next_start = temp_end = 0
|
||||
triggered = False
|
||||
possible_ends = []
|
||||
continue
|
||||
current_speech["end"] = window_size_samples * i
|
||||
speeches.append(current_speech)
|
||||
current_speech = {}
|
||||
prev_end = next_start = temp_end = 0
|
||||
triggered = False
|
||||
continue
|
||||
|
||||
if (speech_prob < neg_threshold) and triggered:
|
||||
if not temp_end:
|
||||
temp_end = cur_sample
|
||||
sil_dur_now = cur_sample - temp_end
|
||||
|
||||
if (
|
||||
not use_max_poss_sil_at_max_speech
|
||||
and sil_dur_now > min_silence_samples_at_max_speech
|
||||
):
|
||||
temp_end = window_size_samples * i
|
||||
# condition to avoid cutting in very short silence
|
||||
if (window_size_samples * i) - temp_end > min_silence_samples_at_max_speech:
|
||||
prev_end = temp_end
|
||||
|
||||
if sil_dur_now < min_silence_samples:
|
||||
if (window_size_samples * i) - temp_end < min_silence_samples:
|
||||
continue
|
||||
else:
|
||||
current_speech["end"] = temp_end
|
||||
@@ -182,7 +149,6 @@ def get_speech_timestamps(
|
||||
current_speech = {}
|
||||
prev_end = next_start = temp_end = 0
|
||||
triggered = False
|
||||
possible_ends = []
|
||||
continue
|
||||
|
||||
if (
|
||||
|
||||
@@ -1,3 +1,3 @@
|
||||
"""Version information."""
|
||||
|
||||
__version__ = "1.2.1"
|
||||
__version__ = "1.2.0"
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
ctranslate2>=4.0,<5
|
||||
huggingface_hub>=0.23
|
||||
huggingface_hub>=0.13
|
||||
tokenizers>=0.13,<1
|
||||
onnxruntime>=1.14,<2
|
||||
av>=11
|
||||
tqdm
|
||||
tqdm
|
||||
4
setup.py
4
setup.py
@@ -45,13 +45,13 @@ setup(
|
||||
"License :: OSI Approved :: MIT License",
|
||||
"Programming Language :: Python :: 3",
|
||||
"Programming Language :: Python :: 3 :: Only",
|
||||
"Programming Language :: Python :: 3.9",
|
||||
"Programming Language :: Python :: 3.10",
|
||||
"Programming Language :: Python :: 3.11",
|
||||
"Programming Language :: Python :: 3.12",
|
||||
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
||||
],
|
||||
keywords="openai whisper speech ctranslate2 inference quantization transformer",
|
||||
python_requires=">=3.9",
|
||||
python_requires=">=3.10",
|
||||
install_requires=install_requires,
|
||||
extras_require={
|
||||
"conversion": conversion_requires,
|
||||
|
||||
@@ -245,7 +245,7 @@ def test_transcribe_signature():
|
||||
|
||||
|
||||
def test_monotonic_timestamps(physcisworks_path):
|
||||
model = WhisperModel("base")
|
||||
model = WhisperModel("tiny")
|
||||
pipeline = BatchedInferencePipeline(model=model)
|
||||
|
||||
segments, info = model.transcribe(physcisworks_path, word_timestamps=True)
|
||||
@@ -290,26 +290,3 @@ def test_cliptimestamps_segments(jfk_path):
|
||||
" And so my fellow Americans ask not what your country can do for you, "
|
||||
"ask what you can do for your country."
|
||||
)
|
||||
|
||||
|
||||
def test_cliptimestamps_timings(physcisworks_path):
|
||||
model = WhisperModel("tiny")
|
||||
pipeline = BatchedInferencePipeline(model=model)
|
||||
|
||||
audio = decode_audio(physcisworks_path)
|
||||
clip_timestamps = [{"start": 0.0, "end": 5.0}, {"start": 6.0, "end": 15.0}]
|
||||
transcripts = [
|
||||
" Now I want to return to the conservation of mechanical energy.",
|
||||
(
|
||||
" I have here a pendulum. I have an object that weighs 15 kilograms"
|
||||
" and I can lift it up one meter, which I have done now."
|
||||
),
|
||||
]
|
||||
segments, info = pipeline.transcribe(audio, clip_timestamps=clip_timestamps)
|
||||
segments = list(segments)
|
||||
|
||||
assert len(segments) == 2
|
||||
for segment, clip, transcript in zip(segments, clip_timestamps, transcripts):
|
||||
assert clip["start"] == segment.start
|
||||
assert clip["end"] == segment.end
|
||||
assert segment.text == transcript
|
||||
|
||||
Reference in New Issue
Block a user