From 14ba1051f36a900fab951689c9f4846d4bc63d58 Mon Sep 17 00:00:00 2001 From: Mario Date: Fri, 10 Oct 2025 20:56:54 +0200 Subject: [PATCH] Fix: add `<|nocaptions|>` to suppressed tokens (#1338) * Fix: Prevent <|nocaptions|> tokens in BatchedInferencePipeline - Add nocaptions component tokens [1771, 496, 9799] to suppress_tokens list - Add segment filtering to remove any remaining <|nocaptions|> segments - Resolves issue where BatchedInferencePipeline would generate malformed special tokens during periods of silence or low-confidence transcription - Includes comprehensive tests to verify the fix The issue occurred because while bracket tokens ('<', '|', '>') were already suppressed, the content tokens ('no', 'ca', 'ptions') were not, leading to partial token generation that formed complete <|nocaptions|> tags in the output. Files changed: - faster_whisper/transcribe.py: Core fix implementation - test_nocaptions_comprehensive.py: Comprehensive test suite - tests/test_nocaptions_fix.py: Unit tests * removed * Fix: Prevent <|nocaptions|> tokens in BatchedInferencePipeline * Fix: Implement proper <|nocaptions|> token suppression using single token approach * ci: trigger tests * fix: remove trailing whitespace from blank lines * Update faster_whisper/transcribe.py Co-authored-by: Mahmoud Ashraf * Update faster_whisper/tokenizer.py Co-authored-by: Mahmoud Ashraf * Update faster_whisper/tokenizer.py Co-authored-by: Mahmoud Ashraf * Rename no_speech to no_captions in tokenizer * nocaptions has been renamed to nospeech * break line * line break * Refactor no_speech method for improved readability by adjusting line breaks --------- Co-authored-by: Mahmoud Ashraf --- faster_whisper/tokenizer.py | 6 ++++++ faster_whisper/transcribe.py | 1 + tests/test_tokenizer.py | 3 ++- 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/faster_whisper/tokenizer.py b/faster_whisper/tokenizer.py index 3bf76a5..d75bbd7 100644 --- a/faster_whisper/tokenizer.py +++ b/faster_whisper/tokenizer.py @@ -67,6 +67,12 @@ class Tokenizer: def no_timestamps(self) -> int: return self.tokenizer.token_to_id("<|notimestamps|>") + @cached_property + def no_speech(self) -> int: + return self.tokenizer.token_to_id("<|nospeech|>") or self.tokenizer.token_to_id( + "<|nocaptions|>" + ) + @property def timestamp_begin(self) -> int: return self.no_timestamps + 1 diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py index a73b31b..6e57ff7 100644 --- a/faster_whisper/transcribe.py +++ b/faster_whisper/transcribe.py @@ -1886,6 +1886,7 @@ def get_suppressed_tokens( tokenizer.sot, tokenizer.sot_prev, tokenizer.sot_lm, + tokenizer.no_speech, ] ) diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py index 5ea43b0..1a1e27b 100644 --- a/tests/test_tokenizer.py +++ b/tests/test_tokenizer.py @@ -98,6 +98,7 @@ def test_suppressed_tokens_minus_1(): 50358, 50359, 50360, + 50361, ) @@ -106,7 +107,7 @@ def test_suppressed_tokens_minus_value(): tokenizer = Tokenizer(model.hf_tokenizer, False) tokens = get_suppressed_tokens(tokenizer, [13]) - assert tokens == (13, 50257, 50357, 50358, 50359, 50360) + assert tokens == (13, 50257, 50357, 50358, 50359, 50360, 50361) def test_split_on_unicode():