From 14ba1051f36a900fab951689c9f4846d4bc63d58 Mon Sep 17 00:00:00 2001
From: Mario <mmichelli@gmail.com>
Date: Fri, 10 Oct 2025 20:56:54 +0200
Subject: [PATCH] Fix: add `<|nocaptions|>` to suppressed tokens (#1338)

* Fix: Prevent <|nocaptions|> tokens in BatchedInferencePipeline

- Add nocaptions component tokens [1771, 496, 9799] to suppress_tokens list
- Add segment filtering to remove any remaining <|nocaptions|> segments
- Resolves issue where BatchedInferencePipeline would generate malformed
  special tokens during periods of silence or low-confidence transcription
- Includes comprehensive tests to verify the fix

The issue occurred because while bracket tokens ('<', '|', '>') were
already suppressed, the content tokens ('no', 'ca', 'ptions') were not,
leading to partial token generation that formed complete <|nocaptions|>
tags in the output.

Files changed:
- faster_whisper/transcribe.py: Core fix implementation
- test_nocaptions_comprehensive.py: Comprehensive test suite
- tests/test_nocaptions_fix.py: Unit tests

* removed

* Fix: Prevent <|nocaptions|> tokens in BatchedInferencePipeline

* Fix: Implement proper <|nocaptions|> token suppression using single token approach

* ci: trigger tests

* fix: remove trailing whitespace from blank lines

* Update faster_whisper/transcribe.py

Co-authored-by: Mahmoud Ashraf <hassouna97.ma@gmail.com>

* Update faster_whisper/tokenizer.py

Co-authored-by: Mahmoud Ashraf <hassouna97.ma@gmail.com>

* Update faster_whisper/tokenizer.py

Co-authored-by: Mahmoud Ashraf <hassouna97.ma@gmail.com>

* Rename no_speech to no_captions in tokenizer

* nocaptions has been renamed to nospeech

* break line

* line break

* Refactor no_speech method for improved readability by adjusting line breaks

---------

Co-authored-by: Mahmoud Ashraf <hassouna97.ma@gmail.com>
---
 faster_whisper/tokenizer.py  | 6 ++++++
 faster_whisper/transcribe.py | 1 +
 tests/test_tokenizer.py      | 3 ++-
 3 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/faster_whisper/tokenizer.py b/faster_whisper/tokenizer.py
index 3bf76a5..d75bbd7 100644
--- a/faster_whisper/tokenizer.py
+++ b/faster_whisper/tokenizer.py
@@ -67,6 +67,12 @@ class Tokenizer:
     def no_timestamps(self) -> int:
         return self.tokenizer.token_to_id("<|notimestamps|>")
 
+    @cached_property
+    def no_speech(self) -> int:
+        return self.tokenizer.token_to_id("<|nospeech|>") or self.tokenizer.token_to_id(
+            "<|nocaptions|>"
+        )
+
     @property
     def timestamp_begin(self) -> int:
         return self.no_timestamps + 1
diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py
index a73b31b..6e57ff7 100644
--- a/faster_whisper/transcribe.py
+++ b/faster_whisper/transcribe.py
@@ -1886,6 +1886,7 @@ def get_suppressed_tokens(
             tokenizer.sot,
             tokenizer.sot_prev,
             tokenizer.sot_lm,
+            tokenizer.no_speech,
         ]
     )
 
diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py
index 5ea43b0..1a1e27b 100644
--- a/tests/test_tokenizer.py
+++ b/tests/test_tokenizer.py
@@ -98,6 +98,7 @@ def test_suppressed_tokens_minus_1():
         50358,
         50359,
         50360,
+        50361,
     )
 
 
@@ -106,7 +107,7 @@ def test_suppressed_tokens_minus_value():
 
     tokenizer = Tokenizer(model.hf_tokenizer, False)
     tokens = get_suppressed_tokens(tokenizer, [13])
-    assert tokens == (13, 50257, 50357, 50358, 50359, 50360)
+    assert tokens == (13, 50257, 50357, 50358, 50359, 50360, 50361)
 
 
 def test_split_on_unicode():