diff --git a/faster_whisper/tokenizer.py b/faster_whisper/tokenizer.py index 3bf76a5..d75bbd7 100644 --- a/faster_whisper/tokenizer.py +++ b/faster_whisper/tokenizer.py @@ -67,6 +67,12 @@ class Tokenizer: def no_timestamps(self) -> int: return self.tokenizer.token_to_id("<|notimestamps|>") + @cached_property + def no_speech(self) -> int: + return self.tokenizer.token_to_id("<|nospeech|>") or self.tokenizer.token_to_id( + "<|nocaptions|>" + ) + @property def timestamp_begin(self) -> int: return self.no_timestamps + 1 diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py index a73b31b..6e57ff7 100644 --- a/faster_whisper/transcribe.py +++ b/faster_whisper/transcribe.py @@ -1886,6 +1886,7 @@ def get_suppressed_tokens( tokenizer.sot, tokenizer.sot_prev, tokenizer.sot_lm, + tokenizer.no_speech, ] ) diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py index 5ea43b0..1a1e27b 100644 --- a/tests/test_tokenizer.py +++ b/tests/test_tokenizer.py @@ -98,6 +98,7 @@ def test_suppressed_tokens_minus_1(): 50358, 50359, 50360, + 50361, ) @@ -106,7 +107,7 @@ def test_suppressed_tokens_minus_value(): tokenizer = Tokenizer(model.hf_tokenizer, False) tokens = get_suppressed_tokens(tokenizer, [13]) - assert tokens == (13, 50257, 50357, 50358, 50359, 50360) + assert tokens == (13, 50257, 50357, 50358, 50359, 50360, 50361) def test_split_on_unicode():