mirror of
https://github.com/SYSTRAN/faster-whisper.git
synced 2026-01-08 13:14:00 -05:00
Fix: add <|nocaptions|> to suppressed tokens (#1338)
* Fix: Prevent <|nocaptions|> tokens in BatchedInferencePipeline
- Add nocaptions component tokens [1771, 496, 9799] to suppress_tokens list
- Add segment filtering to remove any remaining <|nocaptions|> segments
- Resolves issue where BatchedInferencePipeline would generate malformed
special tokens during periods of silence or low-confidence transcription
- Includes comprehensive tests to verify the fix
The issue occurred because while bracket tokens ('<', '|', '>') were
already suppressed, the content tokens ('no', 'ca', 'ptions') were not,
leading to partial token generation that formed complete <|nocaptions|>
tags in the output.
Files changed:
- faster_whisper/transcribe.py: Core fix implementation
- test_nocaptions_comprehensive.py: Comprehensive test suite
- tests/test_nocaptions_fix.py: Unit tests
* removed
* Fix: Prevent <|nocaptions|> tokens in BatchedInferencePipeline
* Fix: Implement proper <|nocaptions|> token suppression using single token approach
* ci: trigger tests
* fix: remove trailing whitespace from blank lines
* Update faster_whisper/transcribe.py
Co-authored-by: Mahmoud Ashraf <hassouna97.ma@gmail.com>
* Update faster_whisper/tokenizer.py
Co-authored-by: Mahmoud Ashraf <hassouna97.ma@gmail.com>
* Update faster_whisper/tokenizer.py
Co-authored-by: Mahmoud Ashraf <hassouna97.ma@gmail.com>
* Rename no_speech to no_captions in tokenizer
* nocaptions has been renamed to nospeech
* break line
* line break
* Refactor no_speech method for improved readability by adjusting line breaks
---------
Co-authored-by: Mahmoud Ashraf <hassouna97.ma@gmail.com>
This commit is contained in:
@@ -67,6 +67,12 @@ class Tokenizer:
|
||||
def no_timestamps(self) -> int:
|
||||
return self.tokenizer.token_to_id("<|notimestamps|>")
|
||||
|
||||
@cached_property
|
||||
def no_speech(self) -> int:
|
||||
return self.tokenizer.token_to_id("<|nospeech|>") or self.tokenizer.token_to_id(
|
||||
"<|nocaptions|>"
|
||||
)
|
||||
|
||||
@property
|
||||
def timestamp_begin(self) -> int:
|
||||
return self.no_timestamps + 1
|
||||
|
||||
@@ -1886,6 +1886,7 @@ def get_suppressed_tokens(
|
||||
tokenizer.sot,
|
||||
tokenizer.sot_prev,
|
||||
tokenizer.sot_lm,
|
||||
tokenizer.no_speech,
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
@@ -98,6 +98,7 @@ def test_suppressed_tokens_minus_1():
|
||||
50358,
|
||||
50359,
|
||||
50360,
|
||||
50361,
|
||||
)
|
||||
|
||||
|
||||
@@ -106,7 +107,7 @@ def test_suppressed_tokens_minus_value():
|
||||
|
||||
tokenizer = Tokenizer(model.hf_tokenizer, False)
|
||||
tokens = get_suppressed_tokens(tokenizer, [13])
|
||||
assert tokens == (13, 50257, 50357, 50358, 50359, 50360)
|
||||
assert tokens == (13, 50257, 50357, 50358, 50359, 50360, 50361)
|
||||
|
||||
|
||||
def test_split_on_unicode():
|
||||
|
||||
Reference in New Issue
Block a user