#include #include #include #include #include #include #include #include #ifdef _WIN32 #define NOMINMAX #include #endif #include #include "plugin-support.h" #include "transcription-filter.h" #include "transcription-filter-callbacks.h" #include "transcription-filter-data.h" #include "transcription-filter-utils.h" #include "transcription-utils.h" #include "model-utils/model-downloader.h" #include "whisper-utils/whisper-processing.h" #include "whisper-utils/whisper-language.h" #include "whisper-utils/whisper-model-utils.h" #include "whisper-utils/whisper-utils.h" #include "translation/language_codes.h" #include "translation/translation-utils.h" #include "translation/translation.h" #include "translation/translation-includes.h" #include "ui/filter-replace-dialog.h" void set_source_signals(transcription_filter_data *gf, obs_source_t *parent_source) { signal_handler_t *sh = obs_source_get_signal_handler(parent_source); signal_handler_connect(sh, "media_play", media_play_callback, gf); signal_handler_connect(sh, "media_started", media_started_callback, gf); signal_handler_connect(sh, "media_pause", media_pause_callback, gf); signal_handler_connect(sh, "media_restart", media_restart_callback, gf); signal_handler_connect(sh, "media_stopped", media_stopped_callback, gf); gf->source_signals_set = true; } void disconnect_source_signals(transcription_filter_data *gf, obs_source_t *parent_source) { signal_handler_t *sh = obs_source_get_signal_handler(parent_source); signal_handler_disconnect(sh, "media_play", media_play_callback, gf); signal_handler_disconnect(sh, "media_started", media_started_callback, gf); signal_handler_disconnect(sh, "media_pause", media_pause_callback, gf); signal_handler_disconnect(sh, "media_restart", media_restart_callback, gf); signal_handler_disconnect(sh, "media_stopped", media_stopped_callback, gf); gf->source_signals_set = false; } struct obs_audio_data *transcription_filter_filter_audio(void *data, struct obs_audio_data *audio) { if (!audio) { return nullptr; } if (data == nullptr) { return audio; } struct transcription_filter_data *gf = static_cast(data); // Lazy initialization of source signals if (!gf->source_signals_set) { // obs_filter_get_parent only works in the filter function obs_source_t *parent_source = obs_filter_get_parent(gf->context); if (parent_source != nullptr) { set_source_signals(gf, parent_source); } } if (!gf->active) { return audio; } if (gf->whisper_context == nullptr) { // Whisper not initialized, just pass through return audio; } // Check if process while muted is not enabled (e.g. the user wants to avoid processing audio // when the source is muted) if (!gf->process_while_muted) { // Check if the parent source is muted obs_source_t *parent_source = obs_filter_get_parent(gf->context); if (parent_source != nullptr && obs_source_muted(parent_source)) { // Source is muted, do not process audio return audio; } } { std::lock_guard lock(gf->whisper_buf_mutex); // scoped lock // push back current audio data to input circlebuf for (size_t c = 0; c < gf->channels; c++) { circlebuf_push_back(&gf->input_buffers[c], audio->data[c], audio->frames * sizeof(float)); } // push audio packet info (timestamp/frame count) to info circlebuf struct transcription_filter_audio_info info = {0}; info.frames = audio->frames; // number of frames in this packet // check if the timestamp is a false "negative" value for uint64_t if (audio->timestamp > (std::numeric_limits::max() - 100000000)) { // set the timestamp to the current time info.timestamp_offset_ns = 0; } else { info.timestamp_offset_ns = audio->timestamp; // timestamp of this packet } circlebuf_push_back(&gf->info_buffer, &info, sizeof(info)); } return audio; } const char *transcription_filter_name(void *unused) { UNUSED_PARAMETER(unused); return MT_("transcription_filterAudioFilter"); } void transcription_filter_remove(void *data, obs_source_t *source) { struct transcription_filter_data *gf = static_cast(data); obs_log(gf->log_level, "filter remove"); disconnect_source_signals(gf, source); } void transcription_filter_destroy(void *data) { struct transcription_filter_data *gf = static_cast(data); signal_handler_t *sh_filter = obs_source_get_signal_handler(gf->context); signal_handler_disconnect(sh_filter, "enable", enable_callback, gf); obs_log(gf->log_level, "filter destroy"); shutdown_whisper_thread(gf); if (gf->resampler_to_whisper) { audio_resampler_destroy(gf->resampler_to_whisper); } { std::lock_guard lockbuf(gf->whisper_buf_mutex); bfree(gf->copy_buffers[0]); gf->copy_buffers[0] = nullptr; for (size_t i = 0; i < gf->channels; i++) { circlebuf_free(&gf->input_buffers[i]); } } circlebuf_free(&gf->info_buffer); if (gf->captions_monitor.isEnabled()) { gf->captions_monitor.stopThread(); } if (gf->translation_monitor.isEnabled()) { gf->translation_monitor.stopThread(); } bfree(gf); } void transcription_filter_update(void *data, obs_data_t *s) { struct transcription_filter_data *gf = static_cast(data); obs_log(gf->log_level, "LocalVocal filter update"); gf->log_level = (int)obs_data_get_int(s, "log_level"); gf->vad_enabled = obs_data_get_bool(s, "vad_enabled"); gf->log_words = obs_data_get_bool(s, "log_words"); gf->caption_to_stream = obs_data_get_bool(s, "caption_to_stream"); gf->save_to_file = obs_data_get_bool(s, "file_output_enable"); gf->save_srt = obs_data_get_bool(s, "subtitle_save_srt"); gf->truncate_output_file = obs_data_get_bool(s, "truncate_output_file"); gf->save_only_while_recording = obs_data_get_bool(s, "only_while_recording"); gf->rename_file_to_match_recording = obs_data_get_bool(s, "rename_file_to_match_recording"); // Get the current timestamp using the system clock gf->start_timestamp_ms = now_ms(); gf->sentence_number = 1; gf->process_while_muted = obs_data_get_bool(s, "process_while_muted"); gf->min_sub_duration = (int)obs_data_get_int(s, "min_sub_duration"); gf->last_sub_render_time = now_ms(); bool new_buffered_output = obs_data_get_bool(s, "buffered_output"); int new_buffer_num_lines = (int)obs_data_get_int(s, "buffer_num_lines"); int new_buffer_num_chars_per_line = (int)obs_data_get_int(s, "buffer_num_chars_per_line"); TokenBufferSegmentation new_buffer_output_type = (TokenBufferSegmentation)obs_data_get_int(s, "buffer_output_type"); const char *filter_words_replace = obs_data_get_string(s, "filter_words_replace"); if (filter_words_replace != nullptr && strlen(filter_words_replace) > 0) { obs_log(gf->log_level, "filter_words_replace: %s", filter_words_replace); // deserialize the filter words replace gf->filter_words_replace = deserialize_filter_words_replace(filter_words_replace); } else { // clear the filter words replace gf->filter_words_replace.clear(); } if (gf->save_to_file) { gf->output_file_path = ""; // set the output file path const char *output_file_path = obs_data_get_string(s, "subtitle_output_filename"); if (output_file_path != nullptr && strlen(output_file_path) > 0) { gf->output_file_path = output_file_path; } else { obs_log(gf->log_level, "output file path is empty, but selected to save"); } } if (new_buffered_output) { obs_log(gf->log_level, "buffered_output enable"); if (!gf->buffered_output || !gf->captions_monitor.isEnabled()) { obs_log(gf->log_level, "buffered_output currently disabled, enabling"); gf->buffered_output = true; gf->captions_monitor.initialize( gf, [gf](const std::string &text) { if (gf->buffered_output) { send_caption_to_source(gf->text_source_name, text, gf); } }, [gf](const std::string &) {}, new_buffer_num_lines, new_buffer_num_chars_per_line, std::chrono::seconds(3), new_buffer_output_type); gf->translation_monitor.initialize( gf, [gf](const std::string &translated_text) { if (gf->buffered_output && gf->translation_output != "none") { send_caption_to_source(gf->translation_output, translated_text, gf); } }, [gf](const std::string &) {}, new_buffer_num_lines, new_buffer_num_chars_per_line, std::chrono::seconds(3), new_buffer_output_type); } else { if (new_buffer_num_lines != gf->buffered_output_num_lines || new_buffer_num_chars_per_line != gf->buffered_output_num_chars || new_buffer_output_type != gf->buffered_output_output_type) { obs_log(gf->log_level, "buffered_output parameters changed, updating"); gf->captions_monitor.clear(); gf->captions_monitor.setNumSentences(new_buffer_num_lines); gf->captions_monitor.setNumPerSentence( new_buffer_num_chars_per_line); gf->captions_monitor.setSegmentation(new_buffer_output_type); gf->translation_monitor.clear(); gf->translation_monitor.setNumSentences(new_buffer_num_lines); gf->translation_monitor.setNumPerSentence( new_buffer_num_chars_per_line); gf->translation_monitor.setSegmentation(new_buffer_output_type); } } gf->buffered_output_num_lines = new_buffer_num_lines; gf->buffered_output_num_chars = new_buffer_num_chars_per_line; gf->buffered_output_output_type = new_buffer_output_type; } else { obs_log(gf->log_level, "buffered_output disable"); if (gf->buffered_output) { obs_log(gf->log_level, "buffered_output currently enabled, disabling"); if (gf->captions_monitor.isEnabled()) { gf->captions_monitor.clear(); gf->captions_monitor.stopThread(); gf->translation_monitor.clear(); gf->translation_monitor.stopThread(); } gf->buffered_output = false; } } bool new_translate = obs_data_get_bool(s, "translate"); gf->source_lang = obs_data_get_string(s, "translate_source_language"); gf->target_lang = obs_data_get_string(s, "translate_target_language"); gf->translation_ctx.add_context = obs_data_get_bool(s, "translate_add_context"); gf->translation_ctx.input_tokenization_style = (InputTokenizationStyle)obs_data_get_int(s, "translate_input_tokenization_style"); gf->translation_output = obs_data_get_string(s, "translate_output"); std::string new_translate_model_index = obs_data_get_string(s, "translate_model"); std::string new_translation_model_path_external = obs_data_get_string(s, "translation_model_path_external"); if (new_translate) { if (new_translate != gf->translate || new_translate_model_index != gf->translation_model_index || new_translation_model_path_external != gf->translation_model_path_external) { // translation settings changed gf->translation_model_index = new_translate_model_index; gf->translation_model_path_external = new_translation_model_path_external; if (gf->translation_model_index != "whisper-based-translation") { start_translation(gf); } else { // whisper-based translation obs_log(gf->log_level, "Starting whisper-based translation..."); gf->translate = false; } } } else { gf->translate = false; } // translation options if (gf->translate) { if (gf->translation_ctx.options) { gf->translation_ctx.options->sampling_temperature = (float)obs_data_get_double(s, "translation_sampling_temperature"); gf->translation_ctx.options->repetition_penalty = (float)obs_data_get_double(s, "translation_repetition_penalty"); gf->translation_ctx.options->beam_size = (int)obs_data_get_int(s, "translation_beam_size"); gf->translation_ctx.options->max_decoding_length = (int)obs_data_get_int(s, "translation_max_decoding_length"); gf->translation_ctx.options->no_repeat_ngram_size = (int)obs_data_get_int(s, "translation_no_repeat_ngram_size"); gf->translation_ctx.options->max_input_length = (int)obs_data_get_int(s, "translation_max_input_length"); } } obs_log(gf->log_level, "update text source"); // update the text source const char *new_text_source_name = obs_data_get_string(s, "subtitle_sources"); if (new_text_source_name == nullptr || strcmp(new_text_source_name, "none") == 0 || strcmp(new_text_source_name, "(null)") == 0 || strlen(new_text_source_name) == 0) { // new selected text source is not valid, release the old one gf->text_source_name.clear(); } else { gf->text_source_name = new_text_source_name; } obs_log(gf->log_level, "update whisper params"); { std::lock_guard lock(gf->whisper_ctx_mutex); gf->sentence_psum_accept_thresh = (float)obs_data_get_double(s, "sentence_psum_accept_thresh"); gf->whisper_params = whisper_full_default_params( (whisper_sampling_strategy)obs_data_get_int(s, "whisper_sampling_method")); gf->whisper_params.duration_ms = (int)obs_data_get_int(s, "buffer_size_msec"); if (!new_translate || gf->translation_model_index != "whisper-based-translation") { const char *whisper_language_select = obs_data_get_string(s, "whisper_language_select"); gf->whisper_params.language = (whisper_language_select != nullptr && strlen(whisper_language_select) > 0) ? whisper_language_select : "auto"; } else { // take the language from gf->target_lang if (language_codes_2_reverse.count(gf->target_lang) > 0) { gf->whisper_params.language = language_codes_2_reverse[gf->target_lang].c_str(); } else { gf->whisper_params.language = "auto"; } } gf->whisper_params.initial_prompt = obs_data_get_string(s, "initial_prompt") != nullptr ? obs_data_get_string(s, "initial_prompt") : ""; gf->whisper_params.n_threads = (int)obs_data_get_int(s, "n_threads"); gf->whisper_params.n_max_text_ctx = (int)obs_data_get_int(s, "n_max_text_ctx"); gf->whisper_params.translate = obs_data_get_bool(s, "whisper_translate"); gf->whisper_params.no_context = obs_data_get_bool(s, "no_context"); gf->whisper_params.single_segment = obs_data_get_bool(s, "single_segment"); gf->whisper_params.print_special = obs_data_get_bool(s, "print_special"); gf->whisper_params.print_progress = obs_data_get_bool(s, "print_progress"); gf->whisper_params.print_realtime = obs_data_get_bool(s, "print_realtime"); gf->whisper_params.print_timestamps = obs_data_get_bool(s, "print_timestamps"); gf->whisper_params.token_timestamps = obs_data_get_bool(s, "token_timestamps"); gf->whisper_params.thold_pt = (float)obs_data_get_double(s, "thold_pt"); gf->whisper_params.thold_ptsum = (float)obs_data_get_double(s, "thold_ptsum"); gf->whisper_params.max_len = (int)obs_data_get_int(s, "max_len"); gf->whisper_params.split_on_word = obs_data_get_bool(s, "split_on_word"); gf->whisper_params.max_tokens = (int)obs_data_get_int(s, "max_tokens"); gf->whisper_params.speed_up = obs_data_get_bool(s, "speed_up"); gf->whisper_params.suppress_blank = obs_data_get_bool(s, "suppress_blank"); gf->whisper_params.suppress_non_speech_tokens = obs_data_get_bool(s, "suppress_non_speech_tokens"); gf->whisper_params.temperature = (float)obs_data_get_double(s, "temperature"); gf->whisper_params.max_initial_ts = (float)obs_data_get_double(s, "max_initial_ts"); gf->whisper_params.length_penalty = (float)obs_data_get_double(s, "length_penalty"); if (gf->vad_enabled && gf->vad) { const float vad_threshold = (float)obs_data_get_double(s, "vad_threshold"); gf->vad->set_threshold(vad_threshold); } } if (gf->context != nullptr && obs_source_enabled(gf->context)) { if (gf->initial_creation) { obs_log(LOG_INFO, "Initial filter creation and source enabled"); // source was enabled on creation update_whisper_model(gf); gf->active = true; gf->initial_creation = false; } else { // check if the whisper model selection has changed const std::string new_model_path = obs_data_get_string(s, "whisper_model_path") != nullptr ? obs_data_get_string(s, "whisper_model_path") : "Whisper Tiny English (74Mb)"; if (gf->whisper_model_path != new_model_path) { obs_log(LOG_INFO, "New model selected: %s", new_model_path.c_str()); update_whisper_model(gf); } } } } void *transcription_filter_create(obs_data_t *settings, obs_source_t *filter) { obs_log(LOG_INFO, "LocalVocal filter create"); void *data = bmalloc(sizeof(struct transcription_filter_data)); struct transcription_filter_data *gf = new (data) transcription_filter_data(); // Get the number of channels for the input source gf->channels = audio_output_get_channels(obs_get_audio()); gf->sample_rate = audio_output_get_sample_rate(obs_get_audio()); gf->frames = (size_t)((float)gf->sample_rate / (1000.0f / MAX_MS_WORK_BUFFER)); gf->last_num_frames = 0; gf->min_sub_duration = (int)obs_data_get_int(settings, "min_sub_duration"); gf->last_sub_render_time = now_ms(); gf->log_level = (int)obs_data_get_int(settings, "log_level"); gf->save_srt = obs_data_get_bool(settings, "subtitle_save_srt"); gf->truncate_output_file = obs_data_get_bool(settings, "truncate_output_file"); gf->save_only_while_recording = obs_data_get_bool(settings, "only_while_recording"); gf->rename_file_to_match_recording = obs_data_get_bool(settings, "rename_file_to_match_recording"); gf->process_while_muted = obs_data_get_bool(settings, "process_while_muted"); gf->buffered_output = obs_data_get_bool(settings, "buffered_output"); for (size_t i = 0; i < gf->channels; i++) { circlebuf_init(&gf->input_buffers[i]); } circlebuf_init(&gf->info_buffer); circlebuf_init(&gf->whisper_buffer); // allocate copy buffers gf->copy_buffers[0] = static_cast(bzalloc(gf->channels * gf->frames * sizeof(float))); if (gf->copy_buffers[0] == nullptr) { obs_log(LOG_ERROR, "Failed to allocate copy buffer"); gf->active = false; return nullptr; } for (size_t c = 1; c < gf->channels; c++) { // set the channel pointers gf->copy_buffers[c] = gf->copy_buffers[0] + c * gf->frames; } memset(gf->copy_buffers[0], 0, gf->channels * gf->frames * sizeof(float)); gf->context = filter; obs_log(gf->log_level, "channels %d, frames %d, sample_rate %d", (int)gf->channels, (int)gf->frames, gf->sample_rate); obs_log(gf->log_level, "setup audio resampler"); struct resample_info src, dst; src.samples_per_sec = gf->sample_rate; src.format = AUDIO_FORMAT_FLOAT_PLANAR; src.speakers = convert_speaker_layout((uint8_t)gf->channels); dst.samples_per_sec = WHISPER_SAMPLE_RATE; dst.format = AUDIO_FORMAT_FLOAT_PLANAR; dst.speakers = convert_speaker_layout((uint8_t)1); gf->resampler_to_whisper = audio_resampler_create(&dst, &src); if (!gf->resampler_to_whisper) { obs_log(LOG_ERROR, "Failed to create resampler"); gf->active = false; return nullptr; } obs_log(gf->log_level, "clear text source data"); const char *subtitle_sources = obs_data_get_string(settings, "subtitle_sources"); if (subtitle_sources == nullptr || strlen(subtitle_sources) == 0 || strcmp(subtitle_sources, "none") == 0 || strcmp(subtitle_sources, "(null)") == 0) { obs_log(gf->log_level, "Create text source"); create_obs_text_source_if_needed(); gf->text_source_name = "LocalVocal Subtitles"; obs_data_set_string(settings, "subtitle_sources", "LocalVocal Subtitles"); } else { // set the text source name gf->text_source_name = subtitle_sources; } obs_log(gf->log_level, "clear paths and whisper context"); gf->whisper_model_file_currently_loaded = ""; gf->output_file_path = std::string(""); gf->whisper_model_path = std::string(""); // The update function will set the model path gf->whisper_context = nullptr; signal_handler_t *sh_filter = obs_source_get_signal_handler(gf->context); if (sh_filter == nullptr) { obs_log(LOG_ERROR, "Failed to get signal handler"); gf->active = false; return nullptr; } signal_handler_connect(sh_filter, "enable", enable_callback, gf); obs_log(gf->log_level, "run update"); // get the settings updated on the filter data struct transcription_filter_update(gf, settings); // handle the event OBS_FRONTEND_EVENT_RECORDING_STARTING to reset the srt sentence number // to match the subtitles with the recording obs_frontend_add_event_callback(recording_state_callback, gf); obs_log(gf->log_level, "filter created."); return gf; } void transcription_filter_activate(void *data) { struct transcription_filter_data *gf = static_cast(data); obs_log(gf->log_level, "filter activated"); gf->active = true; } void transcription_filter_deactivate(void *data) { struct transcription_filter_data *gf = static_cast(data); obs_log(gf->log_level, "filter deactivated"); gf->active = false; } void transcription_filter_show(void *data) { struct transcription_filter_data *gf = static_cast(data); obs_log(gf->log_level, "filter show"); } void transcription_filter_hide(void *data) { struct transcription_filter_data *gf = static_cast(data); obs_log(gf->log_level, "filter hide"); } void transcription_filter_defaults(obs_data_t *s) { obs_log(LOG_DEBUG, "filter defaults"); obs_data_set_default_bool(s, "buffered_output", false); obs_data_set_default_int(s, "buffer_num_lines", 2); obs_data_set_default_int(s, "buffer_num_chars_per_line", 30); obs_data_set_default_int(s, "buffer_output_type", (int)TokenBufferSegmentation::SEGMENTATION_TOKEN); obs_data_set_default_bool(s, "vad_enabled", true); obs_data_set_default_double(s, "vad_threshold", 0.65); obs_data_set_default_int(s, "log_level", LOG_DEBUG); obs_data_set_default_bool(s, "log_words", false); obs_data_set_default_bool(s, "caption_to_stream", false); obs_data_set_default_string(s, "whisper_model_path", "Whisper Tiny English (74Mb)"); obs_data_set_default_string(s, "whisper_language_select", "en"); obs_data_set_default_string(s, "subtitle_sources", "none"); obs_data_set_default_bool(s, "process_while_muted", false); obs_data_set_default_bool(s, "subtitle_save_srt", false); obs_data_set_default_bool(s, "truncate_output_file", false); obs_data_set_default_bool(s, "only_while_recording", false); obs_data_set_default_bool(s, "rename_file_to_match_recording", true); obs_data_set_default_int(s, "min_sub_duration", 3000); obs_data_set_default_bool(s, "advanced_settings", false); obs_data_set_default_bool(s, "translate", false); obs_data_set_default_string(s, "translate_target_language", "__es__"); obs_data_set_default_string(s, "translate_source_language", "__en__"); obs_data_set_default_bool(s, "translate_add_context", true); obs_data_set_default_string(s, "translate_model", "whisper-based-translation"); obs_data_set_default_string(s, "translation_model_path_external", ""); obs_data_set_default_int(s, "translate_input_tokenization_style", INPUT_TOKENIZAION_M2M100); obs_data_set_default_double(s, "sentence_psum_accept_thresh", 0.4); // translation options obs_data_set_default_double(s, "translation_sampling_temperature", 0.1); obs_data_set_default_double(s, "translation_repetition_penalty", 2.0); obs_data_set_default_int(s, "translation_beam_size", 1); obs_data_set_default_int(s, "translation_max_decoding_length", 65); obs_data_set_default_int(s, "translation_no_repeat_ngram_size", 1); obs_data_set_default_int(s, "translation_max_input_length", 65); // Whisper parameters obs_data_set_default_int(s, "whisper_sampling_method", WHISPER_SAMPLING_BEAM_SEARCH); obs_data_set_default_string(s, "initial_prompt", ""); obs_data_set_default_int(s, "n_threads", 4); obs_data_set_default_int(s, "n_max_text_ctx", 16384); obs_data_set_default_bool(s, "whisper_translate", false); obs_data_set_default_bool(s, "no_context", true); obs_data_set_default_bool(s, "single_segment", true); obs_data_set_default_bool(s, "print_special", false); obs_data_set_default_bool(s, "print_progress", false); obs_data_set_default_bool(s, "print_realtime", false); obs_data_set_default_bool(s, "print_timestamps", false); obs_data_set_default_bool(s, "token_timestamps", false); obs_data_set_default_bool(s, "dtw_token_timestamps", false); obs_data_set_default_double(s, "thold_pt", 0.01); obs_data_set_default_double(s, "thold_ptsum", 0.01); obs_data_set_default_int(s, "max_len", 0); obs_data_set_default_bool(s, "split_on_word", true); obs_data_set_default_int(s, "max_tokens", 0); obs_data_set_default_bool(s, "speed_up", false); obs_data_set_default_bool(s, "suppress_blank", false); obs_data_set_default_bool(s, "suppress_non_speech_tokens", true); obs_data_set_default_double(s, "temperature", 0.1); obs_data_set_default_double(s, "max_initial_ts", 1.0); obs_data_set_default_double(s, "length_penalty", -1.0); }