From 7bccfd96d6f6728b9052b5a428a6fcaefa6f634a Mon Sep 17 00:00:00 2001 From: Roy Shilkrot Date: Mon, 3 Jun 2024 00:34:38 -0400 Subject: [PATCH] refactor: Remove unused utils.h and utils.cpp files --- CMakeLists.txt | 7 +- src/tests/localvocal-offline-test.cpp | 18 +- src/transcription-filter-callbacks.cpp | 38 +++- src/transcription-filter-callbacks.h | 2 + src/transcription-filter-data.h | 1 + src/transcription-filter-utils.cpp | 55 +++++ src/transcription-filter-utils.h | 33 +++ src/transcription-filter.cpp | 248 ++++++++++------------ src/transcription-utils.cpp | 20 ++ src/transcription-utils.h | 34 +-- src/utils.cpp | 21 -- src/utils.h | 9 - src/whisper-utils/token-buffer-thread.cpp | 35 ++- src/whisper-utils/token-buffer-thread.h | 7 +- src/whisper-utils/whisper-model-utils.cpp | 4 - src/whisper-utils/whisper-processing.cpp | 14 +- 16 files changed, 330 insertions(+), 216 deletions(-) create mode 100644 src/transcription-filter-utils.cpp create mode 100644 src/transcription-filter-utils.h delete mode 100644 src/utils.cpp delete mode 100644 src/utils.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 063ec64..8b58b07 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -88,6 +88,7 @@ target_sources( src/transcription-filter.cpp src/transcription-filter.c src/transcription-filter-callbacks.cpp + src/transcription-filter-utils.cpp src/transcription-utils.cpp src/model-utils/model-downloader.cpp src/model-utils/model-downloader-ui.cpp @@ -100,8 +101,7 @@ target_sources( src/whisper-utils/token-buffer-thread.cpp src/translation/language_codes.cpp src/translation/translation.cpp - src/translation/translation-utils.cpp - src/utils.cpp) + src/translation/translation-utils.cpp) set_target_properties_plugin(${CMAKE_PROJECT_NAME} PROPERTIES OUTPUT_NAME ${_name}) @@ -122,8 +122,7 @@ if(ENABLE_TESTS) src/whisper-utils/silero-vad-onnx.cpp src/whisper-utils/token-buffer-thread.cpp src/translation/language_codes.cpp - src/translation/translation.cpp - src/utils.cpp) + src/translation/translation.cpp) find_libav(${CMAKE_PROJECT_NAME}-tests) diff --git a/src/tests/localvocal-offline-test.cpp b/src/tests/localvocal-offline-test.cpp index f36c12f..7bfd777 100644 --- a/src/tests/localvocal-offline-test.cpp +++ b/src/tests/localvocal-offline-test.cpp @@ -13,6 +13,7 @@ #include #include "transcription-filter-data.h" +#include "transcription-filter-utils.h" #include "transcription-filter.h" #include "transcription-utils.h" #include "whisper-utils/whisper-utils.h" @@ -155,11 +156,12 @@ create_context(int sample_rate, int channels, const std::string &whisper_model_p gf->whisper_params = whisper_full_default_params(whisper_sampling_method); gf->whisper_params.duration_ms = 3000; gf->whisper_params.language = "en"; + gf->whisper_params.detect_language = false; gf->whisper_params.initial_prompt = ""; gf->whisper_params.n_threads = 4; gf->whisper_params.n_max_text_ctx = 16384; gf->whisper_params.translate = false; - gf->whisper_params.no_context = true; + gf->whisper_params.no_context = false; gf->whisper_params.single_segment = true; gf->whisper_params.print_special = false; gf->whisper_params.print_progress = false; @@ -174,7 +176,7 @@ create_context(int sample_rate, int channels, const std::string &whisper_model_p gf->whisper_params.speed_up = false; gf->whisper_params.suppress_blank = true; gf->whisper_params.suppress_non_speech_tokens = true; - gf->whisper_params.temperature = 0.1; + gf->whisper_params.temperature = 0.0; gf->whisper_params.max_initial_ts = 1.0; gf->whisper_params.length_penalty = -1; gf->active = true; @@ -201,7 +203,7 @@ void audio_chunk_callback(struct transcription_filter_data *gf, const float *pcm // numeral = "0" + numeral; // } - // save the audio to a .wav file + // // save the audio to a .wav file // std::string filename = "audio_chunk_" + numeral + vad_state_str + ".wav"; // obs_log(gf->log_level, "Saving %lu frames to %s", frames, filename.c_str()); // write_audio_wav_file(filename.c_str(), pcm32f_data, frames); @@ -388,6 +390,16 @@ int wmain(int argc, wchar_t *argv[]) gf->enable_audio_chunks_callback = config["enable_audio_chunks_callback"]; } + if (config.contains("temperature")) { + obs_log(LOG_INFO, "Setting temperture to %f", + config["temperature"].get()); + gf->whisper_params.temperature = config["temperature"].get(); + } + if (config.contains("no_context")) { + obs_log(LOG_INFO, "Setting no_context to %s", + config["no_context"] ? "true" : "false"); + gf->whisper_params.no_context = config["no_context"]; + } // set log level if (logLevelStr == "debug") { gf->log_level = LOG_DEBUG; diff --git a/src/transcription-filter-callbacks.cpp b/src/transcription-filter-callbacks.cpp index 8fddafd..f386e16 100644 --- a/src/transcription-filter-callbacks.cpp +++ b/src/transcription-filter-callbacks.cpp @@ -20,16 +20,19 @@ #define SEND_TIMED_METADATA_URL "http://localhost:8080/timed-metadata" -void send_caption_to_source(const std::string &target_source_name, const std::string &str_copy, +void send_caption_to_source(const std::string &target_source_name, const std::string &caption, struct transcription_filter_data *gf) { + if (target_source_name.empty()) { + return; + } auto target = obs_get_source_by_name(target_source_name.c_str()); if (!target) { obs_log(gf->log_level, "text_source target is null"); return; } auto text_settings = obs_source_get_settings(target); - obs_data_set_string(text_settings, "text", str_copy.c_str()); + obs_data_set_string(text_settings, "text", caption.c_str()); obs_source_update(target, text_settings); obs_source_release(target); } @@ -228,3 +231,34 @@ void set_text_callback(struct transcription_filter_data *gf, } } }; + +void recording_state_callback(enum obs_frontend_event event, void *data) +{ + struct transcription_filter_data *gf_ = + static_cast(data); + if (event == OBS_FRONTEND_EVENT_RECORDING_STARTING) { + if (gf_->save_srt && gf_->save_only_while_recording) { + obs_log(gf_->log_level, "Recording started. Resetting srt file."); + // truncate file if it exists + std::ofstream output_file(gf_->output_file_path, + std::ios::out | std::ios::trunc); + output_file.close(); + gf_->sentence_number = 1; + gf_->start_timestamp_ms = now_ms(); + } + } else if (event == OBS_FRONTEND_EVENT_RECORDING_STOPPED) { + if (gf_->save_srt && gf_->save_only_while_recording && + gf_->rename_file_to_match_recording) { + obs_log(gf_->log_level, "Recording stopped. Rename srt file."); + // rename file to match the recording file name with .srt extension + // use obs_frontend_get_last_recording to get the last recording file name + std::string recording_file_name = obs_frontend_get_last_recording(); + // remove the extension + recording_file_name = recording_file_name.substr( + 0, recording_file_name.find_last_of(".")); + std::string srt_file_name = recording_file_name + ".srt"; + // rename the file + std::rename(gf_->output_file_path.c_str(), srt_file_name.c_str()); + } + } +} diff --git a/src/transcription-filter-callbacks.h b/src/transcription-filter-callbacks.h index 656b140..481af9f 100644 --- a/src/transcription-filter-callbacks.h +++ b/src/transcription-filter-callbacks.h @@ -15,4 +15,6 @@ void audio_chunk_callback(struct transcription_filter_data *gf, const float *pcm void set_text_callback(struct transcription_filter_data *gf, const DetectionResultWithText &resultIn); +void recording_state_callback(enum obs_frontend_event event, void *data); + #endif /* TRANSCRIPTION_FILTER_CALLBACKS_H */ diff --git a/src/transcription-filter-data.h b/src/transcription-filter-data.h index 572e596..3ce5ae3 100644 --- a/src/transcription-filter-data.h +++ b/src/transcription-filter-data.h @@ -80,6 +80,7 @@ struct transcription_filter_data { bool fix_utf8 = true; bool enable_audio_chunks_callback = false; bool send_timed_metadata = false; + bool source_signals_set = false; // Last transcription result std::string last_text; diff --git a/src/transcription-filter-utils.cpp b/src/transcription-filter-utils.cpp new file mode 100644 index 0000000..72f313c --- /dev/null +++ b/src/transcription-filter-utils.cpp @@ -0,0 +1,55 @@ +#include "transcription-filter-utils.h" + +#include +#include +#include + +void create_obs_text_source() +{ + // create a new OBS text source called "LocalVocal Subtitles" + obs_source_t *scene_as_source = obs_frontend_get_current_scene(); + obs_scene_t *scene = obs_scene_from_source(scene_as_source); +#ifdef _WIN32 + obs_source_t *source = + obs_source_create("text_gdiplus_v2", "LocalVocal Subtitles", nullptr, nullptr); +#else + obs_source_t *source = + obs_source_create("text_ft2_source_v2", "LocalVocal Subtitles", nullptr, nullptr); +#endif + if (source) { + // add source to the current scene + obs_scene_add(scene, source); + // set source settings + obs_data_t *source_settings = obs_source_get_settings(source); + obs_data_set_bool(source_settings, "word_wrap", true); + obs_data_set_int(source_settings, "custom_width", 1760); + obs_data_t *font_data = obs_data_create(); + obs_data_set_string(font_data, "face", "Arial"); + obs_data_set_string(font_data, "style", "Regular"); + obs_data_set_int(font_data, "size", 72); + obs_data_set_int(font_data, "flags", 0); + obs_data_set_obj(source_settings, "font", font_data); + obs_data_release(font_data); + obs_source_update(source, source_settings); + obs_data_release(source_settings); + + // set transform settings + obs_transform_info transform_info; + transform_info.pos.x = 962.0; + transform_info.pos.y = 959.0; + transform_info.bounds.x = 1769.0; + transform_info.bounds.y = 145.0; + transform_info.bounds_type = obs_bounds_type::OBS_BOUNDS_SCALE_INNER; + transform_info.bounds_alignment = OBS_ALIGN_CENTER; + transform_info.alignment = OBS_ALIGN_CENTER; + transform_info.scale.x = 1.0; + transform_info.scale.y = 1.0; + transform_info.rot = 0.0; + obs_sceneitem_t *source_sceneitem = obs_scene_sceneitem_from_source(scene, source); + obs_sceneitem_set_info(source_sceneitem, &transform_info); + obs_sceneitem_release(source_sceneitem); + + obs_source_release(source); + } + obs_source_release(scene_as_source); +} diff --git a/src/transcription-filter-utils.h b/src/transcription-filter-utils.h new file mode 100644 index 0000000..9f24d55 --- /dev/null +++ b/src/transcription-filter-utils.h @@ -0,0 +1,33 @@ +#ifndef TRANSCRIPTION_FILTER_UTILS_H +#define TRANSCRIPTION_FILTER_UTILS_H + +#include + +// Convert channels number to a speaker layout +inline enum speaker_layout convert_speaker_layout(uint8_t channels) +{ + switch (channels) { + case 0: + return SPEAKERS_UNKNOWN; + case 1: + return SPEAKERS_MONO; + case 2: + return SPEAKERS_STEREO; + case 3: + return SPEAKERS_2POINT1; + case 4: + return SPEAKERS_4POINT0; + case 5: + return SPEAKERS_4POINT1; + case 6: + return SPEAKERS_5POINT1; + case 8: + return SPEAKERS_7POINT1; + default: + return SPEAKERS_UNKNOWN; + } +} + +void create_obs_text_source(); + +#endif // TRANSCRIPTION_FILTER_UTILS_H diff --git a/src/transcription-filter.cpp b/src/transcription-filter.cpp index 5e3a5dc..319573a 100644 --- a/src/transcription-filter.cpp +++ b/src/transcription-filter.cpp @@ -18,6 +18,7 @@ #include "transcription-filter.h" #include "transcription-filter-callbacks.h" #include "transcription-filter-data.h" +#include "transcription-filter-utils.h" #include "transcription-utils.h" #include "model-utils/model-downloader.h" #include "whisper-utils/whisper-processing.h" @@ -28,7 +29,6 @@ #include "translation/translation-utils.h" #include "translation/translation.h" #include "translation/translation-includes.h" -#include "utils.h" bool add_sources_to_list(void *list_property, obs_source_t *source) { @@ -44,6 +44,71 @@ bool add_sources_to_list(void *list_property, obs_source_t *source) return true; } +void set_source_signals(transcription_filter_data *gf, obs_source_t *parent_source) +{ + obs_log(LOG_INFO, "parent source name: %s", obs_source_get_name(parent_source)); + signal_handler_t *sh = obs_source_get_signal_handler(parent_source); + signal_handler_connect( + sh, "media_play", + [](void *data_, calldata_t *cd) { + obs_log(LOG_INFO, "media_play"); + transcription_filter_data *gf_ = + static_cast(data_); + gf_->active = true; + }, + gf); + signal_handler_connect( + sh, "media_started", + [](void *data_, calldata_t *cd) { + obs_log(LOG_INFO, "media_started"); + transcription_filter_data *gf_ = + static_cast(data_); + gf_->active = true; + }, + gf); + signal_handler_connect( + sh, "media_pause", + [](void *data_, calldata_t *cd) { + obs_log(LOG_INFO, "media_pause"); + transcription_filter_data *gf_ = + static_cast(data_); + gf_->active = false; + }, + gf); + signal_handler_connect( + sh, "media_restart", + [](void *data_, calldata_t *cd) { + obs_log(LOG_INFO, "media_restart"); + transcription_filter_data *gf_ = + static_cast(data_); + gf_->active = true; + gf_->captions_monitor.clear(); + send_caption_to_source(gf_->text_source_name, "", gf_); + }, + gf); + signal_handler_connect( + sh, "media_stopped", + [](void *data_, calldata_t *cd) { + obs_log(LOG_INFO, "media_stopped"); + transcription_filter_data *gf_ = + static_cast(data_); + gf_->active = false; + gf_->captions_monitor.clear(); + send_caption_to_source(gf_->text_source_name, "", gf_); + // flush the buffer + { + std::lock_guard lock(gf_->whisper_buf_mutex); + for (size_t c = 0; c < gf_->channels; c++) { + circlebuf_free(&gf_->input_buffers[c]); + } + circlebuf_free(&gf_->info_buffer); + circlebuf_free(&gf_->whisper_buffer); + } + }, + gf); + gf->source_signals_set = true; +} + struct obs_audio_data *transcription_filter_filter_audio(void *data, struct obs_audio_data *audio) { if (!audio) { @@ -56,14 +121,16 @@ struct obs_audio_data *transcription_filter_filter_audio(void *data, struct obs_ struct transcription_filter_data *gf = static_cast(data); - if (!gf->active) { - return audio; + // Lazy initialization of source signals + if (!gf->source_signals_set) { + // obs_filter_get_parent only works in the filter function + obs_source_t *parent_source = obs_filter_get_parent(gf->context); + if (parent_source != nullptr) { + set_source_signals(gf, parent_source); + } } - // Check if the parent source is muted - obs_source_t *parent_source = obs_filter_get_parent(gf->context); - if (gf->process_while_muted == false && obs_source_muted(parent_source)) { - // Source is muted, do not process audio + if (!gf->active) { return audio; } @@ -72,6 +139,17 @@ struct obs_audio_data *transcription_filter_filter_audio(void *data, struct obs_ return audio; } + // Check if process while muted is not enabled (e.g. the user wants to avoid processing audio + // when the source is muted) + if (!gf->process_while_muted) { + // Check if the parent source is muted + obs_source_t *parent_source = obs_filter_get_parent(gf->context); + if (parent_source != nullptr && obs_source_muted(parent_source)) { + // Source is muted, do not process audio + return audio; + } + } + { std::lock_guard lock(gf->whisper_buf_mutex); // scoped lock // push back current audio data to input circlebuf @@ -122,12 +200,11 @@ void transcription_filter_destroy(void *data) void transcription_filter_update(void *data, obs_data_t *s) { + obs_log(LOG_INFO, "LocalVocal filter update"); struct transcription_filter_data *gf = static_cast(data); gf->log_level = (int)obs_data_get_int(s, "log_level"); - obs_log(gf->log_level, "filter update"); - gf->vad_enabled = obs_data_get_bool(s, "vad_enabled"); gf->log_words = obs_data_get_bool(s, "log_words"); gf->caption_to_stream = obs_data_get_bool(s, "caption_to_stream"); @@ -142,7 +219,34 @@ void transcription_filter_update(void *data, obs_data_t *s) gf->process_while_muted = obs_data_get_bool(s, "process_while_muted"); gf->min_sub_duration = (int)obs_data_get_int(s, "min_sub_duration"); gf->last_sub_render_time = 0; - gf->buffered_output = obs_data_get_bool(s, "buffered_output"); + bool new_buffered_output = obs_data_get_bool(s, "buffered_output"); + + if (new_buffered_output) { + obs_log(LOG_INFO, "buffered_output enable"); + if (!gf->buffered_output || !gf->captions_monitor.isEnabled()) { + obs_log(LOG_INFO, "buffered_output currently disabled, enabling"); + gf->buffered_output = true; + gf->captions_monitor.initialize( + gf, + [gf](const std::string &text) { + if (gf->buffered_output) { + send_caption_to_source(gf->text_source_name, text, + gf); + } + }, + 2, 30, std::chrono::seconds(10)); + } + } else { + obs_log(LOG_INFO, "buffered_output disable"); + if (gf->buffered_output) { + obs_log(LOG_INFO, "buffered_output currently enabled, disabling"); + if (gf->captions_monitor.isEnabled()) { + gf->captions_monitor.clear(); + gf->captions_monitor.stopThread(); + } + gf->buffered_output = false; + } + } bool new_translate = obs_data_get_bool(s, "translate"); gf->source_lang = obs_data_get_string(s, "translate_source_language"); @@ -195,7 +299,6 @@ void transcription_filter_update(void *data, obs_data_t *s) obs_log(gf->log_level, "update text source"); // update the text source const char *new_text_source_name = obs_data_get_string(s, "subtitle_sources"); - obs_weak_source_t *old_weak_text_source = NULL; if (new_text_source_name == nullptr || strcmp(new_text_source_name, "none") == 0 || strcmp(new_text_source_name, "(null)") == 0 || @@ -212,16 +315,7 @@ void transcription_filter_update(void *data, obs_data_t *s) } } } else { - // new selected text source is valid, check if it's different from the old one - if (gf->text_source_name != new_text_source_name) { - // new text source is different from the old one, release the old one - gf->text_source_name = new_text_source_name; - } - } - - if (old_weak_text_source) { - obs_log(gf->log_level, "releasing old text source"); - obs_weak_source_release(old_weak_text_source); + gf->text_source_name = new_text_source_name; } obs_log(gf->log_level, "update whisper model"); @@ -333,53 +427,7 @@ void *transcription_filter_create(obs_data_t *settings, obs_source_t *filter) obs_source_release(source); } else { // create a new OBS text source called "LocalVocal Subtitles" - obs_source_t *scene_as_source = obs_frontend_get_current_scene(); - obs_scene_t *scene = obs_scene_from_source(scene_as_source); -#ifdef _WIN32 - source = obs_source_create("text_gdiplus_v2", "LocalVocal Subtitles", - nullptr, nullptr); -#else - source = obs_source_create("text_ft2_source_v2", "LocalVocal Subtitles", - nullptr, nullptr); -#endif - if (source) { - // add source to the current scene - obs_scene_add(scene, source); - // set source settings - obs_data_t *source_settings = obs_source_get_settings(source); - obs_data_set_bool(source_settings, "word_wrap", true); - obs_data_set_int(source_settings, "custom_width", 1760); - obs_data_t *font_data = obs_data_create(); - obs_data_set_string(font_data, "face", "Arial"); - obs_data_set_string(font_data, "style", "Regular"); - obs_data_set_int(font_data, "size", 72); - obs_data_set_int(font_data, "flags", 0); - obs_data_set_obj(source_settings, "font", font_data); - obs_data_release(font_data); - obs_source_update(source, source_settings); - obs_data_release(source_settings); - - // set transform settings - obs_transform_info transform_info; - transform_info.pos.x = 962.0; - transform_info.pos.y = 959.0; - transform_info.bounds.x = 1769.0; - transform_info.bounds.y = 145.0; - transform_info.bounds_type = - obs_bounds_type::OBS_BOUNDS_SCALE_INNER; - transform_info.bounds_alignment = OBS_ALIGN_CENTER; - transform_info.alignment = OBS_ALIGN_CENTER; - transform_info.scale.x = 1.0; - transform_info.scale.y = 1.0; - transform_info.rot = 0.0; - obs_sceneitem_t *source_sceneitem = - obs_scene_sceneitem_from_source(scene, source); - obs_sceneitem_set_info(source_sceneitem, &transform_info); - obs_sceneitem_release(source_sceneitem); - - obs_source_release(source); - } - obs_source_release(scene_as_source); + create_obs_text_source(); } gf->text_source_name = "LocalVocal Subtitles"; obs_data_set_string(settings, "subtitle_sources", "LocalVocal Subtitles"); @@ -393,15 +441,6 @@ void *transcription_filter_create(obs_data_t *settings, obs_source_t *filter) gf->whisper_model_path = std::string(""); // The update function will set the model path gf->whisper_context = nullptr; - gf->captions_monitor.initialize( - gf, - [gf](const std::string &text) { - if (gf->buffered_output) { - send_caption_to_source(gf->text_source_name, text, gf); - } - }, - 2, 30, std::chrono::seconds(10)); - obs_log(gf->log_level, "run update"); // get the settings updated on the filter data struct transcription_filter_update(gf, settings); @@ -410,45 +449,7 @@ void *transcription_filter_create(obs_data_t *settings, obs_source_t *filter) // handle the event OBS_FRONTEND_EVENT_RECORDING_STARTING to reset the srt sentence number // to match the subtitles with the recording - obs_frontend_add_event_callback( - [](enum obs_frontend_event event, void *private_data) { - if (event == OBS_FRONTEND_EVENT_RECORDING_STARTING) { - struct transcription_filter_data *gf_ = - static_cast( - private_data); - if (gf_->save_srt && gf_->save_only_while_recording) { - obs_log(gf_->log_level, - "Recording started. Resetting srt file."); - // truncate file if it exists - std::ofstream output_file(gf_->output_file_path, - std::ios::out | std::ios::trunc); - output_file.close(); - gf_->sentence_number = 1; - gf_->start_timestamp_ms = now_ms(); - } - } else if (event == OBS_FRONTEND_EVENT_RECORDING_STOPPED) { - struct transcription_filter_data *gf_ = - static_cast( - private_data); - if (gf_->save_srt && gf_->save_only_while_recording && - gf_->rename_file_to_match_recording) { - obs_log(gf_->log_level, - "Recording stopped. Rename srt file."); - // rename file to match the recording file name with .srt extension - // use obs_frontend_get_last_recording to get the last recording file name - std::string recording_file_name = - obs_frontend_get_last_recording(); - // remove the extension - recording_file_name = recording_file_name.substr( - 0, recording_file_name.find_last_of(".")); - std::string srt_file_name = recording_file_name + ".srt"; - // rename the file - std::rename(gf_->output_file_path.c_str(), - srt_file_name.c_str()); - } - } - }, - gf); + obs_frontend_add_event_callback(recording_state_callback, gf); obs_log(gf->log_level, "filter created."); return gf; @@ -541,7 +542,7 @@ void transcription_filter_defaults(obs_data_t *s) obs_data_set_default_double(s, "thold_ptsum", 0.01); obs_data_set_default_int(s, "max_len", 0); obs_data_set_default_bool(s, "split_on_word", true); - obs_data_set_default_int(s, "max_tokens", 32); + obs_data_set_default_int(s, "max_tokens", 0); obs_data_set_default_bool(s, "speed_up", false); obs_data_set_default_bool(s, "suppress_blank", false); obs_data_set_default_bool(s, "suppress_non_speech_tokens", true); @@ -776,19 +777,6 @@ obs_properties_t *transcription_filter_properties(void *data) obs_property_t *buffered_output_prop = obs_properties_add_bool(ppts, "buffered_output", MT_("buffered_output")); - // add on-change handler for buffered_output - obs_property_set_modified_callback(buffered_output_prop, [](obs_properties_t *props, - obs_property_t *property, - obs_data_t *settings) { - UNUSED_PARAMETER(property); - UNUSED_PARAMETER(props); - // if buffered output is enabled set the overlap to max else set it to default - obs_data_set_int(settings, "overlap_size_msec", - obs_data_get_bool(settings, "buffered_output") - ? MAX_OVERLAP_SIZE_MSEC - : DEFAULT_OVERLAP_SIZE_MSEC); - return true; - }); obs_properties_add_bool(ppts, "log_words", MT_("log_words")); obs_properties_add_bool(ppts, "caption_to_stream", MT_("caption_to_stream")); diff --git a/src/transcription-utils.cpp b/src/transcription-utils.cpp index ca9e0f1..415b47b 100644 --- a/src/transcription-utils.cpp +++ b/src/transcription-utils.cpp @@ -117,3 +117,23 @@ std::vector split(const std::string &string, char delimiter) } return tokens; } + +std::vector split_words(const std::string &str_copy) +{ + std::vector words; + std::string word; + for (char c : str_copy) { + if (std::isspace(c)) { + if (!word.empty()) { + words.push_back(word); + word.clear(); + } + } else { + word += c; + } + } + if (!word.empty()) { + words.push_back(word); + } + return words; +} diff --git a/src/transcription-utils.h b/src/transcription-utils.h index e5eb274..4e7f39c 100644 --- a/src/transcription-utils.h +++ b/src/transcription-utils.h @@ -4,36 +4,17 @@ #include #include #include -#include +// Fix UTF8 string for Windows std::string fix_utf8(const std::string &str); + +// Remove leading and trailing non-alphabetic characters std::string remove_leading_trailing_nonalpha(const std::string &str); + +// Split a string by a delimiter std::vector split(const std::string &string, char delimiter); -inline enum speaker_layout convert_speaker_layout(uint8_t channels) -{ - switch (channels) { - case 0: - return SPEAKERS_UNKNOWN; - case 1: - return SPEAKERS_MONO; - case 2: - return SPEAKERS_STEREO; - case 3: - return SPEAKERS_2POINT1; - case 4: - return SPEAKERS_4POINT0; - case 5: - return SPEAKERS_4POINT1; - case 6: - return SPEAKERS_5POINT1; - case 8: - return SPEAKERS_7POINT1; - default: - return SPEAKERS_UNKNOWN; - } -} - +// Get the current timestamp in milliseconds since epoch inline uint64_t now_ms() { return std::chrono::duration_cast( @@ -41,4 +22,7 @@ inline uint64_t now_ms() .count(); } +// Split a string into words based on spaces +std::vector split_words(const std::string &str_copy); + #endif // TRANSCRIPTION_UTILS_H diff --git a/src/utils.cpp b/src/utils.cpp deleted file mode 100644 index 6639ae7..0000000 --- a/src/utils.cpp +++ /dev/null @@ -1,21 +0,0 @@ -#include "utils.h" - -std::vector split_words(const std::string &str_copy) -{ - std::vector words; - std::string word; - for (char c : str_copy) { - if (std::isspace(c)) { - if (!word.empty()) { - words.push_back(word); - word.clear(); - } - } else { - word += c; - } - } - if (!word.empty()) { - words.push_back(word); - } - return words; -} diff --git a/src/utils.h b/src/utils.h deleted file mode 100644 index 9348417..0000000 --- a/src/utils.h +++ /dev/null @@ -1,9 +0,0 @@ -#ifndef UTILS_H -#define UTILS_H - -#include -#include - -std::vector split_words(const std::string &str_copy); - -#endif // UTILS_H diff --git a/src/whisper-utils/token-buffer-thread.cpp b/src/whisper-utils/token-buffer-thread.cpp index a6b1110..aa4db2e 100644 --- a/src/whisper-utils/token-buffer-thread.cpp +++ b/src/whisper-utils/token-buffer-thread.cpp @@ -23,7 +23,9 @@ TokenBufferThread::~TokenBufferThread() stop = true; } condVar.notify_all(); - workerThread.join(); + if (workerThread.joinable()) { + workerThread.join(); + } } void TokenBufferThread::initialize(struct transcription_filter_data *gf_, @@ -38,10 +40,20 @@ void TokenBufferThread::initialize(struct transcription_filter_data *gf_, this->numPerSentence = numPerSentence_; this->segmentation = segmentation_; this->maxTime = maxTime_; - this->initialized = true; + this->stop = false; this->workerThread = std::thread(&TokenBufferThread::monitor, this); } +void TokenBufferThread::stopThread() +{ + std::lock_guard lock(queueMutex); + stop = true; + condVar.notify_all(); + if (workerThread.joinable()) { + workerThread.join(); + } +} + void TokenBufferThread::log_token_vector(const std::vector &tokens) { std::string output; @@ -81,21 +93,22 @@ void TokenBufferThread::addSentence(const std::string &sentence) } } +void TokenBufferThread::clear() +{ + obs_log(LOG_INFO, "TokenBufferThread::clear"); + std::lock_guard lock(queueMutex); + inputQueue.clear(); + presentationQueue.clear(); + this->callback(""); +} + void TokenBufferThread::monitor() { obs_log(LOG_INFO, "TokenBufferThread::monitor"); this->callback(""); - while (this->initialized && !this->stop) { - if (this->stop) { - break; - } - - if (this->gf->whisper_context == nullptr) { - continue; - } - + while (!this->stop) { // condition presentation queue if (presentationQueue.size() == this->numSentences * this->numPerSentence) { // pop a whole sentence from the presentation queue front diff --git a/src/whisper-utils/token-buffer-thread.h b/src/whisper-utils/token-buffer-thread.h index 223d5b0..ce02491 100644 --- a/src/whisper-utils/token-buffer-thread.h +++ b/src/whisper-utils/token-buffer-thread.h @@ -36,6 +36,10 @@ public: TokenBufferSegmentation segmentation_ = SEGMENTATION_TOKEN); void addSentence(const std::string &sentence); + void clear(); + void stopThread(); + + bool isEnabled() const { return !stop; } private: void monitor(); @@ -48,8 +52,7 @@ private: std::condition_variable condVar; std::function callback; std::chrono::seconds maxTime; - bool stop; - bool initialized = false; + bool stop = true; bool newDataAvailable = false; size_t numSentences; size_t numPerSentence; diff --git a/src/whisper-utils/whisper-model-utils.cpp b/src/whisper-utils/whisper-model-utils.cpp index 35213d3..c9620c8 100644 --- a/src/whisper-utils/whisper-model-utils.cpp +++ b/src/whisper-utils/whisper-model-utils.cpp @@ -102,9 +102,5 @@ void update_whisper_model(struct transcription_filter_data *gf, obs_data_t *s) gf->enable_token_ts_dtw = obs_data_get_bool(s, "dtw_token_timestamps"); shutdown_whisper_thread(gf); start_whisper_thread_with_path(gf, gf->whisper_model_path, silero_vad_model_file); - } else { - // dtw_token_timestamps did not change - obs_log(gf->log_level, "dtw_token_timestamps did not change: %d == %d", - gf->enable_token_ts_dtw, new_dtw_timestamps); } } diff --git a/src/whisper-utils/whisper-processing.cpp b/src/whisper-utils/whisper-processing.cpp index a7a42e7..c3d06ae 100644 --- a/src/whisper-utils/whisper-processing.cpp +++ b/src/whisper-utils/whisper-processing.cpp @@ -283,18 +283,22 @@ void run_inference_and_callbacks(transcription_filter_data *gf, uint64_t start_o uint64_t end_offset_ms, int vad_state) { // get the data from the entire whisper buffer + // add 50ms of silence to the beginning and end of the buffer const size_t pcm32f_size = gf->whisper_buffer.size / sizeof(float); + const size_t pcm32f_size_with_silence = pcm32f_size + 2 * WHISPER_SAMPLE_RATE / 100; // allocate a new buffer and copy the data to it - float *pcm32f_data = (float *)bzalloc(pcm32f_size * sizeof(float)); - circlebuf_pop_back(&gf->whisper_buffer, pcm32f_data, pcm32f_size * sizeof(float)); + float *pcm32f_data = (float *)bzalloc(pcm32f_size_with_silence * sizeof(float)); + circlebuf_pop_back(&gf->whisper_buffer, pcm32f_data + WHISPER_SAMPLE_RATE / 100, + pcm32f_size * sizeof(float)); - struct DetectionResultWithText inference_result = - run_whisper_inference(gf, pcm32f_data, pcm32f_size, start_offset_ms, end_offset_ms); + struct DetectionResultWithText inference_result = run_whisper_inference( + gf, pcm32f_data, pcm32f_size_with_silence, start_offset_ms, end_offset_ms); // output inference result to a text source set_text_callback(gf, inference_result); if (gf->enable_audio_chunks_callback) { - audio_chunk_callback(gf, pcm32f_data, pcm32f_size, vad_state, inference_result); + audio_chunk_callback(gf, pcm32f_data, pcm32f_size_with_silence, vad_state, + inference_result); } // free the buffer