diff --git a/.github/actions/package-plugin/action.yaml b/.github/actions/package-plugin/action.yaml index b3af301..5d31e66 100644 --- a/.github/actions/package-plugin/action.yaml +++ b/.github/actions/package-plugin/action.yaml @@ -24,6 +24,10 @@ inputs: description: 'Developer ID for installer package codesigning (macOS only)' required: false default: '' + codesignTeam: + description: 'Developer team for codesigning (macOS only)' + required: false + default: '' codesignUser: description: 'Apple ID username for notarization (macOS only)' required: false @@ -50,6 +54,7 @@ runs: env: CODESIGN_IDENT: ${{ inputs.codesignIdent }} CODESIGN_IDENT_INSTALLER: ${{ inputs.installerIdent }} + CODESIGN_TEAM: ${{ inputs.codesignTeam }} CODESIGN_IDENT_USER: ${{ inputs.codesignUser }} CODESIGN_IDENT_PASS: ${{ inputs.codesignPass }} run: | diff --git a/.github/workflows/build-project.yaml b/.github/workflows/build-project.yaml index 9f85f2f..556ec4f 100644 --- a/.github/workflows/build-project.yaml +++ b/.github/workflows/build-project.yaml @@ -129,6 +129,7 @@ jobs: codesign: ${{ fromJSON(needs.check-event.outputs.codesign) && fromJSON(steps.codesign.outputs.haveCodesignIdent) }} codesignIdent: ${{ steps.codesign.outputs.codesignIdent }} installerIdent: ${{ steps.codesign.outputs.installerIdent }} + codesignTeam: ${{ steps.codesign.outputs.codesignTeam }} notarize: ${{ fromJSON(needs.check-event.outputs.notarize) && fromJSON(steps.codesign.outputs.haveNotarizationUser) }} codesignUser: ${{ secrets.MACOS_NOTARIZATION_USERNAME }} codesignPass: ${{ secrets.MACOS_NOTARIZATION_PASSWORD }} diff --git a/.gitignore b/.gitignore index 10d3b1f..40a8b2d 100644 --- a/.gitignore +++ b/.gitignore @@ -15,6 +15,7 @@ !CMakePresets.json !LICENSE !README.md +!/vendor # Exclude lock files *.lock.json diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..3c6156a --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "vendor/curl"] + path = vendor/curl + url = https://github.com/curl/curl.git diff --git a/CMakeLists.txt b/CMakeLists.txt index b4fde9c..88f2875 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -34,6 +34,23 @@ if(ENABLE_QT) AUTORCC ON) endif() -target_sources(${CMAKE_PROJECT_NAME} PRIVATE src/plugin-main.c) +set(USE_SYSTEM_CURL + OFF + CACHE STRING "Use system cURL") + +if(USE_SYSTEM_CURL) + find_package(CURL REQUIRED) + target_link_libraries(${CMAKE_PROJECT_NAME} PRIVATE "${CURL_LIBRARIES}") + target_include_directories(${CMAKE_PROJECT_NAME} SYSTEM PUBLIC "${CURL_INCLUDE_DIRS}") +else() + include(cmake/BuildMyCurl.cmake) + target_link_libraries(${CMAKE_PROJECT_NAME} PRIVATE libcurl) +endif() + +include(cmake/BuildWhispercpp.cmake) +target_link_libraries(${CMAKE_PROJECT_NAME} PRIVATE Whispercpp) + +target_sources(${CMAKE_PROJECT_NAME} PRIVATE src/plugin-main.c src/transcription-filter.cpp src/transcription-filter.c + src/whisper-processing.cpp) set_target_properties_plugin(${CMAKE_PROJECT_NAME} PROPERTIES OUTPUT_NAME ${_name}) diff --git a/buildspec.json b/buildspec.json index 8a10e3f..b3af2b7 100644 --- a/buildspec.json +++ b/buildspec.json @@ -41,14 +41,14 @@ }, "platformConfig": { "macos": { - "bundleId": "com.example.obs-plugintemplate" + "bundleId": "com.royshilkrot.obs-localvocal" } }, - "name": "obs-plugintemplate", - "version": "1.0.0", - "author": "Your Name Here", - "website": "https://example.com", - "email": "me@example.com", + "name": "obs-localvocal", + "version": "0.0.1", + "author": "Roy Shilkrot", + "website": "https://github.com/royshil/obs-localvocal", + "email": "roy.shil@gmail.com", "uuids": { "macosPackage": "00000000-0000-0000-0000-000000000000", "macosInstaller": "00000000-0000-0000-0000-000000000000", diff --git a/cmake/BuildMyCurl.cmake b/cmake/BuildMyCurl.cmake new file mode 100644 index 0000000..26b26bd --- /dev/null +++ b/cmake/BuildMyCurl.cmake @@ -0,0 +1,29 @@ +set(LIBCURL_SOURCE_DIR ${CMAKE_SOURCE_DIR}/vendor/curl) + +find_package(Git QUIET) +execute_process( + COMMAND ${GIT_EXECUTABLE} checkout curl-8_2_0 + WORKING_DIRECTORY ${LIBCURL_SOURCE_DIR} + RESULT_VARIABLE GIT_SUBMOD_RESULT) + +if(OS_MACOS) + set(CURL_USE_OPENSSL OFF) + set(CURL_USE_SECTRANSP ON) +elseif(OS_WINDOWS) + set(CURL_USE_OPENSSL OFF) + set(CURL_USE_SCHANNEL ON) +elseif(OS_LINUX) + add_compile_options(-fPIC) + set(CURL_USE_OPENSSL ON) +endif() +set(BUILD_CURL_EXE OFF) +set(BUILD_SHARED_LIBS OFF) +set(HTTP_ONLY OFF) +set(CURL_USE_LIBSSH2 OFF) +add_subdirectory(${LIBCURL_SOURCE_DIR} EXCLUDE_FROM_ALL) +if(OS_MACOS) + target_compile_options( + libcurl PRIVATE -Wno-error=ambiguous-macro -Wno-error=deprecated-declarations -Wno-error=unreachable-code + -Wno-error=unused-parameter -Wno-error=unused-variable) +endif() +include_directories(SYSTEM ${LIBCURL_SOURCE_DIR}/include) diff --git a/cmake/BuildWhispercpp.cmake b/cmake/BuildWhispercpp.cmake new file mode 100644 index 0000000..8cce7f1 --- /dev/null +++ b/cmake/BuildWhispercpp.cmake @@ -0,0 +1,51 @@ +include(ExternalProject) + +string(REPLACE ";" "$" CMAKE_OSX_ARCHITECTURES_ "${CMAKE_OSX_ARCHITECTURES}") + +if(${CMAKE_BUILD_TYPE} STREQUAL Release OR ${CMAKE_BUILD_TYPE} STREQUAL RelWithDebInfo) + set(Whispercpp_BUILD_TYPE Release) +else() + set(Whispercpp_BUILD_TYPE Debug) +endif() + +# On linux add the `-fPIC` flag to the compiler +if(UNIX AND NOT APPLE) + set(WHISPER_EXTRA_CXX_FLAGS "-fPIC") +endif() + +ExternalProject_Add( + Whispercpp_Build + DOWNLOAD_EXTRACT_TIMESTAMP true + GIT_REPOSITORY https://github.com/ggerganov/whisper.cpp.git + GIT_TAG 7b374c9ac9b9861bb737eec060e4dfa29d229259 + BUILD_COMMAND ${CMAKE_COMMAND} --build --config ${Whispercpp_BUILD_TYPE} + BUILD_BYPRODUCTS /lib/static/${CMAKE_STATIC_LIBRARY_PREFIX}whisper${CMAKE_STATIC_LIBRARY_SUFFIX} + CMAKE_GENERATOR ${CMAKE_GENERATOR} + INSTALL_COMMAND ${CMAKE_COMMAND} --install --config ${Whispercpp_BUILD_TYPE} + CMAKE_ARGS -DCMAKE_INSTALL_PREFIX= + -DCMAKE_BUILD_TYPE=${Whispercpp_BUILD_TYPE} + -DCMAKE_GENERATOR_PLATFORM=${CMAKE_GENERATOR_PLATFORM} + -DCMAKE_OSX_DEPLOYMENT_TARGET=10.13 + -DCMAKE_OSX_ARCHITECTURES=${CMAKE_OSX_ARCHITECTURES_} + -DCMAKE_CXX_FLAGS=${WHISPER_EXTRA_CXX_FLAGS} + -DCMAKE_C_FLAGS=${WHISPER_EXTRA_CXX_FLAGS} + -DBUILD_SHARED_LIBS=OFF + -DWHISPER_BUILD_TESTS=OFF + -DWHISPER_BUILD_EXAMPLES=OFF + -DWHISPER_OPENBLAS=ON) + +ExternalProject_Get_Property(Whispercpp_Build INSTALL_DIR) + +add_library(Whispercpp::Whisper STATIC IMPORTED) +set_target_properties( + Whispercpp::Whisper + PROPERTIES IMPORTED_LOCATION + ${INSTALL_DIR}/lib/static/${CMAKE_STATIC_LIBRARY_PREFIX}whisper${CMAKE_STATIC_LIBRARY_SUFFIX}) + +add_library(Whispercpp INTERFACE) +add_dependencies(Whispercpp Whispercpp_Build) +target_link_libraries(Whispercpp INTERFACE Whispercpp::Whisper) +set_target_properties(Whispercpp::Whisper PROPERTIES INTERFACE_INCLUDE_DIRECTORIES ${INSTALL_DIR}/include) +if(APPLE) + target_link_libraries(Whispercpp INTERFACE "-framework Accelerate") +endif(APPLE) diff --git a/data/locale/en-US.ini b/data/locale/en-US.ini index e69de29..04f4f44 100644 --- a/data/locale/en-US.ini +++ b/data/locale/en-US.ini @@ -0,0 +1 @@ +transcription_filterAudioFilter=LocalVocal Transcription diff --git a/data/models/ggml-tiny.en.bin b/data/models/ggml-tiny.en.bin new file mode 100644 index 0000000..9dd1a6b Binary files /dev/null and b/data/models/ggml-tiny.en.bin differ diff --git a/src/plugin-main.c b/src/plugin-main.c index bd4976c..c2f8445 100644 --- a/src/plugin-main.c +++ b/src/plugin-main.c @@ -22,10 +22,18 @@ with this program. If not, see OBS_DECLARE_MODULE() OBS_MODULE_USE_DEFAULT_LOCALE(PLUGIN_NAME, "en-US") +MODULE_EXPORT const char *obs_module_description(void) +{ + return obs_module_text("LocalVocalPlugin"); +} + +extern struct obs_source_info transcription_filter_info; + bool obs_module_load(void) { - obs_log(LOG_INFO, "plugin loaded successfully (version %s)", - PLUGIN_VERSION); + obs_register_source(&transcription_filter_info); + blog(LOG_INFO, "plugin loaded successfully (version %s)", + PLUGIN_VERSION); return true; } diff --git a/src/plugin-support.c.in b/src/plugin-support.c.in index acf56ee..f78a593 100644 --- a/src/plugin-support.c.in +++ b/src/plugin-support.c.in @@ -21,6 +21,8 @@ with this program. If not, see const char *PLUGIN_NAME = "@CMAKE_PROJECT_NAME@"; const char *PLUGIN_VERSION = "@CMAKE_PROJECT_VERSION@"; +extern void blogva(int log_level, const char *format, va_list args); + void obs_log(int log_level, const char *format, ...) { size_t length = 4 + strlen(PLUGIN_NAME) + strlen(format); diff --git a/src/plugin-support.h b/src/plugin-support.h index 8ffb57c..6959fcf 100644 --- a/src/plugin-support.h +++ b/src/plugin-support.h @@ -31,7 +31,6 @@ extern const char *PLUGIN_NAME; extern const char *PLUGIN_VERSION; void obs_log(int log_level, const char *format, ...); -extern void blogva(int log_level, const char *format, va_list args); #ifdef __cplusplus } diff --git a/src/transcription-filter-data.h b/src/transcription-filter-data.h new file mode 100644 index 0000000..fa302ac --- /dev/null +++ b/src/transcription-filter-data.h @@ -0,0 +1,73 @@ +#ifndef TRANSCRIPTION_FILTER_DATA_H +#define TRANSCRIPTION_FILTER_DATA_H + +#include +#include +#include +#include + +#include + +#include +#include + +#define MAX_PREPROC_CHANNELS 2 + +// buffer size in msec +#define BUFFER_SIZE_MSEC 1010 +// at 16Khz, 1010 msec is 16160 frames +#define WHISPER_FRAME_SIZE 16160 +// overlap in msec +#define OVERLAP_SIZE_MSEC 340 + +#define MT_ obs_module_text + +struct transcription_filter_data { + obs_source_t *context; // obs input source + size_t channels; // number of channels + uint32_t sample_rate; // input sample rate + // How many input frames (in input sample rate) are needed for the next whisper frame + size_t frames; + // How many ms/frames are needed to overlap with the next whisper frame + size_t overlap_frames; + size_t overlap_ms; + // How many frames were processed in the last whisper frame (this is dynamic) + size_t last_num_frames; + + /* PCM buffers */ + float *copy_buffers[MAX_PREPROC_CHANNELS]; + DARRAY(float) copy_output_buffers[MAX_PREPROC_CHANNELS]; + struct circlebuf info_buffer; + struct circlebuf input_buffers[MAX_PREPROC_CHANNELS]; + + /* Resampler */ + audio_resampler_t *resampler; + + /* whisper */ + std::string whisper_model_path = "models/ggml-tiny.en.bin"; + struct whisper_context *whisper_context; + whisper_full_params whisper_params; + + float filler_p_threshold; + + bool do_silence; + bool vad_enabled; + int log_level; + bool log_words; + bool active; + + // Use std for thread and mutex + std::thread whisper_thread; + + std::unique_ptr whisper_buf_mutex; + std::unique_ptr whisper_ctx_mutex; + std::unique_ptr wshiper_thread_cv; +}; + +// Audio packet info +struct transcription_filter_audio_info { + uint32_t frames; + uint64_t timestamp; +}; + +#endif /* TRANSCRIPTION_FILTER_DATA_H */ diff --git a/src/transcription-filter.c b/src/transcription-filter.c new file mode 100644 index 0000000..505a106 --- /dev/null +++ b/src/transcription-filter.c @@ -0,0 +1,16 @@ +#include "transcription-filter.h" + +struct obs_source_info transcription_filter_info = { + .id = "transcription_filter_audio_filter", + .type = OBS_SOURCE_TYPE_FILTER, + .output_flags = OBS_SOURCE_AUDIO, + .get_name = transcription_filter_name, + .create = transcription_filter_create, + .destroy = transcription_filter_destroy, + .get_defaults = transcription_filter_defaults, + .get_properties = transcription_filter_properties, + .update = transcription_filter_update, + .activate = transcription_filter_activate, + .deactivate = transcription_filter_deactivate, + .filter_audio = transcription_filter_filter_audio, +}; diff --git a/src/transcription-filter.cpp b/src/transcription-filter.cpp new file mode 100644 index 0000000..a784d93 --- /dev/null +++ b/src/transcription-filter.cpp @@ -0,0 +1,468 @@ +#include + +#include "plugin-support.h" +#include "transcription-filter.h" +#include "transcription-filter-data.h" +#include "whisper-processing.h" +#include "whisper-language.h" + +inline enum speaker_layout convert_speaker_layout(uint8_t channels) +{ + switch (channels) { + case 0: + return SPEAKERS_UNKNOWN; + case 1: + return SPEAKERS_MONO; + case 2: + return SPEAKERS_STEREO; + case 3: + return SPEAKERS_2POINT1; + case 4: + return SPEAKERS_4POINT0; + case 5: + return SPEAKERS_4POINT1; + case 6: + return SPEAKERS_5POINT1; + case 8: + return SPEAKERS_7POINT1; + default: + return SPEAKERS_UNKNOWN; + } +} + +struct obs_audio_data * +transcription_filter_filter_audio(void *data, struct obs_audio_data *audio) +{ + if (!audio) { + return nullptr; + } + if (data == nullptr) { + return audio; + } + + struct transcription_filter_data *gf = + static_cast(data); + + if (!gf->active) { + return audio; + } + + if (gf->whisper_context == nullptr) { + // Whisper not initialized, just pass through + return audio; + } + + { + std::lock_guard lock( + *gf->whisper_buf_mutex); // scoped lock + obs_log(gf->log_level, + "pushing %lu frames to input buffer. current size: %lu (bytes)", + (size_t)(audio->frames), gf->input_buffers[0].size); + // push back current audio data to input circlebuf + for (size_t c = 0; c < gf->channels; c++) { + circlebuf_push_back(&gf->input_buffers[c], + audio->data[c], + audio->frames * sizeof(float)); + } + // push audio packet info (timestamp/frame count) to info circlebuf + struct transcription_filter_audio_info info = {0}; + info.frames = audio->frames; // number of frames in this packet + info.timestamp = audio->timestamp; // timestamp of this packet + circlebuf_push_back(&gf->info_buffer, &info, sizeof(info)); + } + + return audio; +} + +const char *transcription_filter_name(void *unused) +{ + UNUSED_PARAMETER(unused); + return MT_("transcription_filterAudioFilter"); +} + +void transcription_filter_destroy(void *data) +{ + struct transcription_filter_data *gf = + static_cast(data); + + obs_log(LOG_INFO, "transcription_filter_destroy"); + { + std::lock_guard lock(*gf->whisper_ctx_mutex); + if (gf->whisper_context != nullptr) { + whisper_free(gf->whisper_context); + gf->whisper_context = nullptr; + } + } + + // join the thread + if (gf->whisper_thread.joinable()) { + gf->whisper_thread.join(); + } + + if (gf->resampler) { + audio_resampler_destroy(gf->resampler); + } + + { + std::lock_guard lockbuf(*gf->whisper_buf_mutex); + bfree(gf->copy_buffers[0]); + gf->copy_buffers[0] = nullptr; + for (size_t i = 0; i < gf->channels; i++) { + circlebuf_free(&gf->input_buffers[i]); + } + } + circlebuf_free(&gf->info_buffer); + + bfree(gf); +} + +void transcription_filter_update(void *data, obs_data_t *s) +{ + struct transcription_filter_data *gf = + static_cast(data); + + gf->filler_p_threshold = + (float)obs_data_get_double(s, "filler_p_threshold"); + gf->log_level = (int)obs_data_get_int(s, "log_level"); + gf->do_silence = obs_data_get_bool(s, "do_silence"); + gf->vad_enabled = obs_data_get_bool(s, "vad_enabled"); + gf->log_words = obs_data_get_bool(s, "log_words"); + + const char *new_model_path = + obs_data_get_string(s, "whisper_model_path"); + if (strcmp(new_model_path, gf->whisper_model_path.c_str()) != 0) { + // model path changed, reload the model + obs_log(LOG_INFO, "model path changed, reloading model"); + if (gf->whisper_context != nullptr) { + // acquire the mutex before freeing the context + std::lock_guard lock( + *gf->whisper_ctx_mutex); + whisper_free(gf->whisper_context); + gf->whisper_context = nullptr; + } + if (gf->whisper_thread.joinable()) { + gf->whisper_thread.join(); + } + gf->whisper_model_path = bstrdup(new_model_path); + + // check if the model exists, if not, download it + // if (!check_if_model_exists(gf->whisper_model_path)) { + // obs_log(LOG_ERROR, "Whisper model does not exist"); + // download_model_with_ui_dialog( + // gf->whisper_model_path, [gf](int download_status) { + // if (download_status == 0) { + // obs_log(LOG_INFO, "Model download complete"); + // gf->whisper_context = init_whisper_context( + // gf->whisper_model_path); + // gf->whisper_thread = std::thread(whisper_loop, gf); + // } else { + // obs_log(LOG_ERROR, "Model download failed"); + // } + // }); + // } else { + // Model exists, just load it + gf->whisper_context = + init_whisper_context(gf->whisper_model_path); + gf->whisper_thread = std::thread(whisper_loop, gf); + // } + } + + std::lock_guard lock(*gf->whisper_ctx_mutex); + + gf->whisper_params = whisper_full_default_params( + (whisper_sampling_strategy)obs_data_get_int( + s, "whisper_sampling_method")); + gf->whisper_params.duration_ms = BUFFER_SIZE_MSEC; + gf->whisper_params.language = + obs_data_get_string(s, "whisper_language_select"); + gf->whisper_params.translate = false; + gf->whisper_params.initial_prompt = + obs_data_get_string(s, "initial_prompt"); + gf->whisper_params.n_threads = (int)obs_data_get_int(s, "n_threads"); + gf->whisper_params.n_max_text_ctx = + (int)obs_data_get_int(s, "n_max_text_ctx"); + gf->whisper_params.no_context = obs_data_get_bool(s, "no_context"); + gf->whisper_params.single_segment = + obs_data_get_bool(s, "single_segment"); + gf->whisper_params.print_special = + obs_data_get_bool(s, "print_special"); + gf->whisper_params.print_progress = + obs_data_get_bool(s, "print_progress"); + gf->whisper_params.print_realtime = + obs_data_get_bool(s, "print_realtime"); + gf->whisper_params.print_timestamps = + obs_data_get_bool(s, "print_timestamps"); + gf->whisper_params.token_timestamps = + obs_data_get_bool(s, "token_timestamps"); + gf->whisper_params.thold_pt = (float)obs_data_get_double(s, "thold_pt"); + gf->whisper_params.thold_ptsum = + (float)obs_data_get_double(s, "thold_ptsum"); + gf->whisper_params.max_len = (int)obs_data_get_int(s, "max_len"); + gf->whisper_params.split_on_word = + obs_data_get_bool(s, "split_on_word"); + gf->whisper_params.max_tokens = (int)obs_data_get_int(s, "max_tokens"); + gf->whisper_params.speed_up = obs_data_get_bool(s, "speed_up"); + gf->whisper_params.suppress_blank = + obs_data_get_bool(s, "suppress_blank"); + gf->whisper_params.suppress_non_speech_tokens = + obs_data_get_bool(s, "suppress_non_speech_tokens"); + gf->whisper_params.temperature = + (float)obs_data_get_double(s, "temperature"); + gf->whisper_params.max_initial_ts = + (float)obs_data_get_double(s, "max_initial_ts"); + gf->whisper_params.length_penalty = + (float)obs_data_get_double(s, "length_penalty"); +} + +void *transcription_filter_create(obs_data_t *settings, obs_source_t *filter) +{ + struct transcription_filter_data *gf = + static_cast( + bmalloc(sizeof(struct transcription_filter_data))); + + // Get the number of channels for the input source + gf->channels = audio_output_get_channels(obs_get_audio()); + gf->sample_rate = audio_output_get_sample_rate(obs_get_audio()); + gf->frames = (size_t)((float)gf->sample_rate / + (1000.0f / (float)BUFFER_SIZE_MSEC)); + gf->last_num_frames = 0; + + for (size_t i = 0; i < MAX_AUDIO_CHANNELS; i++) { + circlebuf_init(&gf->input_buffers[i]); + } + circlebuf_init(&gf->info_buffer); + + // allocate copy buffers + gf->copy_buffers[0] = static_cast( + bzalloc(gf->channels * gf->frames * sizeof(float))); + for (size_t c = 1; c < gf->channels; c++) { // set the channel pointers + gf->copy_buffers[c] = gf->copy_buffers[0] + c * gf->frames; + } + + gf->context = filter; + gf->whisper_model_path = + obs_data_get_string(settings, "whisper_model_path"); + gf->whisper_context = init_whisper_context(gf->whisper_model_path); + if (gf->whisper_context == nullptr) { + obs_log(LOG_ERROR, "Failed to load whisper model"); + return nullptr; + } + + gf->overlap_ms = OVERLAP_SIZE_MSEC; + gf->overlap_frames = (size_t)((float)gf->sample_rate / + (1000.0f / (float)gf->overlap_ms)); + obs_log(LOG_INFO, + "transcription_filter filter: channels %d, frames %d, sample_rate %d", + (int)gf->channels, (int)gf->frames, gf->sample_rate); + + struct resample_info src, dst; + src.samples_per_sec = gf->sample_rate; + src.format = AUDIO_FORMAT_FLOAT_PLANAR; + src.speakers = convert_speaker_layout((uint8_t)gf->channels); + + dst.samples_per_sec = WHISPER_SAMPLE_RATE; + dst.format = AUDIO_FORMAT_FLOAT_PLANAR; + dst.speakers = convert_speaker_layout((uint8_t)1); + + gf->resampler = audio_resampler_create(&dst, &src); + + gf->active = true; + + gf->whisper_buf_mutex = std::unique_ptr(new std::mutex()); + gf->whisper_ctx_mutex = std::unique_ptr(new std::mutex()); + gf->wshiper_thread_cv = + std::unique_ptr(new std::condition_variable()); + + // get the settings updated on the filter data struct + transcription_filter_update(gf, settings); + + // start the thread + gf->whisper_thread = std::thread(whisper_loop, gf); + + return gf; +} + +void transcription_filter_activate(void *data) +{ + struct transcription_filter_data *gf = + static_cast(data); + obs_log(LOG_INFO, "transcription_filter filter activated"); + gf->active = true; +} + +void transcription_filter_deactivate(void *data) +{ + struct transcription_filter_data *gf = + static_cast(data); + obs_log(LOG_INFO, "transcription_filter filter deactivated"); + gf->active = false; +} + +void transcription_filter_defaults(obs_data_t *s) +{ + obs_data_set_default_double(s, "filler_p_threshold", 0.75); + obs_data_set_default_bool(s, "do_silence", true); + obs_data_set_default_bool(s, "vad_enabled", true); + obs_data_set_default_int(s, "log_level", LOG_DEBUG); + obs_data_set_default_bool(s, "log_words", true); + obs_data_set_default_string(s, "whisper_model_path", + "models/ggml-tiny.en.bin"); + obs_data_set_default_string(s, "whisper_language_select", "en"); + + // Whisper parameters + obs_data_set_default_int(s, "whisper_sampling_method", + WHISPER_SAMPLING_BEAM_SEARCH); + obs_data_set_default_string(s, "initial_prompt", ""); + obs_data_set_default_int(s, "n_threads", 4); + obs_data_set_default_int(s, "n_max_text_ctx", 16384); + obs_data_set_default_bool(s, "no_context", true); + obs_data_set_default_bool(s, "single_segment", true); + obs_data_set_default_bool(s, "print_special", false); + obs_data_set_default_bool(s, "print_progress", false); + obs_data_set_default_bool(s, "print_realtime", false); + obs_data_set_default_bool(s, "print_timestamps", false); + obs_data_set_default_bool(s, "token_timestamps", false); + obs_data_set_default_double(s, "thold_pt", 0.01); + obs_data_set_default_double(s, "thold_ptsum", 0.01); + obs_data_set_default_int(s, "max_len", 0); + obs_data_set_default_bool(s, "split_on_word", false); + obs_data_set_default_int(s, "max_tokens", 3); + obs_data_set_default_bool(s, "speed_up", false); + obs_data_set_default_bool(s, "suppress_blank", false); + obs_data_set_default_bool(s, "suppress_non_speech_tokens", true); + obs_data_set_default_double(s, "temperature", 0.5); + obs_data_set_default_double(s, "max_initial_ts", 1.0); + obs_data_set_default_double(s, "length_penalty", -1.0); +} + +obs_properties_t *transcription_filter_properties(void *data) +{ + obs_properties_t *ppts = obs_properties_create(); + + obs_properties_add_float_slider(ppts, "filler_p_threshold", + "filler_p_threshold", 0.0f, 1.0f, + 0.05f); + obs_properties_add_bool(ppts, "do_silence", "do_silence"); + obs_properties_add_bool(ppts, "vad_enabled", "vad_enabled"); + obs_property_t *list = obs_properties_add_list(ppts, "log_level", + "log_level", + OBS_COMBO_TYPE_LIST, + OBS_COMBO_FORMAT_INT); + obs_property_list_add_int(list, "DEBUG", LOG_DEBUG); + obs_property_list_add_int(list, "INFO", LOG_INFO); + obs_property_list_add_int(list, "WARNING", LOG_WARNING); + obs_properties_add_bool(ppts, "log_words", "log_words"); + + // Add a list of available whisper models to download + obs_property_t *whisper_models_list = obs_properties_add_list( + ppts, "whisper_model_path", "Whisper Model", + OBS_COMBO_TYPE_LIST, OBS_COMBO_FORMAT_STRING); + + obs_property_list_add_string(whisper_models_list, "Tiny (Eng) 75Mb", + "models/ggml-tiny.en.bin"); + obs_property_list_add_string(whisper_models_list, "Tiny 75Mb", + "models/ggml-tiny.bin"); + obs_property_list_add_string(whisper_models_list, "Base (Eng) 142Mb", + "models/ggml-base.en.bin"); + obs_property_list_add_string(whisper_models_list, "Base 142Mb", + "models/ggml-base.bin"); + obs_property_list_add_string(whisper_models_list, "Small (Eng) 466Mb", + "models/ggml-small.en.bin"); + obs_property_list_add_string(whisper_models_list, "Small 466Mb", + "models/ggml-small.bin"); + + obs_properties_t *whisper_params_group = obs_properties_create(); + obs_properties_add_group(ppts, "whisper_params_group", + "Whisper Parameters", OBS_GROUP_NORMAL, + whisper_params_group); + + // Add language selector + obs_property_t *whisper_language_select_list = obs_properties_add_list( + whisper_params_group, "whisper_language_select", "Language", + OBS_COMBO_TYPE_LIST, OBS_COMBO_FORMAT_STRING); + // iterate over all available languages in whisper_available_lang map + for (auto const &pair : whisper_available_lang) { + obs_property_list_add_string(whisper_language_select_list, + pair.second.c_str(), + pair.first.c_str()); + } + + obs_property_t *whisper_sampling_method_list = obs_properties_add_list( + whisper_params_group, "whisper_sampling_method", + "whisper_sampling_method", OBS_COMBO_TYPE_LIST, + OBS_COMBO_FORMAT_INT); + obs_property_list_add_int(whisper_sampling_method_list, "Beam search", + WHISPER_SAMPLING_BEAM_SEARCH); + obs_property_list_add_int(whisper_sampling_method_list, "Greedy", + WHISPER_SAMPLING_GREEDY); + + // int n_threads; + obs_properties_add_int_slider(whisper_params_group, "n_threads", + "n_threads", 1, 8, 1); + // int n_max_text_ctx; // max tokens to use from past text as prompt for the decoder + obs_properties_add_int_slider(whisper_params_group, "n_max_text_ctx", + "n_max_text_ctx", 0, 16384, 100); + // int offset_ms; // start offset in ms + // int duration_ms; // audio duration to process in ms + // bool translate; + // bool no_context; // do not use past transcription (if any) as initial prompt for the decoder + obs_properties_add_bool(whisper_params_group, "no_context", + "no_context"); + // bool single_segment; // force single segment output (useful for streaming) + obs_properties_add_bool(whisper_params_group, "single_segment", + "single_segment"); + // bool print_special; // print special tokens (e.g. , , , etc.) + obs_properties_add_bool(whisper_params_group, "print_special", + "print_special"); + // bool print_progress; // print progress information + obs_properties_add_bool(whisper_params_group, "print_progress", + "print_progress"); + // bool print_realtime; // print results from within whisper.cpp (avoid it, use callback instead) + obs_properties_add_bool(whisper_params_group, "print_realtime", + "print_realtime"); + // bool print_timestamps; // print timestamps for each text segment when printing realtime + obs_properties_add_bool(whisper_params_group, "print_timestamps", + "print_timestamps"); + // bool token_timestamps; // enable token-level timestamps + obs_properties_add_bool(whisper_params_group, "token_timestamps", + "token_timestamps"); + // float thold_pt; // timestamp token probability threshold (~0.01) + obs_properties_add_float_slider(whisper_params_group, "thold_pt", + "thold_pt", 0.0f, 1.0f, 0.05f); + // float thold_ptsum; // timestamp token sum probability threshold (~0.01) + obs_properties_add_float_slider(whisper_params_group, "thold_ptsum", + "thold_ptsum", 0.0f, 1.0f, 0.05f); + // int max_len; // max segment length in characters + obs_properties_add_int_slider(whisper_params_group, "max_len", + "max_len", 0, 100, 1); + // bool split_on_word; // split on word rather than on token (when used with max_len) + obs_properties_add_bool(whisper_params_group, "split_on_word", + "split_on_word"); + // int max_tokens; // max tokens per segment (0 = no limit) + obs_properties_add_int_slider(whisper_params_group, "max_tokens", + "max_tokens", 0, 100, 1); + // bool speed_up; // speed-up the audio by 2x using Phase Vocoder + obs_properties_add_bool(whisper_params_group, "speed_up", "speed_up"); + // const char * initial_prompt; + obs_properties_add_text(whisper_params_group, "initial_prompt", + "initial_prompt", OBS_TEXT_DEFAULT); + // bool suppress_blank + obs_properties_add_bool(whisper_params_group, "suppress_blank", + "suppress_blank"); + // bool suppress_non_speech_tokens + obs_properties_add_bool(whisper_params_group, + "suppress_non_speech_tokens", + "suppress_non_speech_tokens"); + // float temperature + obs_properties_add_float_slider(whisper_params_group, "temperature", + "temperature", 0.0f, 1.0f, 0.05f); + // float max_initial_ts + obs_properties_add_float_slider(whisper_params_group, "max_initial_ts", + "max_initial_ts", 0.0f, 1.0f, 0.05f); + // float length_penalty + obs_properties_add_float_slider(whisper_params_group, "length_penalty", + "length_penalty", -1.0f, 1.0f, 0.1f); + + UNUSED_PARAMETER(data); + return ppts; +} diff --git a/src/transcription-filter.h b/src/transcription-filter.h new file mode 100644 index 0000000..a2311a5 --- /dev/null +++ b/src/transcription-filter.h @@ -0,0 +1,20 @@ +#include + +#ifdef __cplusplus +extern "C" { +#endif + +void transcription_filter_activate(void *data); +void *transcription_filter_create(obs_data_t *settings, obs_source_t *filter); +void transcription_filter_update(void *data, obs_data_t *s); +void transcription_filter_destroy(void *data); +const char *transcription_filter_name(void *unused); +struct obs_audio_data * +transcription_filter_filter_audio(void *data, struct obs_audio_data *audio); +void transcription_filter_deactivate(void *data); +void transcription_filter_defaults(obs_data_t *s); +obs_properties_t *transcription_filter_properties(void *data); + +#ifdef __cplusplus +} +#endif diff --git a/src/whisper-language.h b/src/whisper-language.h new file mode 100644 index 0000000..2d69c65 --- /dev/null +++ b/src/whisper-language.h @@ -0,0 +1,406 @@ +#ifndef WHISPER_LANGUAGE_H +#define WHISPER_LANGUAGE_H + +#include +#include + +static const std::map whisper_available_lang = { + { + "en", + "english", + }, + { + "zh", + "chinese", + }, + { + "de", + "german", + }, + { + "es", + "spanish", + }, + { + "ru", + "russian", + }, + { + "ko", + "korean", + }, + { + "fr", + "french", + }, + { + "ja", + "japanese", + }, + { + "pt", + "portuguese", + }, + { + "tr", + "turkish", + }, + { + "pl", + "polish", + }, + { + "ca", + "catalan", + }, + { + "nl", + "dutch", + }, + { + "ar", + "arabic", + }, + { + "sv", + "swedish", + }, + { + "it", + "italian", + }, + { + "id", + "indonesian", + }, + { + "hi", + "hindi", + }, + { + "fi", + "finnish", + }, + { + "vi", + "vietnamese", + }, + { + "he", + "hebrew", + }, + { + "uk", + "ukrainian", + }, + { + "el", + "greek", + }, + { + "ms", + "malay", + }, + { + "cs", + "czech", + }, + { + "ro", + "romanian", + }, + { + "da", + "danish", + }, + { + "hu", + "hungarian", + }, + { + "ta", + "tamil", + }, + { + "no", + "norwegian", + }, + { + "th", + "thai", + }, + { + "ur", + "urdu", + }, + { + "hr", + "croatian", + }, + { + "bg", + "bulgarian", + }, + { + "lt", + "lithuanian", + }, + { + "la", + "latin", + }, + { + "mi", + "maori", + }, + { + "ml", + "malayalam", + }, + { + "cy", + "welsh", + }, + { + "sk", + "slovak", + }, + { + "te", + "telugu", + }, + { + "fa", + "persian", + }, + { + "lv", + "latvian", + }, + { + "bn", + "bengali", + }, + { + "sr", + "serbian", + }, + { + "az", + "azerbaijani", + }, + { + "sl", + "slovenian", + }, + { + "kn", + "kannada", + }, + { + "et", + "estonian", + }, + { + "mk", + "macedonian", + }, + { + "br", + "breton", + }, + { + "eu", + "basque", + }, + { + "is", + "icelandic", + }, + { + "hy", + "armenian", + }, + { + "ne", + "nepali", + }, + { + "mn", + "mongolian", + }, + { + "bs", + "bosnian", + }, + { + "kk", + "kazakh", + }, + { + "sq", + "albanian", + }, + { + "sw", + "swahili", + }, + { + "gl", + "galician", + }, + { + "mr", + "marathi", + }, + { + "pa", + "punjabi", + }, + { + "si", + "sinhala", + }, + { + "km", + "khmer", + }, + { + "sn", + "shona", + }, + { + "yo", + "yoruba", + }, + { + "so", + "somali", + }, + { + "af", + "afrikaans", + }, + { + "oc", + "occitan", + }, + { + "ka", + "georgian", + }, + { + "be", + "belarusian", + }, + { + "tg", + "tajik", + }, + { + "sd", + "sindhi", + }, + { + "gu", + "gujarati", + }, + { + "am", + "amharic", + }, + { + "yi", + "yiddish", + }, + { + "lo", + "lao", + }, + { + "uz", + "uzbek", + }, + { + "fo", + "faroese", + }, + { + "ht", + "haitian", + }, + { + "ps", + "pashto", + }, + { + "tk", + "turkmen", + }, + { + "nn", + "nynorsk", + }, + { + "mt", + "maltese", + }, + { + "sa", + "sanskrit", + }, + { + "lb", + "luxembourgish", + }, + { + "my", + "myanmar", + }, + { + "bo", + "tibetan", + }, + { + "tl", + "tagalog", + }, + { + "mg", + "malagasy", + }, + { + "as", + "assamese", + }, + { + "tt", + "tatar", + }, + { + "haw", + "hawaiian", + }, + { + "ln", + "lingala", + }, + { + "ha", + "hausa", + }, + { + "ba", + "bashkir", + }, + { + "jw", + "javanese", + }, + { + "su", + "sundanese", + }, +}; + +#endif // WHISPER_LANGUAGE_H diff --git a/src/whisper-processing.cpp b/src/whisper-processing.cpp new file mode 100644 index 0000000..03c1ee4 --- /dev/null +++ b/src/whisper-processing.cpp @@ -0,0 +1,376 @@ +#include + +#include + +#include "plugin-support.h" +#include "transcription-filter-data.h" + +#define VAD_THOLD 0.0001f +#define FREQ_THOLD 100.0f + +std::string to_timestamp(int64_t t) +{ + int64_t sec = t / 100; + int64_t msec = t - sec * 100; + int64_t min = sec / 60; + sec = sec - min * 60; + + char buf[32]; + snprintf(buf, sizeof(buf), "%02d:%02d.%03d", (int)min, (int)sec, + (int)msec); + + return std::string(buf); +} + +void high_pass_filter(float *pcmf32, size_t pcm32f_size, float cutoff, + uint32_t sample_rate) +{ + const float rc = 1.0f / (2.0f * (float)M_PI * cutoff); + const float dt = 1.0f / (float)sample_rate; + const float alpha = dt / (rc + dt); + + float y = pcmf32[0]; + + for (size_t i = 1; i < pcm32f_size; i++) { + y = alpha * (y + pcmf32[i] - pcmf32[i - 1]); + pcmf32[i] = y; + } +} + +// VAD (voice activity detection), return true if speech detected +bool vad_simple(float *pcmf32, size_t pcm32f_size, uint32_t sample_rate, + float vad_thold, float freq_thold, bool verbose) +{ + const uint64_t n_samples = pcm32f_size; + + if (freq_thold > 0.0f) { + high_pass_filter(pcmf32, pcm32f_size, freq_thold, sample_rate); + } + + float energy_all = 0.0f; + + for (uint64_t i = 0; i < n_samples; i++) { + energy_all += fabsf(pcmf32[i]); + } + + energy_all /= (float)n_samples; + + if (verbose) { + blog(LOG_INFO, + "%s: energy_all: %f, vad_thold: %f, freq_thold: %f", + __func__, energy_all, vad_thold, freq_thold); + } + + if (energy_all < vad_thold) { + return false; + } + + return true; +} + +struct whisper_context *init_whisper_context(const std::string &model_path) +{ + struct whisper_context *ctx = + whisper_init_from_file(obs_module_file(model_path.c_str())); + if (ctx == nullptr) { + obs_log(LOG_ERROR, "Failed to load whisper model"); + return nullptr; + } + return ctx; +} + +enum DetectionResult { + DETECTION_RESULT_UNKNOWN = 0, + DETECTION_RESULT_SILENCE = 1, + DETECTION_RESULT_SPEECH = 2, + DETECTION_RESULT_FILLER = 3, + DETECTION_RESULT_BEEP = 4, +}; + +int run_whisper_inference(struct transcription_filter_data *gf, + const float *pcm32f_data, size_t pcm32f_size) +{ + obs_log(gf->log_level, + "%s: processing %d samples, %.3f sec, %d threads", __func__, + int(pcm32f_size), float(pcm32f_size) / WHISPER_SAMPLE_RATE, + gf->whisper_params.n_threads); + + std::lock_guard lock(*gf->whisper_ctx_mutex); + if (gf->whisper_context == nullptr) { + obs_log(LOG_WARNING, "whisper context is null"); + return DETECTION_RESULT_UNKNOWN; + } + + // run the inference + int whisper_full_result = -1; + try { + whisper_full_result = + whisper_full(gf->whisper_context, gf->whisper_params, + pcm32f_data, (int)pcm32f_size); + } catch (const std::exception &e) { + obs_log(LOG_ERROR, + "Whisper exception: %s. Filter restart is required", + e.what()); + whisper_free(gf->whisper_context); + gf->whisper_context = nullptr; + return DETECTION_RESULT_UNKNOWN; + } + + if (whisper_full_result != 0) { + obs_log(LOG_WARNING, "failed to process audio, error %d", + whisper_full_result); + return DETECTION_RESULT_UNKNOWN; + } else { + const int n_segment = 0; + const char *text = whisper_full_get_segment_text( + gf->whisper_context, n_segment); + const int64_t t0 = whisper_full_get_segment_t0( + gf->whisper_context, n_segment); + const int64_t t1 = whisper_full_get_segment_t1( + gf->whisper_context, n_segment); + + float sentence_p = 0.0f; + const int n_tokens = + whisper_full_n_tokens(gf->whisper_context, n_segment); + for (int j = 0; j < n_tokens; ++j) { + sentence_p += whisper_full_get_token_p( + gf->whisper_context, n_segment, j); + } + sentence_p /= (float)n_tokens; + + // convert text to lowercase + std::string text_lower(text); + std::transform(text_lower.begin(), text_lower.end(), + text_lower.begin(), ::tolower); + // trim whitespace (use lambda) + text_lower.erase(std::find_if(text_lower.rbegin(), + text_lower.rend(), + [](unsigned char ch) { + return !std::isspace(ch); + }) + .base(), + text_lower.end()); + + if (gf->log_words) { + obs_log(LOG_INFO, "[%s --> %s] (%.3f) %s", + to_timestamp(t0).c_str(), + to_timestamp(t1).c_str(), sentence_p, + text_lower.c_str()); + } + + if (text_lower.empty()) { + return DETECTION_RESULT_SILENCE; + } + } + + return DETECTION_RESULT_SPEECH; +} + +void process_audio_from_buffer(struct transcription_filter_data *gf) +{ + uint32_t num_new_frames_from_infos = 0; + uint64_t start_timestamp = 0; + + { + // scoped lock the buffer mutex + std::lock_guard lock(*gf->whisper_buf_mutex); + + // We need (gf->frames - gf->overlap_frames) new frames to run inference, + // except for the first segment, where we need the whole gf->frames frames + size_t how_many_frames_needed = gf->frames - gf->overlap_frames; + if (gf->last_num_frames == 0) { + how_many_frames_needed = gf->frames; + } + + // pop infos from the info buffer and mark the beginning timestamp from the first + // info as the beginning timestamp of the segment + struct transcription_filter_audio_info info_from_buf = {0}; + while (gf->info_buffer.size >= + sizeof(struct transcription_filter_audio_info)) { + circlebuf_pop_front( + &gf->info_buffer, &info_from_buf, + sizeof(struct transcription_filter_audio_info)); + num_new_frames_from_infos += info_from_buf.frames; + if (start_timestamp == 0) { + start_timestamp = info_from_buf.timestamp; + } + obs_log(gf->log_level, + "popped %d frames from info buffer, %lu needed", + num_new_frames_from_infos, + how_many_frames_needed); + // Check if we're within the needed segment length + if (num_new_frames_from_infos > + how_many_frames_needed) { + // too big, push the last info into the buffer's front where it was + num_new_frames_from_infos -= + info_from_buf.frames; + circlebuf_push_front( + &gf->info_buffer, &info_from_buf, + sizeof(struct transcription_filter_audio_info)); + break; + } + } + + /* Pop from input circlebuf */ + for (size_t c = 0; c < gf->channels; c++) { + if (gf->last_num_frames > 0) { + // move overlap frames from the end of the last copy_buffers to the beginning + memcpy(gf->copy_buffers[c], + gf->copy_buffers[c] + + gf->last_num_frames - + gf->overlap_frames, + gf->overlap_frames * sizeof(float)); + // copy new data to the end of copy_buffers[c] + circlebuf_pop_front(&gf->input_buffers[c], + gf->copy_buffers[c] + + gf->overlap_frames, + num_new_frames_from_infos * + sizeof(float)); + } else { + // Very first time, just copy data to copy_buffers[c] + circlebuf_pop_front(&gf->input_buffers[c], + gf->copy_buffers[c], + num_new_frames_from_infos * + sizeof(float)); + } + } + obs_log(gf->log_level, + "popped %u frames from input buffer. input_buffer[0] size is %lu", + num_new_frames_from_infos, gf->input_buffers[0].size); + + if (gf->last_num_frames > 0) { + gf->last_num_frames = + num_new_frames_from_infos + gf->overlap_frames; + } else { + gf->last_num_frames = num_new_frames_from_infos; + } + } + + obs_log(gf->log_level, + "processing %d frames (%d ms), start timestamp %llu ", + (int)gf->last_num_frames, + (int)(gf->last_num_frames * 1000 / gf->sample_rate), + start_timestamp); + + // time the audio processing + auto start = std::chrono::high_resolution_clock::now(); + + // resample to 16kHz + float *output[MAX_PREPROC_CHANNELS]; + uint32_t out_frames; + uint64_t ts_offset; + audio_resampler_resample(gf->resampler, (uint8_t **)output, &out_frames, + &ts_offset, (const uint8_t **)gf->copy_buffers, + (uint32_t)gf->last_num_frames); + + obs_log(gf->log_level, "%d channels, %d frames, %f ms", + (int)gf->channels, (int)out_frames, + (float)out_frames / WHISPER_SAMPLE_RATE * 1000.0f); + + bool skipped_inference = false; + + if (gf->vad_enabled) { + skipped_inference = !::vad_simple(output[0], out_frames, + WHISPER_SAMPLE_RATE, + VAD_THOLD, FREQ_THOLD, + gf->log_level != LOG_DEBUG); + } + + // copy output buffer before potentially modifying it + for (size_t c = 0; c < gf->channels; c++) { + da_copy_array(gf->copy_output_buffers[c], gf->copy_buffers[c], + gf->last_num_frames); + } + + if (!skipped_inference) { + // run inference + const int inference_result = + run_whisper_inference(gf, output[0], out_frames); + UNUSED_PARAMETER(inference_result); + // TODO: output inference result to a text source + } else { + if (gf->log_words) { + obs_log(LOG_INFO, "skipping inference"); + } + } + + // end of timer + auto end = std::chrono::high_resolution_clock::now(); + auto duration = std::chrono::duration_cast( + end - start) + .count(); + const uint32_t new_frames_from_infos_ms = + num_new_frames_from_infos * 1000 / + gf->sample_rate; // number of frames in this packet + obs_log(gf->log_level, "audio processing of %u ms new data took %d ms", + new_frames_from_infos_ms, (int)duration); + + if (duration > new_frames_from_infos_ms) { + // try to decrease overlap down to minimum of 100 ms + gf->overlap_ms = + std::max((uint64_t)gf->overlap_ms - 10, (uint64_t)100); + gf->overlap_frames = gf->overlap_ms * gf->sample_rate / 1000; + obs_log(gf->log_level, + "audio processing took too long (%d ms), reducing overlap to %lu ms", + (int)duration, gf->overlap_ms); + } else if (!skipped_inference) { + // try to increase overlap up to 75% of the segment + gf->overlap_ms = std::min( + (uint64_t)gf->overlap_ms + 10, + (uint64_t)((float)new_frames_from_infos_ms * 0.75f)); + gf->overlap_frames = gf->overlap_ms * gf->sample_rate / 1000; + obs_log(gf->log_level, + "audio processing took %d ms, increasing overlap to %lu ms", + (int)duration, gf->overlap_ms); + } +} + +void whisper_loop(void *data) +{ + struct transcription_filter_data *gf = + static_cast(data); + const size_t segment_size = gf->frames * sizeof(float); + + obs_log(LOG_INFO, "starting whisper thread"); + + // Thread main loop + while (true) { + { + std::lock_guard lock( + *gf->whisper_ctx_mutex); + if (gf->whisper_context == nullptr) { + obs_log(LOG_WARNING, + "Whisper context is null, exiting thread"); + break; + } + } + + // Check if we have enough data to process + while (true) { + size_t input_buf_size = 0; + { + std::lock_guard lock( + *gf->whisper_buf_mutex); + input_buf_size = gf->input_buffers[0].size; + } + + if (input_buf_size >= segment_size) { + obs_log(gf->log_level, + "found %lu bytes, %lu frames in input buffer, need >= %lu, processing", + input_buf_size, + (size_t)(input_buf_size / + sizeof(float)), + segment_size); + + // Process the audio. This will also remove the processed data from the input buffer. + // Mutex is locked inside process_audio_from_buffer. + process_audio_from_buffer(gf); + } else { + break; + } + } + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + } + + obs_log(LOG_INFO, "exiting whisper thread"); +} diff --git a/src/whisper-processing.h b/src/whisper-processing.h new file mode 100644 index 0000000..f71b098 --- /dev/null +++ b/src/whisper-processing.h @@ -0,0 +1,7 @@ +#ifndef WHISPER_PROCESSING_H +#define WHISPER_PROCESSING_H + +void whisper_loop(void *data); +struct whisper_context *init_whisper_context(const std::string &model_path); + +#endif // WHISPER_PROCESSING_H diff --git a/vendor/curl b/vendor/curl new file mode 160000 index 0000000..439ff20 --- /dev/null +++ b/vendor/curl @@ -0,0 +1 @@ +Subproject commit 439ff2052e219162708faddedacdf6f1242bb8c8