This commit is contained in:
Roy Shilkrot
2023-08-12 23:51:51 +03:00
parent ad7cb94c55
commit 7023ec5152
21 changed files with 1494 additions and 10 deletions

View File

@@ -24,6 +24,10 @@ inputs:
description: 'Developer ID for installer package codesigning (macOS only)'
required: false
default: ''
codesignTeam:
description: 'Developer team for codesigning (macOS only)'
required: false
default: ''
codesignUser:
description: 'Apple ID username for notarization (macOS only)'
required: false
@@ -50,6 +54,7 @@ runs:
env:
CODESIGN_IDENT: ${{ inputs.codesignIdent }}
CODESIGN_IDENT_INSTALLER: ${{ inputs.installerIdent }}
CODESIGN_TEAM: ${{ inputs.codesignTeam }}
CODESIGN_IDENT_USER: ${{ inputs.codesignUser }}
CODESIGN_IDENT_PASS: ${{ inputs.codesignPass }}
run: |

View File

@@ -129,6 +129,7 @@ jobs:
codesign: ${{ fromJSON(needs.check-event.outputs.codesign) && fromJSON(steps.codesign.outputs.haveCodesignIdent) }}
codesignIdent: ${{ steps.codesign.outputs.codesignIdent }}
installerIdent: ${{ steps.codesign.outputs.installerIdent }}
codesignTeam: ${{ steps.codesign.outputs.codesignTeam }}
notarize: ${{ fromJSON(needs.check-event.outputs.notarize) && fromJSON(steps.codesign.outputs.haveNotarizationUser) }}
codesignUser: ${{ secrets.MACOS_NOTARIZATION_USERNAME }}
codesignPass: ${{ secrets.MACOS_NOTARIZATION_PASSWORD }}

1
.gitignore vendored
View File

@@ -15,6 +15,7 @@
!CMakePresets.json
!LICENSE
!README.md
!/vendor
# Exclude lock files
*.lock.json

3
.gitmodules vendored Normal file
View File

@@ -0,0 +1,3 @@
[submodule "vendor/curl"]
path = vendor/curl
url = https://github.com/curl/curl.git

View File

@@ -34,6 +34,23 @@ if(ENABLE_QT)
AUTORCC ON)
endif()
target_sources(${CMAKE_PROJECT_NAME} PRIVATE src/plugin-main.c)
set(USE_SYSTEM_CURL
OFF
CACHE STRING "Use system cURL")
if(USE_SYSTEM_CURL)
find_package(CURL REQUIRED)
target_link_libraries(${CMAKE_PROJECT_NAME} PRIVATE "${CURL_LIBRARIES}")
target_include_directories(${CMAKE_PROJECT_NAME} SYSTEM PUBLIC "${CURL_INCLUDE_DIRS}")
else()
include(cmake/BuildMyCurl.cmake)
target_link_libraries(${CMAKE_PROJECT_NAME} PRIVATE libcurl)
endif()
include(cmake/BuildWhispercpp.cmake)
target_link_libraries(${CMAKE_PROJECT_NAME} PRIVATE Whispercpp)
target_sources(${CMAKE_PROJECT_NAME} PRIVATE src/plugin-main.c src/transcription-filter.cpp src/transcription-filter.c
src/whisper-processing.cpp)
set_target_properties_plugin(${CMAKE_PROJECT_NAME} PROPERTIES OUTPUT_NAME ${_name})

View File

@@ -41,14 +41,14 @@
},
"platformConfig": {
"macos": {
"bundleId": "com.example.obs-plugintemplate"
"bundleId": "com.royshilkrot.obs-localvocal"
}
},
"name": "obs-plugintemplate",
"version": "1.0.0",
"author": "Your Name Here",
"website": "https://example.com",
"email": "me@example.com",
"name": "obs-localvocal",
"version": "0.0.1",
"author": "Roy Shilkrot",
"website": "https://github.com/royshil/obs-localvocal",
"email": "roy.shil@gmail.com",
"uuids": {
"macosPackage": "00000000-0000-0000-0000-000000000000",
"macosInstaller": "00000000-0000-0000-0000-000000000000",

29
cmake/BuildMyCurl.cmake Normal file
View File

@@ -0,0 +1,29 @@
set(LIBCURL_SOURCE_DIR ${CMAKE_SOURCE_DIR}/vendor/curl)
find_package(Git QUIET)
execute_process(
COMMAND ${GIT_EXECUTABLE} checkout curl-8_2_0
WORKING_DIRECTORY ${LIBCURL_SOURCE_DIR}
RESULT_VARIABLE GIT_SUBMOD_RESULT)
if(OS_MACOS)
set(CURL_USE_OPENSSL OFF)
set(CURL_USE_SECTRANSP ON)
elseif(OS_WINDOWS)
set(CURL_USE_OPENSSL OFF)
set(CURL_USE_SCHANNEL ON)
elseif(OS_LINUX)
add_compile_options(-fPIC)
set(CURL_USE_OPENSSL ON)
endif()
set(BUILD_CURL_EXE OFF)
set(BUILD_SHARED_LIBS OFF)
set(HTTP_ONLY OFF)
set(CURL_USE_LIBSSH2 OFF)
add_subdirectory(${LIBCURL_SOURCE_DIR} EXCLUDE_FROM_ALL)
if(OS_MACOS)
target_compile_options(
libcurl PRIVATE -Wno-error=ambiguous-macro -Wno-error=deprecated-declarations -Wno-error=unreachable-code
-Wno-error=unused-parameter -Wno-error=unused-variable)
endif()
include_directories(SYSTEM ${LIBCURL_SOURCE_DIR}/include)

View File

@@ -0,0 +1,51 @@
include(ExternalProject)
string(REPLACE ";" "$<SEMICOLON>" CMAKE_OSX_ARCHITECTURES_ "${CMAKE_OSX_ARCHITECTURES}")
if(${CMAKE_BUILD_TYPE} STREQUAL Release OR ${CMAKE_BUILD_TYPE} STREQUAL RelWithDebInfo)
set(Whispercpp_BUILD_TYPE Release)
else()
set(Whispercpp_BUILD_TYPE Debug)
endif()
# On linux add the `-fPIC` flag to the compiler
if(UNIX AND NOT APPLE)
set(WHISPER_EXTRA_CXX_FLAGS "-fPIC")
endif()
ExternalProject_Add(
Whispercpp_Build
DOWNLOAD_EXTRACT_TIMESTAMP true
GIT_REPOSITORY https://github.com/ggerganov/whisper.cpp.git
GIT_TAG 7b374c9ac9b9861bb737eec060e4dfa29d229259
BUILD_COMMAND ${CMAKE_COMMAND} --build <BINARY_DIR> --config ${Whispercpp_BUILD_TYPE}
BUILD_BYPRODUCTS <INSTALL_DIR>/lib/static/${CMAKE_STATIC_LIBRARY_PREFIX}whisper${CMAKE_STATIC_LIBRARY_SUFFIX}
CMAKE_GENERATOR ${CMAKE_GENERATOR}
INSTALL_COMMAND ${CMAKE_COMMAND} --install <BINARY_DIR> --config ${Whispercpp_BUILD_TYPE}
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=<INSTALL_DIR>
-DCMAKE_BUILD_TYPE=${Whispercpp_BUILD_TYPE}
-DCMAKE_GENERATOR_PLATFORM=${CMAKE_GENERATOR_PLATFORM}
-DCMAKE_OSX_DEPLOYMENT_TARGET=10.13
-DCMAKE_OSX_ARCHITECTURES=${CMAKE_OSX_ARCHITECTURES_}
-DCMAKE_CXX_FLAGS=${WHISPER_EXTRA_CXX_FLAGS}
-DCMAKE_C_FLAGS=${WHISPER_EXTRA_CXX_FLAGS}
-DBUILD_SHARED_LIBS=OFF
-DWHISPER_BUILD_TESTS=OFF
-DWHISPER_BUILD_EXAMPLES=OFF
-DWHISPER_OPENBLAS=ON)
ExternalProject_Get_Property(Whispercpp_Build INSTALL_DIR)
add_library(Whispercpp::Whisper STATIC IMPORTED)
set_target_properties(
Whispercpp::Whisper
PROPERTIES IMPORTED_LOCATION
${INSTALL_DIR}/lib/static/${CMAKE_STATIC_LIBRARY_PREFIX}whisper${CMAKE_STATIC_LIBRARY_SUFFIX})
add_library(Whispercpp INTERFACE)
add_dependencies(Whispercpp Whispercpp_Build)
target_link_libraries(Whispercpp INTERFACE Whispercpp::Whisper)
set_target_properties(Whispercpp::Whisper PROPERTIES INTERFACE_INCLUDE_DIRECTORIES ${INSTALL_DIR}/include)
if(APPLE)
target_link_libraries(Whispercpp INTERFACE "-framework Accelerate")
endif(APPLE)

View File

@@ -0,0 +1 @@
transcription_filterAudioFilter=LocalVocal Transcription

Binary file not shown.

View File

@@ -22,10 +22,18 @@ with this program. If not, see <https://www.gnu.org/licenses/>
OBS_DECLARE_MODULE()
OBS_MODULE_USE_DEFAULT_LOCALE(PLUGIN_NAME, "en-US")
MODULE_EXPORT const char *obs_module_description(void)
{
return obs_module_text("LocalVocalPlugin");
}
extern struct obs_source_info transcription_filter_info;
bool obs_module_load(void)
{
obs_log(LOG_INFO, "plugin loaded successfully (version %s)",
PLUGIN_VERSION);
obs_register_source(&transcription_filter_info);
blog(LOG_INFO, "plugin loaded successfully (version %s)",
PLUGIN_VERSION);
return true;
}

View File

@@ -21,6 +21,8 @@ with this program. If not, see <https://www.gnu.org/licenses/>
const char *PLUGIN_NAME = "@CMAKE_PROJECT_NAME@";
const char *PLUGIN_VERSION = "@CMAKE_PROJECT_VERSION@";
extern void blogva(int log_level, const char *format, va_list args);
void obs_log(int log_level, const char *format, ...)
{
size_t length = 4 + strlen(PLUGIN_NAME) + strlen(format);

View File

@@ -31,7 +31,6 @@ extern const char *PLUGIN_NAME;
extern const char *PLUGIN_VERSION;
void obs_log(int log_level, const char *format, ...);
extern void blogva(int log_level, const char *format, va_list args);
#ifdef __cplusplus
}

View File

@@ -0,0 +1,73 @@
#ifndef TRANSCRIPTION_FILTER_DATA_H
#define TRANSCRIPTION_FILTER_DATA_H
#include <obs.h>
#include <util/circlebuf.h>
#include <util/darray.h>
#include <media-io/audio-resampler.h>
#include <whisper.h>
#include <thread>
#include <memory>
#define MAX_PREPROC_CHANNELS 2
// buffer size in msec
#define BUFFER_SIZE_MSEC 1010
// at 16Khz, 1010 msec is 16160 frames
#define WHISPER_FRAME_SIZE 16160
// overlap in msec
#define OVERLAP_SIZE_MSEC 340
#define MT_ obs_module_text
struct transcription_filter_data {
obs_source_t *context; // obs input source
size_t channels; // number of channels
uint32_t sample_rate; // input sample rate
// How many input frames (in input sample rate) are needed for the next whisper frame
size_t frames;
// How many ms/frames are needed to overlap with the next whisper frame
size_t overlap_frames;
size_t overlap_ms;
// How many frames were processed in the last whisper frame (this is dynamic)
size_t last_num_frames;
/* PCM buffers */
float *copy_buffers[MAX_PREPROC_CHANNELS];
DARRAY(float) copy_output_buffers[MAX_PREPROC_CHANNELS];
struct circlebuf info_buffer;
struct circlebuf input_buffers[MAX_PREPROC_CHANNELS];
/* Resampler */
audio_resampler_t *resampler;
/* whisper */
std::string whisper_model_path = "models/ggml-tiny.en.bin";
struct whisper_context *whisper_context;
whisper_full_params whisper_params;
float filler_p_threshold;
bool do_silence;
bool vad_enabled;
int log_level;
bool log_words;
bool active;
// Use std for thread and mutex
std::thread whisper_thread;
std::unique_ptr<std::mutex> whisper_buf_mutex;
std::unique_ptr<std::mutex> whisper_ctx_mutex;
std::unique_ptr<std::condition_variable> wshiper_thread_cv;
};
// Audio packet info
struct transcription_filter_audio_info {
uint32_t frames;
uint64_t timestamp;
};
#endif /* TRANSCRIPTION_FILTER_DATA_H */

View File

@@ -0,0 +1,16 @@
#include "transcription-filter.h"
struct obs_source_info transcription_filter_info = {
.id = "transcription_filter_audio_filter",
.type = OBS_SOURCE_TYPE_FILTER,
.output_flags = OBS_SOURCE_AUDIO,
.get_name = transcription_filter_name,
.create = transcription_filter_create,
.destroy = transcription_filter_destroy,
.get_defaults = transcription_filter_defaults,
.get_properties = transcription_filter_properties,
.update = transcription_filter_update,
.activate = transcription_filter_activate,
.deactivate = transcription_filter_deactivate,
.filter_audio = transcription_filter_filter_audio,
};

View File

@@ -0,0 +1,468 @@
#include <obs-module.h>
#include "plugin-support.h"
#include "transcription-filter.h"
#include "transcription-filter-data.h"
#include "whisper-processing.h"
#include "whisper-language.h"
inline enum speaker_layout convert_speaker_layout(uint8_t channels)
{
switch (channels) {
case 0:
return SPEAKERS_UNKNOWN;
case 1:
return SPEAKERS_MONO;
case 2:
return SPEAKERS_STEREO;
case 3:
return SPEAKERS_2POINT1;
case 4:
return SPEAKERS_4POINT0;
case 5:
return SPEAKERS_4POINT1;
case 6:
return SPEAKERS_5POINT1;
case 8:
return SPEAKERS_7POINT1;
default:
return SPEAKERS_UNKNOWN;
}
}
struct obs_audio_data *
transcription_filter_filter_audio(void *data, struct obs_audio_data *audio)
{
if (!audio) {
return nullptr;
}
if (data == nullptr) {
return audio;
}
struct transcription_filter_data *gf =
static_cast<struct transcription_filter_data *>(data);
if (!gf->active) {
return audio;
}
if (gf->whisper_context == nullptr) {
// Whisper not initialized, just pass through
return audio;
}
{
std::lock_guard<std::mutex> lock(
*gf->whisper_buf_mutex); // scoped lock
obs_log(gf->log_level,
"pushing %lu frames to input buffer. current size: %lu (bytes)",
(size_t)(audio->frames), gf->input_buffers[0].size);
// push back current audio data to input circlebuf
for (size_t c = 0; c < gf->channels; c++) {
circlebuf_push_back(&gf->input_buffers[c],
audio->data[c],
audio->frames * sizeof(float));
}
// push audio packet info (timestamp/frame count) to info circlebuf
struct transcription_filter_audio_info info = {0};
info.frames = audio->frames; // number of frames in this packet
info.timestamp = audio->timestamp; // timestamp of this packet
circlebuf_push_back(&gf->info_buffer, &info, sizeof(info));
}
return audio;
}
const char *transcription_filter_name(void *unused)
{
UNUSED_PARAMETER(unused);
return MT_("transcription_filterAudioFilter");
}
void transcription_filter_destroy(void *data)
{
struct transcription_filter_data *gf =
static_cast<struct transcription_filter_data *>(data);
obs_log(LOG_INFO, "transcription_filter_destroy");
{
std::lock_guard<std::mutex> lock(*gf->whisper_ctx_mutex);
if (gf->whisper_context != nullptr) {
whisper_free(gf->whisper_context);
gf->whisper_context = nullptr;
}
}
// join the thread
if (gf->whisper_thread.joinable()) {
gf->whisper_thread.join();
}
if (gf->resampler) {
audio_resampler_destroy(gf->resampler);
}
{
std::lock_guard<std::mutex> lockbuf(*gf->whisper_buf_mutex);
bfree(gf->copy_buffers[0]);
gf->copy_buffers[0] = nullptr;
for (size_t i = 0; i < gf->channels; i++) {
circlebuf_free(&gf->input_buffers[i]);
}
}
circlebuf_free(&gf->info_buffer);
bfree(gf);
}
void transcription_filter_update(void *data, obs_data_t *s)
{
struct transcription_filter_data *gf =
static_cast<struct transcription_filter_data *>(data);
gf->filler_p_threshold =
(float)obs_data_get_double(s, "filler_p_threshold");
gf->log_level = (int)obs_data_get_int(s, "log_level");
gf->do_silence = obs_data_get_bool(s, "do_silence");
gf->vad_enabled = obs_data_get_bool(s, "vad_enabled");
gf->log_words = obs_data_get_bool(s, "log_words");
const char *new_model_path =
obs_data_get_string(s, "whisper_model_path");
if (strcmp(new_model_path, gf->whisper_model_path.c_str()) != 0) {
// model path changed, reload the model
obs_log(LOG_INFO, "model path changed, reloading model");
if (gf->whisper_context != nullptr) {
// acquire the mutex before freeing the context
std::lock_guard<std::mutex> lock(
*gf->whisper_ctx_mutex);
whisper_free(gf->whisper_context);
gf->whisper_context = nullptr;
}
if (gf->whisper_thread.joinable()) {
gf->whisper_thread.join();
}
gf->whisper_model_path = bstrdup(new_model_path);
// check if the model exists, if not, download it
// if (!check_if_model_exists(gf->whisper_model_path)) {
// obs_log(LOG_ERROR, "Whisper model does not exist");
// download_model_with_ui_dialog(
// gf->whisper_model_path, [gf](int download_status) {
// if (download_status == 0) {
// obs_log(LOG_INFO, "Model download complete");
// gf->whisper_context = init_whisper_context(
// gf->whisper_model_path);
// gf->whisper_thread = std::thread(whisper_loop, gf);
// } else {
// obs_log(LOG_ERROR, "Model download failed");
// }
// });
// } else {
// Model exists, just load it
gf->whisper_context =
init_whisper_context(gf->whisper_model_path);
gf->whisper_thread = std::thread(whisper_loop, gf);
// }
}
std::lock_guard<std::mutex> lock(*gf->whisper_ctx_mutex);
gf->whisper_params = whisper_full_default_params(
(whisper_sampling_strategy)obs_data_get_int(
s, "whisper_sampling_method"));
gf->whisper_params.duration_ms = BUFFER_SIZE_MSEC;
gf->whisper_params.language =
obs_data_get_string(s, "whisper_language_select");
gf->whisper_params.translate = false;
gf->whisper_params.initial_prompt =
obs_data_get_string(s, "initial_prompt");
gf->whisper_params.n_threads = (int)obs_data_get_int(s, "n_threads");
gf->whisper_params.n_max_text_ctx =
(int)obs_data_get_int(s, "n_max_text_ctx");
gf->whisper_params.no_context = obs_data_get_bool(s, "no_context");
gf->whisper_params.single_segment =
obs_data_get_bool(s, "single_segment");
gf->whisper_params.print_special =
obs_data_get_bool(s, "print_special");
gf->whisper_params.print_progress =
obs_data_get_bool(s, "print_progress");
gf->whisper_params.print_realtime =
obs_data_get_bool(s, "print_realtime");
gf->whisper_params.print_timestamps =
obs_data_get_bool(s, "print_timestamps");
gf->whisper_params.token_timestamps =
obs_data_get_bool(s, "token_timestamps");
gf->whisper_params.thold_pt = (float)obs_data_get_double(s, "thold_pt");
gf->whisper_params.thold_ptsum =
(float)obs_data_get_double(s, "thold_ptsum");
gf->whisper_params.max_len = (int)obs_data_get_int(s, "max_len");
gf->whisper_params.split_on_word =
obs_data_get_bool(s, "split_on_word");
gf->whisper_params.max_tokens = (int)obs_data_get_int(s, "max_tokens");
gf->whisper_params.speed_up = obs_data_get_bool(s, "speed_up");
gf->whisper_params.suppress_blank =
obs_data_get_bool(s, "suppress_blank");
gf->whisper_params.suppress_non_speech_tokens =
obs_data_get_bool(s, "suppress_non_speech_tokens");
gf->whisper_params.temperature =
(float)obs_data_get_double(s, "temperature");
gf->whisper_params.max_initial_ts =
(float)obs_data_get_double(s, "max_initial_ts");
gf->whisper_params.length_penalty =
(float)obs_data_get_double(s, "length_penalty");
}
void *transcription_filter_create(obs_data_t *settings, obs_source_t *filter)
{
struct transcription_filter_data *gf =
static_cast<struct transcription_filter_data *>(
bmalloc(sizeof(struct transcription_filter_data)));
// Get the number of channels for the input source
gf->channels = audio_output_get_channels(obs_get_audio());
gf->sample_rate = audio_output_get_sample_rate(obs_get_audio());
gf->frames = (size_t)((float)gf->sample_rate /
(1000.0f / (float)BUFFER_SIZE_MSEC));
gf->last_num_frames = 0;
for (size_t i = 0; i < MAX_AUDIO_CHANNELS; i++) {
circlebuf_init(&gf->input_buffers[i]);
}
circlebuf_init(&gf->info_buffer);
// allocate copy buffers
gf->copy_buffers[0] = static_cast<float *>(
bzalloc(gf->channels * gf->frames * sizeof(float)));
for (size_t c = 1; c < gf->channels; c++) { // set the channel pointers
gf->copy_buffers[c] = gf->copy_buffers[0] + c * gf->frames;
}
gf->context = filter;
gf->whisper_model_path =
obs_data_get_string(settings, "whisper_model_path");
gf->whisper_context = init_whisper_context(gf->whisper_model_path);
if (gf->whisper_context == nullptr) {
obs_log(LOG_ERROR, "Failed to load whisper model");
return nullptr;
}
gf->overlap_ms = OVERLAP_SIZE_MSEC;
gf->overlap_frames = (size_t)((float)gf->sample_rate /
(1000.0f / (float)gf->overlap_ms));
obs_log(LOG_INFO,
"transcription_filter filter: channels %d, frames %d, sample_rate %d",
(int)gf->channels, (int)gf->frames, gf->sample_rate);
struct resample_info src, dst;
src.samples_per_sec = gf->sample_rate;
src.format = AUDIO_FORMAT_FLOAT_PLANAR;
src.speakers = convert_speaker_layout((uint8_t)gf->channels);
dst.samples_per_sec = WHISPER_SAMPLE_RATE;
dst.format = AUDIO_FORMAT_FLOAT_PLANAR;
dst.speakers = convert_speaker_layout((uint8_t)1);
gf->resampler = audio_resampler_create(&dst, &src);
gf->active = true;
gf->whisper_buf_mutex = std::unique_ptr<std::mutex>(new std::mutex());
gf->whisper_ctx_mutex = std::unique_ptr<std::mutex>(new std::mutex());
gf->wshiper_thread_cv =
std::unique_ptr<std::condition_variable>(new std::condition_variable());
// get the settings updated on the filter data struct
transcription_filter_update(gf, settings);
// start the thread
gf->whisper_thread = std::thread(whisper_loop, gf);
return gf;
}
void transcription_filter_activate(void *data)
{
struct transcription_filter_data *gf =
static_cast<struct transcription_filter_data *>(data);
obs_log(LOG_INFO, "transcription_filter filter activated");
gf->active = true;
}
void transcription_filter_deactivate(void *data)
{
struct transcription_filter_data *gf =
static_cast<struct transcription_filter_data *>(data);
obs_log(LOG_INFO, "transcription_filter filter deactivated");
gf->active = false;
}
void transcription_filter_defaults(obs_data_t *s)
{
obs_data_set_default_double(s, "filler_p_threshold", 0.75);
obs_data_set_default_bool(s, "do_silence", true);
obs_data_set_default_bool(s, "vad_enabled", true);
obs_data_set_default_int(s, "log_level", LOG_DEBUG);
obs_data_set_default_bool(s, "log_words", true);
obs_data_set_default_string(s, "whisper_model_path",
"models/ggml-tiny.en.bin");
obs_data_set_default_string(s, "whisper_language_select", "en");
// Whisper parameters
obs_data_set_default_int(s, "whisper_sampling_method",
WHISPER_SAMPLING_BEAM_SEARCH);
obs_data_set_default_string(s, "initial_prompt", "");
obs_data_set_default_int(s, "n_threads", 4);
obs_data_set_default_int(s, "n_max_text_ctx", 16384);
obs_data_set_default_bool(s, "no_context", true);
obs_data_set_default_bool(s, "single_segment", true);
obs_data_set_default_bool(s, "print_special", false);
obs_data_set_default_bool(s, "print_progress", false);
obs_data_set_default_bool(s, "print_realtime", false);
obs_data_set_default_bool(s, "print_timestamps", false);
obs_data_set_default_bool(s, "token_timestamps", false);
obs_data_set_default_double(s, "thold_pt", 0.01);
obs_data_set_default_double(s, "thold_ptsum", 0.01);
obs_data_set_default_int(s, "max_len", 0);
obs_data_set_default_bool(s, "split_on_word", false);
obs_data_set_default_int(s, "max_tokens", 3);
obs_data_set_default_bool(s, "speed_up", false);
obs_data_set_default_bool(s, "suppress_blank", false);
obs_data_set_default_bool(s, "suppress_non_speech_tokens", true);
obs_data_set_default_double(s, "temperature", 0.5);
obs_data_set_default_double(s, "max_initial_ts", 1.0);
obs_data_set_default_double(s, "length_penalty", -1.0);
}
obs_properties_t *transcription_filter_properties(void *data)
{
obs_properties_t *ppts = obs_properties_create();
obs_properties_add_float_slider(ppts, "filler_p_threshold",
"filler_p_threshold", 0.0f, 1.0f,
0.05f);
obs_properties_add_bool(ppts, "do_silence", "do_silence");
obs_properties_add_bool(ppts, "vad_enabled", "vad_enabled");
obs_property_t *list = obs_properties_add_list(ppts, "log_level",
"log_level",
OBS_COMBO_TYPE_LIST,
OBS_COMBO_FORMAT_INT);
obs_property_list_add_int(list, "DEBUG", LOG_DEBUG);
obs_property_list_add_int(list, "INFO", LOG_INFO);
obs_property_list_add_int(list, "WARNING", LOG_WARNING);
obs_properties_add_bool(ppts, "log_words", "log_words");
// Add a list of available whisper models to download
obs_property_t *whisper_models_list = obs_properties_add_list(
ppts, "whisper_model_path", "Whisper Model",
OBS_COMBO_TYPE_LIST, OBS_COMBO_FORMAT_STRING);
obs_property_list_add_string(whisper_models_list, "Tiny (Eng) 75Mb",
"models/ggml-tiny.en.bin");
obs_property_list_add_string(whisper_models_list, "Tiny 75Mb",
"models/ggml-tiny.bin");
obs_property_list_add_string(whisper_models_list, "Base (Eng) 142Mb",
"models/ggml-base.en.bin");
obs_property_list_add_string(whisper_models_list, "Base 142Mb",
"models/ggml-base.bin");
obs_property_list_add_string(whisper_models_list, "Small (Eng) 466Mb",
"models/ggml-small.en.bin");
obs_property_list_add_string(whisper_models_list, "Small 466Mb",
"models/ggml-small.bin");
obs_properties_t *whisper_params_group = obs_properties_create();
obs_properties_add_group(ppts, "whisper_params_group",
"Whisper Parameters", OBS_GROUP_NORMAL,
whisper_params_group);
// Add language selector
obs_property_t *whisper_language_select_list = obs_properties_add_list(
whisper_params_group, "whisper_language_select", "Language",
OBS_COMBO_TYPE_LIST, OBS_COMBO_FORMAT_STRING);
// iterate over all available languages in whisper_available_lang map<string, string>
for (auto const &pair : whisper_available_lang) {
obs_property_list_add_string(whisper_language_select_list,
pair.second.c_str(),
pair.first.c_str());
}
obs_property_t *whisper_sampling_method_list = obs_properties_add_list(
whisper_params_group, "whisper_sampling_method",
"whisper_sampling_method", OBS_COMBO_TYPE_LIST,
OBS_COMBO_FORMAT_INT);
obs_property_list_add_int(whisper_sampling_method_list, "Beam search",
WHISPER_SAMPLING_BEAM_SEARCH);
obs_property_list_add_int(whisper_sampling_method_list, "Greedy",
WHISPER_SAMPLING_GREEDY);
// int n_threads;
obs_properties_add_int_slider(whisper_params_group, "n_threads",
"n_threads", 1, 8, 1);
// int n_max_text_ctx; // max tokens to use from past text as prompt for the decoder
obs_properties_add_int_slider(whisper_params_group, "n_max_text_ctx",
"n_max_text_ctx", 0, 16384, 100);
// int offset_ms; // start offset in ms
// int duration_ms; // audio duration to process in ms
// bool translate;
// bool no_context; // do not use past transcription (if any) as initial prompt for the decoder
obs_properties_add_bool(whisper_params_group, "no_context",
"no_context");
// bool single_segment; // force single segment output (useful for streaming)
obs_properties_add_bool(whisper_params_group, "single_segment",
"single_segment");
// bool print_special; // print special tokens (e.g. <SOT>, <EOT>, <BEG>, etc.)
obs_properties_add_bool(whisper_params_group, "print_special",
"print_special");
// bool print_progress; // print progress information
obs_properties_add_bool(whisper_params_group, "print_progress",
"print_progress");
// bool print_realtime; // print results from within whisper.cpp (avoid it, use callback instead)
obs_properties_add_bool(whisper_params_group, "print_realtime",
"print_realtime");
// bool print_timestamps; // print timestamps for each text segment when printing realtime
obs_properties_add_bool(whisper_params_group, "print_timestamps",
"print_timestamps");
// bool token_timestamps; // enable token-level timestamps
obs_properties_add_bool(whisper_params_group, "token_timestamps",
"token_timestamps");
// float thold_pt; // timestamp token probability threshold (~0.01)
obs_properties_add_float_slider(whisper_params_group, "thold_pt",
"thold_pt", 0.0f, 1.0f, 0.05f);
// float thold_ptsum; // timestamp token sum probability threshold (~0.01)
obs_properties_add_float_slider(whisper_params_group, "thold_ptsum",
"thold_ptsum", 0.0f, 1.0f, 0.05f);
// int max_len; // max segment length in characters
obs_properties_add_int_slider(whisper_params_group, "max_len",
"max_len", 0, 100, 1);
// bool split_on_word; // split on word rather than on token (when used with max_len)
obs_properties_add_bool(whisper_params_group, "split_on_word",
"split_on_word");
// int max_tokens; // max tokens per segment (0 = no limit)
obs_properties_add_int_slider(whisper_params_group, "max_tokens",
"max_tokens", 0, 100, 1);
// bool speed_up; // speed-up the audio by 2x using Phase Vocoder
obs_properties_add_bool(whisper_params_group, "speed_up", "speed_up");
// const char * initial_prompt;
obs_properties_add_text(whisper_params_group, "initial_prompt",
"initial_prompt", OBS_TEXT_DEFAULT);
// bool suppress_blank
obs_properties_add_bool(whisper_params_group, "suppress_blank",
"suppress_blank");
// bool suppress_non_speech_tokens
obs_properties_add_bool(whisper_params_group,
"suppress_non_speech_tokens",
"suppress_non_speech_tokens");
// float temperature
obs_properties_add_float_slider(whisper_params_group, "temperature",
"temperature", 0.0f, 1.0f, 0.05f);
// float max_initial_ts
obs_properties_add_float_slider(whisper_params_group, "max_initial_ts",
"max_initial_ts", 0.0f, 1.0f, 0.05f);
// float length_penalty
obs_properties_add_float_slider(whisper_params_group, "length_penalty",
"length_penalty", -1.0f, 1.0f, 0.1f);
UNUSED_PARAMETER(data);
return ppts;
}

View File

@@ -0,0 +1,20 @@
#include <obs-module.h>
#ifdef __cplusplus
extern "C" {
#endif
void transcription_filter_activate(void *data);
void *transcription_filter_create(obs_data_t *settings, obs_source_t *filter);
void transcription_filter_update(void *data, obs_data_t *s);
void transcription_filter_destroy(void *data);
const char *transcription_filter_name(void *unused);
struct obs_audio_data *
transcription_filter_filter_audio(void *data, struct obs_audio_data *audio);
void transcription_filter_deactivate(void *data);
void transcription_filter_defaults(obs_data_t *s);
obs_properties_t *transcription_filter_properties(void *data);
#ifdef __cplusplus
}
#endif

406
src/whisper-language.h Normal file
View File

@@ -0,0 +1,406 @@
#ifndef WHISPER_LANGUAGE_H
#define WHISPER_LANGUAGE_H
#include <map>
#include <string>
static const std::map<std::string, std::string> whisper_available_lang = {
{
"en",
"english",
},
{
"zh",
"chinese",
},
{
"de",
"german",
},
{
"es",
"spanish",
},
{
"ru",
"russian",
},
{
"ko",
"korean",
},
{
"fr",
"french",
},
{
"ja",
"japanese",
},
{
"pt",
"portuguese",
},
{
"tr",
"turkish",
},
{
"pl",
"polish",
},
{
"ca",
"catalan",
},
{
"nl",
"dutch",
},
{
"ar",
"arabic",
},
{
"sv",
"swedish",
},
{
"it",
"italian",
},
{
"id",
"indonesian",
},
{
"hi",
"hindi",
},
{
"fi",
"finnish",
},
{
"vi",
"vietnamese",
},
{
"he",
"hebrew",
},
{
"uk",
"ukrainian",
},
{
"el",
"greek",
},
{
"ms",
"malay",
},
{
"cs",
"czech",
},
{
"ro",
"romanian",
},
{
"da",
"danish",
},
{
"hu",
"hungarian",
},
{
"ta",
"tamil",
},
{
"no",
"norwegian",
},
{
"th",
"thai",
},
{
"ur",
"urdu",
},
{
"hr",
"croatian",
},
{
"bg",
"bulgarian",
},
{
"lt",
"lithuanian",
},
{
"la",
"latin",
},
{
"mi",
"maori",
},
{
"ml",
"malayalam",
},
{
"cy",
"welsh",
},
{
"sk",
"slovak",
},
{
"te",
"telugu",
},
{
"fa",
"persian",
},
{
"lv",
"latvian",
},
{
"bn",
"bengali",
},
{
"sr",
"serbian",
},
{
"az",
"azerbaijani",
},
{
"sl",
"slovenian",
},
{
"kn",
"kannada",
},
{
"et",
"estonian",
},
{
"mk",
"macedonian",
},
{
"br",
"breton",
},
{
"eu",
"basque",
},
{
"is",
"icelandic",
},
{
"hy",
"armenian",
},
{
"ne",
"nepali",
},
{
"mn",
"mongolian",
},
{
"bs",
"bosnian",
},
{
"kk",
"kazakh",
},
{
"sq",
"albanian",
},
{
"sw",
"swahili",
},
{
"gl",
"galician",
},
{
"mr",
"marathi",
},
{
"pa",
"punjabi",
},
{
"si",
"sinhala",
},
{
"km",
"khmer",
},
{
"sn",
"shona",
},
{
"yo",
"yoruba",
},
{
"so",
"somali",
},
{
"af",
"afrikaans",
},
{
"oc",
"occitan",
},
{
"ka",
"georgian",
},
{
"be",
"belarusian",
},
{
"tg",
"tajik",
},
{
"sd",
"sindhi",
},
{
"gu",
"gujarati",
},
{
"am",
"amharic",
},
{
"yi",
"yiddish",
},
{
"lo",
"lao",
},
{
"uz",
"uzbek",
},
{
"fo",
"faroese",
},
{
"ht",
"haitian",
},
{
"ps",
"pashto",
},
{
"tk",
"turkmen",
},
{
"nn",
"nynorsk",
},
{
"mt",
"maltese",
},
{
"sa",
"sanskrit",
},
{
"lb",
"luxembourgish",
},
{
"my",
"myanmar",
},
{
"bo",
"tibetan",
},
{
"tl",
"tagalog",
},
{
"mg",
"malagasy",
},
{
"as",
"assamese",
},
{
"tt",
"tatar",
},
{
"haw",
"hawaiian",
},
{
"ln",
"lingala",
},
{
"ha",
"hausa",
},
{
"ba",
"bashkir",
},
{
"jw",
"javanese",
},
{
"su",
"sundanese",
},
};
#endif // WHISPER_LANGUAGE_H

376
src/whisper-processing.cpp Normal file
View File

@@ -0,0 +1,376 @@
#include <whisper.h>
#include <obs-module.h>
#include "plugin-support.h"
#include "transcription-filter-data.h"
#define VAD_THOLD 0.0001f
#define FREQ_THOLD 100.0f
std::string to_timestamp(int64_t t)
{
int64_t sec = t / 100;
int64_t msec = t - sec * 100;
int64_t min = sec / 60;
sec = sec - min * 60;
char buf[32];
snprintf(buf, sizeof(buf), "%02d:%02d.%03d", (int)min, (int)sec,
(int)msec);
return std::string(buf);
}
void high_pass_filter(float *pcmf32, size_t pcm32f_size, float cutoff,
uint32_t sample_rate)
{
const float rc = 1.0f / (2.0f * (float)M_PI * cutoff);
const float dt = 1.0f / (float)sample_rate;
const float alpha = dt / (rc + dt);
float y = pcmf32[0];
for (size_t i = 1; i < pcm32f_size; i++) {
y = alpha * (y + pcmf32[i] - pcmf32[i - 1]);
pcmf32[i] = y;
}
}
// VAD (voice activity detection), return true if speech detected
bool vad_simple(float *pcmf32, size_t pcm32f_size, uint32_t sample_rate,
float vad_thold, float freq_thold, bool verbose)
{
const uint64_t n_samples = pcm32f_size;
if (freq_thold > 0.0f) {
high_pass_filter(pcmf32, pcm32f_size, freq_thold, sample_rate);
}
float energy_all = 0.0f;
for (uint64_t i = 0; i < n_samples; i++) {
energy_all += fabsf(pcmf32[i]);
}
energy_all /= (float)n_samples;
if (verbose) {
blog(LOG_INFO,
"%s: energy_all: %f, vad_thold: %f, freq_thold: %f",
__func__, energy_all, vad_thold, freq_thold);
}
if (energy_all < vad_thold) {
return false;
}
return true;
}
struct whisper_context *init_whisper_context(const std::string &model_path)
{
struct whisper_context *ctx =
whisper_init_from_file(obs_module_file(model_path.c_str()));
if (ctx == nullptr) {
obs_log(LOG_ERROR, "Failed to load whisper model");
return nullptr;
}
return ctx;
}
enum DetectionResult {
DETECTION_RESULT_UNKNOWN = 0,
DETECTION_RESULT_SILENCE = 1,
DETECTION_RESULT_SPEECH = 2,
DETECTION_RESULT_FILLER = 3,
DETECTION_RESULT_BEEP = 4,
};
int run_whisper_inference(struct transcription_filter_data *gf,
const float *pcm32f_data, size_t pcm32f_size)
{
obs_log(gf->log_level,
"%s: processing %d samples, %.3f sec, %d threads", __func__,
int(pcm32f_size), float(pcm32f_size) / WHISPER_SAMPLE_RATE,
gf->whisper_params.n_threads);
std::lock_guard<std::mutex> lock(*gf->whisper_ctx_mutex);
if (gf->whisper_context == nullptr) {
obs_log(LOG_WARNING, "whisper context is null");
return DETECTION_RESULT_UNKNOWN;
}
// run the inference
int whisper_full_result = -1;
try {
whisper_full_result =
whisper_full(gf->whisper_context, gf->whisper_params,
pcm32f_data, (int)pcm32f_size);
} catch (const std::exception &e) {
obs_log(LOG_ERROR,
"Whisper exception: %s. Filter restart is required",
e.what());
whisper_free(gf->whisper_context);
gf->whisper_context = nullptr;
return DETECTION_RESULT_UNKNOWN;
}
if (whisper_full_result != 0) {
obs_log(LOG_WARNING, "failed to process audio, error %d",
whisper_full_result);
return DETECTION_RESULT_UNKNOWN;
} else {
const int n_segment = 0;
const char *text = whisper_full_get_segment_text(
gf->whisper_context, n_segment);
const int64_t t0 = whisper_full_get_segment_t0(
gf->whisper_context, n_segment);
const int64_t t1 = whisper_full_get_segment_t1(
gf->whisper_context, n_segment);
float sentence_p = 0.0f;
const int n_tokens =
whisper_full_n_tokens(gf->whisper_context, n_segment);
for (int j = 0; j < n_tokens; ++j) {
sentence_p += whisper_full_get_token_p(
gf->whisper_context, n_segment, j);
}
sentence_p /= (float)n_tokens;
// convert text to lowercase
std::string text_lower(text);
std::transform(text_lower.begin(), text_lower.end(),
text_lower.begin(), ::tolower);
// trim whitespace (use lambda)
text_lower.erase(std::find_if(text_lower.rbegin(),
text_lower.rend(),
[](unsigned char ch) {
return !std::isspace(ch);
})
.base(),
text_lower.end());
if (gf->log_words) {
obs_log(LOG_INFO, "[%s --> %s] (%.3f) %s",
to_timestamp(t0).c_str(),
to_timestamp(t1).c_str(), sentence_p,
text_lower.c_str());
}
if (text_lower.empty()) {
return DETECTION_RESULT_SILENCE;
}
}
return DETECTION_RESULT_SPEECH;
}
void process_audio_from_buffer(struct transcription_filter_data *gf)
{
uint32_t num_new_frames_from_infos = 0;
uint64_t start_timestamp = 0;
{
// scoped lock the buffer mutex
std::lock_guard<std::mutex> lock(*gf->whisper_buf_mutex);
// We need (gf->frames - gf->overlap_frames) new frames to run inference,
// except for the first segment, where we need the whole gf->frames frames
size_t how_many_frames_needed = gf->frames - gf->overlap_frames;
if (gf->last_num_frames == 0) {
how_many_frames_needed = gf->frames;
}
// pop infos from the info buffer and mark the beginning timestamp from the first
// info as the beginning timestamp of the segment
struct transcription_filter_audio_info info_from_buf = {0};
while (gf->info_buffer.size >=
sizeof(struct transcription_filter_audio_info)) {
circlebuf_pop_front(
&gf->info_buffer, &info_from_buf,
sizeof(struct transcription_filter_audio_info));
num_new_frames_from_infos += info_from_buf.frames;
if (start_timestamp == 0) {
start_timestamp = info_from_buf.timestamp;
}
obs_log(gf->log_level,
"popped %d frames from info buffer, %lu needed",
num_new_frames_from_infos,
how_many_frames_needed);
// Check if we're within the needed segment length
if (num_new_frames_from_infos >
how_many_frames_needed) {
// too big, push the last info into the buffer's front where it was
num_new_frames_from_infos -=
info_from_buf.frames;
circlebuf_push_front(
&gf->info_buffer, &info_from_buf,
sizeof(struct transcription_filter_audio_info));
break;
}
}
/* Pop from input circlebuf */
for (size_t c = 0; c < gf->channels; c++) {
if (gf->last_num_frames > 0) {
// move overlap frames from the end of the last copy_buffers to the beginning
memcpy(gf->copy_buffers[c],
gf->copy_buffers[c] +
gf->last_num_frames -
gf->overlap_frames,
gf->overlap_frames * sizeof(float));
// copy new data to the end of copy_buffers[c]
circlebuf_pop_front(&gf->input_buffers[c],
gf->copy_buffers[c] +
gf->overlap_frames,
num_new_frames_from_infos *
sizeof(float));
} else {
// Very first time, just copy data to copy_buffers[c]
circlebuf_pop_front(&gf->input_buffers[c],
gf->copy_buffers[c],
num_new_frames_from_infos *
sizeof(float));
}
}
obs_log(gf->log_level,
"popped %u frames from input buffer. input_buffer[0] size is %lu",
num_new_frames_from_infos, gf->input_buffers[0].size);
if (gf->last_num_frames > 0) {
gf->last_num_frames =
num_new_frames_from_infos + gf->overlap_frames;
} else {
gf->last_num_frames = num_new_frames_from_infos;
}
}
obs_log(gf->log_level,
"processing %d frames (%d ms), start timestamp %llu ",
(int)gf->last_num_frames,
(int)(gf->last_num_frames * 1000 / gf->sample_rate),
start_timestamp);
// time the audio processing
auto start = std::chrono::high_resolution_clock::now();
// resample to 16kHz
float *output[MAX_PREPROC_CHANNELS];
uint32_t out_frames;
uint64_t ts_offset;
audio_resampler_resample(gf->resampler, (uint8_t **)output, &out_frames,
&ts_offset, (const uint8_t **)gf->copy_buffers,
(uint32_t)gf->last_num_frames);
obs_log(gf->log_level, "%d channels, %d frames, %f ms",
(int)gf->channels, (int)out_frames,
(float)out_frames / WHISPER_SAMPLE_RATE * 1000.0f);
bool skipped_inference = false;
if (gf->vad_enabled) {
skipped_inference = !::vad_simple(output[0], out_frames,
WHISPER_SAMPLE_RATE,
VAD_THOLD, FREQ_THOLD,
gf->log_level != LOG_DEBUG);
}
// copy output buffer before potentially modifying it
for (size_t c = 0; c < gf->channels; c++) {
da_copy_array(gf->copy_output_buffers[c], gf->copy_buffers[c],
gf->last_num_frames);
}
if (!skipped_inference) {
// run inference
const int inference_result =
run_whisper_inference(gf, output[0], out_frames);
UNUSED_PARAMETER(inference_result);
// TODO: output inference result to a text source
} else {
if (gf->log_words) {
obs_log(LOG_INFO, "skipping inference");
}
}
// end of timer
auto end = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(
end - start)
.count();
const uint32_t new_frames_from_infos_ms =
num_new_frames_from_infos * 1000 /
gf->sample_rate; // number of frames in this packet
obs_log(gf->log_level, "audio processing of %u ms new data took %d ms",
new_frames_from_infos_ms, (int)duration);
if (duration > new_frames_from_infos_ms) {
// try to decrease overlap down to minimum of 100 ms
gf->overlap_ms =
std::max((uint64_t)gf->overlap_ms - 10, (uint64_t)100);
gf->overlap_frames = gf->overlap_ms * gf->sample_rate / 1000;
obs_log(gf->log_level,
"audio processing took too long (%d ms), reducing overlap to %lu ms",
(int)duration, gf->overlap_ms);
} else if (!skipped_inference) {
// try to increase overlap up to 75% of the segment
gf->overlap_ms = std::min(
(uint64_t)gf->overlap_ms + 10,
(uint64_t)((float)new_frames_from_infos_ms * 0.75f));
gf->overlap_frames = gf->overlap_ms * gf->sample_rate / 1000;
obs_log(gf->log_level,
"audio processing took %d ms, increasing overlap to %lu ms",
(int)duration, gf->overlap_ms);
}
}
void whisper_loop(void *data)
{
struct transcription_filter_data *gf =
static_cast<struct transcription_filter_data *>(data);
const size_t segment_size = gf->frames * sizeof(float);
obs_log(LOG_INFO, "starting whisper thread");
// Thread main loop
while (true) {
{
std::lock_guard<std::mutex> lock(
*gf->whisper_ctx_mutex);
if (gf->whisper_context == nullptr) {
obs_log(LOG_WARNING,
"Whisper context is null, exiting thread");
break;
}
}
// Check if we have enough data to process
while (true) {
size_t input_buf_size = 0;
{
std::lock_guard<std::mutex> lock(
*gf->whisper_buf_mutex);
input_buf_size = gf->input_buffers[0].size;
}
if (input_buf_size >= segment_size) {
obs_log(gf->log_level,
"found %lu bytes, %lu frames in input buffer, need >= %lu, processing",
input_buf_size,
(size_t)(input_buf_size /
sizeof(float)),
segment_size);
// Process the audio. This will also remove the processed data from the input buffer.
// Mutex is locked inside process_audio_from_buffer.
process_audio_from_buffer(gf);
} else {
break;
}
}
std::this_thread::sleep_for(std::chrono::milliseconds(10));
}
obs_log(LOG_INFO, "exiting whisper thread");
}

7
src/whisper-processing.h Normal file
View File

@@ -0,0 +1,7 @@
#ifndef WHISPER_PROCESSING_H
#define WHISPER_PROCESSING_H
void whisper_loop(void *data);
struct whisper_context *init_whisper_context(const std::string &model_path);
#endif // WHISPER_PROCESSING_H

1
vendor/curl vendored Submodule

Submodule vendor/curl added at 439ff2052e