mirror of
https://github.com/royshil/obs-localvocal.git
synced 2026-01-09 12:28:05 -05:00
* refactor: Add transcription-filter-properties.cpp for managing filter properties * refactor: Add translation_monitor to transcription filter - Add translation_monitor to the transcription filter data structure - Initialize and stop the translation_monitor in the transcription_filter_update function - Update the send_caption_to_source function to use the translation_monitor for sending translated captions - Clear the translation_monitor when disabling buffered output in the transcription_filter_update function * refactor: Simplify UI and improve error handling in transcription filter
624 lines
24 KiB
C++
624 lines
24 KiB
C++
#include <obs-module.h>
|
|
#include <obs-frontend-api.h>
|
|
|
|
#include <algorithm>
|
|
#include <fstream>
|
|
#include <sstream>
|
|
#include <iomanip>
|
|
#include <bitset>
|
|
#include <regex>
|
|
#ifdef _WIN32
|
|
#define NOMINMAX
|
|
#include <Windows.h>
|
|
#endif
|
|
|
|
#include <QString>
|
|
|
|
#include "plugin-support.h"
|
|
#include "transcription-filter.h"
|
|
#include "transcription-filter-callbacks.h"
|
|
#include "transcription-filter-data.h"
|
|
#include "transcription-filter-utils.h"
|
|
#include "transcription-utils.h"
|
|
#include "model-utils/model-downloader.h"
|
|
#include "whisper-utils/whisper-processing.h"
|
|
#include "whisper-utils/whisper-language.h"
|
|
#include "whisper-utils/whisper-model-utils.h"
|
|
#include "whisper-utils/whisper-utils.h"
|
|
#include "translation/language_codes.h"
|
|
#include "translation/translation-utils.h"
|
|
#include "translation/translation.h"
|
|
#include "translation/translation-includes.h"
|
|
#include "ui/filter-replace-dialog.h"
|
|
|
|
void set_source_signals(transcription_filter_data *gf, obs_source_t *parent_source)
|
|
{
|
|
signal_handler_t *sh = obs_source_get_signal_handler(parent_source);
|
|
signal_handler_connect(sh, "media_play", media_play_callback, gf);
|
|
signal_handler_connect(sh, "media_started", media_started_callback, gf);
|
|
signal_handler_connect(sh, "media_pause", media_pause_callback, gf);
|
|
signal_handler_connect(sh, "media_restart", media_restart_callback, gf);
|
|
signal_handler_connect(sh, "media_stopped", media_stopped_callback, gf);
|
|
gf->source_signals_set = true;
|
|
}
|
|
|
|
void disconnect_source_signals(transcription_filter_data *gf, obs_source_t *parent_source)
|
|
{
|
|
signal_handler_t *sh = obs_source_get_signal_handler(parent_source);
|
|
signal_handler_disconnect(sh, "media_play", media_play_callback, gf);
|
|
signal_handler_disconnect(sh, "media_started", media_started_callback, gf);
|
|
signal_handler_disconnect(sh, "media_pause", media_pause_callback, gf);
|
|
signal_handler_disconnect(sh, "media_restart", media_restart_callback, gf);
|
|
signal_handler_disconnect(sh, "media_stopped", media_stopped_callback, gf);
|
|
gf->source_signals_set = false;
|
|
}
|
|
|
|
struct obs_audio_data *transcription_filter_filter_audio(void *data, struct obs_audio_data *audio)
|
|
{
|
|
if (!audio) {
|
|
return nullptr;
|
|
}
|
|
|
|
if (data == nullptr) {
|
|
return audio;
|
|
}
|
|
|
|
struct transcription_filter_data *gf =
|
|
static_cast<struct transcription_filter_data *>(data);
|
|
|
|
// Lazy initialization of source signals
|
|
if (!gf->source_signals_set) {
|
|
// obs_filter_get_parent only works in the filter function
|
|
obs_source_t *parent_source = obs_filter_get_parent(gf->context);
|
|
if (parent_source != nullptr) {
|
|
set_source_signals(gf, parent_source);
|
|
}
|
|
}
|
|
|
|
if (!gf->active) {
|
|
return audio;
|
|
}
|
|
|
|
if (gf->whisper_context == nullptr) {
|
|
// Whisper not initialized, just pass through
|
|
return audio;
|
|
}
|
|
|
|
// Check if process while muted is not enabled (e.g. the user wants to avoid processing audio
|
|
// when the source is muted)
|
|
if (!gf->process_while_muted) {
|
|
// Check if the parent source is muted
|
|
obs_source_t *parent_source = obs_filter_get_parent(gf->context);
|
|
if (parent_source != nullptr && obs_source_muted(parent_source)) {
|
|
// Source is muted, do not process audio
|
|
return audio;
|
|
}
|
|
}
|
|
|
|
{
|
|
std::lock_guard<std::mutex> lock(gf->whisper_buf_mutex); // scoped lock
|
|
// push back current audio data to input circlebuf
|
|
for (size_t c = 0; c < gf->channels; c++) {
|
|
circlebuf_push_back(&gf->input_buffers[c], audio->data[c],
|
|
audio->frames * sizeof(float));
|
|
}
|
|
// push audio packet info (timestamp/frame count) to info circlebuf
|
|
struct transcription_filter_audio_info info = {0};
|
|
info.frames = audio->frames; // number of frames in this packet
|
|
// check if the timestamp is a false "negative" value for uint64_t
|
|
if (audio->timestamp > (std::numeric_limits<uint64_t>::max() - 100000000)) {
|
|
// set the timestamp to the current time
|
|
info.timestamp_offset_ns = 0;
|
|
} else {
|
|
info.timestamp_offset_ns = audio->timestamp; // timestamp of this packet
|
|
}
|
|
circlebuf_push_back(&gf->info_buffer, &info, sizeof(info));
|
|
}
|
|
|
|
return audio;
|
|
}
|
|
|
|
const char *transcription_filter_name(void *unused)
|
|
{
|
|
UNUSED_PARAMETER(unused);
|
|
return MT_("transcription_filterAudioFilter");
|
|
}
|
|
|
|
void transcription_filter_remove(void *data, obs_source_t *source)
|
|
{
|
|
struct transcription_filter_data *gf =
|
|
static_cast<struct transcription_filter_data *>(data);
|
|
|
|
obs_log(gf->log_level, "filter remove");
|
|
|
|
disconnect_source_signals(gf, source);
|
|
}
|
|
|
|
void transcription_filter_destroy(void *data)
|
|
{
|
|
struct transcription_filter_data *gf =
|
|
static_cast<struct transcription_filter_data *>(data);
|
|
|
|
signal_handler_t *sh_filter = obs_source_get_signal_handler(gf->context);
|
|
signal_handler_disconnect(sh_filter, "enable", enable_callback, gf);
|
|
|
|
obs_log(gf->log_level, "filter destroy");
|
|
shutdown_whisper_thread(gf);
|
|
|
|
if (gf->resampler_to_whisper) {
|
|
audio_resampler_destroy(gf->resampler_to_whisper);
|
|
}
|
|
|
|
{
|
|
std::lock_guard<std::mutex> lockbuf(gf->whisper_buf_mutex);
|
|
bfree(gf->copy_buffers[0]);
|
|
gf->copy_buffers[0] = nullptr;
|
|
for (size_t i = 0; i < gf->channels; i++) {
|
|
circlebuf_free(&gf->input_buffers[i]);
|
|
}
|
|
}
|
|
circlebuf_free(&gf->info_buffer);
|
|
|
|
if (gf->captions_monitor.isEnabled()) {
|
|
gf->captions_monitor.stopThread();
|
|
}
|
|
if (gf->translation_monitor.isEnabled()) {
|
|
gf->translation_monitor.stopThread();
|
|
}
|
|
|
|
bfree(gf);
|
|
}
|
|
|
|
void transcription_filter_update(void *data, obs_data_t *s)
|
|
{
|
|
struct transcription_filter_data *gf =
|
|
static_cast<struct transcription_filter_data *>(data);
|
|
obs_log(gf->log_level, "LocalVocal filter update");
|
|
|
|
gf->log_level = (int)obs_data_get_int(s, "log_level");
|
|
gf->vad_enabled = obs_data_get_bool(s, "vad_enabled");
|
|
gf->log_words = obs_data_get_bool(s, "log_words");
|
|
gf->caption_to_stream = obs_data_get_bool(s, "caption_to_stream");
|
|
gf->save_to_file = obs_data_get_bool(s, "file_output_enable");
|
|
gf->save_srt = obs_data_get_bool(s, "subtitle_save_srt");
|
|
gf->truncate_output_file = obs_data_get_bool(s, "truncate_output_file");
|
|
gf->save_only_while_recording = obs_data_get_bool(s, "only_while_recording");
|
|
gf->rename_file_to_match_recording = obs_data_get_bool(s, "rename_file_to_match_recording");
|
|
// Get the current timestamp using the system clock
|
|
gf->start_timestamp_ms = now_ms();
|
|
gf->sentence_number = 1;
|
|
gf->process_while_muted = obs_data_get_bool(s, "process_while_muted");
|
|
gf->min_sub_duration = (int)obs_data_get_int(s, "min_sub_duration");
|
|
gf->last_sub_render_time = now_ms();
|
|
bool new_buffered_output = obs_data_get_bool(s, "buffered_output");
|
|
int new_buffer_num_lines = (int)obs_data_get_int(s, "buffer_num_lines");
|
|
int new_buffer_num_chars_per_line = (int)obs_data_get_int(s, "buffer_num_chars_per_line");
|
|
TokenBufferSegmentation new_buffer_output_type =
|
|
(TokenBufferSegmentation)obs_data_get_int(s, "buffer_output_type");
|
|
const char *filter_words_replace = obs_data_get_string(s, "filter_words_replace");
|
|
if (filter_words_replace != nullptr && strlen(filter_words_replace) > 0) {
|
|
obs_log(gf->log_level, "filter_words_replace: %s", filter_words_replace);
|
|
// deserialize the filter words replace
|
|
gf->filter_words_replace = deserialize_filter_words_replace(filter_words_replace);
|
|
} else {
|
|
// clear the filter words replace
|
|
gf->filter_words_replace.clear();
|
|
}
|
|
|
|
if (gf->save_to_file) {
|
|
gf->output_file_path = "";
|
|
// set the output file path
|
|
const char *output_file_path = obs_data_get_string(s, "subtitle_output_filename");
|
|
if (output_file_path != nullptr && strlen(output_file_path) > 0) {
|
|
gf->output_file_path = output_file_path;
|
|
} else {
|
|
obs_log(gf->log_level, "output file path is empty, but selected to save");
|
|
}
|
|
}
|
|
|
|
if (new_buffered_output) {
|
|
obs_log(gf->log_level, "buffered_output enable");
|
|
if (!gf->buffered_output || !gf->captions_monitor.isEnabled()) {
|
|
obs_log(gf->log_level, "buffered_output currently disabled, enabling");
|
|
gf->buffered_output = true;
|
|
gf->captions_monitor.initialize(
|
|
gf,
|
|
[gf](const std::string &text) {
|
|
if (gf->buffered_output) {
|
|
send_caption_to_source(gf->text_source_name, text,
|
|
gf);
|
|
}
|
|
},
|
|
[gf](const std::string &) {}, new_buffer_num_lines,
|
|
new_buffer_num_chars_per_line, std::chrono::seconds(3),
|
|
new_buffer_output_type);
|
|
gf->translation_monitor.initialize(
|
|
gf,
|
|
[gf](const std::string &translated_text) {
|
|
if (gf->buffered_output &&
|
|
gf->translation_output != "none") {
|
|
send_caption_to_source(gf->translation_output,
|
|
translated_text, gf);
|
|
}
|
|
},
|
|
[gf](const std::string &) {}, new_buffer_num_lines,
|
|
new_buffer_num_chars_per_line, std::chrono::seconds(3),
|
|
new_buffer_output_type);
|
|
} else {
|
|
if (new_buffer_num_lines != gf->buffered_output_num_lines ||
|
|
new_buffer_num_chars_per_line != gf->buffered_output_num_chars ||
|
|
new_buffer_output_type != gf->buffered_output_output_type) {
|
|
obs_log(gf->log_level,
|
|
"buffered_output parameters changed, updating");
|
|
gf->captions_monitor.clear();
|
|
gf->captions_monitor.setNumSentences(new_buffer_num_lines);
|
|
gf->captions_monitor.setNumPerSentence(
|
|
new_buffer_num_chars_per_line);
|
|
gf->captions_monitor.setSegmentation(new_buffer_output_type);
|
|
gf->translation_monitor.clear();
|
|
gf->translation_monitor.setNumSentences(new_buffer_num_lines);
|
|
gf->translation_monitor.setNumPerSentence(
|
|
new_buffer_num_chars_per_line);
|
|
gf->translation_monitor.setSegmentation(new_buffer_output_type);
|
|
}
|
|
}
|
|
gf->buffered_output_num_lines = new_buffer_num_lines;
|
|
gf->buffered_output_num_chars = new_buffer_num_chars_per_line;
|
|
gf->buffered_output_output_type = new_buffer_output_type;
|
|
} else {
|
|
obs_log(gf->log_level, "buffered_output disable");
|
|
if (gf->buffered_output) {
|
|
obs_log(gf->log_level, "buffered_output currently enabled, disabling");
|
|
if (gf->captions_monitor.isEnabled()) {
|
|
gf->captions_monitor.clear();
|
|
gf->captions_monitor.stopThread();
|
|
gf->translation_monitor.clear();
|
|
gf->translation_monitor.stopThread();
|
|
}
|
|
gf->buffered_output = false;
|
|
}
|
|
}
|
|
|
|
bool new_translate = obs_data_get_bool(s, "translate");
|
|
gf->source_lang = obs_data_get_string(s, "translate_source_language");
|
|
gf->target_lang = obs_data_get_string(s, "translate_target_language");
|
|
gf->translation_ctx.add_context = obs_data_get_bool(s, "translate_add_context");
|
|
gf->translation_ctx.input_tokenization_style =
|
|
(InputTokenizationStyle)obs_data_get_int(s, "translate_input_tokenization_style");
|
|
gf->translation_output = obs_data_get_string(s, "translate_output");
|
|
std::string new_translate_model_index = obs_data_get_string(s, "translate_model");
|
|
std::string new_translation_model_path_external =
|
|
obs_data_get_string(s, "translation_model_path_external");
|
|
|
|
if (new_translate) {
|
|
if (new_translate != gf->translate ||
|
|
new_translate_model_index != gf->translation_model_index ||
|
|
new_translation_model_path_external != gf->translation_model_path_external) {
|
|
// translation settings changed
|
|
gf->translation_model_index = new_translate_model_index;
|
|
gf->translation_model_path_external = new_translation_model_path_external;
|
|
if (gf->translation_model_index != "whisper-based-translation") {
|
|
start_translation(gf);
|
|
} else {
|
|
// whisper-based translation
|
|
obs_log(gf->log_level, "Starting whisper-based translation...");
|
|
gf->translate = false;
|
|
}
|
|
}
|
|
} else {
|
|
gf->translate = false;
|
|
}
|
|
|
|
// translation options
|
|
if (gf->translate) {
|
|
if (gf->translation_ctx.options) {
|
|
gf->translation_ctx.options->sampling_temperature =
|
|
(float)obs_data_get_double(s, "translation_sampling_temperature");
|
|
gf->translation_ctx.options->repetition_penalty =
|
|
(float)obs_data_get_double(s, "translation_repetition_penalty");
|
|
gf->translation_ctx.options->beam_size =
|
|
(int)obs_data_get_int(s, "translation_beam_size");
|
|
gf->translation_ctx.options->max_decoding_length =
|
|
(int)obs_data_get_int(s, "translation_max_decoding_length");
|
|
gf->translation_ctx.options->no_repeat_ngram_size =
|
|
(int)obs_data_get_int(s, "translation_no_repeat_ngram_size");
|
|
gf->translation_ctx.options->max_input_length =
|
|
(int)obs_data_get_int(s, "translation_max_input_length");
|
|
}
|
|
}
|
|
|
|
obs_log(gf->log_level, "update text source");
|
|
// update the text source
|
|
const char *new_text_source_name = obs_data_get_string(s, "subtitle_sources");
|
|
|
|
if (new_text_source_name == nullptr || strcmp(new_text_source_name, "none") == 0 ||
|
|
strcmp(new_text_source_name, "(null)") == 0 || strlen(new_text_source_name) == 0) {
|
|
// new selected text source is not valid, release the old one
|
|
gf->text_source_name.clear();
|
|
} else {
|
|
gf->text_source_name = new_text_source_name;
|
|
}
|
|
|
|
obs_log(gf->log_level, "update whisper params");
|
|
{
|
|
std::lock_guard<std::mutex> lock(gf->whisper_ctx_mutex);
|
|
|
|
gf->sentence_psum_accept_thresh =
|
|
(float)obs_data_get_double(s, "sentence_psum_accept_thresh");
|
|
|
|
gf->whisper_params = whisper_full_default_params(
|
|
(whisper_sampling_strategy)obs_data_get_int(s, "whisper_sampling_method"));
|
|
gf->whisper_params.duration_ms = (int)obs_data_get_int(s, "buffer_size_msec");
|
|
if (!new_translate || gf->translation_model_index != "whisper-based-translation") {
|
|
const char *whisper_language_select =
|
|
obs_data_get_string(s, "whisper_language_select");
|
|
gf->whisper_params.language = (whisper_language_select != nullptr &&
|
|
strlen(whisper_language_select) > 0)
|
|
? whisper_language_select
|
|
: "auto";
|
|
} else {
|
|
// take the language from gf->target_lang
|
|
if (language_codes_2_reverse.count(gf->target_lang) > 0) {
|
|
gf->whisper_params.language =
|
|
language_codes_2_reverse[gf->target_lang].c_str();
|
|
} else {
|
|
gf->whisper_params.language = "auto";
|
|
}
|
|
}
|
|
gf->whisper_params.initial_prompt =
|
|
obs_data_get_string(s, "initial_prompt") != nullptr
|
|
? obs_data_get_string(s, "initial_prompt")
|
|
: "";
|
|
gf->whisper_params.n_threads = (int)obs_data_get_int(s, "n_threads");
|
|
gf->whisper_params.n_max_text_ctx = (int)obs_data_get_int(s, "n_max_text_ctx");
|
|
gf->whisper_params.translate = obs_data_get_bool(s, "whisper_translate");
|
|
gf->whisper_params.no_context = obs_data_get_bool(s, "no_context");
|
|
gf->whisper_params.single_segment = obs_data_get_bool(s, "single_segment");
|
|
gf->whisper_params.print_special = obs_data_get_bool(s, "print_special");
|
|
gf->whisper_params.print_progress = obs_data_get_bool(s, "print_progress");
|
|
gf->whisper_params.print_realtime = obs_data_get_bool(s, "print_realtime");
|
|
gf->whisper_params.print_timestamps = obs_data_get_bool(s, "print_timestamps");
|
|
gf->whisper_params.token_timestamps = obs_data_get_bool(s, "token_timestamps");
|
|
gf->whisper_params.thold_pt = (float)obs_data_get_double(s, "thold_pt");
|
|
gf->whisper_params.thold_ptsum = (float)obs_data_get_double(s, "thold_ptsum");
|
|
gf->whisper_params.max_len = (int)obs_data_get_int(s, "max_len");
|
|
gf->whisper_params.split_on_word = obs_data_get_bool(s, "split_on_word");
|
|
gf->whisper_params.max_tokens = (int)obs_data_get_int(s, "max_tokens");
|
|
gf->whisper_params.speed_up = obs_data_get_bool(s, "speed_up");
|
|
gf->whisper_params.suppress_blank = obs_data_get_bool(s, "suppress_blank");
|
|
gf->whisper_params.suppress_non_speech_tokens =
|
|
obs_data_get_bool(s, "suppress_non_speech_tokens");
|
|
gf->whisper_params.temperature = (float)obs_data_get_double(s, "temperature");
|
|
gf->whisper_params.max_initial_ts = (float)obs_data_get_double(s, "max_initial_ts");
|
|
gf->whisper_params.length_penalty = (float)obs_data_get_double(s, "length_penalty");
|
|
|
|
if (gf->vad_enabled && gf->vad) {
|
|
const float vad_threshold = (float)obs_data_get_double(s, "vad_threshold");
|
|
gf->vad->set_threshold(vad_threshold);
|
|
}
|
|
}
|
|
|
|
if (gf->context != nullptr && obs_source_enabled(gf->context)) {
|
|
if (gf->initial_creation) {
|
|
obs_log(LOG_INFO, "Initial filter creation and source enabled");
|
|
|
|
// source was enabled on creation
|
|
update_whisper_model(gf);
|
|
gf->active = true;
|
|
gf->initial_creation = false;
|
|
} else {
|
|
// check if the whisper model selection has changed
|
|
const std::string new_model_path =
|
|
obs_data_get_string(s, "whisper_model_path") != nullptr
|
|
? obs_data_get_string(s, "whisper_model_path")
|
|
: "Whisper Tiny English (74Mb)";
|
|
if (gf->whisper_model_path != new_model_path) {
|
|
obs_log(LOG_INFO, "New model selected: %s", new_model_path.c_str());
|
|
update_whisper_model(gf);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void *transcription_filter_create(obs_data_t *settings, obs_source_t *filter)
|
|
{
|
|
obs_log(LOG_INFO, "LocalVocal filter create");
|
|
|
|
void *data = bmalloc(sizeof(struct transcription_filter_data));
|
|
struct transcription_filter_data *gf = new (data) transcription_filter_data();
|
|
|
|
// Get the number of channels for the input source
|
|
gf->channels = audio_output_get_channels(obs_get_audio());
|
|
gf->sample_rate = audio_output_get_sample_rate(obs_get_audio());
|
|
gf->frames = (size_t)((float)gf->sample_rate / (1000.0f / MAX_MS_WORK_BUFFER));
|
|
gf->last_num_frames = 0;
|
|
gf->min_sub_duration = (int)obs_data_get_int(settings, "min_sub_duration");
|
|
gf->last_sub_render_time = now_ms();
|
|
gf->log_level = (int)obs_data_get_int(settings, "log_level");
|
|
gf->save_srt = obs_data_get_bool(settings, "subtitle_save_srt");
|
|
gf->truncate_output_file = obs_data_get_bool(settings, "truncate_output_file");
|
|
gf->save_only_while_recording = obs_data_get_bool(settings, "only_while_recording");
|
|
gf->rename_file_to_match_recording =
|
|
obs_data_get_bool(settings, "rename_file_to_match_recording");
|
|
gf->process_while_muted = obs_data_get_bool(settings, "process_while_muted");
|
|
gf->buffered_output = obs_data_get_bool(settings, "buffered_output");
|
|
|
|
for (size_t i = 0; i < gf->channels; i++) {
|
|
circlebuf_init(&gf->input_buffers[i]);
|
|
}
|
|
circlebuf_init(&gf->info_buffer);
|
|
circlebuf_init(&gf->whisper_buffer);
|
|
|
|
// allocate copy buffers
|
|
gf->copy_buffers[0] =
|
|
static_cast<float *>(bzalloc(gf->channels * gf->frames * sizeof(float)));
|
|
if (gf->copy_buffers[0] == nullptr) {
|
|
obs_log(LOG_ERROR, "Failed to allocate copy buffer");
|
|
gf->active = false;
|
|
return nullptr;
|
|
}
|
|
for (size_t c = 1; c < gf->channels; c++) { // set the channel pointers
|
|
gf->copy_buffers[c] = gf->copy_buffers[0] + c * gf->frames;
|
|
}
|
|
memset(gf->copy_buffers[0], 0, gf->channels * gf->frames * sizeof(float));
|
|
|
|
gf->context = filter;
|
|
|
|
obs_log(gf->log_level, "channels %d, frames %d, sample_rate %d", (int)gf->channels,
|
|
(int)gf->frames, gf->sample_rate);
|
|
|
|
obs_log(gf->log_level, "setup audio resampler");
|
|
struct resample_info src, dst;
|
|
src.samples_per_sec = gf->sample_rate;
|
|
src.format = AUDIO_FORMAT_FLOAT_PLANAR;
|
|
src.speakers = convert_speaker_layout((uint8_t)gf->channels);
|
|
|
|
dst.samples_per_sec = WHISPER_SAMPLE_RATE;
|
|
dst.format = AUDIO_FORMAT_FLOAT_PLANAR;
|
|
dst.speakers = convert_speaker_layout((uint8_t)1);
|
|
|
|
gf->resampler_to_whisper = audio_resampler_create(&dst, &src);
|
|
if (!gf->resampler_to_whisper) {
|
|
obs_log(LOG_ERROR, "Failed to create resampler");
|
|
gf->active = false;
|
|
return nullptr;
|
|
}
|
|
|
|
obs_log(gf->log_level, "clear text source data");
|
|
const char *subtitle_sources = obs_data_get_string(settings, "subtitle_sources");
|
|
if (subtitle_sources == nullptr || strlen(subtitle_sources) == 0 ||
|
|
strcmp(subtitle_sources, "none") == 0 || strcmp(subtitle_sources, "(null)") == 0) {
|
|
obs_log(gf->log_level, "Create text source");
|
|
create_obs_text_source_if_needed();
|
|
gf->text_source_name = "LocalVocal Subtitles";
|
|
obs_data_set_string(settings, "subtitle_sources", "LocalVocal Subtitles");
|
|
} else {
|
|
// set the text source name
|
|
gf->text_source_name = subtitle_sources;
|
|
}
|
|
obs_log(gf->log_level, "clear paths and whisper context");
|
|
gf->whisper_model_file_currently_loaded = "";
|
|
gf->output_file_path = std::string("");
|
|
gf->whisper_model_path = std::string(""); // The update function will set the model path
|
|
gf->whisper_context = nullptr;
|
|
|
|
signal_handler_t *sh_filter = obs_source_get_signal_handler(gf->context);
|
|
if (sh_filter == nullptr) {
|
|
obs_log(LOG_ERROR, "Failed to get signal handler");
|
|
gf->active = false;
|
|
return nullptr;
|
|
}
|
|
|
|
signal_handler_connect(sh_filter, "enable", enable_callback, gf);
|
|
|
|
obs_log(gf->log_level, "run update");
|
|
// get the settings updated on the filter data struct
|
|
transcription_filter_update(gf, settings);
|
|
|
|
// handle the event OBS_FRONTEND_EVENT_RECORDING_STARTING to reset the srt sentence number
|
|
// to match the subtitles with the recording
|
|
obs_frontend_add_event_callback(recording_state_callback, gf);
|
|
|
|
obs_log(gf->log_level, "filter created.");
|
|
return gf;
|
|
}
|
|
|
|
void transcription_filter_activate(void *data)
|
|
{
|
|
struct transcription_filter_data *gf =
|
|
static_cast<struct transcription_filter_data *>(data);
|
|
obs_log(gf->log_level, "filter activated");
|
|
gf->active = true;
|
|
}
|
|
|
|
void transcription_filter_deactivate(void *data)
|
|
{
|
|
struct transcription_filter_data *gf =
|
|
static_cast<struct transcription_filter_data *>(data);
|
|
obs_log(gf->log_level, "filter deactivated");
|
|
gf->active = false;
|
|
}
|
|
|
|
void transcription_filter_show(void *data)
|
|
{
|
|
struct transcription_filter_data *gf =
|
|
static_cast<struct transcription_filter_data *>(data);
|
|
obs_log(gf->log_level, "filter show");
|
|
}
|
|
|
|
void transcription_filter_hide(void *data)
|
|
{
|
|
struct transcription_filter_data *gf =
|
|
static_cast<struct transcription_filter_data *>(data);
|
|
obs_log(gf->log_level, "filter hide");
|
|
}
|
|
|
|
void transcription_filter_defaults(obs_data_t *s)
|
|
{
|
|
obs_log(LOG_DEBUG, "filter defaults");
|
|
|
|
obs_data_set_default_bool(s, "buffered_output", false);
|
|
obs_data_set_default_int(s, "buffer_num_lines", 2);
|
|
obs_data_set_default_int(s, "buffer_num_chars_per_line", 30);
|
|
obs_data_set_default_int(s, "buffer_output_type",
|
|
(int)TokenBufferSegmentation::SEGMENTATION_TOKEN);
|
|
|
|
obs_data_set_default_bool(s, "vad_enabled", true);
|
|
obs_data_set_default_double(s, "vad_threshold", 0.65);
|
|
obs_data_set_default_int(s, "log_level", LOG_DEBUG);
|
|
obs_data_set_default_bool(s, "log_words", false);
|
|
obs_data_set_default_bool(s, "caption_to_stream", false);
|
|
obs_data_set_default_string(s, "whisper_model_path", "Whisper Tiny English (74Mb)");
|
|
obs_data_set_default_string(s, "whisper_language_select", "en");
|
|
obs_data_set_default_string(s, "subtitle_sources", "none");
|
|
obs_data_set_default_bool(s, "process_while_muted", false);
|
|
obs_data_set_default_bool(s, "subtitle_save_srt", false);
|
|
obs_data_set_default_bool(s, "truncate_output_file", false);
|
|
obs_data_set_default_bool(s, "only_while_recording", false);
|
|
obs_data_set_default_bool(s, "rename_file_to_match_recording", true);
|
|
obs_data_set_default_int(s, "min_sub_duration", 3000);
|
|
obs_data_set_default_bool(s, "advanced_settings", false);
|
|
obs_data_set_default_bool(s, "translate", false);
|
|
obs_data_set_default_string(s, "translate_target_language", "__es__");
|
|
obs_data_set_default_string(s, "translate_source_language", "__en__");
|
|
obs_data_set_default_bool(s, "translate_add_context", true);
|
|
obs_data_set_default_string(s, "translate_model", "whisper-based-translation");
|
|
obs_data_set_default_string(s, "translation_model_path_external", "");
|
|
obs_data_set_default_int(s, "translate_input_tokenization_style", INPUT_TOKENIZAION_M2M100);
|
|
obs_data_set_default_double(s, "sentence_psum_accept_thresh", 0.4);
|
|
|
|
// translation options
|
|
obs_data_set_default_double(s, "translation_sampling_temperature", 0.1);
|
|
obs_data_set_default_double(s, "translation_repetition_penalty", 2.0);
|
|
obs_data_set_default_int(s, "translation_beam_size", 1);
|
|
obs_data_set_default_int(s, "translation_max_decoding_length", 65);
|
|
obs_data_set_default_int(s, "translation_no_repeat_ngram_size", 1);
|
|
obs_data_set_default_int(s, "translation_max_input_length", 65);
|
|
|
|
// Whisper parameters
|
|
obs_data_set_default_int(s, "whisper_sampling_method", WHISPER_SAMPLING_BEAM_SEARCH);
|
|
obs_data_set_default_string(s, "initial_prompt", "");
|
|
obs_data_set_default_int(s, "n_threads", 4);
|
|
obs_data_set_default_int(s, "n_max_text_ctx", 16384);
|
|
obs_data_set_default_bool(s, "whisper_translate", false);
|
|
obs_data_set_default_bool(s, "no_context", true);
|
|
obs_data_set_default_bool(s, "single_segment", true);
|
|
obs_data_set_default_bool(s, "print_special", false);
|
|
obs_data_set_default_bool(s, "print_progress", false);
|
|
obs_data_set_default_bool(s, "print_realtime", false);
|
|
obs_data_set_default_bool(s, "print_timestamps", false);
|
|
obs_data_set_default_bool(s, "token_timestamps", false);
|
|
obs_data_set_default_bool(s, "dtw_token_timestamps", false);
|
|
obs_data_set_default_double(s, "thold_pt", 0.01);
|
|
obs_data_set_default_double(s, "thold_ptsum", 0.01);
|
|
obs_data_set_default_int(s, "max_len", 0);
|
|
obs_data_set_default_bool(s, "split_on_word", true);
|
|
obs_data_set_default_int(s, "max_tokens", 0);
|
|
obs_data_set_default_bool(s, "speed_up", false);
|
|
obs_data_set_default_bool(s, "suppress_blank", false);
|
|
obs_data_set_default_bool(s, "suppress_non_speech_tokens", true);
|
|
obs_data_set_default_double(s, "temperature", 0.1);
|
|
obs_data_set_default_double(s, "max_initial_ts", 1.0);
|
|
obs_data_set_default_double(s, "length_penalty", -1.0);
|
|
}
|