mirror of
https://github.com/royshil/obs-localvocal.git
synced 2026-01-08 20:08:08 -05:00
refactor: Remove unused utils.h and utils.cpp files
This commit is contained in:
@@ -88,6 +88,7 @@ target_sources(
|
||||
src/transcription-filter.cpp
|
||||
src/transcription-filter.c
|
||||
src/transcription-filter-callbacks.cpp
|
||||
src/transcription-filter-utils.cpp
|
||||
src/transcription-utils.cpp
|
||||
src/model-utils/model-downloader.cpp
|
||||
src/model-utils/model-downloader-ui.cpp
|
||||
@@ -100,8 +101,7 @@ target_sources(
|
||||
src/whisper-utils/token-buffer-thread.cpp
|
||||
src/translation/language_codes.cpp
|
||||
src/translation/translation.cpp
|
||||
src/translation/translation-utils.cpp
|
||||
src/utils.cpp)
|
||||
src/translation/translation-utils.cpp)
|
||||
|
||||
set_target_properties_plugin(${CMAKE_PROJECT_NAME} PROPERTIES OUTPUT_NAME ${_name})
|
||||
|
||||
@@ -122,8 +122,7 @@ if(ENABLE_TESTS)
|
||||
src/whisper-utils/silero-vad-onnx.cpp
|
||||
src/whisper-utils/token-buffer-thread.cpp
|
||||
src/translation/language_codes.cpp
|
||||
src/translation/translation.cpp
|
||||
src/utils.cpp)
|
||||
src/translation/translation.cpp)
|
||||
|
||||
find_libav(${CMAKE_PROJECT_NAME}-tests)
|
||||
|
||||
|
||||
@@ -13,6 +13,7 @@
|
||||
#include <nlohmann/json.hpp>
|
||||
|
||||
#include "transcription-filter-data.h"
|
||||
#include "transcription-filter-utils.h"
|
||||
#include "transcription-filter.h"
|
||||
#include "transcription-utils.h"
|
||||
#include "whisper-utils/whisper-utils.h"
|
||||
@@ -155,11 +156,12 @@ create_context(int sample_rate, int channels, const std::string &whisper_model_p
|
||||
gf->whisper_params = whisper_full_default_params(whisper_sampling_method);
|
||||
gf->whisper_params.duration_ms = 3000;
|
||||
gf->whisper_params.language = "en";
|
||||
gf->whisper_params.detect_language = false;
|
||||
gf->whisper_params.initial_prompt = "";
|
||||
gf->whisper_params.n_threads = 4;
|
||||
gf->whisper_params.n_max_text_ctx = 16384;
|
||||
gf->whisper_params.translate = false;
|
||||
gf->whisper_params.no_context = true;
|
||||
gf->whisper_params.no_context = false;
|
||||
gf->whisper_params.single_segment = true;
|
||||
gf->whisper_params.print_special = false;
|
||||
gf->whisper_params.print_progress = false;
|
||||
@@ -174,7 +176,7 @@ create_context(int sample_rate, int channels, const std::string &whisper_model_p
|
||||
gf->whisper_params.speed_up = false;
|
||||
gf->whisper_params.suppress_blank = true;
|
||||
gf->whisper_params.suppress_non_speech_tokens = true;
|
||||
gf->whisper_params.temperature = 0.1;
|
||||
gf->whisper_params.temperature = 0.0;
|
||||
gf->whisper_params.max_initial_ts = 1.0;
|
||||
gf->whisper_params.length_penalty = -1;
|
||||
gf->active = true;
|
||||
@@ -201,7 +203,7 @@ void audio_chunk_callback(struct transcription_filter_data *gf, const float *pcm
|
||||
// numeral = "0" + numeral;
|
||||
// }
|
||||
|
||||
// save the audio to a .wav file
|
||||
// // save the audio to a .wav file
|
||||
// std::string filename = "audio_chunk_" + numeral + vad_state_str + ".wav";
|
||||
// obs_log(gf->log_level, "Saving %lu frames to %s", frames, filename.c_str());
|
||||
// write_audio_wav_file(filename.c_str(), pcm32f_data, frames);
|
||||
@@ -388,6 +390,16 @@ int wmain(int argc, wchar_t *argv[])
|
||||
gf->enable_audio_chunks_callback =
|
||||
config["enable_audio_chunks_callback"];
|
||||
}
|
||||
if (config.contains("temperature")) {
|
||||
obs_log(LOG_INFO, "Setting temperture to %f",
|
||||
config["temperature"].get<float>());
|
||||
gf->whisper_params.temperature = config["temperature"].get<float>();
|
||||
}
|
||||
if (config.contains("no_context")) {
|
||||
obs_log(LOG_INFO, "Setting no_context to %s",
|
||||
config["no_context"] ? "true" : "false");
|
||||
gf->whisper_params.no_context = config["no_context"];
|
||||
}
|
||||
// set log level
|
||||
if (logLevelStr == "debug") {
|
||||
gf->log_level = LOG_DEBUG;
|
||||
|
||||
@@ -20,16 +20,19 @@
|
||||
|
||||
#define SEND_TIMED_METADATA_URL "http://localhost:8080/timed-metadata"
|
||||
|
||||
void send_caption_to_source(const std::string &target_source_name, const std::string &str_copy,
|
||||
void send_caption_to_source(const std::string &target_source_name, const std::string &caption,
|
||||
struct transcription_filter_data *gf)
|
||||
{
|
||||
if (target_source_name.empty()) {
|
||||
return;
|
||||
}
|
||||
auto target = obs_get_source_by_name(target_source_name.c_str());
|
||||
if (!target) {
|
||||
obs_log(gf->log_level, "text_source target is null");
|
||||
return;
|
||||
}
|
||||
auto text_settings = obs_source_get_settings(target);
|
||||
obs_data_set_string(text_settings, "text", str_copy.c_str());
|
||||
obs_data_set_string(text_settings, "text", caption.c_str());
|
||||
obs_source_update(target, text_settings);
|
||||
obs_source_release(target);
|
||||
}
|
||||
@@ -228,3 +231,34 @@ void set_text_callback(struct transcription_filter_data *gf,
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
void recording_state_callback(enum obs_frontend_event event, void *data)
|
||||
{
|
||||
struct transcription_filter_data *gf_ =
|
||||
static_cast<struct transcription_filter_data *>(data);
|
||||
if (event == OBS_FRONTEND_EVENT_RECORDING_STARTING) {
|
||||
if (gf_->save_srt && gf_->save_only_while_recording) {
|
||||
obs_log(gf_->log_level, "Recording started. Resetting srt file.");
|
||||
// truncate file if it exists
|
||||
std::ofstream output_file(gf_->output_file_path,
|
||||
std::ios::out | std::ios::trunc);
|
||||
output_file.close();
|
||||
gf_->sentence_number = 1;
|
||||
gf_->start_timestamp_ms = now_ms();
|
||||
}
|
||||
} else if (event == OBS_FRONTEND_EVENT_RECORDING_STOPPED) {
|
||||
if (gf_->save_srt && gf_->save_only_while_recording &&
|
||||
gf_->rename_file_to_match_recording) {
|
||||
obs_log(gf_->log_level, "Recording stopped. Rename srt file.");
|
||||
// rename file to match the recording file name with .srt extension
|
||||
// use obs_frontend_get_last_recording to get the last recording file name
|
||||
std::string recording_file_name = obs_frontend_get_last_recording();
|
||||
// remove the extension
|
||||
recording_file_name = recording_file_name.substr(
|
||||
0, recording_file_name.find_last_of("."));
|
||||
std::string srt_file_name = recording_file_name + ".srt";
|
||||
// rename the file
|
||||
std::rename(gf_->output_file_path.c_str(), srt_file_name.c_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -15,4 +15,6 @@ void audio_chunk_callback(struct transcription_filter_data *gf, const float *pcm
|
||||
void set_text_callback(struct transcription_filter_data *gf,
|
||||
const DetectionResultWithText &resultIn);
|
||||
|
||||
void recording_state_callback(enum obs_frontend_event event, void *data);
|
||||
|
||||
#endif /* TRANSCRIPTION_FILTER_CALLBACKS_H */
|
||||
|
||||
@@ -80,6 +80,7 @@ struct transcription_filter_data {
|
||||
bool fix_utf8 = true;
|
||||
bool enable_audio_chunks_callback = false;
|
||||
bool send_timed_metadata = false;
|
||||
bool source_signals_set = false;
|
||||
|
||||
// Last transcription result
|
||||
std::string last_text;
|
||||
|
||||
55
src/transcription-filter-utils.cpp
Normal file
55
src/transcription-filter-utils.cpp
Normal file
@@ -0,0 +1,55 @@
|
||||
#include "transcription-filter-utils.h"
|
||||
|
||||
#include <obs-module.h>
|
||||
#include <obs.h>
|
||||
#include <obs-frontend-api.h>
|
||||
|
||||
void create_obs_text_source()
|
||||
{
|
||||
// create a new OBS text source called "LocalVocal Subtitles"
|
||||
obs_source_t *scene_as_source = obs_frontend_get_current_scene();
|
||||
obs_scene_t *scene = obs_scene_from_source(scene_as_source);
|
||||
#ifdef _WIN32
|
||||
obs_source_t *source =
|
||||
obs_source_create("text_gdiplus_v2", "LocalVocal Subtitles", nullptr, nullptr);
|
||||
#else
|
||||
obs_source_t *source =
|
||||
obs_source_create("text_ft2_source_v2", "LocalVocal Subtitles", nullptr, nullptr);
|
||||
#endif
|
||||
if (source) {
|
||||
// add source to the current scene
|
||||
obs_scene_add(scene, source);
|
||||
// set source settings
|
||||
obs_data_t *source_settings = obs_source_get_settings(source);
|
||||
obs_data_set_bool(source_settings, "word_wrap", true);
|
||||
obs_data_set_int(source_settings, "custom_width", 1760);
|
||||
obs_data_t *font_data = obs_data_create();
|
||||
obs_data_set_string(font_data, "face", "Arial");
|
||||
obs_data_set_string(font_data, "style", "Regular");
|
||||
obs_data_set_int(font_data, "size", 72);
|
||||
obs_data_set_int(font_data, "flags", 0);
|
||||
obs_data_set_obj(source_settings, "font", font_data);
|
||||
obs_data_release(font_data);
|
||||
obs_source_update(source, source_settings);
|
||||
obs_data_release(source_settings);
|
||||
|
||||
// set transform settings
|
||||
obs_transform_info transform_info;
|
||||
transform_info.pos.x = 962.0;
|
||||
transform_info.pos.y = 959.0;
|
||||
transform_info.bounds.x = 1769.0;
|
||||
transform_info.bounds.y = 145.0;
|
||||
transform_info.bounds_type = obs_bounds_type::OBS_BOUNDS_SCALE_INNER;
|
||||
transform_info.bounds_alignment = OBS_ALIGN_CENTER;
|
||||
transform_info.alignment = OBS_ALIGN_CENTER;
|
||||
transform_info.scale.x = 1.0;
|
||||
transform_info.scale.y = 1.0;
|
||||
transform_info.rot = 0.0;
|
||||
obs_sceneitem_t *source_sceneitem = obs_scene_sceneitem_from_source(scene, source);
|
||||
obs_sceneitem_set_info(source_sceneitem, &transform_info);
|
||||
obs_sceneitem_release(source_sceneitem);
|
||||
|
||||
obs_source_release(source);
|
||||
}
|
||||
obs_source_release(scene_as_source);
|
||||
}
|
||||
33
src/transcription-filter-utils.h
Normal file
33
src/transcription-filter-utils.h
Normal file
@@ -0,0 +1,33 @@
|
||||
#ifndef TRANSCRIPTION_FILTER_UTILS_H
|
||||
#define TRANSCRIPTION_FILTER_UTILS_H
|
||||
|
||||
#include <media-io/audio-io.h>
|
||||
|
||||
// Convert channels number to a speaker layout
|
||||
inline enum speaker_layout convert_speaker_layout(uint8_t channels)
|
||||
{
|
||||
switch (channels) {
|
||||
case 0:
|
||||
return SPEAKERS_UNKNOWN;
|
||||
case 1:
|
||||
return SPEAKERS_MONO;
|
||||
case 2:
|
||||
return SPEAKERS_STEREO;
|
||||
case 3:
|
||||
return SPEAKERS_2POINT1;
|
||||
case 4:
|
||||
return SPEAKERS_4POINT0;
|
||||
case 5:
|
||||
return SPEAKERS_4POINT1;
|
||||
case 6:
|
||||
return SPEAKERS_5POINT1;
|
||||
case 8:
|
||||
return SPEAKERS_7POINT1;
|
||||
default:
|
||||
return SPEAKERS_UNKNOWN;
|
||||
}
|
||||
}
|
||||
|
||||
void create_obs_text_source();
|
||||
|
||||
#endif // TRANSCRIPTION_FILTER_UTILS_H
|
||||
@@ -18,6 +18,7 @@
|
||||
#include "transcription-filter.h"
|
||||
#include "transcription-filter-callbacks.h"
|
||||
#include "transcription-filter-data.h"
|
||||
#include "transcription-filter-utils.h"
|
||||
#include "transcription-utils.h"
|
||||
#include "model-utils/model-downloader.h"
|
||||
#include "whisper-utils/whisper-processing.h"
|
||||
@@ -28,7 +29,6 @@
|
||||
#include "translation/translation-utils.h"
|
||||
#include "translation/translation.h"
|
||||
#include "translation/translation-includes.h"
|
||||
#include "utils.h"
|
||||
|
||||
bool add_sources_to_list(void *list_property, obs_source_t *source)
|
||||
{
|
||||
@@ -44,6 +44,71 @@ bool add_sources_to_list(void *list_property, obs_source_t *source)
|
||||
return true;
|
||||
}
|
||||
|
||||
void set_source_signals(transcription_filter_data *gf, obs_source_t *parent_source)
|
||||
{
|
||||
obs_log(LOG_INFO, "parent source name: %s", obs_source_get_name(parent_source));
|
||||
signal_handler_t *sh = obs_source_get_signal_handler(parent_source);
|
||||
signal_handler_connect(
|
||||
sh, "media_play",
|
||||
[](void *data_, calldata_t *cd) {
|
||||
obs_log(LOG_INFO, "media_play");
|
||||
transcription_filter_data *gf_ =
|
||||
static_cast<struct transcription_filter_data *>(data_);
|
||||
gf_->active = true;
|
||||
},
|
||||
gf);
|
||||
signal_handler_connect(
|
||||
sh, "media_started",
|
||||
[](void *data_, calldata_t *cd) {
|
||||
obs_log(LOG_INFO, "media_started");
|
||||
transcription_filter_data *gf_ =
|
||||
static_cast<struct transcription_filter_data *>(data_);
|
||||
gf_->active = true;
|
||||
},
|
||||
gf);
|
||||
signal_handler_connect(
|
||||
sh, "media_pause",
|
||||
[](void *data_, calldata_t *cd) {
|
||||
obs_log(LOG_INFO, "media_pause");
|
||||
transcription_filter_data *gf_ =
|
||||
static_cast<struct transcription_filter_data *>(data_);
|
||||
gf_->active = false;
|
||||
},
|
||||
gf);
|
||||
signal_handler_connect(
|
||||
sh, "media_restart",
|
||||
[](void *data_, calldata_t *cd) {
|
||||
obs_log(LOG_INFO, "media_restart");
|
||||
transcription_filter_data *gf_ =
|
||||
static_cast<struct transcription_filter_data *>(data_);
|
||||
gf_->active = true;
|
||||
gf_->captions_monitor.clear();
|
||||
send_caption_to_source(gf_->text_source_name, "", gf_);
|
||||
},
|
||||
gf);
|
||||
signal_handler_connect(
|
||||
sh, "media_stopped",
|
||||
[](void *data_, calldata_t *cd) {
|
||||
obs_log(LOG_INFO, "media_stopped");
|
||||
transcription_filter_data *gf_ =
|
||||
static_cast<struct transcription_filter_data *>(data_);
|
||||
gf_->active = false;
|
||||
gf_->captions_monitor.clear();
|
||||
send_caption_to_source(gf_->text_source_name, "", gf_);
|
||||
// flush the buffer
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(gf_->whisper_buf_mutex);
|
||||
for (size_t c = 0; c < gf_->channels; c++) {
|
||||
circlebuf_free(&gf_->input_buffers[c]);
|
||||
}
|
||||
circlebuf_free(&gf_->info_buffer);
|
||||
circlebuf_free(&gf_->whisper_buffer);
|
||||
}
|
||||
},
|
||||
gf);
|
||||
gf->source_signals_set = true;
|
||||
}
|
||||
|
||||
struct obs_audio_data *transcription_filter_filter_audio(void *data, struct obs_audio_data *audio)
|
||||
{
|
||||
if (!audio) {
|
||||
@@ -56,14 +121,16 @@ struct obs_audio_data *transcription_filter_filter_audio(void *data, struct obs_
|
||||
struct transcription_filter_data *gf =
|
||||
static_cast<struct transcription_filter_data *>(data);
|
||||
|
||||
if (!gf->active) {
|
||||
return audio;
|
||||
// Lazy initialization of source signals
|
||||
if (!gf->source_signals_set) {
|
||||
// obs_filter_get_parent only works in the filter function
|
||||
obs_source_t *parent_source = obs_filter_get_parent(gf->context);
|
||||
if (parent_source != nullptr) {
|
||||
set_source_signals(gf, parent_source);
|
||||
}
|
||||
}
|
||||
|
||||
// Check if the parent source is muted
|
||||
obs_source_t *parent_source = obs_filter_get_parent(gf->context);
|
||||
if (gf->process_while_muted == false && obs_source_muted(parent_source)) {
|
||||
// Source is muted, do not process audio
|
||||
if (!gf->active) {
|
||||
return audio;
|
||||
}
|
||||
|
||||
@@ -72,6 +139,17 @@ struct obs_audio_data *transcription_filter_filter_audio(void *data, struct obs_
|
||||
return audio;
|
||||
}
|
||||
|
||||
// Check if process while muted is not enabled (e.g. the user wants to avoid processing audio
|
||||
// when the source is muted)
|
||||
if (!gf->process_while_muted) {
|
||||
// Check if the parent source is muted
|
||||
obs_source_t *parent_source = obs_filter_get_parent(gf->context);
|
||||
if (parent_source != nullptr && obs_source_muted(parent_source)) {
|
||||
// Source is muted, do not process audio
|
||||
return audio;
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(gf->whisper_buf_mutex); // scoped lock
|
||||
// push back current audio data to input circlebuf
|
||||
@@ -122,12 +200,11 @@ void transcription_filter_destroy(void *data)
|
||||
|
||||
void transcription_filter_update(void *data, obs_data_t *s)
|
||||
{
|
||||
obs_log(LOG_INFO, "LocalVocal filter update");
|
||||
struct transcription_filter_data *gf =
|
||||
static_cast<struct transcription_filter_data *>(data);
|
||||
|
||||
gf->log_level = (int)obs_data_get_int(s, "log_level");
|
||||
obs_log(gf->log_level, "filter update");
|
||||
|
||||
gf->vad_enabled = obs_data_get_bool(s, "vad_enabled");
|
||||
gf->log_words = obs_data_get_bool(s, "log_words");
|
||||
gf->caption_to_stream = obs_data_get_bool(s, "caption_to_stream");
|
||||
@@ -142,7 +219,34 @@ void transcription_filter_update(void *data, obs_data_t *s)
|
||||
gf->process_while_muted = obs_data_get_bool(s, "process_while_muted");
|
||||
gf->min_sub_duration = (int)obs_data_get_int(s, "min_sub_duration");
|
||||
gf->last_sub_render_time = 0;
|
||||
gf->buffered_output = obs_data_get_bool(s, "buffered_output");
|
||||
bool new_buffered_output = obs_data_get_bool(s, "buffered_output");
|
||||
|
||||
if (new_buffered_output) {
|
||||
obs_log(LOG_INFO, "buffered_output enable");
|
||||
if (!gf->buffered_output || !gf->captions_monitor.isEnabled()) {
|
||||
obs_log(LOG_INFO, "buffered_output currently disabled, enabling");
|
||||
gf->buffered_output = true;
|
||||
gf->captions_monitor.initialize(
|
||||
gf,
|
||||
[gf](const std::string &text) {
|
||||
if (gf->buffered_output) {
|
||||
send_caption_to_source(gf->text_source_name, text,
|
||||
gf);
|
||||
}
|
||||
},
|
||||
2, 30, std::chrono::seconds(10));
|
||||
}
|
||||
} else {
|
||||
obs_log(LOG_INFO, "buffered_output disable");
|
||||
if (gf->buffered_output) {
|
||||
obs_log(LOG_INFO, "buffered_output currently enabled, disabling");
|
||||
if (gf->captions_monitor.isEnabled()) {
|
||||
gf->captions_monitor.clear();
|
||||
gf->captions_monitor.stopThread();
|
||||
}
|
||||
gf->buffered_output = false;
|
||||
}
|
||||
}
|
||||
|
||||
bool new_translate = obs_data_get_bool(s, "translate");
|
||||
gf->source_lang = obs_data_get_string(s, "translate_source_language");
|
||||
@@ -195,7 +299,6 @@ void transcription_filter_update(void *data, obs_data_t *s)
|
||||
obs_log(gf->log_level, "update text source");
|
||||
// update the text source
|
||||
const char *new_text_source_name = obs_data_get_string(s, "subtitle_sources");
|
||||
obs_weak_source_t *old_weak_text_source = NULL;
|
||||
|
||||
if (new_text_source_name == nullptr || strcmp(new_text_source_name, "none") == 0 ||
|
||||
strcmp(new_text_source_name, "(null)") == 0 ||
|
||||
@@ -212,16 +315,7 @@ void transcription_filter_update(void *data, obs_data_t *s)
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// new selected text source is valid, check if it's different from the old one
|
||||
if (gf->text_source_name != new_text_source_name) {
|
||||
// new text source is different from the old one, release the old one
|
||||
gf->text_source_name = new_text_source_name;
|
||||
}
|
||||
}
|
||||
|
||||
if (old_weak_text_source) {
|
||||
obs_log(gf->log_level, "releasing old text source");
|
||||
obs_weak_source_release(old_weak_text_source);
|
||||
gf->text_source_name = new_text_source_name;
|
||||
}
|
||||
|
||||
obs_log(gf->log_level, "update whisper model");
|
||||
@@ -333,53 +427,7 @@ void *transcription_filter_create(obs_data_t *settings, obs_source_t *filter)
|
||||
obs_source_release(source);
|
||||
} else {
|
||||
// create a new OBS text source called "LocalVocal Subtitles"
|
||||
obs_source_t *scene_as_source = obs_frontend_get_current_scene();
|
||||
obs_scene_t *scene = obs_scene_from_source(scene_as_source);
|
||||
#ifdef _WIN32
|
||||
source = obs_source_create("text_gdiplus_v2", "LocalVocal Subtitles",
|
||||
nullptr, nullptr);
|
||||
#else
|
||||
source = obs_source_create("text_ft2_source_v2", "LocalVocal Subtitles",
|
||||
nullptr, nullptr);
|
||||
#endif
|
||||
if (source) {
|
||||
// add source to the current scene
|
||||
obs_scene_add(scene, source);
|
||||
// set source settings
|
||||
obs_data_t *source_settings = obs_source_get_settings(source);
|
||||
obs_data_set_bool(source_settings, "word_wrap", true);
|
||||
obs_data_set_int(source_settings, "custom_width", 1760);
|
||||
obs_data_t *font_data = obs_data_create();
|
||||
obs_data_set_string(font_data, "face", "Arial");
|
||||
obs_data_set_string(font_data, "style", "Regular");
|
||||
obs_data_set_int(font_data, "size", 72);
|
||||
obs_data_set_int(font_data, "flags", 0);
|
||||
obs_data_set_obj(source_settings, "font", font_data);
|
||||
obs_data_release(font_data);
|
||||
obs_source_update(source, source_settings);
|
||||
obs_data_release(source_settings);
|
||||
|
||||
// set transform settings
|
||||
obs_transform_info transform_info;
|
||||
transform_info.pos.x = 962.0;
|
||||
transform_info.pos.y = 959.0;
|
||||
transform_info.bounds.x = 1769.0;
|
||||
transform_info.bounds.y = 145.0;
|
||||
transform_info.bounds_type =
|
||||
obs_bounds_type::OBS_BOUNDS_SCALE_INNER;
|
||||
transform_info.bounds_alignment = OBS_ALIGN_CENTER;
|
||||
transform_info.alignment = OBS_ALIGN_CENTER;
|
||||
transform_info.scale.x = 1.0;
|
||||
transform_info.scale.y = 1.0;
|
||||
transform_info.rot = 0.0;
|
||||
obs_sceneitem_t *source_sceneitem =
|
||||
obs_scene_sceneitem_from_source(scene, source);
|
||||
obs_sceneitem_set_info(source_sceneitem, &transform_info);
|
||||
obs_sceneitem_release(source_sceneitem);
|
||||
|
||||
obs_source_release(source);
|
||||
}
|
||||
obs_source_release(scene_as_source);
|
||||
create_obs_text_source();
|
||||
}
|
||||
gf->text_source_name = "LocalVocal Subtitles";
|
||||
obs_data_set_string(settings, "subtitle_sources", "LocalVocal Subtitles");
|
||||
@@ -393,15 +441,6 @@ void *transcription_filter_create(obs_data_t *settings, obs_source_t *filter)
|
||||
gf->whisper_model_path = std::string(""); // The update function will set the model path
|
||||
gf->whisper_context = nullptr;
|
||||
|
||||
gf->captions_monitor.initialize(
|
||||
gf,
|
||||
[gf](const std::string &text) {
|
||||
if (gf->buffered_output) {
|
||||
send_caption_to_source(gf->text_source_name, text, gf);
|
||||
}
|
||||
},
|
||||
2, 30, std::chrono::seconds(10));
|
||||
|
||||
obs_log(gf->log_level, "run update");
|
||||
// get the settings updated on the filter data struct
|
||||
transcription_filter_update(gf, settings);
|
||||
@@ -410,45 +449,7 @@ void *transcription_filter_create(obs_data_t *settings, obs_source_t *filter)
|
||||
|
||||
// handle the event OBS_FRONTEND_EVENT_RECORDING_STARTING to reset the srt sentence number
|
||||
// to match the subtitles with the recording
|
||||
obs_frontend_add_event_callback(
|
||||
[](enum obs_frontend_event event, void *private_data) {
|
||||
if (event == OBS_FRONTEND_EVENT_RECORDING_STARTING) {
|
||||
struct transcription_filter_data *gf_ =
|
||||
static_cast<struct transcription_filter_data *>(
|
||||
private_data);
|
||||
if (gf_->save_srt && gf_->save_only_while_recording) {
|
||||
obs_log(gf_->log_level,
|
||||
"Recording started. Resetting srt file.");
|
||||
// truncate file if it exists
|
||||
std::ofstream output_file(gf_->output_file_path,
|
||||
std::ios::out | std::ios::trunc);
|
||||
output_file.close();
|
||||
gf_->sentence_number = 1;
|
||||
gf_->start_timestamp_ms = now_ms();
|
||||
}
|
||||
} else if (event == OBS_FRONTEND_EVENT_RECORDING_STOPPED) {
|
||||
struct transcription_filter_data *gf_ =
|
||||
static_cast<struct transcription_filter_data *>(
|
||||
private_data);
|
||||
if (gf_->save_srt && gf_->save_only_while_recording &&
|
||||
gf_->rename_file_to_match_recording) {
|
||||
obs_log(gf_->log_level,
|
||||
"Recording stopped. Rename srt file.");
|
||||
// rename file to match the recording file name with .srt extension
|
||||
// use obs_frontend_get_last_recording to get the last recording file name
|
||||
std::string recording_file_name =
|
||||
obs_frontend_get_last_recording();
|
||||
// remove the extension
|
||||
recording_file_name = recording_file_name.substr(
|
||||
0, recording_file_name.find_last_of("."));
|
||||
std::string srt_file_name = recording_file_name + ".srt";
|
||||
// rename the file
|
||||
std::rename(gf_->output_file_path.c_str(),
|
||||
srt_file_name.c_str());
|
||||
}
|
||||
}
|
||||
},
|
||||
gf);
|
||||
obs_frontend_add_event_callback(recording_state_callback, gf);
|
||||
|
||||
obs_log(gf->log_level, "filter created.");
|
||||
return gf;
|
||||
@@ -541,7 +542,7 @@ void transcription_filter_defaults(obs_data_t *s)
|
||||
obs_data_set_default_double(s, "thold_ptsum", 0.01);
|
||||
obs_data_set_default_int(s, "max_len", 0);
|
||||
obs_data_set_default_bool(s, "split_on_word", true);
|
||||
obs_data_set_default_int(s, "max_tokens", 32);
|
||||
obs_data_set_default_int(s, "max_tokens", 0);
|
||||
obs_data_set_default_bool(s, "speed_up", false);
|
||||
obs_data_set_default_bool(s, "suppress_blank", false);
|
||||
obs_data_set_default_bool(s, "suppress_non_speech_tokens", true);
|
||||
@@ -776,19 +777,6 @@ obs_properties_t *transcription_filter_properties(void *data)
|
||||
|
||||
obs_property_t *buffered_output_prop =
|
||||
obs_properties_add_bool(ppts, "buffered_output", MT_("buffered_output"));
|
||||
// add on-change handler for buffered_output
|
||||
obs_property_set_modified_callback(buffered_output_prop, [](obs_properties_t *props,
|
||||
obs_property_t *property,
|
||||
obs_data_t *settings) {
|
||||
UNUSED_PARAMETER(property);
|
||||
UNUSED_PARAMETER(props);
|
||||
// if buffered output is enabled set the overlap to max else set it to default
|
||||
obs_data_set_int(settings, "overlap_size_msec",
|
||||
obs_data_get_bool(settings, "buffered_output")
|
||||
? MAX_OVERLAP_SIZE_MSEC
|
||||
: DEFAULT_OVERLAP_SIZE_MSEC);
|
||||
return true;
|
||||
});
|
||||
|
||||
obs_properties_add_bool(ppts, "log_words", MT_("log_words"));
|
||||
obs_properties_add_bool(ppts, "caption_to_stream", MT_("caption_to_stream"));
|
||||
|
||||
@@ -117,3 +117,23 @@ std::vector<std::string> split(const std::string &string, char delimiter)
|
||||
}
|
||||
return tokens;
|
||||
}
|
||||
|
||||
std::vector<std::string> split_words(const std::string &str_copy)
|
||||
{
|
||||
std::vector<std::string> words;
|
||||
std::string word;
|
||||
for (char c : str_copy) {
|
||||
if (std::isspace(c)) {
|
||||
if (!word.empty()) {
|
||||
words.push_back(word);
|
||||
word.clear();
|
||||
}
|
||||
} else {
|
||||
word += c;
|
||||
}
|
||||
}
|
||||
if (!word.empty()) {
|
||||
words.push_back(word);
|
||||
}
|
||||
return words;
|
||||
}
|
||||
|
||||
@@ -4,36 +4,17 @@
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <chrono>
|
||||
#include <media-io/audio-io.h>
|
||||
|
||||
// Fix UTF8 string for Windows
|
||||
std::string fix_utf8(const std::string &str);
|
||||
|
||||
// Remove leading and trailing non-alphabetic characters
|
||||
std::string remove_leading_trailing_nonalpha(const std::string &str);
|
||||
|
||||
// Split a string by a delimiter
|
||||
std::vector<std::string> split(const std::string &string, char delimiter);
|
||||
|
||||
inline enum speaker_layout convert_speaker_layout(uint8_t channels)
|
||||
{
|
||||
switch (channels) {
|
||||
case 0:
|
||||
return SPEAKERS_UNKNOWN;
|
||||
case 1:
|
||||
return SPEAKERS_MONO;
|
||||
case 2:
|
||||
return SPEAKERS_STEREO;
|
||||
case 3:
|
||||
return SPEAKERS_2POINT1;
|
||||
case 4:
|
||||
return SPEAKERS_4POINT0;
|
||||
case 5:
|
||||
return SPEAKERS_4POINT1;
|
||||
case 6:
|
||||
return SPEAKERS_5POINT1;
|
||||
case 8:
|
||||
return SPEAKERS_7POINT1;
|
||||
default:
|
||||
return SPEAKERS_UNKNOWN;
|
||||
}
|
||||
}
|
||||
|
||||
// Get the current timestamp in milliseconds since epoch
|
||||
inline uint64_t now_ms()
|
||||
{
|
||||
return std::chrono::duration_cast<std::chrono::milliseconds>(
|
||||
@@ -41,4 +22,7 @@ inline uint64_t now_ms()
|
||||
.count();
|
||||
}
|
||||
|
||||
// Split a string into words based on spaces
|
||||
std::vector<std::string> split_words(const std::string &str_copy);
|
||||
|
||||
#endif // TRANSCRIPTION_UTILS_H
|
||||
|
||||
@@ -1,21 +0,0 @@
|
||||
#include "utils.h"
|
||||
|
||||
std::vector<std::string> split_words(const std::string &str_copy)
|
||||
{
|
||||
std::vector<std::string> words;
|
||||
std::string word;
|
||||
for (char c : str_copy) {
|
||||
if (std::isspace(c)) {
|
||||
if (!word.empty()) {
|
||||
words.push_back(word);
|
||||
word.clear();
|
||||
}
|
||||
} else {
|
||||
word += c;
|
||||
}
|
||||
}
|
||||
if (!word.empty()) {
|
||||
words.push_back(word);
|
||||
}
|
||||
return words;
|
||||
}
|
||||
@@ -1,9 +0,0 @@
|
||||
#ifndef UTILS_H
|
||||
#define UTILS_H
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
std::vector<std::string> split_words(const std::string &str_copy);
|
||||
|
||||
#endif // UTILS_H
|
||||
@@ -23,7 +23,9 @@ TokenBufferThread::~TokenBufferThread()
|
||||
stop = true;
|
||||
}
|
||||
condVar.notify_all();
|
||||
workerThread.join();
|
||||
if (workerThread.joinable()) {
|
||||
workerThread.join();
|
||||
}
|
||||
}
|
||||
|
||||
void TokenBufferThread::initialize(struct transcription_filter_data *gf_,
|
||||
@@ -38,10 +40,20 @@ void TokenBufferThread::initialize(struct transcription_filter_data *gf_,
|
||||
this->numPerSentence = numPerSentence_;
|
||||
this->segmentation = segmentation_;
|
||||
this->maxTime = maxTime_;
|
||||
this->initialized = true;
|
||||
this->stop = false;
|
||||
this->workerThread = std::thread(&TokenBufferThread::monitor, this);
|
||||
}
|
||||
|
||||
void TokenBufferThread::stopThread()
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(queueMutex);
|
||||
stop = true;
|
||||
condVar.notify_all();
|
||||
if (workerThread.joinable()) {
|
||||
workerThread.join();
|
||||
}
|
||||
}
|
||||
|
||||
void TokenBufferThread::log_token_vector(const std::vector<std::string> &tokens)
|
||||
{
|
||||
std::string output;
|
||||
@@ -81,21 +93,22 @@ void TokenBufferThread::addSentence(const std::string &sentence)
|
||||
}
|
||||
}
|
||||
|
||||
void TokenBufferThread::clear()
|
||||
{
|
||||
obs_log(LOG_INFO, "TokenBufferThread::clear");
|
||||
std::lock_guard<std::mutex> lock(queueMutex);
|
||||
inputQueue.clear();
|
||||
presentationQueue.clear();
|
||||
this->callback("");
|
||||
}
|
||||
|
||||
void TokenBufferThread::monitor()
|
||||
{
|
||||
obs_log(LOG_INFO, "TokenBufferThread::monitor");
|
||||
|
||||
this->callback("");
|
||||
|
||||
while (this->initialized && !this->stop) {
|
||||
if (this->stop) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (this->gf->whisper_context == nullptr) {
|
||||
continue;
|
||||
}
|
||||
|
||||
while (!this->stop) {
|
||||
// condition presentation queue
|
||||
if (presentationQueue.size() == this->numSentences * this->numPerSentence) {
|
||||
// pop a whole sentence from the presentation queue front
|
||||
|
||||
@@ -36,6 +36,10 @@ public:
|
||||
TokenBufferSegmentation segmentation_ = SEGMENTATION_TOKEN);
|
||||
|
||||
void addSentence(const std::string &sentence);
|
||||
void clear();
|
||||
void stopThread();
|
||||
|
||||
bool isEnabled() const { return !stop; }
|
||||
|
||||
private:
|
||||
void monitor();
|
||||
@@ -48,8 +52,7 @@ private:
|
||||
std::condition_variable condVar;
|
||||
std::function<void(std::string)> callback;
|
||||
std::chrono::seconds maxTime;
|
||||
bool stop;
|
||||
bool initialized = false;
|
||||
bool stop = true;
|
||||
bool newDataAvailable = false;
|
||||
size_t numSentences;
|
||||
size_t numPerSentence;
|
||||
|
||||
@@ -102,9 +102,5 @@ void update_whisper_model(struct transcription_filter_data *gf, obs_data_t *s)
|
||||
gf->enable_token_ts_dtw = obs_data_get_bool(s, "dtw_token_timestamps");
|
||||
shutdown_whisper_thread(gf);
|
||||
start_whisper_thread_with_path(gf, gf->whisper_model_path, silero_vad_model_file);
|
||||
} else {
|
||||
// dtw_token_timestamps did not change
|
||||
obs_log(gf->log_level, "dtw_token_timestamps did not change: %d == %d",
|
||||
gf->enable_token_ts_dtw, new_dtw_timestamps);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -283,18 +283,22 @@ void run_inference_and_callbacks(transcription_filter_data *gf, uint64_t start_o
|
||||
uint64_t end_offset_ms, int vad_state)
|
||||
{
|
||||
// get the data from the entire whisper buffer
|
||||
// add 50ms of silence to the beginning and end of the buffer
|
||||
const size_t pcm32f_size = gf->whisper_buffer.size / sizeof(float);
|
||||
const size_t pcm32f_size_with_silence = pcm32f_size + 2 * WHISPER_SAMPLE_RATE / 100;
|
||||
// allocate a new buffer and copy the data to it
|
||||
float *pcm32f_data = (float *)bzalloc(pcm32f_size * sizeof(float));
|
||||
circlebuf_pop_back(&gf->whisper_buffer, pcm32f_data, pcm32f_size * sizeof(float));
|
||||
float *pcm32f_data = (float *)bzalloc(pcm32f_size_with_silence * sizeof(float));
|
||||
circlebuf_pop_back(&gf->whisper_buffer, pcm32f_data + WHISPER_SAMPLE_RATE / 100,
|
||||
pcm32f_size * sizeof(float));
|
||||
|
||||
struct DetectionResultWithText inference_result =
|
||||
run_whisper_inference(gf, pcm32f_data, pcm32f_size, start_offset_ms, end_offset_ms);
|
||||
struct DetectionResultWithText inference_result = run_whisper_inference(
|
||||
gf, pcm32f_data, pcm32f_size_with_silence, start_offset_ms, end_offset_ms);
|
||||
// output inference result to a text source
|
||||
set_text_callback(gf, inference_result);
|
||||
|
||||
if (gf->enable_audio_chunks_callback) {
|
||||
audio_chunk_callback(gf, pcm32f_data, pcm32f_size, vad_state, inference_result);
|
||||
audio_chunk_callback(gf, pcm32f_data, pcm32f_size_with_silence, vad_state,
|
||||
inference_result);
|
||||
}
|
||||
|
||||
// free the buffer
|
||||
|
||||
Reference in New Issue
Block a user