refactor: Remove unused utils.h and utils.cpp files

This commit is contained in:
Roy Shilkrot
2024-06-03 00:34:38 -04:00
parent 2630bc52ff
commit 7bccfd96d6
16 changed files with 330 additions and 216 deletions

View File

@@ -88,6 +88,7 @@ target_sources(
src/transcription-filter.cpp
src/transcription-filter.c
src/transcription-filter-callbacks.cpp
src/transcription-filter-utils.cpp
src/transcription-utils.cpp
src/model-utils/model-downloader.cpp
src/model-utils/model-downloader-ui.cpp
@@ -100,8 +101,7 @@ target_sources(
src/whisper-utils/token-buffer-thread.cpp
src/translation/language_codes.cpp
src/translation/translation.cpp
src/translation/translation-utils.cpp
src/utils.cpp)
src/translation/translation-utils.cpp)
set_target_properties_plugin(${CMAKE_PROJECT_NAME} PROPERTIES OUTPUT_NAME ${_name})
@@ -122,8 +122,7 @@ if(ENABLE_TESTS)
src/whisper-utils/silero-vad-onnx.cpp
src/whisper-utils/token-buffer-thread.cpp
src/translation/language_codes.cpp
src/translation/translation.cpp
src/utils.cpp)
src/translation/translation.cpp)
find_libav(${CMAKE_PROJECT_NAME}-tests)

View File

@@ -13,6 +13,7 @@
#include <nlohmann/json.hpp>
#include "transcription-filter-data.h"
#include "transcription-filter-utils.h"
#include "transcription-filter.h"
#include "transcription-utils.h"
#include "whisper-utils/whisper-utils.h"
@@ -155,11 +156,12 @@ create_context(int sample_rate, int channels, const std::string &whisper_model_p
gf->whisper_params = whisper_full_default_params(whisper_sampling_method);
gf->whisper_params.duration_ms = 3000;
gf->whisper_params.language = "en";
gf->whisper_params.detect_language = false;
gf->whisper_params.initial_prompt = "";
gf->whisper_params.n_threads = 4;
gf->whisper_params.n_max_text_ctx = 16384;
gf->whisper_params.translate = false;
gf->whisper_params.no_context = true;
gf->whisper_params.no_context = false;
gf->whisper_params.single_segment = true;
gf->whisper_params.print_special = false;
gf->whisper_params.print_progress = false;
@@ -174,7 +176,7 @@ create_context(int sample_rate, int channels, const std::string &whisper_model_p
gf->whisper_params.speed_up = false;
gf->whisper_params.suppress_blank = true;
gf->whisper_params.suppress_non_speech_tokens = true;
gf->whisper_params.temperature = 0.1;
gf->whisper_params.temperature = 0.0;
gf->whisper_params.max_initial_ts = 1.0;
gf->whisper_params.length_penalty = -1;
gf->active = true;
@@ -201,7 +203,7 @@ void audio_chunk_callback(struct transcription_filter_data *gf, const float *pcm
// numeral = "0" + numeral;
// }
// save the audio to a .wav file
// // save the audio to a .wav file
// std::string filename = "audio_chunk_" + numeral + vad_state_str + ".wav";
// obs_log(gf->log_level, "Saving %lu frames to %s", frames, filename.c_str());
// write_audio_wav_file(filename.c_str(), pcm32f_data, frames);
@@ -388,6 +390,16 @@ int wmain(int argc, wchar_t *argv[])
gf->enable_audio_chunks_callback =
config["enable_audio_chunks_callback"];
}
if (config.contains("temperature")) {
obs_log(LOG_INFO, "Setting temperture to %f",
config["temperature"].get<float>());
gf->whisper_params.temperature = config["temperature"].get<float>();
}
if (config.contains("no_context")) {
obs_log(LOG_INFO, "Setting no_context to %s",
config["no_context"] ? "true" : "false");
gf->whisper_params.no_context = config["no_context"];
}
// set log level
if (logLevelStr == "debug") {
gf->log_level = LOG_DEBUG;

View File

@@ -20,16 +20,19 @@
#define SEND_TIMED_METADATA_URL "http://localhost:8080/timed-metadata"
void send_caption_to_source(const std::string &target_source_name, const std::string &str_copy,
void send_caption_to_source(const std::string &target_source_name, const std::string &caption,
struct transcription_filter_data *gf)
{
if (target_source_name.empty()) {
return;
}
auto target = obs_get_source_by_name(target_source_name.c_str());
if (!target) {
obs_log(gf->log_level, "text_source target is null");
return;
}
auto text_settings = obs_source_get_settings(target);
obs_data_set_string(text_settings, "text", str_copy.c_str());
obs_data_set_string(text_settings, "text", caption.c_str());
obs_source_update(target, text_settings);
obs_source_release(target);
}
@@ -228,3 +231,34 @@ void set_text_callback(struct transcription_filter_data *gf,
}
}
};
void recording_state_callback(enum obs_frontend_event event, void *data)
{
struct transcription_filter_data *gf_ =
static_cast<struct transcription_filter_data *>(data);
if (event == OBS_FRONTEND_EVENT_RECORDING_STARTING) {
if (gf_->save_srt && gf_->save_only_while_recording) {
obs_log(gf_->log_level, "Recording started. Resetting srt file.");
// truncate file if it exists
std::ofstream output_file(gf_->output_file_path,
std::ios::out | std::ios::trunc);
output_file.close();
gf_->sentence_number = 1;
gf_->start_timestamp_ms = now_ms();
}
} else if (event == OBS_FRONTEND_EVENT_RECORDING_STOPPED) {
if (gf_->save_srt && gf_->save_only_while_recording &&
gf_->rename_file_to_match_recording) {
obs_log(gf_->log_level, "Recording stopped. Rename srt file.");
// rename file to match the recording file name with .srt extension
// use obs_frontend_get_last_recording to get the last recording file name
std::string recording_file_name = obs_frontend_get_last_recording();
// remove the extension
recording_file_name = recording_file_name.substr(
0, recording_file_name.find_last_of("."));
std::string srt_file_name = recording_file_name + ".srt";
// rename the file
std::rename(gf_->output_file_path.c_str(), srt_file_name.c_str());
}
}
}

View File

@@ -15,4 +15,6 @@ void audio_chunk_callback(struct transcription_filter_data *gf, const float *pcm
void set_text_callback(struct transcription_filter_data *gf,
const DetectionResultWithText &resultIn);
void recording_state_callback(enum obs_frontend_event event, void *data);
#endif /* TRANSCRIPTION_FILTER_CALLBACKS_H */

View File

@@ -80,6 +80,7 @@ struct transcription_filter_data {
bool fix_utf8 = true;
bool enable_audio_chunks_callback = false;
bool send_timed_metadata = false;
bool source_signals_set = false;
// Last transcription result
std::string last_text;

View File

@@ -0,0 +1,55 @@
#include "transcription-filter-utils.h"
#include <obs-module.h>
#include <obs.h>
#include <obs-frontend-api.h>
void create_obs_text_source()
{
// create a new OBS text source called "LocalVocal Subtitles"
obs_source_t *scene_as_source = obs_frontend_get_current_scene();
obs_scene_t *scene = obs_scene_from_source(scene_as_source);
#ifdef _WIN32
obs_source_t *source =
obs_source_create("text_gdiplus_v2", "LocalVocal Subtitles", nullptr, nullptr);
#else
obs_source_t *source =
obs_source_create("text_ft2_source_v2", "LocalVocal Subtitles", nullptr, nullptr);
#endif
if (source) {
// add source to the current scene
obs_scene_add(scene, source);
// set source settings
obs_data_t *source_settings = obs_source_get_settings(source);
obs_data_set_bool(source_settings, "word_wrap", true);
obs_data_set_int(source_settings, "custom_width", 1760);
obs_data_t *font_data = obs_data_create();
obs_data_set_string(font_data, "face", "Arial");
obs_data_set_string(font_data, "style", "Regular");
obs_data_set_int(font_data, "size", 72);
obs_data_set_int(font_data, "flags", 0);
obs_data_set_obj(source_settings, "font", font_data);
obs_data_release(font_data);
obs_source_update(source, source_settings);
obs_data_release(source_settings);
// set transform settings
obs_transform_info transform_info;
transform_info.pos.x = 962.0;
transform_info.pos.y = 959.0;
transform_info.bounds.x = 1769.0;
transform_info.bounds.y = 145.0;
transform_info.bounds_type = obs_bounds_type::OBS_BOUNDS_SCALE_INNER;
transform_info.bounds_alignment = OBS_ALIGN_CENTER;
transform_info.alignment = OBS_ALIGN_CENTER;
transform_info.scale.x = 1.0;
transform_info.scale.y = 1.0;
transform_info.rot = 0.0;
obs_sceneitem_t *source_sceneitem = obs_scene_sceneitem_from_source(scene, source);
obs_sceneitem_set_info(source_sceneitem, &transform_info);
obs_sceneitem_release(source_sceneitem);
obs_source_release(source);
}
obs_source_release(scene_as_source);
}

View File

@@ -0,0 +1,33 @@
#ifndef TRANSCRIPTION_FILTER_UTILS_H
#define TRANSCRIPTION_FILTER_UTILS_H
#include <media-io/audio-io.h>
// Convert channels number to a speaker layout
inline enum speaker_layout convert_speaker_layout(uint8_t channels)
{
switch (channels) {
case 0:
return SPEAKERS_UNKNOWN;
case 1:
return SPEAKERS_MONO;
case 2:
return SPEAKERS_STEREO;
case 3:
return SPEAKERS_2POINT1;
case 4:
return SPEAKERS_4POINT0;
case 5:
return SPEAKERS_4POINT1;
case 6:
return SPEAKERS_5POINT1;
case 8:
return SPEAKERS_7POINT1;
default:
return SPEAKERS_UNKNOWN;
}
}
void create_obs_text_source();
#endif // TRANSCRIPTION_FILTER_UTILS_H

View File

@@ -18,6 +18,7 @@
#include "transcription-filter.h"
#include "transcription-filter-callbacks.h"
#include "transcription-filter-data.h"
#include "transcription-filter-utils.h"
#include "transcription-utils.h"
#include "model-utils/model-downloader.h"
#include "whisper-utils/whisper-processing.h"
@@ -28,7 +29,6 @@
#include "translation/translation-utils.h"
#include "translation/translation.h"
#include "translation/translation-includes.h"
#include "utils.h"
bool add_sources_to_list(void *list_property, obs_source_t *source)
{
@@ -44,6 +44,71 @@ bool add_sources_to_list(void *list_property, obs_source_t *source)
return true;
}
void set_source_signals(transcription_filter_data *gf, obs_source_t *parent_source)
{
obs_log(LOG_INFO, "parent source name: %s", obs_source_get_name(parent_source));
signal_handler_t *sh = obs_source_get_signal_handler(parent_source);
signal_handler_connect(
sh, "media_play",
[](void *data_, calldata_t *cd) {
obs_log(LOG_INFO, "media_play");
transcription_filter_data *gf_ =
static_cast<struct transcription_filter_data *>(data_);
gf_->active = true;
},
gf);
signal_handler_connect(
sh, "media_started",
[](void *data_, calldata_t *cd) {
obs_log(LOG_INFO, "media_started");
transcription_filter_data *gf_ =
static_cast<struct transcription_filter_data *>(data_);
gf_->active = true;
},
gf);
signal_handler_connect(
sh, "media_pause",
[](void *data_, calldata_t *cd) {
obs_log(LOG_INFO, "media_pause");
transcription_filter_data *gf_ =
static_cast<struct transcription_filter_data *>(data_);
gf_->active = false;
},
gf);
signal_handler_connect(
sh, "media_restart",
[](void *data_, calldata_t *cd) {
obs_log(LOG_INFO, "media_restart");
transcription_filter_data *gf_ =
static_cast<struct transcription_filter_data *>(data_);
gf_->active = true;
gf_->captions_monitor.clear();
send_caption_to_source(gf_->text_source_name, "", gf_);
},
gf);
signal_handler_connect(
sh, "media_stopped",
[](void *data_, calldata_t *cd) {
obs_log(LOG_INFO, "media_stopped");
transcription_filter_data *gf_ =
static_cast<struct transcription_filter_data *>(data_);
gf_->active = false;
gf_->captions_monitor.clear();
send_caption_to_source(gf_->text_source_name, "", gf_);
// flush the buffer
{
std::lock_guard<std::mutex> lock(gf_->whisper_buf_mutex);
for (size_t c = 0; c < gf_->channels; c++) {
circlebuf_free(&gf_->input_buffers[c]);
}
circlebuf_free(&gf_->info_buffer);
circlebuf_free(&gf_->whisper_buffer);
}
},
gf);
gf->source_signals_set = true;
}
struct obs_audio_data *transcription_filter_filter_audio(void *data, struct obs_audio_data *audio)
{
if (!audio) {
@@ -56,14 +121,16 @@ struct obs_audio_data *transcription_filter_filter_audio(void *data, struct obs_
struct transcription_filter_data *gf =
static_cast<struct transcription_filter_data *>(data);
if (!gf->active) {
return audio;
// Lazy initialization of source signals
if (!gf->source_signals_set) {
// obs_filter_get_parent only works in the filter function
obs_source_t *parent_source = obs_filter_get_parent(gf->context);
if (parent_source != nullptr) {
set_source_signals(gf, parent_source);
}
}
// Check if the parent source is muted
obs_source_t *parent_source = obs_filter_get_parent(gf->context);
if (gf->process_while_muted == false && obs_source_muted(parent_source)) {
// Source is muted, do not process audio
if (!gf->active) {
return audio;
}
@@ -72,6 +139,17 @@ struct obs_audio_data *transcription_filter_filter_audio(void *data, struct obs_
return audio;
}
// Check if process while muted is not enabled (e.g. the user wants to avoid processing audio
// when the source is muted)
if (!gf->process_while_muted) {
// Check if the parent source is muted
obs_source_t *parent_source = obs_filter_get_parent(gf->context);
if (parent_source != nullptr && obs_source_muted(parent_source)) {
// Source is muted, do not process audio
return audio;
}
}
{
std::lock_guard<std::mutex> lock(gf->whisper_buf_mutex); // scoped lock
// push back current audio data to input circlebuf
@@ -122,12 +200,11 @@ void transcription_filter_destroy(void *data)
void transcription_filter_update(void *data, obs_data_t *s)
{
obs_log(LOG_INFO, "LocalVocal filter update");
struct transcription_filter_data *gf =
static_cast<struct transcription_filter_data *>(data);
gf->log_level = (int)obs_data_get_int(s, "log_level");
obs_log(gf->log_level, "filter update");
gf->vad_enabled = obs_data_get_bool(s, "vad_enabled");
gf->log_words = obs_data_get_bool(s, "log_words");
gf->caption_to_stream = obs_data_get_bool(s, "caption_to_stream");
@@ -142,7 +219,34 @@ void transcription_filter_update(void *data, obs_data_t *s)
gf->process_while_muted = obs_data_get_bool(s, "process_while_muted");
gf->min_sub_duration = (int)obs_data_get_int(s, "min_sub_duration");
gf->last_sub_render_time = 0;
gf->buffered_output = obs_data_get_bool(s, "buffered_output");
bool new_buffered_output = obs_data_get_bool(s, "buffered_output");
if (new_buffered_output) {
obs_log(LOG_INFO, "buffered_output enable");
if (!gf->buffered_output || !gf->captions_monitor.isEnabled()) {
obs_log(LOG_INFO, "buffered_output currently disabled, enabling");
gf->buffered_output = true;
gf->captions_monitor.initialize(
gf,
[gf](const std::string &text) {
if (gf->buffered_output) {
send_caption_to_source(gf->text_source_name, text,
gf);
}
},
2, 30, std::chrono::seconds(10));
}
} else {
obs_log(LOG_INFO, "buffered_output disable");
if (gf->buffered_output) {
obs_log(LOG_INFO, "buffered_output currently enabled, disabling");
if (gf->captions_monitor.isEnabled()) {
gf->captions_monitor.clear();
gf->captions_monitor.stopThread();
}
gf->buffered_output = false;
}
}
bool new_translate = obs_data_get_bool(s, "translate");
gf->source_lang = obs_data_get_string(s, "translate_source_language");
@@ -195,7 +299,6 @@ void transcription_filter_update(void *data, obs_data_t *s)
obs_log(gf->log_level, "update text source");
// update the text source
const char *new_text_source_name = obs_data_get_string(s, "subtitle_sources");
obs_weak_source_t *old_weak_text_source = NULL;
if (new_text_source_name == nullptr || strcmp(new_text_source_name, "none") == 0 ||
strcmp(new_text_source_name, "(null)") == 0 ||
@@ -212,16 +315,7 @@ void transcription_filter_update(void *data, obs_data_t *s)
}
}
} else {
// new selected text source is valid, check if it's different from the old one
if (gf->text_source_name != new_text_source_name) {
// new text source is different from the old one, release the old one
gf->text_source_name = new_text_source_name;
}
}
if (old_weak_text_source) {
obs_log(gf->log_level, "releasing old text source");
obs_weak_source_release(old_weak_text_source);
gf->text_source_name = new_text_source_name;
}
obs_log(gf->log_level, "update whisper model");
@@ -333,53 +427,7 @@ void *transcription_filter_create(obs_data_t *settings, obs_source_t *filter)
obs_source_release(source);
} else {
// create a new OBS text source called "LocalVocal Subtitles"
obs_source_t *scene_as_source = obs_frontend_get_current_scene();
obs_scene_t *scene = obs_scene_from_source(scene_as_source);
#ifdef _WIN32
source = obs_source_create("text_gdiplus_v2", "LocalVocal Subtitles",
nullptr, nullptr);
#else
source = obs_source_create("text_ft2_source_v2", "LocalVocal Subtitles",
nullptr, nullptr);
#endif
if (source) {
// add source to the current scene
obs_scene_add(scene, source);
// set source settings
obs_data_t *source_settings = obs_source_get_settings(source);
obs_data_set_bool(source_settings, "word_wrap", true);
obs_data_set_int(source_settings, "custom_width", 1760);
obs_data_t *font_data = obs_data_create();
obs_data_set_string(font_data, "face", "Arial");
obs_data_set_string(font_data, "style", "Regular");
obs_data_set_int(font_data, "size", 72);
obs_data_set_int(font_data, "flags", 0);
obs_data_set_obj(source_settings, "font", font_data);
obs_data_release(font_data);
obs_source_update(source, source_settings);
obs_data_release(source_settings);
// set transform settings
obs_transform_info transform_info;
transform_info.pos.x = 962.0;
transform_info.pos.y = 959.0;
transform_info.bounds.x = 1769.0;
transform_info.bounds.y = 145.0;
transform_info.bounds_type =
obs_bounds_type::OBS_BOUNDS_SCALE_INNER;
transform_info.bounds_alignment = OBS_ALIGN_CENTER;
transform_info.alignment = OBS_ALIGN_CENTER;
transform_info.scale.x = 1.0;
transform_info.scale.y = 1.0;
transform_info.rot = 0.0;
obs_sceneitem_t *source_sceneitem =
obs_scene_sceneitem_from_source(scene, source);
obs_sceneitem_set_info(source_sceneitem, &transform_info);
obs_sceneitem_release(source_sceneitem);
obs_source_release(source);
}
obs_source_release(scene_as_source);
create_obs_text_source();
}
gf->text_source_name = "LocalVocal Subtitles";
obs_data_set_string(settings, "subtitle_sources", "LocalVocal Subtitles");
@@ -393,15 +441,6 @@ void *transcription_filter_create(obs_data_t *settings, obs_source_t *filter)
gf->whisper_model_path = std::string(""); // The update function will set the model path
gf->whisper_context = nullptr;
gf->captions_monitor.initialize(
gf,
[gf](const std::string &text) {
if (gf->buffered_output) {
send_caption_to_source(gf->text_source_name, text, gf);
}
},
2, 30, std::chrono::seconds(10));
obs_log(gf->log_level, "run update");
// get the settings updated on the filter data struct
transcription_filter_update(gf, settings);
@@ -410,45 +449,7 @@ void *transcription_filter_create(obs_data_t *settings, obs_source_t *filter)
// handle the event OBS_FRONTEND_EVENT_RECORDING_STARTING to reset the srt sentence number
// to match the subtitles with the recording
obs_frontend_add_event_callback(
[](enum obs_frontend_event event, void *private_data) {
if (event == OBS_FRONTEND_EVENT_RECORDING_STARTING) {
struct transcription_filter_data *gf_ =
static_cast<struct transcription_filter_data *>(
private_data);
if (gf_->save_srt && gf_->save_only_while_recording) {
obs_log(gf_->log_level,
"Recording started. Resetting srt file.");
// truncate file if it exists
std::ofstream output_file(gf_->output_file_path,
std::ios::out | std::ios::trunc);
output_file.close();
gf_->sentence_number = 1;
gf_->start_timestamp_ms = now_ms();
}
} else if (event == OBS_FRONTEND_EVENT_RECORDING_STOPPED) {
struct transcription_filter_data *gf_ =
static_cast<struct transcription_filter_data *>(
private_data);
if (gf_->save_srt && gf_->save_only_while_recording &&
gf_->rename_file_to_match_recording) {
obs_log(gf_->log_level,
"Recording stopped. Rename srt file.");
// rename file to match the recording file name with .srt extension
// use obs_frontend_get_last_recording to get the last recording file name
std::string recording_file_name =
obs_frontend_get_last_recording();
// remove the extension
recording_file_name = recording_file_name.substr(
0, recording_file_name.find_last_of("."));
std::string srt_file_name = recording_file_name + ".srt";
// rename the file
std::rename(gf_->output_file_path.c_str(),
srt_file_name.c_str());
}
}
},
gf);
obs_frontend_add_event_callback(recording_state_callback, gf);
obs_log(gf->log_level, "filter created.");
return gf;
@@ -541,7 +542,7 @@ void transcription_filter_defaults(obs_data_t *s)
obs_data_set_default_double(s, "thold_ptsum", 0.01);
obs_data_set_default_int(s, "max_len", 0);
obs_data_set_default_bool(s, "split_on_word", true);
obs_data_set_default_int(s, "max_tokens", 32);
obs_data_set_default_int(s, "max_tokens", 0);
obs_data_set_default_bool(s, "speed_up", false);
obs_data_set_default_bool(s, "suppress_blank", false);
obs_data_set_default_bool(s, "suppress_non_speech_tokens", true);
@@ -776,19 +777,6 @@ obs_properties_t *transcription_filter_properties(void *data)
obs_property_t *buffered_output_prop =
obs_properties_add_bool(ppts, "buffered_output", MT_("buffered_output"));
// add on-change handler for buffered_output
obs_property_set_modified_callback(buffered_output_prop, [](obs_properties_t *props,
obs_property_t *property,
obs_data_t *settings) {
UNUSED_PARAMETER(property);
UNUSED_PARAMETER(props);
// if buffered output is enabled set the overlap to max else set it to default
obs_data_set_int(settings, "overlap_size_msec",
obs_data_get_bool(settings, "buffered_output")
? MAX_OVERLAP_SIZE_MSEC
: DEFAULT_OVERLAP_SIZE_MSEC);
return true;
});
obs_properties_add_bool(ppts, "log_words", MT_("log_words"));
obs_properties_add_bool(ppts, "caption_to_stream", MT_("caption_to_stream"));

View File

@@ -117,3 +117,23 @@ std::vector<std::string> split(const std::string &string, char delimiter)
}
return tokens;
}
std::vector<std::string> split_words(const std::string &str_copy)
{
std::vector<std::string> words;
std::string word;
for (char c : str_copy) {
if (std::isspace(c)) {
if (!word.empty()) {
words.push_back(word);
word.clear();
}
} else {
word += c;
}
}
if (!word.empty()) {
words.push_back(word);
}
return words;
}

View File

@@ -4,36 +4,17 @@
#include <string>
#include <vector>
#include <chrono>
#include <media-io/audio-io.h>
// Fix UTF8 string for Windows
std::string fix_utf8(const std::string &str);
// Remove leading and trailing non-alphabetic characters
std::string remove_leading_trailing_nonalpha(const std::string &str);
// Split a string by a delimiter
std::vector<std::string> split(const std::string &string, char delimiter);
inline enum speaker_layout convert_speaker_layout(uint8_t channels)
{
switch (channels) {
case 0:
return SPEAKERS_UNKNOWN;
case 1:
return SPEAKERS_MONO;
case 2:
return SPEAKERS_STEREO;
case 3:
return SPEAKERS_2POINT1;
case 4:
return SPEAKERS_4POINT0;
case 5:
return SPEAKERS_4POINT1;
case 6:
return SPEAKERS_5POINT1;
case 8:
return SPEAKERS_7POINT1;
default:
return SPEAKERS_UNKNOWN;
}
}
// Get the current timestamp in milliseconds since epoch
inline uint64_t now_ms()
{
return std::chrono::duration_cast<std::chrono::milliseconds>(
@@ -41,4 +22,7 @@ inline uint64_t now_ms()
.count();
}
// Split a string into words based on spaces
std::vector<std::string> split_words(const std::string &str_copy);
#endif // TRANSCRIPTION_UTILS_H

View File

@@ -1,21 +0,0 @@
#include "utils.h"
std::vector<std::string> split_words(const std::string &str_copy)
{
std::vector<std::string> words;
std::string word;
for (char c : str_copy) {
if (std::isspace(c)) {
if (!word.empty()) {
words.push_back(word);
word.clear();
}
} else {
word += c;
}
}
if (!word.empty()) {
words.push_back(word);
}
return words;
}

View File

@@ -1,9 +0,0 @@
#ifndef UTILS_H
#define UTILS_H
#include <string>
#include <vector>
std::vector<std::string> split_words(const std::string &str_copy);
#endif // UTILS_H

View File

@@ -23,7 +23,9 @@ TokenBufferThread::~TokenBufferThread()
stop = true;
}
condVar.notify_all();
workerThread.join();
if (workerThread.joinable()) {
workerThread.join();
}
}
void TokenBufferThread::initialize(struct transcription_filter_data *gf_,
@@ -38,10 +40,20 @@ void TokenBufferThread::initialize(struct transcription_filter_data *gf_,
this->numPerSentence = numPerSentence_;
this->segmentation = segmentation_;
this->maxTime = maxTime_;
this->initialized = true;
this->stop = false;
this->workerThread = std::thread(&TokenBufferThread::monitor, this);
}
void TokenBufferThread::stopThread()
{
std::lock_guard<std::mutex> lock(queueMutex);
stop = true;
condVar.notify_all();
if (workerThread.joinable()) {
workerThread.join();
}
}
void TokenBufferThread::log_token_vector(const std::vector<std::string> &tokens)
{
std::string output;
@@ -81,21 +93,22 @@ void TokenBufferThread::addSentence(const std::string &sentence)
}
}
void TokenBufferThread::clear()
{
obs_log(LOG_INFO, "TokenBufferThread::clear");
std::lock_guard<std::mutex> lock(queueMutex);
inputQueue.clear();
presentationQueue.clear();
this->callback("");
}
void TokenBufferThread::monitor()
{
obs_log(LOG_INFO, "TokenBufferThread::monitor");
this->callback("");
while (this->initialized && !this->stop) {
if (this->stop) {
break;
}
if (this->gf->whisper_context == nullptr) {
continue;
}
while (!this->stop) {
// condition presentation queue
if (presentationQueue.size() == this->numSentences * this->numPerSentence) {
// pop a whole sentence from the presentation queue front

View File

@@ -36,6 +36,10 @@ public:
TokenBufferSegmentation segmentation_ = SEGMENTATION_TOKEN);
void addSentence(const std::string &sentence);
void clear();
void stopThread();
bool isEnabled() const { return !stop; }
private:
void monitor();
@@ -48,8 +52,7 @@ private:
std::condition_variable condVar;
std::function<void(std::string)> callback;
std::chrono::seconds maxTime;
bool stop;
bool initialized = false;
bool stop = true;
bool newDataAvailable = false;
size_t numSentences;
size_t numPerSentence;

View File

@@ -102,9 +102,5 @@ void update_whisper_model(struct transcription_filter_data *gf, obs_data_t *s)
gf->enable_token_ts_dtw = obs_data_get_bool(s, "dtw_token_timestamps");
shutdown_whisper_thread(gf);
start_whisper_thread_with_path(gf, gf->whisper_model_path, silero_vad_model_file);
} else {
// dtw_token_timestamps did not change
obs_log(gf->log_level, "dtw_token_timestamps did not change: %d == %d",
gf->enable_token_ts_dtw, new_dtw_timestamps);
}
}

View File

@@ -283,18 +283,22 @@ void run_inference_and_callbacks(transcription_filter_data *gf, uint64_t start_o
uint64_t end_offset_ms, int vad_state)
{
// get the data from the entire whisper buffer
// add 50ms of silence to the beginning and end of the buffer
const size_t pcm32f_size = gf->whisper_buffer.size / sizeof(float);
const size_t pcm32f_size_with_silence = pcm32f_size + 2 * WHISPER_SAMPLE_RATE / 100;
// allocate a new buffer and copy the data to it
float *pcm32f_data = (float *)bzalloc(pcm32f_size * sizeof(float));
circlebuf_pop_back(&gf->whisper_buffer, pcm32f_data, pcm32f_size * sizeof(float));
float *pcm32f_data = (float *)bzalloc(pcm32f_size_with_silence * sizeof(float));
circlebuf_pop_back(&gf->whisper_buffer, pcm32f_data + WHISPER_SAMPLE_RATE / 100,
pcm32f_size * sizeof(float));
struct DetectionResultWithText inference_result =
run_whisper_inference(gf, pcm32f_data, pcm32f_size, start_offset_ms, end_offset_ms);
struct DetectionResultWithText inference_result = run_whisper_inference(
gf, pcm32f_data, pcm32f_size_with_silence, start_offset_ms, end_offset_ms);
// output inference result to a text source
set_text_callback(gf, inference_result);
if (gf->enable_audio_chunks_callback) {
audio_chunk_callback(gf, pcm32f_data, pcm32f_size, vad_state, inference_result);
audio_chunk_callback(gf, pcm32f_data, pcm32f_size_with_silence, vad_state,
inference_result);
}
// free the buffer