From 7bccfd96d6f6728b9052b5a428a6fcaefa6f634a Mon Sep 17 00:00:00 2001
From: Roy Shilkrot <roy.shil@gmail.com>
Date: Mon, 3 Jun 2024 00:34:38 -0400
Subject: [PATCH] refactor: Remove unused utils.h and utils.cpp files

---
 CMakeLists.txt                            |   7 +-
 src/tests/localvocal-offline-test.cpp     |  18 +-
 src/transcription-filter-callbacks.cpp    |  38 +++-
 src/transcription-filter-callbacks.h      |   2 +
 src/transcription-filter-data.h           |   1 +
 src/transcription-filter-utils.cpp        |  55 +++++
 src/transcription-filter-utils.h          |  33 +++
 src/transcription-filter.cpp              | 248 ++++++++++------------
 src/transcription-utils.cpp               |  20 ++
 src/transcription-utils.h                 |  34 +--
 src/utils.cpp                             |  21 --
 src/utils.h                               |   9 -
 src/whisper-utils/token-buffer-thread.cpp |  35 ++-
 src/whisper-utils/token-buffer-thread.h   |   7 +-
 src/whisper-utils/whisper-model-utils.cpp |   4 -
 src/whisper-utils/whisper-processing.cpp  |  14 +-
 16 files changed, 330 insertions(+), 216 deletions(-)
 create mode 100644 src/transcription-filter-utils.cpp
 create mode 100644 src/transcription-filter-utils.h
 delete mode 100644 src/utils.cpp
 delete mode 100644 src/utils.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 063ec64..8b58b07 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -88,6 +88,7 @@ target_sources(
           src/transcription-filter.cpp
           src/transcription-filter.c
           src/transcription-filter-callbacks.cpp
+          src/transcription-filter-utils.cpp
           src/transcription-utils.cpp
           src/model-utils/model-downloader.cpp
           src/model-utils/model-downloader-ui.cpp
@@ -100,8 +101,7 @@ target_sources(
           src/whisper-utils/token-buffer-thread.cpp
           src/translation/language_codes.cpp
           src/translation/translation.cpp
-          src/translation/translation-utils.cpp
-          src/utils.cpp)
+          src/translation/translation-utils.cpp)
 
 set_target_properties_plugin(${CMAKE_PROJECT_NAME} PROPERTIES OUTPUT_NAME ${_name})
 
@@ -122,8 +122,7 @@ if(ENABLE_TESTS)
             src/whisper-utils/silero-vad-onnx.cpp
             src/whisper-utils/token-buffer-thread.cpp
             src/translation/language_codes.cpp
-            src/translation/translation.cpp
-            src/utils.cpp)
+            src/translation/translation.cpp)
 
   find_libav(${CMAKE_PROJECT_NAME}-tests)
 
diff --git a/src/tests/localvocal-offline-test.cpp b/src/tests/localvocal-offline-test.cpp
index f36c12f..7bfd777 100644
--- a/src/tests/localvocal-offline-test.cpp
+++ b/src/tests/localvocal-offline-test.cpp
@@ -13,6 +13,7 @@
 #include <nlohmann/json.hpp>
 
 #include "transcription-filter-data.h"
+#include "transcription-filter-utils.h"
 #include "transcription-filter.h"
 #include "transcription-utils.h"
 #include "whisper-utils/whisper-utils.h"
@@ -155,11 +156,12 @@ create_context(int sample_rate, int channels, const std::string &whisper_model_p
 	gf->whisper_params = whisper_full_default_params(whisper_sampling_method);
 	gf->whisper_params.duration_ms = 3000;
 	gf->whisper_params.language = "en";
+	gf->whisper_params.detect_language = false;
 	gf->whisper_params.initial_prompt = "";
 	gf->whisper_params.n_threads = 4;
 	gf->whisper_params.n_max_text_ctx = 16384;
 	gf->whisper_params.translate = false;
-	gf->whisper_params.no_context = true;
+	gf->whisper_params.no_context = false;
 	gf->whisper_params.single_segment = true;
 	gf->whisper_params.print_special = false;
 	gf->whisper_params.print_progress = false;
@@ -174,7 +176,7 @@ create_context(int sample_rate, int channels, const std::string &whisper_model_p
 	gf->whisper_params.speed_up = false;
 	gf->whisper_params.suppress_blank = true;
 	gf->whisper_params.suppress_non_speech_tokens = true;
-	gf->whisper_params.temperature = 0.1;
+	gf->whisper_params.temperature = 0.0;
 	gf->whisper_params.max_initial_ts = 1.0;
 	gf->whisper_params.length_penalty = -1;
 	gf->active = true;
@@ -201,7 +203,7 @@ void audio_chunk_callback(struct transcription_filter_data *gf, const float *pcm
 	//     numeral = "0" + numeral;
 	// }
 
-	// save the audio to a .wav file
+	// // save the audio to a .wav file
 	// std::string filename = "audio_chunk_" + numeral + vad_state_str + ".wav";
 	// obs_log(gf->log_level, "Saving %lu frames to %s", frames, filename.c_str());
 	// write_audio_wav_file(filename.c_str(), pcm32f_data, frames);
@@ -388,6 +390,16 @@ int wmain(int argc, wchar_t *argv[])
 				gf->enable_audio_chunks_callback =
 					config["enable_audio_chunks_callback"];
 			}
+			if (config.contains("temperature")) {
+				obs_log(LOG_INFO, "Setting temperture to %f",
+					config["temperature"].get<float>());
+				gf->whisper_params.temperature = config["temperature"].get<float>();
+			}
+			if (config.contains("no_context")) {
+				obs_log(LOG_INFO, "Setting no_context to %s",
+					config["no_context"] ? "true" : "false");
+				gf->whisper_params.no_context = config["no_context"];
+			}
 			// set log level
 			if (logLevelStr == "debug") {
 				gf->log_level = LOG_DEBUG;
diff --git a/src/transcription-filter-callbacks.cpp b/src/transcription-filter-callbacks.cpp
index 8fddafd..f386e16 100644
--- a/src/transcription-filter-callbacks.cpp
+++ b/src/transcription-filter-callbacks.cpp
@@ -20,16 +20,19 @@
 
 #define SEND_TIMED_METADATA_URL "http://localhost:8080/timed-metadata"
 
-void send_caption_to_source(const std::string &target_source_name, const std::string &str_copy,
+void send_caption_to_source(const std::string &target_source_name, const std::string &caption,
 			    struct transcription_filter_data *gf)
 {
+	if (target_source_name.empty()) {
+		return;
+	}
 	auto target = obs_get_source_by_name(target_source_name.c_str());
 	if (!target) {
 		obs_log(gf->log_level, "text_source target is null");
 		return;
 	}
 	auto text_settings = obs_source_get_settings(target);
-	obs_data_set_string(text_settings, "text", str_copy.c_str());
+	obs_data_set_string(text_settings, "text", caption.c_str());
 	obs_source_update(target, text_settings);
 	obs_source_release(target);
 }
@@ -228,3 +231,34 @@ void set_text_callback(struct transcription_filter_data *gf,
 		}
 	}
 };
+
+void recording_state_callback(enum obs_frontend_event event, void *data)
+{
+	struct transcription_filter_data *gf_ =
+		static_cast<struct transcription_filter_data *>(data);
+	if (event == OBS_FRONTEND_EVENT_RECORDING_STARTING) {
+		if (gf_->save_srt && gf_->save_only_while_recording) {
+			obs_log(gf_->log_level, "Recording started. Resetting srt file.");
+			// truncate file if it exists
+			std::ofstream output_file(gf_->output_file_path,
+						  std::ios::out | std::ios::trunc);
+			output_file.close();
+			gf_->sentence_number = 1;
+			gf_->start_timestamp_ms = now_ms();
+		}
+	} else if (event == OBS_FRONTEND_EVENT_RECORDING_STOPPED) {
+		if (gf_->save_srt && gf_->save_only_while_recording &&
+		    gf_->rename_file_to_match_recording) {
+			obs_log(gf_->log_level, "Recording stopped. Rename srt file.");
+			// rename file to match the recording file name with .srt extension
+			// use obs_frontend_get_last_recording to get the last recording file name
+			std::string recording_file_name = obs_frontend_get_last_recording();
+			// remove the extension
+			recording_file_name = recording_file_name.substr(
+				0, recording_file_name.find_last_of("."));
+			std::string srt_file_name = recording_file_name + ".srt";
+			// rename the file
+			std::rename(gf_->output_file_path.c_str(), srt_file_name.c_str());
+		}
+	}
+}
diff --git a/src/transcription-filter-callbacks.h b/src/transcription-filter-callbacks.h
index 656b140..481af9f 100644
--- a/src/transcription-filter-callbacks.h
+++ b/src/transcription-filter-callbacks.h
@@ -15,4 +15,6 @@ void audio_chunk_callback(struct transcription_filter_data *gf, const float *pcm
 void set_text_callback(struct transcription_filter_data *gf,
 		       const DetectionResultWithText &resultIn);
 
+void recording_state_callback(enum obs_frontend_event event, void *data);
+
 #endif /* TRANSCRIPTION_FILTER_CALLBACKS_H */
diff --git a/src/transcription-filter-data.h b/src/transcription-filter-data.h
index 572e596..3ce5ae3 100644
--- a/src/transcription-filter-data.h
+++ b/src/transcription-filter-data.h
@@ -80,6 +80,7 @@ struct transcription_filter_data {
 	bool fix_utf8 = true;
 	bool enable_audio_chunks_callback = false;
 	bool send_timed_metadata = false;
+	bool source_signals_set = false;
 
 	// Last transcription result
 	std::string last_text;
diff --git a/src/transcription-filter-utils.cpp b/src/transcription-filter-utils.cpp
new file mode 100644
index 0000000..72f313c
--- /dev/null
+++ b/src/transcription-filter-utils.cpp
@@ -0,0 +1,55 @@
+#include "transcription-filter-utils.h"
+
+#include <obs-module.h>
+#include <obs.h>
+#include <obs-frontend-api.h>
+
+void create_obs_text_source()
+{
+	// create a new OBS text source called "LocalVocal Subtitles"
+	obs_source_t *scene_as_source = obs_frontend_get_current_scene();
+	obs_scene_t *scene = obs_scene_from_source(scene_as_source);
+#ifdef _WIN32
+	obs_source_t *source =
+		obs_source_create("text_gdiplus_v2", "LocalVocal Subtitles", nullptr, nullptr);
+#else
+	obs_source_t *source =
+		obs_source_create("text_ft2_source_v2", "LocalVocal Subtitles", nullptr, nullptr);
+#endif
+	if (source) {
+		// add source to the current scene
+		obs_scene_add(scene, source);
+		// set source settings
+		obs_data_t *source_settings = obs_source_get_settings(source);
+		obs_data_set_bool(source_settings, "word_wrap", true);
+		obs_data_set_int(source_settings, "custom_width", 1760);
+		obs_data_t *font_data = obs_data_create();
+		obs_data_set_string(font_data, "face", "Arial");
+		obs_data_set_string(font_data, "style", "Regular");
+		obs_data_set_int(font_data, "size", 72);
+		obs_data_set_int(font_data, "flags", 0);
+		obs_data_set_obj(source_settings, "font", font_data);
+		obs_data_release(font_data);
+		obs_source_update(source, source_settings);
+		obs_data_release(source_settings);
+
+		// set transform settings
+		obs_transform_info transform_info;
+		transform_info.pos.x = 962.0;
+		transform_info.pos.y = 959.0;
+		transform_info.bounds.x = 1769.0;
+		transform_info.bounds.y = 145.0;
+		transform_info.bounds_type = obs_bounds_type::OBS_BOUNDS_SCALE_INNER;
+		transform_info.bounds_alignment = OBS_ALIGN_CENTER;
+		transform_info.alignment = OBS_ALIGN_CENTER;
+		transform_info.scale.x = 1.0;
+		transform_info.scale.y = 1.0;
+		transform_info.rot = 0.0;
+		obs_sceneitem_t *source_sceneitem = obs_scene_sceneitem_from_source(scene, source);
+		obs_sceneitem_set_info(source_sceneitem, &transform_info);
+		obs_sceneitem_release(source_sceneitem);
+
+		obs_source_release(source);
+	}
+	obs_source_release(scene_as_source);
+}
diff --git a/src/transcription-filter-utils.h b/src/transcription-filter-utils.h
new file mode 100644
index 0000000..9f24d55
--- /dev/null
+++ b/src/transcription-filter-utils.h
@@ -0,0 +1,33 @@
+#ifndef TRANSCRIPTION_FILTER_UTILS_H
+#define TRANSCRIPTION_FILTER_UTILS_H
+
+#include <media-io/audio-io.h>
+
+// Convert channels number to a speaker layout
+inline enum speaker_layout convert_speaker_layout(uint8_t channels)
+{
+	switch (channels) {
+	case 0:
+		return SPEAKERS_UNKNOWN;
+	case 1:
+		return SPEAKERS_MONO;
+	case 2:
+		return SPEAKERS_STEREO;
+	case 3:
+		return SPEAKERS_2POINT1;
+	case 4:
+		return SPEAKERS_4POINT0;
+	case 5:
+		return SPEAKERS_4POINT1;
+	case 6:
+		return SPEAKERS_5POINT1;
+	case 8:
+		return SPEAKERS_7POINT1;
+	default:
+		return SPEAKERS_UNKNOWN;
+	}
+}
+
+void create_obs_text_source();
+
+#endif // TRANSCRIPTION_FILTER_UTILS_H
diff --git a/src/transcription-filter.cpp b/src/transcription-filter.cpp
index 5e3a5dc..319573a 100644
--- a/src/transcription-filter.cpp
+++ b/src/transcription-filter.cpp
@@ -18,6 +18,7 @@
 #include "transcription-filter.h"
 #include "transcription-filter-callbacks.h"
 #include "transcription-filter-data.h"
+#include "transcription-filter-utils.h"
 #include "transcription-utils.h"
 #include "model-utils/model-downloader.h"
 #include "whisper-utils/whisper-processing.h"
@@ -28,7 +29,6 @@
 #include "translation/translation-utils.h"
 #include "translation/translation.h"
 #include "translation/translation-includes.h"
-#include "utils.h"
 
 bool add_sources_to_list(void *list_property, obs_source_t *source)
 {
@@ -44,6 +44,71 @@ bool add_sources_to_list(void *list_property, obs_source_t *source)
 	return true;
 }
 
+void set_source_signals(transcription_filter_data *gf, obs_source_t *parent_source)
+{
+	obs_log(LOG_INFO, "parent source name: %s", obs_source_get_name(parent_source));
+	signal_handler_t *sh = obs_source_get_signal_handler(parent_source);
+	signal_handler_connect(
+		sh, "media_play",
+		[](void *data_, calldata_t *cd) {
+			obs_log(LOG_INFO, "media_play");
+			transcription_filter_data *gf_ =
+				static_cast<struct transcription_filter_data *>(data_);
+			gf_->active = true;
+		},
+		gf);
+	signal_handler_connect(
+		sh, "media_started",
+		[](void *data_, calldata_t *cd) {
+			obs_log(LOG_INFO, "media_started");
+			transcription_filter_data *gf_ =
+				static_cast<struct transcription_filter_data *>(data_);
+			gf_->active = true;
+		},
+		gf);
+	signal_handler_connect(
+		sh, "media_pause",
+		[](void *data_, calldata_t *cd) {
+			obs_log(LOG_INFO, "media_pause");
+			transcription_filter_data *gf_ =
+				static_cast<struct transcription_filter_data *>(data_);
+			gf_->active = false;
+		},
+		gf);
+	signal_handler_connect(
+		sh, "media_restart",
+		[](void *data_, calldata_t *cd) {
+			obs_log(LOG_INFO, "media_restart");
+			transcription_filter_data *gf_ =
+				static_cast<struct transcription_filter_data *>(data_);
+			gf_->active = true;
+			gf_->captions_monitor.clear();
+			send_caption_to_source(gf_->text_source_name, "", gf_);
+		},
+		gf);
+	signal_handler_connect(
+		sh, "media_stopped",
+		[](void *data_, calldata_t *cd) {
+			obs_log(LOG_INFO, "media_stopped");
+			transcription_filter_data *gf_ =
+				static_cast<struct transcription_filter_data *>(data_);
+			gf_->active = false;
+			gf_->captions_monitor.clear();
+			send_caption_to_source(gf_->text_source_name, "", gf_);
+			// flush the buffer
+			{
+				std::lock_guard<std::mutex> lock(gf_->whisper_buf_mutex);
+				for (size_t c = 0; c < gf_->channels; c++) {
+					circlebuf_free(&gf_->input_buffers[c]);
+				}
+				circlebuf_free(&gf_->info_buffer);
+				circlebuf_free(&gf_->whisper_buffer);
+			}
+		},
+		gf);
+	gf->source_signals_set = true;
+}
+
 struct obs_audio_data *transcription_filter_filter_audio(void *data, struct obs_audio_data *audio)
 {
 	if (!audio) {
@@ -56,14 +121,16 @@ struct obs_audio_data *transcription_filter_filter_audio(void *data, struct obs_
 	struct transcription_filter_data *gf =
 		static_cast<struct transcription_filter_data *>(data);
 
-	if (!gf->active) {
-		return audio;
+	// Lazy initialization of source signals
+	if (!gf->source_signals_set) {
+		// obs_filter_get_parent only works in the filter function
+		obs_source_t *parent_source = obs_filter_get_parent(gf->context);
+		if (parent_source != nullptr) {
+			set_source_signals(gf, parent_source);
+		}
 	}
 
-	// Check if the parent source is muted
-	obs_source_t *parent_source = obs_filter_get_parent(gf->context);
-	if (gf->process_while_muted == false && obs_source_muted(parent_source)) {
-		// Source is muted, do not process audio
+	if (!gf->active) {
 		return audio;
 	}
 
@@ -72,6 +139,17 @@ struct obs_audio_data *transcription_filter_filter_audio(void *data, struct obs_
 		return audio;
 	}
 
+	// Check if process while muted is not enabled (e.g. the user wants to avoid processing audio
+	// when the source is muted)
+	if (!gf->process_while_muted) {
+		// Check if the parent source is muted
+		obs_source_t *parent_source = obs_filter_get_parent(gf->context);
+		if (parent_source != nullptr && obs_source_muted(parent_source)) {
+			// Source is muted, do not process audio
+			return audio;
+		}
+	}
+
 	{
 		std::lock_guard<std::mutex> lock(gf->whisper_buf_mutex); // scoped lock
 		// push back current audio data to input circlebuf
@@ -122,12 +200,11 @@ void transcription_filter_destroy(void *data)
 
 void transcription_filter_update(void *data, obs_data_t *s)
 {
+	obs_log(LOG_INFO, "LocalVocal filter update");
 	struct transcription_filter_data *gf =
 		static_cast<struct transcription_filter_data *>(data);
 
 	gf->log_level = (int)obs_data_get_int(s, "log_level");
-	obs_log(gf->log_level, "filter update");
-
 	gf->vad_enabled = obs_data_get_bool(s, "vad_enabled");
 	gf->log_words = obs_data_get_bool(s, "log_words");
 	gf->caption_to_stream = obs_data_get_bool(s, "caption_to_stream");
@@ -142,7 +219,34 @@ void transcription_filter_update(void *data, obs_data_t *s)
 	gf->process_while_muted = obs_data_get_bool(s, "process_while_muted");
 	gf->min_sub_duration = (int)obs_data_get_int(s, "min_sub_duration");
 	gf->last_sub_render_time = 0;
-	gf->buffered_output = obs_data_get_bool(s, "buffered_output");
+	bool new_buffered_output = obs_data_get_bool(s, "buffered_output");
+
+	if (new_buffered_output) {
+		obs_log(LOG_INFO, "buffered_output enable");
+		if (!gf->buffered_output || !gf->captions_monitor.isEnabled()) {
+			obs_log(LOG_INFO, "buffered_output currently disabled, enabling");
+			gf->buffered_output = true;
+			gf->captions_monitor.initialize(
+				gf,
+				[gf](const std::string &text) {
+					if (gf->buffered_output) {
+						send_caption_to_source(gf->text_source_name, text,
+								       gf);
+					}
+				},
+				2, 30, std::chrono::seconds(10));
+		}
+	} else {
+		obs_log(LOG_INFO, "buffered_output disable");
+		if (gf->buffered_output) {
+			obs_log(LOG_INFO, "buffered_output currently enabled, disabling");
+			if (gf->captions_monitor.isEnabled()) {
+				gf->captions_monitor.clear();
+				gf->captions_monitor.stopThread();
+			}
+			gf->buffered_output = false;
+		}
+	}
 
 	bool new_translate = obs_data_get_bool(s, "translate");
 	gf->source_lang = obs_data_get_string(s, "translate_source_language");
@@ -195,7 +299,6 @@ void transcription_filter_update(void *data, obs_data_t *s)
 	obs_log(gf->log_level, "update text source");
 	// update the text source
 	const char *new_text_source_name = obs_data_get_string(s, "subtitle_sources");
-	obs_weak_source_t *old_weak_text_source = NULL;
 
 	if (new_text_source_name == nullptr || strcmp(new_text_source_name, "none") == 0 ||
 	    strcmp(new_text_source_name, "(null)") == 0 ||
@@ -212,16 +315,7 @@ void transcription_filter_update(void *data, obs_data_t *s)
 			}
 		}
 	} else {
-		// new selected text source is valid, check if it's different from the old one
-		if (gf->text_source_name != new_text_source_name) {
-			// new text source is different from the old one, release the old one
-			gf->text_source_name = new_text_source_name;
-		}
-	}
-
-	if (old_weak_text_source) {
-		obs_log(gf->log_level, "releasing old text source");
-		obs_weak_source_release(old_weak_text_source);
+		gf->text_source_name = new_text_source_name;
 	}
 
 	obs_log(gf->log_level, "update whisper model");
@@ -333,53 +427,7 @@ void *transcription_filter_create(obs_data_t *settings, obs_source_t *filter)
 			obs_source_release(source);
 		} else {
 			// create a new OBS text source called "LocalVocal Subtitles"
-			obs_source_t *scene_as_source = obs_frontend_get_current_scene();
-			obs_scene_t *scene = obs_scene_from_source(scene_as_source);
-#ifdef _WIN32
-			source = obs_source_create("text_gdiplus_v2", "LocalVocal Subtitles",
-						   nullptr, nullptr);
-#else
-			source = obs_source_create("text_ft2_source_v2", "LocalVocal Subtitles",
-						   nullptr, nullptr);
-#endif
-			if (source) {
-				// add source to the current scene
-				obs_scene_add(scene, source);
-				// set source settings
-				obs_data_t *source_settings = obs_source_get_settings(source);
-				obs_data_set_bool(source_settings, "word_wrap", true);
-				obs_data_set_int(source_settings, "custom_width", 1760);
-				obs_data_t *font_data = obs_data_create();
-				obs_data_set_string(font_data, "face", "Arial");
-				obs_data_set_string(font_data, "style", "Regular");
-				obs_data_set_int(font_data, "size", 72);
-				obs_data_set_int(font_data, "flags", 0);
-				obs_data_set_obj(source_settings, "font", font_data);
-				obs_data_release(font_data);
-				obs_source_update(source, source_settings);
-				obs_data_release(source_settings);
-
-				// set transform settings
-				obs_transform_info transform_info;
-				transform_info.pos.x = 962.0;
-				transform_info.pos.y = 959.0;
-				transform_info.bounds.x = 1769.0;
-				transform_info.bounds.y = 145.0;
-				transform_info.bounds_type =
-					obs_bounds_type::OBS_BOUNDS_SCALE_INNER;
-				transform_info.bounds_alignment = OBS_ALIGN_CENTER;
-				transform_info.alignment = OBS_ALIGN_CENTER;
-				transform_info.scale.x = 1.0;
-				transform_info.scale.y = 1.0;
-				transform_info.rot = 0.0;
-				obs_sceneitem_t *source_sceneitem =
-					obs_scene_sceneitem_from_source(scene, source);
-				obs_sceneitem_set_info(source_sceneitem, &transform_info);
-				obs_sceneitem_release(source_sceneitem);
-
-				obs_source_release(source);
-			}
-			obs_source_release(scene_as_source);
+			create_obs_text_source();
 		}
 		gf->text_source_name = "LocalVocal Subtitles";
 		obs_data_set_string(settings, "subtitle_sources", "LocalVocal Subtitles");
@@ -393,15 +441,6 @@ void *transcription_filter_create(obs_data_t *settings, obs_source_t *filter)
 	gf->whisper_model_path = std::string(""); // The update function will set the model path
 	gf->whisper_context = nullptr;
 
-	gf->captions_monitor.initialize(
-		gf,
-		[gf](const std::string &text) {
-			if (gf->buffered_output) {
-				send_caption_to_source(gf->text_source_name, text, gf);
-			}
-		},
-		2, 30, std::chrono::seconds(10));
-
 	obs_log(gf->log_level, "run update");
 	// get the settings updated on the filter data struct
 	transcription_filter_update(gf, settings);
@@ -410,45 +449,7 @@ void *transcription_filter_create(obs_data_t *settings, obs_source_t *filter)
 
 	// handle the event OBS_FRONTEND_EVENT_RECORDING_STARTING to reset the srt sentence number
 	// to match the subtitles with the recording
-	obs_frontend_add_event_callback(
-		[](enum obs_frontend_event event, void *private_data) {
-			if (event == OBS_FRONTEND_EVENT_RECORDING_STARTING) {
-				struct transcription_filter_data *gf_ =
-					static_cast<struct transcription_filter_data *>(
-						private_data);
-				if (gf_->save_srt && gf_->save_only_while_recording) {
-					obs_log(gf_->log_level,
-						"Recording started. Resetting srt file.");
-					// truncate file if it exists
-					std::ofstream output_file(gf_->output_file_path,
-								  std::ios::out | std::ios::trunc);
-					output_file.close();
-					gf_->sentence_number = 1;
-					gf_->start_timestamp_ms = now_ms();
-				}
-			} else if (event == OBS_FRONTEND_EVENT_RECORDING_STOPPED) {
-				struct transcription_filter_data *gf_ =
-					static_cast<struct transcription_filter_data *>(
-						private_data);
-				if (gf_->save_srt && gf_->save_only_while_recording &&
-				    gf_->rename_file_to_match_recording) {
-					obs_log(gf_->log_level,
-						"Recording stopped. Rename srt file.");
-					// rename file to match the recording file name with .srt extension
-					// use obs_frontend_get_last_recording to get the last recording file name
-					std::string recording_file_name =
-						obs_frontend_get_last_recording();
-					// remove the extension
-					recording_file_name = recording_file_name.substr(
-						0, recording_file_name.find_last_of("."));
-					std::string srt_file_name = recording_file_name + ".srt";
-					// rename the file
-					std::rename(gf_->output_file_path.c_str(),
-						    srt_file_name.c_str());
-				}
-			}
-		},
-		gf);
+	obs_frontend_add_event_callback(recording_state_callback, gf);
 
 	obs_log(gf->log_level, "filter created.");
 	return gf;
@@ -541,7 +542,7 @@ void transcription_filter_defaults(obs_data_t *s)
 	obs_data_set_default_double(s, "thold_ptsum", 0.01);
 	obs_data_set_default_int(s, "max_len", 0);
 	obs_data_set_default_bool(s, "split_on_word", true);
-	obs_data_set_default_int(s, "max_tokens", 32);
+	obs_data_set_default_int(s, "max_tokens", 0);
 	obs_data_set_default_bool(s, "speed_up", false);
 	obs_data_set_default_bool(s, "suppress_blank", false);
 	obs_data_set_default_bool(s, "suppress_non_speech_tokens", true);
@@ -776,19 +777,6 @@ obs_properties_t *transcription_filter_properties(void *data)
 
 	obs_property_t *buffered_output_prop =
 		obs_properties_add_bool(ppts, "buffered_output", MT_("buffered_output"));
-	// add on-change handler for buffered_output
-	obs_property_set_modified_callback(buffered_output_prop, [](obs_properties_t *props,
-								    obs_property_t *property,
-								    obs_data_t *settings) {
-		UNUSED_PARAMETER(property);
-		UNUSED_PARAMETER(props);
-		// if buffered output is enabled set the overlap to max else set it to default
-		obs_data_set_int(settings, "overlap_size_msec",
-				 obs_data_get_bool(settings, "buffered_output")
-					 ? MAX_OVERLAP_SIZE_MSEC
-					 : DEFAULT_OVERLAP_SIZE_MSEC);
-		return true;
-	});
 
 	obs_properties_add_bool(ppts, "log_words", MT_("log_words"));
 	obs_properties_add_bool(ppts, "caption_to_stream", MT_("caption_to_stream"));
diff --git a/src/transcription-utils.cpp b/src/transcription-utils.cpp
index ca9e0f1..415b47b 100644
--- a/src/transcription-utils.cpp
+++ b/src/transcription-utils.cpp
@@ -117,3 +117,23 @@ std::vector<std::string> split(const std::string &string, char delimiter)
 	}
 	return tokens;
 }
+
+std::vector<std::string> split_words(const std::string &str_copy)
+{
+	std::vector<std::string> words;
+	std::string word;
+	for (char c : str_copy) {
+		if (std::isspace(c)) {
+			if (!word.empty()) {
+				words.push_back(word);
+				word.clear();
+			}
+		} else {
+			word += c;
+		}
+	}
+	if (!word.empty()) {
+		words.push_back(word);
+	}
+	return words;
+}
diff --git a/src/transcription-utils.h b/src/transcription-utils.h
index e5eb274..4e7f39c 100644
--- a/src/transcription-utils.h
+++ b/src/transcription-utils.h
@@ -4,36 +4,17 @@
 #include <string>
 #include <vector>
 #include <chrono>
-#include <media-io/audio-io.h>
 
+// Fix UTF8 string for Windows
 std::string fix_utf8(const std::string &str);
+
+// Remove leading and trailing non-alphabetic characters
 std::string remove_leading_trailing_nonalpha(const std::string &str);
+
+// Split a string by a delimiter
 std::vector<std::string> split(const std::string &string, char delimiter);
 
-inline enum speaker_layout convert_speaker_layout(uint8_t channels)
-{
-	switch (channels) {
-	case 0:
-		return SPEAKERS_UNKNOWN;
-	case 1:
-		return SPEAKERS_MONO;
-	case 2:
-		return SPEAKERS_STEREO;
-	case 3:
-		return SPEAKERS_2POINT1;
-	case 4:
-		return SPEAKERS_4POINT0;
-	case 5:
-		return SPEAKERS_4POINT1;
-	case 6:
-		return SPEAKERS_5POINT1;
-	case 8:
-		return SPEAKERS_7POINT1;
-	default:
-		return SPEAKERS_UNKNOWN;
-	}
-}
-
+// Get the current timestamp in milliseconds since epoch
 inline uint64_t now_ms()
 {
 	return std::chrono::duration_cast<std::chrono::milliseconds>(
@@ -41,4 +22,7 @@ inline uint64_t now_ms()
 		.count();
 }
 
+// Split a string into words based on spaces
+std::vector<std::string> split_words(const std::string &str_copy);
+
 #endif // TRANSCRIPTION_UTILS_H
diff --git a/src/utils.cpp b/src/utils.cpp
deleted file mode 100644
index 6639ae7..0000000
--- a/src/utils.cpp
+++ /dev/null
@@ -1,21 +0,0 @@
-#include "utils.h"
-
-std::vector<std::string> split_words(const std::string &str_copy)
-{
-	std::vector<std::string> words;
-	std::string word;
-	for (char c : str_copy) {
-		if (std::isspace(c)) {
-			if (!word.empty()) {
-				words.push_back(word);
-				word.clear();
-			}
-		} else {
-			word += c;
-		}
-	}
-	if (!word.empty()) {
-		words.push_back(word);
-	}
-	return words;
-}
diff --git a/src/utils.h b/src/utils.h
deleted file mode 100644
index 9348417..0000000
--- a/src/utils.h
+++ /dev/null
@@ -1,9 +0,0 @@
-#ifndef UTILS_H
-#define UTILS_H
-
-#include <string>
-#include <vector>
-
-std::vector<std::string> split_words(const std::string &str_copy);
-
-#endif // UTILS_H
diff --git a/src/whisper-utils/token-buffer-thread.cpp b/src/whisper-utils/token-buffer-thread.cpp
index a6b1110..aa4db2e 100644
--- a/src/whisper-utils/token-buffer-thread.cpp
+++ b/src/whisper-utils/token-buffer-thread.cpp
@@ -23,7 +23,9 @@ TokenBufferThread::~TokenBufferThread()
 		stop = true;
 	}
 	condVar.notify_all();
-	workerThread.join();
+	if (workerThread.joinable()) {
+		workerThread.join();
+	}
 }
 
 void TokenBufferThread::initialize(struct transcription_filter_data *gf_,
@@ -38,10 +40,20 @@ void TokenBufferThread::initialize(struct transcription_filter_data *gf_,
 	this->numPerSentence = numPerSentence_;
 	this->segmentation = segmentation_;
 	this->maxTime = maxTime_;
-	this->initialized = true;
+	this->stop = false;
 	this->workerThread = std::thread(&TokenBufferThread::monitor, this);
 }
 
+void TokenBufferThread::stopThread()
+{
+	std::lock_guard<std::mutex> lock(queueMutex);
+	stop = true;
+	condVar.notify_all();
+	if (workerThread.joinable()) {
+		workerThread.join();
+	}
+}
+
 void TokenBufferThread::log_token_vector(const std::vector<std::string> &tokens)
 {
 	std::string output;
@@ -81,21 +93,22 @@ void TokenBufferThread::addSentence(const std::string &sentence)
 	}
 }
 
+void TokenBufferThread::clear()
+{
+	obs_log(LOG_INFO, "TokenBufferThread::clear");
+	std::lock_guard<std::mutex> lock(queueMutex);
+	inputQueue.clear();
+	presentationQueue.clear();
+	this->callback("");
+}
+
 void TokenBufferThread::monitor()
 {
 	obs_log(LOG_INFO, "TokenBufferThread::monitor");
 
 	this->callback("");
 
-	while (this->initialized && !this->stop) {
-		if (this->stop) {
-			break;
-		}
-
-		if (this->gf->whisper_context == nullptr) {
-			continue;
-		}
-
+	while (!this->stop) {
 		// condition presentation queue
 		if (presentationQueue.size() == this->numSentences * this->numPerSentence) {
 			// pop a whole sentence from the presentation queue front
diff --git a/src/whisper-utils/token-buffer-thread.h b/src/whisper-utils/token-buffer-thread.h
index 223d5b0..ce02491 100644
--- a/src/whisper-utils/token-buffer-thread.h
+++ b/src/whisper-utils/token-buffer-thread.h
@@ -36,6 +36,10 @@ public:
 			TokenBufferSegmentation segmentation_ = SEGMENTATION_TOKEN);
 
 	void addSentence(const std::string &sentence);
+	void clear();
+	void stopThread();
+
+	bool isEnabled() const { return !stop; }
 
 private:
 	void monitor();
@@ -48,8 +52,7 @@ private:
 	std::condition_variable condVar;
 	std::function<void(std::string)> callback;
 	std::chrono::seconds maxTime;
-	bool stop;
-	bool initialized = false;
+	bool stop = true;
 	bool newDataAvailable = false;
 	size_t numSentences;
 	size_t numPerSentence;
diff --git a/src/whisper-utils/whisper-model-utils.cpp b/src/whisper-utils/whisper-model-utils.cpp
index 35213d3..c9620c8 100644
--- a/src/whisper-utils/whisper-model-utils.cpp
+++ b/src/whisper-utils/whisper-model-utils.cpp
@@ -102,9 +102,5 @@ void update_whisper_model(struct transcription_filter_data *gf, obs_data_t *s)
 		gf->enable_token_ts_dtw = obs_data_get_bool(s, "dtw_token_timestamps");
 		shutdown_whisper_thread(gf);
 		start_whisper_thread_with_path(gf, gf->whisper_model_path, silero_vad_model_file);
-	} else {
-		// dtw_token_timestamps did not change
-		obs_log(gf->log_level, "dtw_token_timestamps did not change: %d == %d",
-			gf->enable_token_ts_dtw, new_dtw_timestamps);
 	}
 }
diff --git a/src/whisper-utils/whisper-processing.cpp b/src/whisper-utils/whisper-processing.cpp
index a7a42e7..c3d06ae 100644
--- a/src/whisper-utils/whisper-processing.cpp
+++ b/src/whisper-utils/whisper-processing.cpp
@@ -283,18 +283,22 @@ void run_inference_and_callbacks(transcription_filter_data *gf, uint64_t start_o
 				 uint64_t end_offset_ms, int vad_state)
 {
 	// get the data from the entire whisper buffer
+	// add 50ms of silence to the beginning and end of the buffer
 	const size_t pcm32f_size = gf->whisper_buffer.size / sizeof(float);
+	const size_t pcm32f_size_with_silence = pcm32f_size + 2 * WHISPER_SAMPLE_RATE / 100;
 	// allocate a new buffer and copy the data to it
-	float *pcm32f_data = (float *)bzalloc(pcm32f_size * sizeof(float));
-	circlebuf_pop_back(&gf->whisper_buffer, pcm32f_data, pcm32f_size * sizeof(float));
+	float *pcm32f_data = (float *)bzalloc(pcm32f_size_with_silence * sizeof(float));
+	circlebuf_pop_back(&gf->whisper_buffer, pcm32f_data + WHISPER_SAMPLE_RATE / 100,
+			   pcm32f_size * sizeof(float));
 
-	struct DetectionResultWithText inference_result =
-		run_whisper_inference(gf, pcm32f_data, pcm32f_size, start_offset_ms, end_offset_ms);
+	struct DetectionResultWithText inference_result = run_whisper_inference(
+		gf, pcm32f_data, pcm32f_size_with_silence, start_offset_ms, end_offset_ms);
 	// output inference result to a text source
 	set_text_callback(gf, inference_result);
 
 	if (gf->enable_audio_chunks_callback) {
-		audio_chunk_callback(gf, pcm32f_data, pcm32f_size, vad_state, inference_result);
+		audio_chunk_callback(gf, pcm32f_data, pcm32f_size_with_silence, vad_state,
+				     inference_result);
 	}
 
 	// free the buffer