srt saving

2026-01-09 12:28:05 -05:00 · 2023-10-07 13:46:58 -04:00
parent 3d7d77b041
commit 9299e7592e
6 changed files with 161 additions and 45 deletions
--- a/README.md
+++ b/README.md
@@ -57,6 +57,12 @@ The plugin was built and tested on Mac OSX  (Intel & Apple silicon), Windows and

 Start by cloning this repo to a directory of your choice.

+Remember to sync and fetch the submodules before building, e.g.
+```sh
+$ git submodule sync --recursive
+$ git update --init --recursive
+```
+
 ### Mac OSX

 Using the CI pipeline scripts, locally you would just call the zsh script. By default this builds a universal binary for both Intel and Apple Silicon. To build for a specific architecture please see `.github/scripts/.build.zsh` for the `-arch` options.
--- a/cmake/linux/defaults.cmake
+++ b/cmake/linux/defaults.cmake
@@ -32,7 +32,7 @@ set(CPACK_SOURCE_IGNORE_FILES
    # cmake-format: sortable
    ".*~$"
    \\.git/
-    \\.github/
+    # \\.github/
    \\.gitignore
    build_.*
    cmake/\\.CMakeBuildNumber
--- a/data/locale/en-US.ini
+++ b/data/locale/en-US.ini
@@ -36,4 +36,7 @@ suppress_blank="Suppress blank"
 suppress_non_speech_tokens="Suppress non-speech tokens"
 temperature="Temperature"
 max_initial_ts="Max initial timestamps"
-length_penalty="Length penalty"
+length_penalty="Length penalty"
+save_srt="Save in SRT format (no file truncation)"
+only_while_recording="Write output only while recording"
+process_while_muted="Process speech while source is muted"
--- a/src/transcription-filter-data.h
+++ b/src/transcription-filter-data.h
@@ -19,8 +19,21 @@

 #define MT_ obs_module_text

+enum DetectionResult {
+	DETECTION_RESULT_UNKNOWN = 0,
+	DETECTION_RESULT_SILENCE = 1,
+	DETECTION_RESULT_SPEECH = 2,
+};
+
+struct DetectionResultWithText {
+	DetectionResult result;
+	std::string text;
+	uint64_t start_timestamp_ms;
+	uint64_t end_timestamp_ms;
+};
+
 struct transcription_filter_data {
-	obs_source_t *context; // obs input source
+	obs_source_t *context; // obs filter source (this filter)
 	size_t channels;       // number of channels
 	uint32_t sample_rate;  // input sample rate
 	// How many input frames (in input sample rate) are needed for the next whisper frame
@@ -32,6 +45,10 @@ struct transcription_filter_data {
 	size_t last_num_frames;
 	// Milliseconds per processing step (e.g. rest of the whisper buffer may be filled with silence)
 	size_t step_size_msec;
+	// Start begining timestamp in ms since epoch
+	uint64_t start_timestamp_ms;
+	// Sentence counter for srt
+	size_t sentence_number;

 	/* PCM buffers */
 	float *copy_buffers[MAX_PREPROC_CHANNELS];
@@ -54,13 +71,16 @@ struct transcription_filter_data {
 	bool log_words;
 	bool caption_to_stream;
 	bool active = false;
+	bool save_srt = false;
+	bool save_only_while_recording = false;
+	bool process_while_muted = false;

 	// Text source to output the subtitles
 	obs_weak_source_t *text_source = nullptr;
 	char *text_source_name = nullptr;
 	std::mutex *text_source_mutex = nullptr;
 	// Callback to set the text in the output text source (subtitles)
-	std::function<void(const std::string &str)> setTextCallback;
+	std::function<void(const DetectionResultWithText &result)> setTextCallback;
 	// Output file path to write the subtitles
 	std::string output_file_path = "";
 	std::string whisper_model_file_currently_loaded = "";
@@ -79,6 +99,6 @@ struct transcription_filter_audio_info {
 	uint64_t timestamp;
 };

-void set_text_callback(struct transcription_filter_data *gf, const std::string &str);
+void set_text_callback(struct transcription_filter_data *gf, const DetectionResultWithText &str);

 #endif /* TRANSCRIPTION_FILTER_DATA_H */
--- a/src/transcription-filter.cpp
+++ b/src/transcription-filter.cpp
@@ -41,6 +41,13 @@ inline enum speaker_layout convert_speaker_layout(uint8_t channels)
 	}
 }

+inline uint64_t now_ms()
+{
+	return std::chrono::duration_cast<std::chrono::milliseconds>(
+		       std::chrono::system_clock::now().time_since_epoch())
+		.count();
+}
+
 bool add_sources_to_list(void *list_property, obs_source_t *source)
 {
 	auto source_id = obs_source_get_id(source);
@@ -71,6 +78,13 @@ struct obs_audio_data *transcription_filter_filter_audio(void *data, struct obs_
 		return audio;
 	}

+	// Check if the parent source is muted
+	obs_source_t *parent_source = obs_filter_get_parent(gf->context);
+	if (gf->process_while_muted == false && obs_source_muted(parent_source)) {
+		// Source is muted, do not process audio
+		return audio;
+	}
+
 	if (gf->whisper_context == nullptr) {
 		// Whisper not initialized, just pass through
 		return audio;
@@ -179,13 +193,13 @@ void acquire_weak_text_source_ref(struct transcription_filter_data *gf)
 	}
 }

-void set_text_callback(struct transcription_filter_data *gf, const std::string &str)
+void set_text_callback(struct transcription_filter_data *gf, const DetectionResultWithText &result)
 {
 #ifdef _WIN32
 	// Russian UTF8 charset on Windows output has a bug, instead of 0xd? it outputs
 	// 0xf?, so we need to replace it. This doesn't affect any other charset, which
 	// outputs the correct UTF8 output. (Except maybe for Greek?)
-	std::string str_copy = str;
+	std::string str_copy = result.text;
 	for (size_t i = 0; i < str_copy.size(); ++i) {
 		// if the char MSBs starts with 0xf replace the MSBs with 0xd
 		if ((str_copy.c_str()[i] & 0xf0) == 0xf0) {
@@ -193,21 +207,61 @@ void set_text_callback(struct transcription_filter_data *gf, const std::string &
 		}
 	}
 #else
-	std::string str_copy = str;
+	std::string str_copy = result.text;
 #endif

 	if (gf->caption_to_stream) {
 		obs_output_t *streaming_output = obs_frontend_get_streaming_output();
 		if (streaming_output) {
-			obs_output_output_caption_text1(streaming_output, str.c_str());
+			obs_output_output_caption_text1(streaming_output, result.text.c_str());
 			obs_output_release(streaming_output);
 		}
 	}
+
 	if (gf->output_file_path != "" && !gf->text_source_name) {
-		// Write to file, do not append
-		std::ofstream output_file(gf->output_file_path, std::ios::out | std::ios::trunc);
-		output_file << str;
-		output_file.close();
+		// Check if we should save the sentence
+		if (gf->save_only_while_recording && !obs_frontend_recording_active()) {
+			// We are not recording, do not save the sentence to file
+			return;
+		}
+		if (!gf->save_srt) {
+			// Write raw sentence to file, do not append
+			std::ofstream output_file(gf->output_file_path,
+						  std::ios::out | std::ios::trunc);
+			output_file << result.text << std::endl;
+			output_file.close();
+		} else {
+			obs_log(gf->log_level, "Saving sentence to file %s, sentence #%d",
+				gf->output_file_path.c_str(), gf->sentence_number);
+			// Append sentence to file in .srt format
+			std::ofstream output_file(gf->output_file_path,
+						  std::ios::out | std::ios::app);
+			output_file << gf->sentence_number << std::endl;
+			// use the start and end timestamps to calculate the start and end time in srt format
+			auto format_ts_for_srt = [&output_file](uint64_t ts) {
+				uint64_t time_s = ts / 1000;
+				uint64_t time_m = time_s / 60;
+				uint64_t time_h = time_m / 60;
+				uint64_t time_ms_rem = ts % 1000;
+				uint64_t time_s_rem = time_s % 60;
+				uint64_t time_m_rem = time_m % 60;
+				uint64_t time_h_rem = time_h % 60;
+				output_file << std::setfill('0') << std::setw(2) << time_h_rem
+					    << ":" << std::setfill('0') << std::setw(2)
+					    << time_m_rem << ":" << std::setfill('0')
+					    << std::setw(2) << time_s_rem << ","
+					    << std::setfill('0') << std::setw(3) << time_ms_rem;
+			};
+			format_ts_for_srt(result.start_timestamp_ms);
+			output_file << " --> ";
+			format_ts_for_srt(result.end_timestamp_ms);
+			output_file << std::endl;
+
+			output_file << result.text << std::endl;
+			output_file << std::endl;
+			output_file.close();
+			gf->sentence_number++;
+		}
 	} else {
 		if (!gf->text_source_mutex) {
 			obs_log(LOG_ERROR, "text_source_mutex is null");
@@ -292,6 +346,12 @@ void transcription_filter_update(void *data, obs_data_t *s)
 	bool step_by_step_processing = obs_data_get_bool(s, "step_by_step_processing");
 	gf->step_size_msec = step_by_step_processing ? (int)obs_data_get_int(s, "step_size_msec")
 						     : BUFFER_SIZE_MSEC;
+	gf->save_srt = obs_data_get_bool(s, "subtitle_save_srt");
+	gf->save_only_while_recording = obs_data_get_bool(s, "only_while_recording");
+	// Get the current timestamp using the system clock
+	gf->start_timestamp_ms = now_ms();
+	gf->sentence_number = 1;
+	gf->process_while_muted = obs_data_get_bool(s, "process_while_muted");

 	obs_log(gf->log_level, "transcription_filter: update text source");
 	// update the text source
@@ -468,6 +528,9 @@ void *transcription_filter_create(obs_data_t *settings, obs_source_t *filter)
 				     ? (int)obs_data_get_int(settings, "step_size_msec")
 				     : BUFFER_SIZE_MSEC;
 	gf->log_level = (int)obs_data_get_int(settings, "log_level");
+	gf->save_srt = obs_data_get_bool(settings, "subtitle_save_srt");
+	gf->save_only_while_recording = obs_data_get_bool(settings, "only_while_recording");
+	gf->process_while_muted = obs_data_get_bool(settings, "process_while_muted");

 	for (size_t i = 0; i < MAX_AUDIO_CHANNELS; i++) {
 		circlebuf_init(&gf->input_buffers[i]);
@@ -525,6 +588,28 @@ void *transcription_filter_create(obs_data_t *settings, obs_source_t *filter)

 	gf->active = true;

+	// handle the event OBS_FRONTEND_EVENT_RECORDING_STARTING to reset the srt sentence number
+	// to match the subtitles with the recording
+	obs_frontend_add_event_callback(
+		[](enum obs_frontend_event event, void *private_data) {
+			if (event == OBS_FRONTEND_EVENT_RECORDING_STARTING) {
+				struct transcription_filter_data *gf =
+					static_cast<struct transcription_filter_data *>(
+						private_data);
+				if (gf->save_srt && gf->save_only_while_recording) {
+					obs_log(gf->log_level,
+						"Recording started. Resetting srt file.");
+					// truncate file if it exists
+					std::ofstream output_file(gf->output_file_path,
+								  std::ios::out | std::ios::trunc);
+					output_file.close();
+					gf->sentence_number = 1;
+					gf->start_timestamp_ms = now_ms();
+				}
+			}
+		},
+		gf);
+
 	obs_log(gf->log_level, "transcription_filter: filter created.");
 	return gf;
 }
@@ -557,6 +642,9 @@ void transcription_filter_defaults(obs_data_t *s)
 	obs_data_set_default_string(s, "whisper_language_select", "en");
 	obs_data_set_default_string(s, "subtitle_sources", "none");
 	obs_data_set_default_bool(s, "step_by_step_processing", false);
+	obs_data_set_default_bool(s, "process_while_muted", false);
+	obs_data_set_default_bool(s, "subtitle_save_srt", false);
+	obs_data_set_default_bool(s, "only_while_recording", false);
 	obs_data_set_default_int(s, "step_size_msec", 1000);

 	// Whisper parameters
@@ -617,6 +705,7 @@ obs_properties_t *transcription_filter_properties(void *data)
 		return true;
 	});

+	obs_properties_add_bool(ppts, "process_while_muted", MT_("process_while_muted"));
 	obs_property_t *subs_output =
 		obs_properties_add_list(ppts, "subtitle_sources", MT_("subtitle_sources"),
 					OBS_COMBO_TYPE_LIST, OBS_COMBO_FORMAT_STRING);
@@ -628,21 +717,21 @@ obs_properties_t *transcription_filter_properties(void *data)

 	obs_properties_add_path(ppts, "subtitle_output_filename", MT_("output_filename"),
 				OBS_PATH_FILE_SAVE, "Text (*.txt)", NULL);
+	obs_properties_add_bool(ppts, "subtitle_save_srt", MT_("save_srt"));
+	obs_properties_add_bool(ppts, "only_while_recording", MT_("only_while_recording"));

 	obs_property_set_modified_callback(subs_output, [](obs_properties_t *props,
 							   obs_property_t *property,
 							   obs_data_t *settings) {
 		UNUSED_PARAMETER(property);
+		// Show or hide the output filename selection input
 		const char *new_output = obs_data_get_string(settings, "subtitle_sources");
-		if (strcmp(new_output, "text_file") == 0) {
-			// Show the output filename selection input
-			obs_property_set_visible(
-				obs_properties_get(props, "subtitle_output_filename"), true);
-		} else {
-			// Hide the output filename selection input
-			obs_property_set_visible(
-				obs_properties_get(props, "subtitle_output_filename"), false);
-		}
+		const bool show_hide = (strcmp(new_output, "text_file") == 0);
+		obs_property_set_visible(obs_properties_get(props, "subtitle_output_filename"),
+					 show_hide);
+		obs_property_set_visible(obs_properties_get(props, "subtitle_save_srt"), show_hide);
+		obs_property_set_visible(obs_properties_get(props, "only_while_recording"),
+					 show_hide);
 		return true;
 	});

--- a/src/whisper-processing.cpp
+++ b/src/whisper-processing.cpp
@@ -15,8 +15,8 @@
 // Taken from https://github.com/ggerganov/whisper.cpp/blob/master/examples/stream/stream.cpp
 std::string to_timestamp(int64_t t)
 {
-	int64_t sec = t / 100;
-	int64_t msec = t - sec * 100;
+	int64_t sec = t / 1000;
+	int64_t msec = t - sec * 1000;
 	int64_t min = sec / 60;
 	sec = sec - min * 60;

@@ -82,17 +82,6 @@ struct whisper_context *init_whisper_context(const std::string &model_path)
 	return ctx;
 }

-enum DetectionResult {
-	DETECTION_RESULT_UNKNOWN = 0,
-	DETECTION_RESULT_SILENCE = 1,
-	DETECTION_RESULT_SPEECH = 2,
-};
-
-struct DetectionResultWithText {
-	DetectionResult result;
-	std::string text;
-};
-
 struct DetectionResultWithText run_whisper_inference(struct transcription_filter_data *gf,
 						     const float *pcm32f_data, size_t pcm32f_size)
 {
@@ -103,9 +92,18 @@ struct DetectionResultWithText run_whisper_inference(struct transcription_filter
 	std::lock_guard<std::mutex> lock(*gf->whisper_ctx_mutex);
 	if (gf->whisper_context == nullptr) {
 		obs_log(LOG_WARNING, "whisper context is null");
-		return {DETECTION_RESULT_UNKNOWN, ""};
+		return {DETECTION_RESULT_UNKNOWN, "", 0, 0};
 	}

+	// set duration in ms
+	const uint64_t duration_ms = (uint64_t)(pcm32f_size * 1000 / WHISPER_SAMPLE_RATE);
+	// Get the duration in ms since the beginning of the stream (gf->start_timestamp_ms)
+	const uint64_t offset_ms =
+		(uint64_t)(std::chrono::duration_cast<std::chrono::milliseconds>(
+				   std::chrono::system_clock::now().time_since_epoch())
+				   .count() -
+			   gf->start_timestamp_ms);
+
 	// run the inference
 	int whisper_full_result = -1;
 	try {
@@ -115,17 +113,17 @@ struct DetectionResultWithText run_whisper_inference(struct transcription_filter
 		obs_log(LOG_ERROR, "Whisper exception: %s. Filter restart is required", e.what());
 		whisper_free(gf->whisper_context);
 		gf->whisper_context = nullptr;
-		return {DETECTION_RESULT_UNKNOWN, ""};
+		return {DETECTION_RESULT_UNKNOWN, "", 0, 0};
 	}

 	if (whisper_full_result != 0) {
 		obs_log(LOG_WARNING, "failed to process audio, error %d", whisper_full_result);
-		return {DETECTION_RESULT_UNKNOWN, ""};
+		return {DETECTION_RESULT_UNKNOWN, "", 0, 0};
 	} else {
 		const int n_segment = 0;
 		const char *text = whisper_full_get_segment_text(gf->whisper_context, n_segment);
-		const int64_t t0 = whisper_full_get_segment_t0(gf->whisper_context, n_segment);
-		const int64_t t1 = whisper_full_get_segment_t1(gf->whisper_context, n_segment);
+		const int64_t t0 = offset_ms;
+		const int64_t t1 = offset_ms + duration_ms;

 		float sentence_p = 0.0f;
 		const int n_tokens = whisper_full_n_tokens(gf->whisper_context, n_segment);
@@ -149,10 +147,10 @@ struct DetectionResultWithText run_whisper_inference(struct transcription_filter
 		}

 		if (text_lower.empty() || text_lower == ".") {
-			return {DETECTION_RESULT_SILENCE, ""};
+			return {DETECTION_RESULT_SILENCE, "", 0, 0};
 		}

-		return {DETECTION_RESULT_SPEECH, text_lower};
+		return {DETECTION_RESULT_SPEECH, text_lower, offset_ms, offset_ms + duration_ms};
 	}
 }

@@ -254,16 +252,16 @@ void process_audio_from_buffer(struct transcription_filter_data *gf)

 		if (inference_result.result == DETECTION_RESULT_SPEECH) {
 			// output inference result to a text source
-			set_text_callback(gf, inference_result.text);
+			set_text_callback(gf, inference_result);
 		} else if (inference_result.result == DETECTION_RESULT_SILENCE) {
 			// output inference result to a text source
-			set_text_callback(gf, "[silence]");
+			set_text_callback(gf, {inference_result.result, "[silence]", 0, 0});
 		}
 	} else {
 		if (gf->log_words) {
 			obs_log(LOG_INFO, "skipping inference");
 		}
-		set_text_callback(gf, "");
+		set_text_callback(gf, {DETECTION_RESULT_UNKNOWN, "[skip]", 0, 0});
 	}

 	// end of timer