srt saving

This commit is contained in:
Roy Shilkrot
2023-10-07 13:46:58 -04:00
parent 3d7d77b041
commit 9299e7592e
6 changed files with 161 additions and 45 deletions

View File

@@ -57,6 +57,12 @@ The plugin was built and tested on Mac OSX (Intel & Apple silicon), Windows and
Start by cloning this repo to a directory of your choice.
Remember to sync and fetch the submodules before building, e.g.
```sh
$ git submodule sync --recursive
$ git update --init --recursive
```
### Mac OSX
Using the CI pipeline scripts, locally you would just call the zsh script. By default this builds a universal binary for both Intel and Apple Silicon. To build for a specific architecture please see `.github/scripts/.build.zsh` for the `-arch` options.

View File

@@ -32,7 +32,7 @@ set(CPACK_SOURCE_IGNORE_FILES
# cmake-format: sortable
".*~$"
\\.git/
\\.github/
# \\.github/
\\.gitignore
build_.*
cmake/\\.CMakeBuildNumber

View File

@@ -36,4 +36,7 @@ suppress_blank="Suppress blank"
suppress_non_speech_tokens="Suppress non-speech tokens"
temperature="Temperature"
max_initial_ts="Max initial timestamps"
length_penalty="Length penalty"
length_penalty="Length penalty"
save_srt="Save in SRT format (no file truncation)"
only_while_recording="Write output only while recording"
process_while_muted="Process speech while source is muted"

View File

@@ -19,8 +19,21 @@
#define MT_ obs_module_text
enum DetectionResult {
DETECTION_RESULT_UNKNOWN = 0,
DETECTION_RESULT_SILENCE = 1,
DETECTION_RESULT_SPEECH = 2,
};
struct DetectionResultWithText {
DetectionResult result;
std::string text;
uint64_t start_timestamp_ms;
uint64_t end_timestamp_ms;
};
struct transcription_filter_data {
obs_source_t *context; // obs input source
obs_source_t *context; // obs filter source (this filter)
size_t channels; // number of channels
uint32_t sample_rate; // input sample rate
// How many input frames (in input sample rate) are needed for the next whisper frame
@@ -32,6 +45,10 @@ struct transcription_filter_data {
size_t last_num_frames;
// Milliseconds per processing step (e.g. rest of the whisper buffer may be filled with silence)
size_t step_size_msec;
// Start begining timestamp in ms since epoch
uint64_t start_timestamp_ms;
// Sentence counter for srt
size_t sentence_number;
/* PCM buffers */
float *copy_buffers[MAX_PREPROC_CHANNELS];
@@ -54,13 +71,16 @@ struct transcription_filter_data {
bool log_words;
bool caption_to_stream;
bool active = false;
bool save_srt = false;
bool save_only_while_recording = false;
bool process_while_muted = false;
// Text source to output the subtitles
obs_weak_source_t *text_source = nullptr;
char *text_source_name = nullptr;
std::mutex *text_source_mutex = nullptr;
// Callback to set the text in the output text source (subtitles)
std::function<void(const std::string &str)> setTextCallback;
std::function<void(const DetectionResultWithText &result)> setTextCallback;
// Output file path to write the subtitles
std::string output_file_path = "";
std::string whisper_model_file_currently_loaded = "";
@@ -79,6 +99,6 @@ struct transcription_filter_audio_info {
uint64_t timestamp;
};
void set_text_callback(struct transcription_filter_data *gf, const std::string &str);
void set_text_callback(struct transcription_filter_data *gf, const DetectionResultWithText &str);
#endif /* TRANSCRIPTION_FILTER_DATA_H */

View File

@@ -41,6 +41,13 @@ inline enum speaker_layout convert_speaker_layout(uint8_t channels)
}
}
inline uint64_t now_ms()
{
return std::chrono::duration_cast<std::chrono::milliseconds>(
std::chrono::system_clock::now().time_since_epoch())
.count();
}
bool add_sources_to_list(void *list_property, obs_source_t *source)
{
auto source_id = obs_source_get_id(source);
@@ -71,6 +78,13 @@ struct obs_audio_data *transcription_filter_filter_audio(void *data, struct obs_
return audio;
}
// Check if the parent source is muted
obs_source_t *parent_source = obs_filter_get_parent(gf->context);
if (gf->process_while_muted == false && obs_source_muted(parent_source)) {
// Source is muted, do not process audio
return audio;
}
if (gf->whisper_context == nullptr) {
// Whisper not initialized, just pass through
return audio;
@@ -179,13 +193,13 @@ void acquire_weak_text_source_ref(struct transcription_filter_data *gf)
}
}
void set_text_callback(struct transcription_filter_data *gf, const std::string &str)
void set_text_callback(struct transcription_filter_data *gf, const DetectionResultWithText &result)
{
#ifdef _WIN32
// Russian UTF8 charset on Windows output has a bug, instead of 0xd? it outputs
// 0xf?, so we need to replace it. This doesn't affect any other charset, which
// outputs the correct UTF8 output. (Except maybe for Greek?)
std::string str_copy = str;
std::string str_copy = result.text;
for (size_t i = 0; i < str_copy.size(); ++i) {
// if the char MSBs starts with 0xf replace the MSBs with 0xd
if ((str_copy.c_str()[i] & 0xf0) == 0xf0) {
@@ -193,21 +207,61 @@ void set_text_callback(struct transcription_filter_data *gf, const std::string &
}
}
#else
std::string str_copy = str;
std::string str_copy = result.text;
#endif
if (gf->caption_to_stream) {
obs_output_t *streaming_output = obs_frontend_get_streaming_output();
if (streaming_output) {
obs_output_output_caption_text1(streaming_output, str.c_str());
obs_output_output_caption_text1(streaming_output, result.text.c_str());
obs_output_release(streaming_output);
}
}
if (gf->output_file_path != "" && !gf->text_source_name) {
// Write to file, do not append
std::ofstream output_file(gf->output_file_path, std::ios::out | std::ios::trunc);
output_file << str;
output_file.close();
// Check if we should save the sentence
if (gf->save_only_while_recording && !obs_frontend_recording_active()) {
// We are not recording, do not save the sentence to file
return;
}
if (!gf->save_srt) {
// Write raw sentence to file, do not append
std::ofstream output_file(gf->output_file_path,
std::ios::out | std::ios::trunc);
output_file << result.text << std::endl;
output_file.close();
} else {
obs_log(gf->log_level, "Saving sentence to file %s, sentence #%d",
gf->output_file_path.c_str(), gf->sentence_number);
// Append sentence to file in .srt format
std::ofstream output_file(gf->output_file_path,
std::ios::out | std::ios::app);
output_file << gf->sentence_number << std::endl;
// use the start and end timestamps to calculate the start and end time in srt format
auto format_ts_for_srt = [&output_file](uint64_t ts) {
uint64_t time_s = ts / 1000;
uint64_t time_m = time_s / 60;
uint64_t time_h = time_m / 60;
uint64_t time_ms_rem = ts % 1000;
uint64_t time_s_rem = time_s % 60;
uint64_t time_m_rem = time_m % 60;
uint64_t time_h_rem = time_h % 60;
output_file << std::setfill('0') << std::setw(2) << time_h_rem
<< ":" << std::setfill('0') << std::setw(2)
<< time_m_rem << ":" << std::setfill('0')
<< std::setw(2) << time_s_rem << ","
<< std::setfill('0') << std::setw(3) << time_ms_rem;
};
format_ts_for_srt(result.start_timestamp_ms);
output_file << " --> ";
format_ts_for_srt(result.end_timestamp_ms);
output_file << std::endl;
output_file << result.text << std::endl;
output_file << std::endl;
output_file.close();
gf->sentence_number++;
}
} else {
if (!gf->text_source_mutex) {
obs_log(LOG_ERROR, "text_source_mutex is null");
@@ -292,6 +346,12 @@ void transcription_filter_update(void *data, obs_data_t *s)
bool step_by_step_processing = obs_data_get_bool(s, "step_by_step_processing");
gf->step_size_msec = step_by_step_processing ? (int)obs_data_get_int(s, "step_size_msec")
: BUFFER_SIZE_MSEC;
gf->save_srt = obs_data_get_bool(s, "subtitle_save_srt");
gf->save_only_while_recording = obs_data_get_bool(s, "only_while_recording");
// Get the current timestamp using the system clock
gf->start_timestamp_ms = now_ms();
gf->sentence_number = 1;
gf->process_while_muted = obs_data_get_bool(s, "process_while_muted");
obs_log(gf->log_level, "transcription_filter: update text source");
// update the text source
@@ -468,6 +528,9 @@ void *transcription_filter_create(obs_data_t *settings, obs_source_t *filter)
? (int)obs_data_get_int(settings, "step_size_msec")
: BUFFER_SIZE_MSEC;
gf->log_level = (int)obs_data_get_int(settings, "log_level");
gf->save_srt = obs_data_get_bool(settings, "subtitle_save_srt");
gf->save_only_while_recording = obs_data_get_bool(settings, "only_while_recording");
gf->process_while_muted = obs_data_get_bool(settings, "process_while_muted");
for (size_t i = 0; i < MAX_AUDIO_CHANNELS; i++) {
circlebuf_init(&gf->input_buffers[i]);
@@ -525,6 +588,28 @@ void *transcription_filter_create(obs_data_t *settings, obs_source_t *filter)
gf->active = true;
// handle the event OBS_FRONTEND_EVENT_RECORDING_STARTING to reset the srt sentence number
// to match the subtitles with the recording
obs_frontend_add_event_callback(
[](enum obs_frontend_event event, void *private_data) {
if (event == OBS_FRONTEND_EVENT_RECORDING_STARTING) {
struct transcription_filter_data *gf =
static_cast<struct transcription_filter_data *>(
private_data);
if (gf->save_srt && gf->save_only_while_recording) {
obs_log(gf->log_level,
"Recording started. Resetting srt file.");
// truncate file if it exists
std::ofstream output_file(gf->output_file_path,
std::ios::out | std::ios::trunc);
output_file.close();
gf->sentence_number = 1;
gf->start_timestamp_ms = now_ms();
}
}
},
gf);
obs_log(gf->log_level, "transcription_filter: filter created.");
return gf;
}
@@ -557,6 +642,9 @@ void transcription_filter_defaults(obs_data_t *s)
obs_data_set_default_string(s, "whisper_language_select", "en");
obs_data_set_default_string(s, "subtitle_sources", "none");
obs_data_set_default_bool(s, "step_by_step_processing", false);
obs_data_set_default_bool(s, "process_while_muted", false);
obs_data_set_default_bool(s, "subtitle_save_srt", false);
obs_data_set_default_bool(s, "only_while_recording", false);
obs_data_set_default_int(s, "step_size_msec", 1000);
// Whisper parameters
@@ -617,6 +705,7 @@ obs_properties_t *transcription_filter_properties(void *data)
return true;
});
obs_properties_add_bool(ppts, "process_while_muted", MT_("process_while_muted"));
obs_property_t *subs_output =
obs_properties_add_list(ppts, "subtitle_sources", MT_("subtitle_sources"),
OBS_COMBO_TYPE_LIST, OBS_COMBO_FORMAT_STRING);
@@ -628,21 +717,21 @@ obs_properties_t *transcription_filter_properties(void *data)
obs_properties_add_path(ppts, "subtitle_output_filename", MT_("output_filename"),
OBS_PATH_FILE_SAVE, "Text (*.txt)", NULL);
obs_properties_add_bool(ppts, "subtitle_save_srt", MT_("save_srt"));
obs_properties_add_bool(ppts, "only_while_recording", MT_("only_while_recording"));
obs_property_set_modified_callback(subs_output, [](obs_properties_t *props,
obs_property_t *property,
obs_data_t *settings) {
UNUSED_PARAMETER(property);
// Show or hide the output filename selection input
const char *new_output = obs_data_get_string(settings, "subtitle_sources");
if (strcmp(new_output, "text_file") == 0) {
// Show the output filename selection input
obs_property_set_visible(
obs_properties_get(props, "subtitle_output_filename"), true);
} else {
// Hide the output filename selection input
obs_property_set_visible(
obs_properties_get(props, "subtitle_output_filename"), false);
}
const bool show_hide = (strcmp(new_output, "text_file") == 0);
obs_property_set_visible(obs_properties_get(props, "subtitle_output_filename"),
show_hide);
obs_property_set_visible(obs_properties_get(props, "subtitle_save_srt"), show_hide);
obs_property_set_visible(obs_properties_get(props, "only_while_recording"),
show_hide);
return true;
});

View File

@@ -15,8 +15,8 @@
// Taken from https://github.com/ggerganov/whisper.cpp/blob/master/examples/stream/stream.cpp
std::string to_timestamp(int64_t t)
{
int64_t sec = t / 100;
int64_t msec = t - sec * 100;
int64_t sec = t / 1000;
int64_t msec = t - sec * 1000;
int64_t min = sec / 60;
sec = sec - min * 60;
@@ -82,17 +82,6 @@ struct whisper_context *init_whisper_context(const std::string &model_path)
return ctx;
}
enum DetectionResult {
DETECTION_RESULT_UNKNOWN = 0,
DETECTION_RESULT_SILENCE = 1,
DETECTION_RESULT_SPEECH = 2,
};
struct DetectionResultWithText {
DetectionResult result;
std::string text;
};
struct DetectionResultWithText run_whisper_inference(struct transcription_filter_data *gf,
const float *pcm32f_data, size_t pcm32f_size)
{
@@ -103,9 +92,18 @@ struct DetectionResultWithText run_whisper_inference(struct transcription_filter
std::lock_guard<std::mutex> lock(*gf->whisper_ctx_mutex);
if (gf->whisper_context == nullptr) {
obs_log(LOG_WARNING, "whisper context is null");
return {DETECTION_RESULT_UNKNOWN, ""};
return {DETECTION_RESULT_UNKNOWN, "", 0, 0};
}
// set duration in ms
const uint64_t duration_ms = (uint64_t)(pcm32f_size * 1000 / WHISPER_SAMPLE_RATE);
// Get the duration in ms since the beginning of the stream (gf->start_timestamp_ms)
const uint64_t offset_ms =
(uint64_t)(std::chrono::duration_cast<std::chrono::milliseconds>(
std::chrono::system_clock::now().time_since_epoch())
.count() -
gf->start_timestamp_ms);
// run the inference
int whisper_full_result = -1;
try {
@@ -115,17 +113,17 @@ struct DetectionResultWithText run_whisper_inference(struct transcription_filter
obs_log(LOG_ERROR, "Whisper exception: %s. Filter restart is required", e.what());
whisper_free(gf->whisper_context);
gf->whisper_context = nullptr;
return {DETECTION_RESULT_UNKNOWN, ""};
return {DETECTION_RESULT_UNKNOWN, "", 0, 0};
}
if (whisper_full_result != 0) {
obs_log(LOG_WARNING, "failed to process audio, error %d", whisper_full_result);
return {DETECTION_RESULT_UNKNOWN, ""};
return {DETECTION_RESULT_UNKNOWN, "", 0, 0};
} else {
const int n_segment = 0;
const char *text = whisper_full_get_segment_text(gf->whisper_context, n_segment);
const int64_t t0 = whisper_full_get_segment_t0(gf->whisper_context, n_segment);
const int64_t t1 = whisper_full_get_segment_t1(gf->whisper_context, n_segment);
const int64_t t0 = offset_ms;
const int64_t t1 = offset_ms + duration_ms;
float sentence_p = 0.0f;
const int n_tokens = whisper_full_n_tokens(gf->whisper_context, n_segment);
@@ -149,10 +147,10 @@ struct DetectionResultWithText run_whisper_inference(struct transcription_filter
}
if (text_lower.empty() || text_lower == ".") {
return {DETECTION_RESULT_SILENCE, ""};
return {DETECTION_RESULT_SILENCE, "", 0, 0};
}
return {DETECTION_RESULT_SPEECH, text_lower};
return {DETECTION_RESULT_SPEECH, text_lower, offset_ms, offset_ms + duration_ms};
}
}
@@ -254,16 +252,16 @@ void process_audio_from_buffer(struct transcription_filter_data *gf)
if (inference_result.result == DETECTION_RESULT_SPEECH) {
// output inference result to a text source
set_text_callback(gf, inference_result.text);
set_text_callback(gf, inference_result);
} else if (inference_result.result == DETECTION_RESULT_SILENCE) {
// output inference result to a text source
set_text_callback(gf, "[silence]");
set_text_callback(gf, {inference_result.result, "[silence]", 0, 0});
}
} else {
if (gf->log_words) {
obs_log(LOG_INFO, "skipping inference");
}
set_text_callback(gf, "");
set_text_callback(gf, {DETECTION_RESULT_UNKNOWN, "[skip]", 0, 0});
}
// end of timer