mirror of
https://github.com/royshil/obs-localvocal.git
synced 2026-01-09 12:28:05 -05:00
srt saving
This commit is contained in:
@@ -57,6 +57,12 @@ The plugin was built and tested on Mac OSX (Intel & Apple silicon), Windows and
|
||||
|
||||
Start by cloning this repo to a directory of your choice.
|
||||
|
||||
Remember to sync and fetch the submodules before building, e.g.
|
||||
```sh
|
||||
$ git submodule sync --recursive
|
||||
$ git update --init --recursive
|
||||
```
|
||||
|
||||
### Mac OSX
|
||||
|
||||
Using the CI pipeline scripts, locally you would just call the zsh script. By default this builds a universal binary for both Intel and Apple Silicon. To build for a specific architecture please see `.github/scripts/.build.zsh` for the `-arch` options.
|
||||
|
||||
@@ -32,7 +32,7 @@ set(CPACK_SOURCE_IGNORE_FILES
|
||||
# cmake-format: sortable
|
||||
".*~$"
|
||||
\\.git/
|
||||
\\.github/
|
||||
# \\.github/
|
||||
\\.gitignore
|
||||
build_.*
|
||||
cmake/\\.CMakeBuildNumber
|
||||
|
||||
@@ -36,4 +36,7 @@ suppress_blank="Suppress blank"
|
||||
suppress_non_speech_tokens="Suppress non-speech tokens"
|
||||
temperature="Temperature"
|
||||
max_initial_ts="Max initial timestamps"
|
||||
length_penalty="Length penalty"
|
||||
length_penalty="Length penalty"
|
||||
save_srt="Save in SRT format (no file truncation)"
|
||||
only_while_recording="Write output only while recording"
|
||||
process_while_muted="Process speech while source is muted"
|
||||
|
||||
@@ -19,8 +19,21 @@
|
||||
|
||||
#define MT_ obs_module_text
|
||||
|
||||
enum DetectionResult {
|
||||
DETECTION_RESULT_UNKNOWN = 0,
|
||||
DETECTION_RESULT_SILENCE = 1,
|
||||
DETECTION_RESULT_SPEECH = 2,
|
||||
};
|
||||
|
||||
struct DetectionResultWithText {
|
||||
DetectionResult result;
|
||||
std::string text;
|
||||
uint64_t start_timestamp_ms;
|
||||
uint64_t end_timestamp_ms;
|
||||
};
|
||||
|
||||
struct transcription_filter_data {
|
||||
obs_source_t *context; // obs input source
|
||||
obs_source_t *context; // obs filter source (this filter)
|
||||
size_t channels; // number of channels
|
||||
uint32_t sample_rate; // input sample rate
|
||||
// How many input frames (in input sample rate) are needed for the next whisper frame
|
||||
@@ -32,6 +45,10 @@ struct transcription_filter_data {
|
||||
size_t last_num_frames;
|
||||
// Milliseconds per processing step (e.g. rest of the whisper buffer may be filled with silence)
|
||||
size_t step_size_msec;
|
||||
// Start begining timestamp in ms since epoch
|
||||
uint64_t start_timestamp_ms;
|
||||
// Sentence counter for srt
|
||||
size_t sentence_number;
|
||||
|
||||
/* PCM buffers */
|
||||
float *copy_buffers[MAX_PREPROC_CHANNELS];
|
||||
@@ -54,13 +71,16 @@ struct transcription_filter_data {
|
||||
bool log_words;
|
||||
bool caption_to_stream;
|
||||
bool active = false;
|
||||
bool save_srt = false;
|
||||
bool save_only_while_recording = false;
|
||||
bool process_while_muted = false;
|
||||
|
||||
// Text source to output the subtitles
|
||||
obs_weak_source_t *text_source = nullptr;
|
||||
char *text_source_name = nullptr;
|
||||
std::mutex *text_source_mutex = nullptr;
|
||||
// Callback to set the text in the output text source (subtitles)
|
||||
std::function<void(const std::string &str)> setTextCallback;
|
||||
std::function<void(const DetectionResultWithText &result)> setTextCallback;
|
||||
// Output file path to write the subtitles
|
||||
std::string output_file_path = "";
|
||||
std::string whisper_model_file_currently_loaded = "";
|
||||
@@ -79,6 +99,6 @@ struct transcription_filter_audio_info {
|
||||
uint64_t timestamp;
|
||||
};
|
||||
|
||||
void set_text_callback(struct transcription_filter_data *gf, const std::string &str);
|
||||
void set_text_callback(struct transcription_filter_data *gf, const DetectionResultWithText &str);
|
||||
|
||||
#endif /* TRANSCRIPTION_FILTER_DATA_H */
|
||||
|
||||
@@ -41,6 +41,13 @@ inline enum speaker_layout convert_speaker_layout(uint8_t channels)
|
||||
}
|
||||
}
|
||||
|
||||
inline uint64_t now_ms()
|
||||
{
|
||||
return std::chrono::duration_cast<std::chrono::milliseconds>(
|
||||
std::chrono::system_clock::now().time_since_epoch())
|
||||
.count();
|
||||
}
|
||||
|
||||
bool add_sources_to_list(void *list_property, obs_source_t *source)
|
||||
{
|
||||
auto source_id = obs_source_get_id(source);
|
||||
@@ -71,6 +78,13 @@ struct obs_audio_data *transcription_filter_filter_audio(void *data, struct obs_
|
||||
return audio;
|
||||
}
|
||||
|
||||
// Check if the parent source is muted
|
||||
obs_source_t *parent_source = obs_filter_get_parent(gf->context);
|
||||
if (gf->process_while_muted == false && obs_source_muted(parent_source)) {
|
||||
// Source is muted, do not process audio
|
||||
return audio;
|
||||
}
|
||||
|
||||
if (gf->whisper_context == nullptr) {
|
||||
// Whisper not initialized, just pass through
|
||||
return audio;
|
||||
@@ -179,13 +193,13 @@ void acquire_weak_text_source_ref(struct transcription_filter_data *gf)
|
||||
}
|
||||
}
|
||||
|
||||
void set_text_callback(struct transcription_filter_data *gf, const std::string &str)
|
||||
void set_text_callback(struct transcription_filter_data *gf, const DetectionResultWithText &result)
|
||||
{
|
||||
#ifdef _WIN32
|
||||
// Russian UTF8 charset on Windows output has a bug, instead of 0xd? it outputs
|
||||
// 0xf?, so we need to replace it. This doesn't affect any other charset, which
|
||||
// outputs the correct UTF8 output. (Except maybe for Greek?)
|
||||
std::string str_copy = str;
|
||||
std::string str_copy = result.text;
|
||||
for (size_t i = 0; i < str_copy.size(); ++i) {
|
||||
// if the char MSBs starts with 0xf replace the MSBs with 0xd
|
||||
if ((str_copy.c_str()[i] & 0xf0) == 0xf0) {
|
||||
@@ -193,21 +207,61 @@ void set_text_callback(struct transcription_filter_data *gf, const std::string &
|
||||
}
|
||||
}
|
||||
#else
|
||||
std::string str_copy = str;
|
||||
std::string str_copy = result.text;
|
||||
#endif
|
||||
|
||||
if (gf->caption_to_stream) {
|
||||
obs_output_t *streaming_output = obs_frontend_get_streaming_output();
|
||||
if (streaming_output) {
|
||||
obs_output_output_caption_text1(streaming_output, str.c_str());
|
||||
obs_output_output_caption_text1(streaming_output, result.text.c_str());
|
||||
obs_output_release(streaming_output);
|
||||
}
|
||||
}
|
||||
|
||||
if (gf->output_file_path != "" && !gf->text_source_name) {
|
||||
// Write to file, do not append
|
||||
std::ofstream output_file(gf->output_file_path, std::ios::out | std::ios::trunc);
|
||||
output_file << str;
|
||||
output_file.close();
|
||||
// Check if we should save the sentence
|
||||
if (gf->save_only_while_recording && !obs_frontend_recording_active()) {
|
||||
// We are not recording, do not save the sentence to file
|
||||
return;
|
||||
}
|
||||
if (!gf->save_srt) {
|
||||
// Write raw sentence to file, do not append
|
||||
std::ofstream output_file(gf->output_file_path,
|
||||
std::ios::out | std::ios::trunc);
|
||||
output_file << result.text << std::endl;
|
||||
output_file.close();
|
||||
} else {
|
||||
obs_log(gf->log_level, "Saving sentence to file %s, sentence #%d",
|
||||
gf->output_file_path.c_str(), gf->sentence_number);
|
||||
// Append sentence to file in .srt format
|
||||
std::ofstream output_file(gf->output_file_path,
|
||||
std::ios::out | std::ios::app);
|
||||
output_file << gf->sentence_number << std::endl;
|
||||
// use the start and end timestamps to calculate the start and end time in srt format
|
||||
auto format_ts_for_srt = [&output_file](uint64_t ts) {
|
||||
uint64_t time_s = ts / 1000;
|
||||
uint64_t time_m = time_s / 60;
|
||||
uint64_t time_h = time_m / 60;
|
||||
uint64_t time_ms_rem = ts % 1000;
|
||||
uint64_t time_s_rem = time_s % 60;
|
||||
uint64_t time_m_rem = time_m % 60;
|
||||
uint64_t time_h_rem = time_h % 60;
|
||||
output_file << std::setfill('0') << std::setw(2) << time_h_rem
|
||||
<< ":" << std::setfill('0') << std::setw(2)
|
||||
<< time_m_rem << ":" << std::setfill('0')
|
||||
<< std::setw(2) << time_s_rem << ","
|
||||
<< std::setfill('0') << std::setw(3) << time_ms_rem;
|
||||
};
|
||||
format_ts_for_srt(result.start_timestamp_ms);
|
||||
output_file << " --> ";
|
||||
format_ts_for_srt(result.end_timestamp_ms);
|
||||
output_file << std::endl;
|
||||
|
||||
output_file << result.text << std::endl;
|
||||
output_file << std::endl;
|
||||
output_file.close();
|
||||
gf->sentence_number++;
|
||||
}
|
||||
} else {
|
||||
if (!gf->text_source_mutex) {
|
||||
obs_log(LOG_ERROR, "text_source_mutex is null");
|
||||
@@ -292,6 +346,12 @@ void transcription_filter_update(void *data, obs_data_t *s)
|
||||
bool step_by_step_processing = obs_data_get_bool(s, "step_by_step_processing");
|
||||
gf->step_size_msec = step_by_step_processing ? (int)obs_data_get_int(s, "step_size_msec")
|
||||
: BUFFER_SIZE_MSEC;
|
||||
gf->save_srt = obs_data_get_bool(s, "subtitle_save_srt");
|
||||
gf->save_only_while_recording = obs_data_get_bool(s, "only_while_recording");
|
||||
// Get the current timestamp using the system clock
|
||||
gf->start_timestamp_ms = now_ms();
|
||||
gf->sentence_number = 1;
|
||||
gf->process_while_muted = obs_data_get_bool(s, "process_while_muted");
|
||||
|
||||
obs_log(gf->log_level, "transcription_filter: update text source");
|
||||
// update the text source
|
||||
@@ -468,6 +528,9 @@ void *transcription_filter_create(obs_data_t *settings, obs_source_t *filter)
|
||||
? (int)obs_data_get_int(settings, "step_size_msec")
|
||||
: BUFFER_SIZE_MSEC;
|
||||
gf->log_level = (int)obs_data_get_int(settings, "log_level");
|
||||
gf->save_srt = obs_data_get_bool(settings, "subtitle_save_srt");
|
||||
gf->save_only_while_recording = obs_data_get_bool(settings, "only_while_recording");
|
||||
gf->process_while_muted = obs_data_get_bool(settings, "process_while_muted");
|
||||
|
||||
for (size_t i = 0; i < MAX_AUDIO_CHANNELS; i++) {
|
||||
circlebuf_init(&gf->input_buffers[i]);
|
||||
@@ -525,6 +588,28 @@ void *transcription_filter_create(obs_data_t *settings, obs_source_t *filter)
|
||||
|
||||
gf->active = true;
|
||||
|
||||
// handle the event OBS_FRONTEND_EVENT_RECORDING_STARTING to reset the srt sentence number
|
||||
// to match the subtitles with the recording
|
||||
obs_frontend_add_event_callback(
|
||||
[](enum obs_frontend_event event, void *private_data) {
|
||||
if (event == OBS_FRONTEND_EVENT_RECORDING_STARTING) {
|
||||
struct transcription_filter_data *gf =
|
||||
static_cast<struct transcription_filter_data *>(
|
||||
private_data);
|
||||
if (gf->save_srt && gf->save_only_while_recording) {
|
||||
obs_log(gf->log_level,
|
||||
"Recording started. Resetting srt file.");
|
||||
// truncate file if it exists
|
||||
std::ofstream output_file(gf->output_file_path,
|
||||
std::ios::out | std::ios::trunc);
|
||||
output_file.close();
|
||||
gf->sentence_number = 1;
|
||||
gf->start_timestamp_ms = now_ms();
|
||||
}
|
||||
}
|
||||
},
|
||||
gf);
|
||||
|
||||
obs_log(gf->log_level, "transcription_filter: filter created.");
|
||||
return gf;
|
||||
}
|
||||
@@ -557,6 +642,9 @@ void transcription_filter_defaults(obs_data_t *s)
|
||||
obs_data_set_default_string(s, "whisper_language_select", "en");
|
||||
obs_data_set_default_string(s, "subtitle_sources", "none");
|
||||
obs_data_set_default_bool(s, "step_by_step_processing", false);
|
||||
obs_data_set_default_bool(s, "process_while_muted", false);
|
||||
obs_data_set_default_bool(s, "subtitle_save_srt", false);
|
||||
obs_data_set_default_bool(s, "only_while_recording", false);
|
||||
obs_data_set_default_int(s, "step_size_msec", 1000);
|
||||
|
||||
// Whisper parameters
|
||||
@@ -617,6 +705,7 @@ obs_properties_t *transcription_filter_properties(void *data)
|
||||
return true;
|
||||
});
|
||||
|
||||
obs_properties_add_bool(ppts, "process_while_muted", MT_("process_while_muted"));
|
||||
obs_property_t *subs_output =
|
||||
obs_properties_add_list(ppts, "subtitle_sources", MT_("subtitle_sources"),
|
||||
OBS_COMBO_TYPE_LIST, OBS_COMBO_FORMAT_STRING);
|
||||
@@ -628,21 +717,21 @@ obs_properties_t *transcription_filter_properties(void *data)
|
||||
|
||||
obs_properties_add_path(ppts, "subtitle_output_filename", MT_("output_filename"),
|
||||
OBS_PATH_FILE_SAVE, "Text (*.txt)", NULL);
|
||||
obs_properties_add_bool(ppts, "subtitle_save_srt", MT_("save_srt"));
|
||||
obs_properties_add_bool(ppts, "only_while_recording", MT_("only_while_recording"));
|
||||
|
||||
obs_property_set_modified_callback(subs_output, [](obs_properties_t *props,
|
||||
obs_property_t *property,
|
||||
obs_data_t *settings) {
|
||||
UNUSED_PARAMETER(property);
|
||||
// Show or hide the output filename selection input
|
||||
const char *new_output = obs_data_get_string(settings, "subtitle_sources");
|
||||
if (strcmp(new_output, "text_file") == 0) {
|
||||
// Show the output filename selection input
|
||||
obs_property_set_visible(
|
||||
obs_properties_get(props, "subtitle_output_filename"), true);
|
||||
} else {
|
||||
// Hide the output filename selection input
|
||||
obs_property_set_visible(
|
||||
obs_properties_get(props, "subtitle_output_filename"), false);
|
||||
}
|
||||
const bool show_hide = (strcmp(new_output, "text_file") == 0);
|
||||
obs_property_set_visible(obs_properties_get(props, "subtitle_output_filename"),
|
||||
show_hide);
|
||||
obs_property_set_visible(obs_properties_get(props, "subtitle_save_srt"), show_hide);
|
||||
obs_property_set_visible(obs_properties_get(props, "only_while_recording"),
|
||||
show_hide);
|
||||
return true;
|
||||
});
|
||||
|
||||
|
||||
@@ -15,8 +15,8 @@
|
||||
// Taken from https://github.com/ggerganov/whisper.cpp/blob/master/examples/stream/stream.cpp
|
||||
std::string to_timestamp(int64_t t)
|
||||
{
|
||||
int64_t sec = t / 100;
|
||||
int64_t msec = t - sec * 100;
|
||||
int64_t sec = t / 1000;
|
||||
int64_t msec = t - sec * 1000;
|
||||
int64_t min = sec / 60;
|
||||
sec = sec - min * 60;
|
||||
|
||||
@@ -82,17 +82,6 @@ struct whisper_context *init_whisper_context(const std::string &model_path)
|
||||
return ctx;
|
||||
}
|
||||
|
||||
enum DetectionResult {
|
||||
DETECTION_RESULT_UNKNOWN = 0,
|
||||
DETECTION_RESULT_SILENCE = 1,
|
||||
DETECTION_RESULT_SPEECH = 2,
|
||||
};
|
||||
|
||||
struct DetectionResultWithText {
|
||||
DetectionResult result;
|
||||
std::string text;
|
||||
};
|
||||
|
||||
struct DetectionResultWithText run_whisper_inference(struct transcription_filter_data *gf,
|
||||
const float *pcm32f_data, size_t pcm32f_size)
|
||||
{
|
||||
@@ -103,9 +92,18 @@ struct DetectionResultWithText run_whisper_inference(struct transcription_filter
|
||||
std::lock_guard<std::mutex> lock(*gf->whisper_ctx_mutex);
|
||||
if (gf->whisper_context == nullptr) {
|
||||
obs_log(LOG_WARNING, "whisper context is null");
|
||||
return {DETECTION_RESULT_UNKNOWN, ""};
|
||||
return {DETECTION_RESULT_UNKNOWN, "", 0, 0};
|
||||
}
|
||||
|
||||
// set duration in ms
|
||||
const uint64_t duration_ms = (uint64_t)(pcm32f_size * 1000 / WHISPER_SAMPLE_RATE);
|
||||
// Get the duration in ms since the beginning of the stream (gf->start_timestamp_ms)
|
||||
const uint64_t offset_ms =
|
||||
(uint64_t)(std::chrono::duration_cast<std::chrono::milliseconds>(
|
||||
std::chrono::system_clock::now().time_since_epoch())
|
||||
.count() -
|
||||
gf->start_timestamp_ms);
|
||||
|
||||
// run the inference
|
||||
int whisper_full_result = -1;
|
||||
try {
|
||||
@@ -115,17 +113,17 @@ struct DetectionResultWithText run_whisper_inference(struct transcription_filter
|
||||
obs_log(LOG_ERROR, "Whisper exception: %s. Filter restart is required", e.what());
|
||||
whisper_free(gf->whisper_context);
|
||||
gf->whisper_context = nullptr;
|
||||
return {DETECTION_RESULT_UNKNOWN, ""};
|
||||
return {DETECTION_RESULT_UNKNOWN, "", 0, 0};
|
||||
}
|
||||
|
||||
if (whisper_full_result != 0) {
|
||||
obs_log(LOG_WARNING, "failed to process audio, error %d", whisper_full_result);
|
||||
return {DETECTION_RESULT_UNKNOWN, ""};
|
||||
return {DETECTION_RESULT_UNKNOWN, "", 0, 0};
|
||||
} else {
|
||||
const int n_segment = 0;
|
||||
const char *text = whisper_full_get_segment_text(gf->whisper_context, n_segment);
|
||||
const int64_t t0 = whisper_full_get_segment_t0(gf->whisper_context, n_segment);
|
||||
const int64_t t1 = whisper_full_get_segment_t1(gf->whisper_context, n_segment);
|
||||
const int64_t t0 = offset_ms;
|
||||
const int64_t t1 = offset_ms + duration_ms;
|
||||
|
||||
float sentence_p = 0.0f;
|
||||
const int n_tokens = whisper_full_n_tokens(gf->whisper_context, n_segment);
|
||||
@@ -149,10 +147,10 @@ struct DetectionResultWithText run_whisper_inference(struct transcription_filter
|
||||
}
|
||||
|
||||
if (text_lower.empty() || text_lower == ".") {
|
||||
return {DETECTION_RESULT_SILENCE, ""};
|
||||
return {DETECTION_RESULT_SILENCE, "", 0, 0};
|
||||
}
|
||||
|
||||
return {DETECTION_RESULT_SPEECH, text_lower};
|
||||
return {DETECTION_RESULT_SPEECH, text_lower, offset_ms, offset_ms + duration_ms};
|
||||
}
|
||||
}
|
||||
|
||||
@@ -254,16 +252,16 @@ void process_audio_from_buffer(struct transcription_filter_data *gf)
|
||||
|
||||
if (inference_result.result == DETECTION_RESULT_SPEECH) {
|
||||
// output inference result to a text source
|
||||
set_text_callback(gf, inference_result.text);
|
||||
set_text_callback(gf, inference_result);
|
||||
} else if (inference_result.result == DETECTION_RESULT_SILENCE) {
|
||||
// output inference result to a text source
|
||||
set_text_callback(gf, "[silence]");
|
||||
set_text_callback(gf, {inference_result.result, "[silence]", 0, 0});
|
||||
}
|
||||
} else {
|
||||
if (gf->log_words) {
|
||||
obs_log(LOG_INFO, "skipping inference");
|
||||
}
|
||||
set_text_callback(gf, "");
|
||||
set_text_callback(gf, {DETECTION_RESULT_UNKNOWN, "[skip]", 0, 0});
|
||||
}
|
||||
|
||||
// end of timer
|
||||
|
||||
Reference in New Issue
Block a user