mirror of
https://github.com/royshil/obs-localvocal.git
synced 2026-01-08 20:08:08 -05:00
refactor: Enable partial transcription with a latency of 1000ms (#141)
* refactor: Enable partial transcription with a latency of 1000ms * refactor: Update CMakePresets.json and buildspec.json - Remove the "QT_VERSION" variable from CMakePresets.json for all platforms - Update the "version" of "obs-studio" and "prebuilt" dependencies in buildspec.json - Update the "version" of "qt6" dependency in buildspec.json - Update the "version" of the project to "0.3.3" in buildspec.json - Update the "version" of the project to "0.3.3" in CMakePresets.json - Remove unused code in whisper-processing.cpp * refactor: Add -Wno-error=deprecated-declarations option to compilerconfig.cmake * refactor: Update language codes in translation module
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -16,7 +16,6 @@
|
||||
!LICENSE
|
||||
!README.md
|
||||
!/vendor
|
||||
!patch_libobs.diff
|
||||
|
||||
# Exclude lock files
|
||||
*.lock.json
|
||||
|
||||
@@ -26,9 +26,8 @@
|
||||
"rhs": "Darwin"
|
||||
},
|
||||
"generator": "Xcode",
|
||||
"warnings": { "dev": true, "deprecated": true },
|
||||
"warnings": {"dev": true, "deprecated": true},
|
||||
"cacheVariables": {
|
||||
"QT_VERSION": "6",
|
||||
"CMAKE_OSX_DEPLOYMENT_TARGET": "11.0",
|
||||
"CODESIGN_IDENTITY": "$penv{CODESIGN_IDENT}",
|
||||
"CODESIGN_TEAM": "$penv{CODESIGN_TEAM}"
|
||||
@@ -57,9 +56,8 @@
|
||||
},
|
||||
"generator": "Visual Studio 17 2022",
|
||||
"architecture": "x64",
|
||||
"warnings": { "dev": true, "deprecated": true },
|
||||
"warnings": {"dev": true, "deprecated": true},
|
||||
"cacheVariables": {
|
||||
"QT_VERSION": "6",
|
||||
"CMAKE_SYSTEM_VERSION": "10.0.18363.657"
|
||||
}
|
||||
},
|
||||
@@ -84,9 +82,8 @@
|
||||
"rhs": "Linux"
|
||||
},
|
||||
"generator": "Ninja",
|
||||
"warnings": { "dev": true, "deprecated": true },
|
||||
"warnings": {"dev": true, "deprecated": true},
|
||||
"cacheVariables": {
|
||||
"QT_VERSION": "6",
|
||||
"CMAKE_BUILD_TYPE": "RelWithDebInfo"
|
||||
}
|
||||
},
|
||||
@@ -112,9 +109,8 @@
|
||||
"rhs": "Linux"
|
||||
},
|
||||
"generator": "Ninja",
|
||||
"warnings": { "dev": true, "deprecated": true },
|
||||
"warnings": {"dev": true, "deprecated": true},
|
||||
"cacheVariables": {
|
||||
"QT_VERSION": "6",
|
||||
"CMAKE_BUILD_TYPE": "RelWithDebInfo"
|
||||
}
|
||||
},
|
||||
|
||||
24
README.md
24
README.md
@@ -12,12 +12,13 @@
|
||||
|
||||
## Introduction
|
||||
|
||||
LocalVocal live-streaming AI assistant plugin allows you to transcribe, locally on your machine, audio speech into text and perform various language processing functions on the text using AI / LLMs (Large Language Models). ✅ No GPU required, ✅ no cloud costs, ✅ no network and ✅ no downtime! Privacy first - all data stays on your machine.
|
||||
LocalVocal lets you transcribe, locally on your machine, speech into text and simultaneously translate to any language. ✅ No GPU required, ✅ no cloud costs, ✅ no network and ✅ no downtime! Privacy first - all data stays on your machine.
|
||||
|
||||
If this free plugin has been valuable to you consider adding a ⭐ to this GH repo, rating it [on OBS](https://obsproject.com/forum/resources/localvocal-live-stream-ai-assistant.1769/), subscribing to [my YouTube channel](https://www.youtube.com/@royshilk) where I post updates, and supporting my work on [GitHub](https://github.com/sponsors/royshil) or [Patreon](https://www.patreon.com/RoyShilkrot) 🙏
|
||||
If this free plugin has been valuable consider adding a ⭐ to this GH repo, rating it [on OBS](https://obsproject.com/forum/resources/localvocal-live-stream-ai-assistant.1769/), subscribing to [my YouTube channel](https://www.youtube.com/@royshilk) where I post updates, and supporting my work on [GitHub](https://github.com/sponsors/royshil), [Patreon](https://www.patreon.com/RoyShilkrot) or [OpenCollective](https://opencollective.com/occ-ai) 🙏
|
||||
|
||||
Internally the plugin is running a neural network ([OpenAI Whisper](https://github.com/openai/whisper)) locally to predict in real time the speech and provide captions.
|
||||
Internally the plugin is running [OpenAI's Whisper](https://github.com/openai/whisper) to process real-time the speech and predict a transcription.
|
||||
It's using the [Whisper.cpp](https://github.com/ggerganov/whisper.cpp) project from [ggerganov](https://github.com/ggerganov) to run the Whisper network efficiently on CPUs and GPUs.
|
||||
Translation is done with [CTranslate2](https://github.com/OpenNMT/CTranslate2).
|
||||
|
||||
## Usage
|
||||
|
||||
@@ -45,9 +46,10 @@ Current Features:
|
||||
- Sync'ed captions with OBS recording timestamps
|
||||
- Send captions on a RTMP stream to e.g. YouTube, Twitch
|
||||
- Bring your own Whisper model (any GGML)
|
||||
- Translate captions in real time to major languages (both Whisper built-in translation as well as NMT models with [CTranslate2](https://github.com/OpenNMT/CTranslate2))
|
||||
- Translate captions in real time to major languages (both Whisper built-in translation as well as NMT models)
|
||||
- CUDA, OpenCL, Apple Arm64, AVX & SSE acceleration support
|
||||
- Filter out or replace any part of the produced captions
|
||||
- Partial transcriptions for a streaming-captions experience
|
||||
|
||||
Roadmap:
|
||||
- More robust built-in translation options
|
||||
@@ -57,22 +59,22 @@ Roadmap:
|
||||
Check out our other plugins:
|
||||
- [Background Removal](https://github.com/occ-ai/obs-backgroundremoval) removes background from webcam without a green screen.
|
||||
- [Detect](https://github.com/occ-ai/obs-detect) will detect and track >80 types of objects in real-time inside OBS
|
||||
- [CleanStream](https://github.com/occ-ai/obs-cleanstream) for real-time filler word (uh,um) and profanity removal from live audio stream
|
||||
- [CleanStream](https://github.com/occ-ai/obs-cleanstream) for real-time filler word (uh,um) and profanity removal from a live audio stream
|
||||
- [URL/API Source](https://github.com/occ-ai/obs-urlsource) that allows fetching live data from an API and displaying it in OBS.
|
||||
- [Polyglot](https://github.com/occ-ai/obs-polyglot) translation AI plugin for real-time, local translation to hunderds of languages
|
||||
- [Squawk](https://github.com/occ-ai/obs-squawk) adds lifelike local text-to-speech capabilities built-in OBS
|
||||
|
||||
## Download
|
||||
Check out the [latest releases](https://github.com/occ-ai/obs-localvocal/releases) for downloads and install instructions.
|
||||
|
||||
### Models
|
||||
The plugin ships with the Tiny.en model, and will autonomoously download other bigger Whisper models through a dropdown.
|
||||
However there's an option to select an external model file if you have it on disk.
|
||||
The plugin ships with the Tiny.en model, and will autonomously download other Whisper models through a dropdown.
|
||||
There's also an option to select an external GGML Whisper model file if you have it on disk.
|
||||
|
||||
Get more models from https://ggml.ggerganov.com/ and follow [the instructions on whisper.cpp](https://github.com/ggerganov/whisper.cpp/tree/master/models) to create your own models or download others such as distilled models.
|
||||
Get more models from https://ggml.ggerganov.com/ and [HuggingFace](https://huggingface.co/ggerganov/whisper.cpp/tree/main), follow [the instructions on whisper.cpp](https://github.com/ggerganov/whisper.cpp/tree/master/models) to create your own models or download others such as distilled models.
|
||||
|
||||
## Building
|
||||
|
||||
The plugin was built and tested on Mac OSX (Intel & Apple silicon), Windows (with and without Nvidia CUDA) and Linux.
|
||||
The plugin was built and tested on Mac OSX (Intel & Apple silicon), Windows (with and without Nvidia CUDA) and Linux.
|
||||
|
||||
Start by cloning this repo to a directory of your choice.
|
||||
|
||||
@@ -172,7 +174,7 @@ The build should exist in the `./release` folder off the root. You can manually
|
||||
|
||||
LocalVocal will now build with CUDA support automatically through a prebuilt binary of Whisper.cpp from https://github.com/occ-ai/occ-ai-dep-whispercpp. The CMake scripts will download all necessary files.
|
||||
|
||||
To build with cuda add `CPU_OR_CUDA` as an environment variable (with `cpu`, `12.2.0` or `11.8.0`) and build regularly
|
||||
To build with cuda add `CPU_OR_CUDA` as an environment variable (with `cpu`, `clblast`, `12.2.0` or `11.8.0`) and build regularly
|
||||
|
||||
```powershell
|
||||
> $env:CPU_OR_CUDA="12.2.0"
|
||||
|
||||
@@ -1,33 +1,33 @@
|
||||
{
|
||||
"dependencies": {
|
||||
"obs-studio": {
|
||||
"version": "30.0.2",
|
||||
"version": "30.1.2",
|
||||
"baseUrl": "https://github.com/obsproject/obs-studio/archive/refs/tags",
|
||||
"label": "OBS sources",
|
||||
"hashes": {
|
||||
"macos": "be12c3ad0a85713750d8325e4b1db75086223402d7080d0e3c2833d7c5e83c27",
|
||||
"windows-x64": "970058c49322cfa9cd6d620abb393fed89743ba7e74bd9dbb6ebe0ea8141d9c7"
|
||||
"macos": "490bae1c392b3b344b0270afd8cb887da4bc50bd92c0c426e96713c1ccb9701a",
|
||||
"windows-x64": "c2dd03fa7fd01fad5beafce8f7156da11f9ed9a588373fd40b44a06f4c03b867"
|
||||
}
|
||||
},
|
||||
"prebuilt": {
|
||||
"version": "2023-11-03",
|
||||
"version": "2024-03-19",
|
||||
"baseUrl": "https://github.com/obsproject/obs-deps/releases/download",
|
||||
"label": "Pre-Built obs-deps",
|
||||
"hashes": {
|
||||
"macos": "90c2fc069847ec2768dcc867c1c63b112c615ed845a907dc44acab7a97181974",
|
||||
"windows-x64": "d0825a6fb65822c993a3059edfba70d72d2e632ef74893588cf12b1f0d329ce6"
|
||||
"macos": "2e9bfb55a5e0e4c1086fa1fda4cf268debfead473089df2aaea80e1c7a3ca7ff",
|
||||
"windows-x64": "6e86068371526a967e805f6f9903f9407adb683c21820db5f07da8f30d11e998"
|
||||
}
|
||||
},
|
||||
"qt6": {
|
||||
"version": "2023-11-03",
|
||||
"version": "2024-03-19",
|
||||
"baseUrl": "https://github.com/obsproject/obs-deps/releases/download",
|
||||
"label": "Pre-Built Qt6",
|
||||
"hashes": {
|
||||
"macos": "ba4a7152848da0053f63427a2a2cb0a199af3992997c0db08564df6f48c9db98",
|
||||
"windows-x64": "bc57dedf76b47119a6dce0435a2f21b35b08c8f2948b1cb34a157320f77732d1"
|
||||
"macos": "694f1e639c017e3b1f456f735330dc5afae287cbea85757101af1368de3142c8",
|
||||
"windows-x64": "72d1df34a0ef7413a681d5fcc88cae81da60adc03dcd23ef17862ab170bcc0dd"
|
||||
},
|
||||
"debugSymbols": {
|
||||
"windows-x64": "fd8ecd1d8cd2ef049d9f4d7fb5c134f784836d6020758094855dfa98bd025036"
|
||||
"windows-x64": "fbddd1f659c360f2291911ac5709b67b6f8182e6bca519d24712e4f6fd3cc865"
|
||||
}
|
||||
}
|
||||
},
|
||||
@@ -38,7 +38,7 @@
|
||||
},
|
||||
"name": "obs-localvocal",
|
||||
"displayName": "OBS Localvocal",
|
||||
"version": "0.3.2",
|
||||
"version": "0.3.3",
|
||||
"author": "Roy Shilkrot",
|
||||
"website": "https://github.com/occ-ai/obs-localvocal",
|
||||
"email": "roy.shil@gmail.com",
|
||||
|
||||
@@ -80,7 +80,8 @@ elseif(WIN32)
|
||||
FetchContent_Declare(
|
||||
whispercpp_fetch
|
||||
URL ${WHISPER_CPP_URL}
|
||||
URL_HASH SHA256=${WHISPER_CPP_HASH})
|
||||
URL_HASH SHA256=${WHISPER_CPP_HASH}
|
||||
DOWNLOAD_EXTRACT_TIMESTAMP TRUE)
|
||||
FetchContent_MakeAvailable(whispercpp_fetch)
|
||||
|
||||
add_library(Whispercpp::Whisper SHARED IMPORTED)
|
||||
@@ -104,8 +105,20 @@ elseif(WIN32)
|
||||
|
||||
# glob all dlls in the bin directory and install them
|
||||
file(GLOB WHISPER_DLLS ${whispercpp_fetch_SOURCE_DIR}/bin/*.dll)
|
||||
install(FILES ${WHISPER_DLLS} DESTINATION "obs-plugins/64bit")
|
||||
foreach(FILE ${WHISPER_DLLS})
|
||||
file(RELATIVE_PATH REL_FILE ${whispercpp_fetch_SOURCE_DIR}/bin ${FILE})
|
||||
set(DEST_DIR "${CMAKE_SOURCE_DIR}/release/${CMAKE_BUILD_TYPE}/obs-plugins/64bit")
|
||||
set(DEST_FILE "${DEST_DIR}/${REL_FILE}")
|
||||
|
||||
if(NOT EXISTS ${DEST_DIR})
|
||||
file(MAKE_DIRECTORY ${DEST_DIR})
|
||||
endif()
|
||||
|
||||
if(NOT EXISTS ${DEST_FILE} OR ${FILE} IS_NEWER_THAN ${DEST_FILE})
|
||||
message(STATUS "Copying ${FILE} to ${DEST_FILE}")
|
||||
file(COPY ${FILE} DESTINATION ${DEST_DIR})
|
||||
endif()
|
||||
endforeach()
|
||||
else()
|
||||
set(Whispercpp_Build_GIT_TAG "v1.6.2")
|
||||
set(WHISPER_EXTRA_CXX_FLAGS "-fPIC")
|
||||
|
||||
@@ -55,4 +55,4 @@ else()
|
||||
endif()
|
||||
|
||||
add_compile_definitions($<$<CONFIG:DEBUG>:DEBUG> $<$<CONFIG:DEBUG>:_DEBUG> SIMDE_ENABLE_OPENMP)
|
||||
add_compile_options(-Wno-error=newline-eof)
|
||||
add_compile_options(-Wno-error=newline-eof -Wno-error=deprecated-declarations -Wno-deprecated-declarations)
|
||||
|
||||
@@ -83,3 +83,5 @@ log_group="Logging"
|
||||
advanced_group="Advanced Configuration"
|
||||
buffered_output_parameters="Buffered Output Configuration"
|
||||
file_output_info="Note: Translation output will be saved to a file in the same directory with the target language added to the name, e.g. 'output_es.srt'."
|
||||
partial_transcription="Enable Partial Transcription"
|
||||
partial_transcription_info="Partial transcription will increase processing load on your machine to transcribe content in real-time, which may impact performance."
|
||||
@@ -1,20 +0,0 @@
|
||||
diff --git a/libobs/CMakeLists.txt b/libobs/CMakeLists.txt
|
||||
index d2e2671..5a9242a 100644
|
||||
--- a/libobs/CMakeLists.txt
|
||||
+++ b/libobs/CMakeLists.txt
|
||||
@@ -263,6 +263,7 @@ set(public_headers
|
||||
graphics/vec3.h
|
||||
graphics/vec4.h
|
||||
media-io/audio-io.h
|
||||
+ media-io/audio-resampler.h
|
||||
media-io/frame-rate.h
|
||||
media-io/media-io-defs.h
|
||||
media-io/video-io.h
|
||||
@@ -287,6 +288,7 @@ set(public_headers
|
||||
util/base.h
|
||||
util/bmem.h
|
||||
util/c99defs.h
|
||||
+ util/circlebuf.h
|
||||
util/darray.h
|
||||
util/profiler.h
|
||||
util/sse-intrin.h
|
||||
@@ -188,7 +188,8 @@ void set_text_callback(struct transcription_filter_data *gf,
|
||||
const DetectionResultWithText &resultIn)
|
||||
{
|
||||
DetectionResultWithText result = resultIn;
|
||||
if (!result.text.empty() && result.result == DETECTION_RESULT_SPEECH) {
|
||||
if (!result.text.empty() && (result.result == DETECTION_RESULT_SPEECH ||
|
||||
result.result == DETECTION_RESULT_PARTIAL)) {
|
||||
gf->last_sub_render_time = now_ms();
|
||||
gf->cleared_last_sub = false;
|
||||
}
|
||||
@@ -231,7 +232,10 @@ void set_text_callback(struct transcription_filter_data *gf,
|
||||
str_copy = translated_sentence;
|
||||
} else {
|
||||
if (gf->buffered_output) {
|
||||
gf->translation_monitor.addSentence(translated_sentence);
|
||||
if (result.result == DETECTION_RESULT_SPEECH) {
|
||||
// buffered output - add the sentence to the monitor
|
||||
gf->translation_monitor.addSentence(translated_sentence);
|
||||
}
|
||||
} else {
|
||||
// non-buffered output - send the sentence to the selected source
|
||||
send_caption_to_source(gf->translation_output, translated_sentence,
|
||||
@@ -241,17 +245,20 @@ void set_text_callback(struct transcription_filter_data *gf,
|
||||
}
|
||||
|
||||
if (gf->buffered_output) {
|
||||
gf->captions_monitor.addSentence(str_copy);
|
||||
if (result.result == DETECTION_RESULT_SPEECH) {
|
||||
gf->captions_monitor.addSentence(str_copy);
|
||||
}
|
||||
} else {
|
||||
// non-buffered output - send the sentence to the selected source
|
||||
send_caption_to_source(gf->text_source_name, str_copy, gf);
|
||||
}
|
||||
|
||||
if (gf->caption_to_stream) {
|
||||
if (gf->caption_to_stream && result.result == DETECTION_RESULT_SPEECH) {
|
||||
send_caption_to_stream(result, str_copy, gf);
|
||||
}
|
||||
|
||||
if (gf->save_to_file && gf->output_file_path != "") {
|
||||
if (gf->save_to_file && gf->output_file_path != "" &&
|
||||
result.result == DETECTION_RESULT_SPEECH) {
|
||||
send_sentence_to_file(gf, result, str_copy, translated_sentence);
|
||||
}
|
||||
};
|
||||
@@ -291,8 +298,10 @@ void reset_caption_state(transcription_filter_data *gf_)
|
||||
{
|
||||
if (gf_->captions_monitor.isEnabled()) {
|
||||
gf_->captions_monitor.clear();
|
||||
gf_->translation_monitor.clear();
|
||||
}
|
||||
send_caption_to_source(gf_->text_source_name, "", gf_);
|
||||
send_caption_to_source(gf_->translation_output, "", gf_);
|
||||
// flush the buffer
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(gf_->whisper_buf_mutex);
|
||||
@@ -326,6 +335,7 @@ void media_started_callback(void *data_, calldata_t *cd)
|
||||
gf_->active = true;
|
||||
reset_caption_state(gf_);
|
||||
}
|
||||
|
||||
void media_pause_callback(void *data_, calldata_t *cd)
|
||||
{
|
||||
UNUSED_PARAMETER(cd);
|
||||
@@ -333,6 +343,7 @@ void media_pause_callback(void *data_, calldata_t *cd)
|
||||
obs_log(gf_->log_level, "media_pause");
|
||||
gf_->active = false;
|
||||
}
|
||||
|
||||
void media_restart_callback(void *data_, calldata_t *cd)
|
||||
{
|
||||
UNUSED_PARAMETER(cd);
|
||||
@@ -341,6 +352,7 @@ void media_restart_callback(void *data_, calldata_t *cd)
|
||||
gf_->active = true;
|
||||
reset_caption_state(gf_);
|
||||
}
|
||||
|
||||
void media_stopped_callback(void *data_, calldata_t *cd)
|
||||
{
|
||||
UNUSED_PARAMETER(cd);
|
||||
|
||||
@@ -81,6 +81,8 @@ struct transcription_filter_data {
|
||||
bool enable_audio_chunks_callback = false;
|
||||
bool source_signals_set = false;
|
||||
bool initial_creation = true;
|
||||
bool partial_transcription = false;
|
||||
int partial_latency = 1000;
|
||||
|
||||
// Last transcription result
|
||||
std::string last_text;
|
||||
|
||||
@@ -46,8 +46,9 @@ bool advanced_settings_callback(obs_properties_t *props, obs_property_t *propert
|
||||
UNUSED_PARAMETER(property);
|
||||
// If advanced settings is enabled, show the advanced settings group
|
||||
const bool show_hide = obs_data_get_int(settings, "advanced_settings_mode") == 1;
|
||||
for (const std::string &prop_name : {"whisper_params_group", "buffered_output_group",
|
||||
"log_group", "advanced_group", "file_output_enable"}) {
|
||||
for (const std::string &prop_name :
|
||||
{"whisper_params_group", "buffered_output_group", "log_group", "advanced_group",
|
||||
"file_output_enable", "partial_group"}) {
|
||||
obs_property_set_visible(obs_properties_get(props, prop_name.c_str()), show_hide);
|
||||
}
|
||||
translation_options_callback(props, NULL, settings);
|
||||
@@ -457,6 +458,22 @@ void add_general_group_properties(obs_properties_t *ppts)
|
||||
}
|
||||
}
|
||||
|
||||
void add_partial_group_properties(obs_properties_t *ppts)
|
||||
{
|
||||
// add a group for partial transcription
|
||||
obs_properties_t *partial_group = obs_properties_create();
|
||||
obs_properties_add_group(ppts, "partial_group", MT_("partial_transcription"),
|
||||
OBS_GROUP_CHECKABLE, partial_group);
|
||||
|
||||
// add text info
|
||||
obs_properties_add_text(partial_group, "partial_info", MT_("partial_transcription_info"),
|
||||
OBS_TEXT_INFO);
|
||||
|
||||
// add slider for partial latecy
|
||||
obs_properties_add_int_slider(partial_group, "partial_latency", MT_("partial_latency"), 500,
|
||||
3000, 50);
|
||||
}
|
||||
|
||||
obs_properties_t *transcription_filter_properties(void *data)
|
||||
{
|
||||
struct transcription_filter_data *gf =
|
||||
@@ -480,6 +497,7 @@ obs_properties_t *transcription_filter_properties(void *data)
|
||||
add_buffered_output_group_properties(ppts);
|
||||
add_advanced_group_properties(ppts, gf);
|
||||
add_logging_group_properties(ppts);
|
||||
add_partial_group_properties(ppts);
|
||||
add_whisper_params_group_properties(ppts);
|
||||
|
||||
// Add a informative text about the plugin
|
||||
|
||||
@@ -105,13 +105,8 @@ struct obs_audio_data *transcription_filter_filter_audio(void *data, struct obs_
|
||||
// push audio packet info (timestamp/frame count) to info circlebuf
|
||||
struct transcription_filter_audio_info info = {0};
|
||||
info.frames = audio->frames; // number of frames in this packet
|
||||
// check if the timestamp is a false "negative" value for uint64_t
|
||||
if (audio->timestamp > (std::numeric_limits<uint64_t>::max() - 100000000)) {
|
||||
// set the timestamp to the current time
|
||||
info.timestamp_offset_ns = 0;
|
||||
} else {
|
||||
info.timestamp_offset_ns = audio->timestamp; // timestamp of this packet
|
||||
}
|
||||
// calculate timestamp offset from the start of the stream
|
||||
info.timestamp_offset_ns = now_ns() - gf->start_timestamp_ms * 1000000;
|
||||
circlebuf_push_back(&gf->info_buffer, &info, sizeof(info));
|
||||
}
|
||||
|
||||
@@ -190,6 +185,8 @@ void transcription_filter_update(void *data, obs_data_t *s)
|
||||
gf->process_while_muted = obs_data_get_bool(s, "process_while_muted");
|
||||
gf->min_sub_duration = (int)obs_data_get_int(s, "min_sub_duration");
|
||||
gf->last_sub_render_time = now_ms();
|
||||
gf->partial_transcription = obs_data_get_bool(s, "partial_group");
|
||||
gf->partial_latency = (int)obs_data_get_int(s, "partial_latency");
|
||||
bool new_buffered_output = obs_data_get_bool(s, "buffered_output");
|
||||
int new_buffer_num_lines = (int)obs_data_get_int(s, "buffer_num_lines");
|
||||
int new_buffer_num_chars_per_line = (int)obs_data_get_int(s, "buffer_num_chars_per_line");
|
||||
@@ -584,6 +581,8 @@ void transcription_filter_defaults(obs_data_t *s)
|
||||
obs_data_set_default_string(s, "translation_model_path_external", "");
|
||||
obs_data_set_default_int(s, "translate_input_tokenization_style", INPUT_TOKENIZAION_M2M100);
|
||||
obs_data_set_default_double(s, "sentence_psum_accept_thresh", 0.4);
|
||||
obs_data_set_default_bool(s, "partial_group", false);
|
||||
obs_data_set_default_int(s, "partial_latency", 1100);
|
||||
|
||||
// translation options
|
||||
obs_data_set_default_double(s, "translation_sampling_temperature", 0.1);
|
||||
|
||||
@@ -24,6 +24,14 @@ inline uint64_t now_ms()
|
||||
.count();
|
||||
}
|
||||
|
||||
// Get the current timestamp in nano seconds since epoch
|
||||
inline uint64_t now_ns()
|
||||
{
|
||||
return std::chrono::duration_cast<std::chrono::nanoseconds>(
|
||||
std::chrono::system_clock::now().time_since_epoch())
|
||||
.count();
|
||||
}
|
||||
|
||||
// Split a string into words based on spaces
|
||||
std::vector<std::string> split_words(const std::string &str_copy);
|
||||
|
||||
|
||||
@@ -63,7 +63,8 @@ void update_whisper_model(struct transcription_filter_data *gf)
|
||||
obs_log(gf->log_level, "model path changed from %s to %s",
|
||||
gf->whisper_model_path.c_str(), new_model_path.c_str());
|
||||
|
||||
gf->whisper_model_loaded_new = true;
|
||||
// check if this is loading the initial model or a switch
|
||||
gf->whisper_model_loaded_new = !gf->whisper_model_path.empty();
|
||||
}
|
||||
|
||||
// check if the new model is external file
|
||||
|
||||
@@ -24,6 +24,7 @@ struct vad_state {
|
||||
bool vad_on;
|
||||
uint64_t start_ts_offest_ms;
|
||||
uint64_t end_ts_offset_ms;
|
||||
uint64_t last_partial_segment_end_ts;
|
||||
};
|
||||
|
||||
struct whisper_context *init_whisper_context(const std::string &model_path_in,
|
||||
@@ -126,18 +127,25 @@ struct whisper_context *init_whisper_context(const std::string &model_path_in,
|
||||
|
||||
struct DetectionResultWithText run_whisper_inference(struct transcription_filter_data *gf,
|
||||
const float *pcm32f_data_,
|
||||
size_t pcm32f_num_samples,
|
||||
uint64_t start_offset_ms = 0,
|
||||
uint64_t end_offset_ms = 0)
|
||||
size_t pcm32f_num_samples, uint64_t t0 = 0,
|
||||
uint64_t t1 = 0,
|
||||
int vad_state = VAD_STATE_WAS_OFF)
|
||||
{
|
||||
if (gf == nullptr) {
|
||||
obs_log(LOG_ERROR, "run_whisper_inference: gf is null");
|
||||
return {DETECTION_RESULT_UNKNOWN, "", start_offset_ms, end_offset_ms, {}, ""};
|
||||
return {DETECTION_RESULT_UNKNOWN, "", t0, t1, {}, ""};
|
||||
}
|
||||
|
||||
if (pcm32f_data_ == nullptr || pcm32f_num_samples == 0) {
|
||||
obs_log(LOG_ERROR, "run_whisper_inference: pcm32f_data is null or size is 0");
|
||||
return {DETECTION_RESULT_UNKNOWN, "", start_offset_ms, end_offset_ms, {}, ""};
|
||||
return {DETECTION_RESULT_UNKNOWN, "", t0, t1, {}, ""};
|
||||
}
|
||||
|
||||
// if the time difference between t0 and t1 is less than 50 ms - skip
|
||||
if (t1 - t0 < 50) {
|
||||
obs_log(gf->log_level,
|
||||
"Time difference between t0 and t1 is less than 50 ms, skipping");
|
||||
return {DETECTION_RESULT_UNKNOWN, "", t0, t1, {}, ""};
|
||||
}
|
||||
|
||||
obs_log(gf->log_level, "%s: processing %d samples, %.3f sec, %d threads", __func__,
|
||||
@@ -163,8 +171,6 @@ struct DetectionResultWithText run_whisper_inference(struct transcription_filter
|
||||
|
||||
// duration in ms
|
||||
const uint64_t duration_ms = (uint64_t)(pcm32f_size * 1000 / WHISPER_SAMPLE_RATE);
|
||||
const uint64_t t0 = start_offset_ms;
|
||||
const uint64_t t1 = end_offset_ms;
|
||||
|
||||
std::lock_guard<std::mutex> lock(gf->whisper_ctx_mutex);
|
||||
if (gf->whisper_context == nullptr) {
|
||||
@@ -202,86 +208,85 @@ struct DetectionResultWithText run_whisper_inference(struct transcription_filter
|
||||
if (whisper_full_result != 0) {
|
||||
obs_log(LOG_WARNING, "failed to process audio, error %d", whisper_full_result);
|
||||
return {DETECTION_RESULT_UNKNOWN, "", t0, t1, {}, ""};
|
||||
} else {
|
||||
float sentence_p = 0.0f;
|
||||
std::string text = "";
|
||||
std::string tokenIds = "";
|
||||
std::vector<whisper_token_data> tokens;
|
||||
for (int n_segment = 0; n_segment < whisper_full_n_segments(gf->whisper_context);
|
||||
++n_segment) {
|
||||
const int n_tokens = whisper_full_n_tokens(gf->whisper_context, n_segment);
|
||||
for (int j = 0; j < n_tokens; ++j) {
|
||||
// get token
|
||||
whisper_token_data token = whisper_full_get_token_data(
|
||||
gf->whisper_context, n_segment, j);
|
||||
const char *token_str =
|
||||
whisper_token_to_str(gf->whisper_context, token.id);
|
||||
bool keep = true;
|
||||
// if the token starts with '[' and ends with ']', don't keep it
|
||||
if (token_str[0] == '[' &&
|
||||
token_str[strlen(token_str) - 1] == ']') {
|
||||
keep = false;
|
||||
}
|
||||
// if this is a special token, don't keep it
|
||||
if (token.id >= 50256) {
|
||||
keep = false;
|
||||
}
|
||||
// if the second to last token is .id == 13 ('.'), don't keep it
|
||||
if (j == n_tokens - 2 && token.id == 13) {
|
||||
keep = false;
|
||||
}
|
||||
// token ids https://huggingface.co/openai/whisper-large-v3/raw/main/tokenizer.json
|
||||
if (token.id > 50365 && token.id <= 51865) {
|
||||
const float time = ((float)token.id - 50365.0f) * 0.02f;
|
||||
const float duration_s = (float)duration_ms / 1000.0f;
|
||||
const float ratio = std::max(time, duration_s) /
|
||||
std::min(time, duration_s);
|
||||
obs_log(gf->log_level,
|
||||
"Time token found %d -> %.3f. Duration: %.3f. Ratio: %.3f.",
|
||||
token.id, time, duration_s, ratio);
|
||||
if (ratio > 3.0f) {
|
||||
// ratio is too high, skip this detection
|
||||
obs_log(gf->log_level,
|
||||
"Time token ratio too high, skipping");
|
||||
return {DETECTION_RESULT_SILENCE,
|
||||
"",
|
||||
t0,
|
||||
t1,
|
||||
{},
|
||||
language};
|
||||
}
|
||||
keep = false;
|
||||
}
|
||||
|
||||
if (keep) {
|
||||
sentence_p += token.p;
|
||||
text += token_str;
|
||||
tokens.push_back(token);
|
||||
}
|
||||
obs_log(gf->log_level, "S %d, Token %d: %d\t%s\tp: %.3f [keep: %d]",
|
||||
n_segment, j, token.id, token_str, token.p, keep);
|
||||
}
|
||||
}
|
||||
sentence_p /= (float)tokens.size();
|
||||
if (sentence_p < gf->sentence_psum_accept_thresh) {
|
||||
obs_log(gf->log_level, "Sentence psum %.3f below threshold %.3f, skipping",
|
||||
sentence_p, gf->sentence_psum_accept_thresh);
|
||||
return {DETECTION_RESULT_SILENCE, "", t0, t1, {}, language};
|
||||
}
|
||||
|
||||
obs_log(gf->log_level, "Decoded sentence: '%s'", text.c_str());
|
||||
|
||||
if (gf->log_words) {
|
||||
obs_log(LOG_INFO, "[%s --> %s] (%.3f) %s", to_timestamp(t0).c_str(),
|
||||
to_timestamp(t1).c_str(), sentence_p, text.c_str());
|
||||
}
|
||||
|
||||
if (text.empty() || text == "." || text == " " || text == "\n") {
|
||||
return {DETECTION_RESULT_SILENCE, "", t0, t1, {}, language};
|
||||
}
|
||||
|
||||
return {DETECTION_RESULT_SPEECH, text, t0, t1, tokens, language};
|
||||
}
|
||||
|
||||
float sentence_p = 0.0f;
|
||||
std::string text = "";
|
||||
std::string tokenIds = "";
|
||||
std::vector<whisper_token_data> tokens;
|
||||
for (int n_segment = 0; n_segment < whisper_full_n_segments(gf->whisper_context);
|
||||
++n_segment) {
|
||||
const int n_tokens = whisper_full_n_tokens(gf->whisper_context, n_segment);
|
||||
for (int j = 0; j < n_tokens; ++j) {
|
||||
// get token
|
||||
whisper_token_data token =
|
||||
whisper_full_get_token_data(gf->whisper_context, n_segment, j);
|
||||
const char *token_str = whisper_token_to_str(gf->whisper_context, token.id);
|
||||
bool keep = true;
|
||||
// if the token starts with '[' and ends with ']', don't keep it
|
||||
if (token_str[0] == '[' && token_str[strlen(token_str) - 1] == ']') {
|
||||
keep = false;
|
||||
}
|
||||
// if this is a special token, don't keep it
|
||||
if (token.id >= 50256) {
|
||||
keep = false;
|
||||
}
|
||||
// if the second to last token is .id == 13 ('.'), don't keep it
|
||||
if (j == n_tokens - 2 && token.id == 13) {
|
||||
keep = false;
|
||||
}
|
||||
// token ids https://huggingface.co/openai/whisper-large-v3/raw/main/tokenizer.json
|
||||
if (token.id > 50365 && token.id <= 51865) {
|
||||
const float time = ((float)token.id - 50365.0f) * 0.02f;
|
||||
const float duration_s = (float)duration_ms / 1000.0f;
|
||||
const float ratio =
|
||||
std::max(time, duration_s) / std::min(time, duration_s);
|
||||
obs_log(gf->log_level,
|
||||
"Time token found %d -> %.3f. Duration: %.3f. Ratio: %.3f.",
|
||||
token.id, time, duration_s, ratio);
|
||||
if (ratio > 3.0f) {
|
||||
// ratio is too high, skip this detection
|
||||
obs_log(gf->log_level,
|
||||
"Time token ratio too high, skipping");
|
||||
return {DETECTION_RESULT_SILENCE, "", t0, t1, {}, language};
|
||||
}
|
||||
keep = false;
|
||||
}
|
||||
|
||||
if (keep) {
|
||||
sentence_p += token.p;
|
||||
text += token_str;
|
||||
tokens.push_back(token);
|
||||
}
|
||||
obs_log(gf->log_level, "S %d, Token %d: %d\t%s\tp: %.3f [keep: %d]",
|
||||
n_segment, j, token.id, token_str, token.p, keep);
|
||||
}
|
||||
}
|
||||
sentence_p /= (float)tokens.size();
|
||||
if (sentence_p < gf->sentence_psum_accept_thresh) {
|
||||
obs_log(gf->log_level, "Sentence psum %.3f below threshold %.3f, skipping",
|
||||
sentence_p, gf->sentence_psum_accept_thresh);
|
||||
return {DETECTION_RESULT_SILENCE, "", t0, t1, {}, language};
|
||||
}
|
||||
|
||||
obs_log(gf->log_level, "Decoded sentence: '%s'", text.c_str());
|
||||
|
||||
if (gf->log_words) {
|
||||
obs_log(LOG_INFO, "[%s --> %s]%s(%.3f) %s", to_timestamp(t0).c_str(),
|
||||
to_timestamp(t1).c_str(), vad_state == VAD_STATE_PARTIAL ? "P" : " ",
|
||||
sentence_p, text.c_str());
|
||||
}
|
||||
|
||||
if (text.empty() || text == "." || text == " " || text == "\n") {
|
||||
return {DETECTION_RESULT_SILENCE, "", t0, t1, {}, language};
|
||||
}
|
||||
|
||||
return {vad_state == VAD_STATE_PARTIAL ? DETECTION_RESULT_PARTIAL : DETECTION_RESULT_SPEECH,
|
||||
text,
|
||||
t0,
|
||||
t1,
|
||||
tokens,
|
||||
language};
|
||||
}
|
||||
|
||||
void run_inference_and_callbacks(transcription_filter_data *gf, uint64_t start_offset_ms,
|
||||
@@ -293,15 +298,22 @@ void run_inference_and_callbacks(transcription_filter_data *gf, uint64_t start_o
|
||||
const size_t pcm32f_size_with_silence = pcm32f_size + 2 * WHISPER_SAMPLE_RATE / 100;
|
||||
// allocate a new buffer and copy the data to it
|
||||
float *pcm32f_data = (float *)bzalloc(pcm32f_size_with_silence * sizeof(float));
|
||||
circlebuf_pop_back(&gf->whisper_buffer, pcm32f_data + WHISPER_SAMPLE_RATE / 100,
|
||||
pcm32f_size * sizeof(float));
|
||||
if (vad_state == VAD_STATE_PARTIAL) {
|
||||
// peek instead of pop, since this is a partial run that keeps the data in the buffer
|
||||
circlebuf_peek_back(&gf->whisper_buffer, pcm32f_data + WHISPER_SAMPLE_RATE / 100,
|
||||
pcm32f_size * sizeof(float));
|
||||
} else {
|
||||
circlebuf_pop_back(&gf->whisper_buffer, pcm32f_data + WHISPER_SAMPLE_RATE / 100,
|
||||
pcm32f_size * sizeof(float));
|
||||
}
|
||||
|
||||
struct DetectionResultWithText inference_result = run_whisper_inference(
|
||||
gf, pcm32f_data, pcm32f_size_with_silence, start_offset_ms, end_offset_ms);
|
||||
struct DetectionResultWithText inference_result =
|
||||
run_whisper_inference(gf, pcm32f_data, pcm32f_size_with_silence, start_offset_ms,
|
||||
end_offset_ms, vad_state);
|
||||
// output inference result to a text source
|
||||
set_text_callback(gf, inference_result);
|
||||
|
||||
if (gf->enable_audio_chunks_callback) {
|
||||
if (gf->enable_audio_chunks_callback && vad_state != VAD_STATE_PARTIAL) {
|
||||
audio_chunk_callback(gf, pcm32f_data, pcm32f_size_with_silence, vad_state,
|
||||
inference_result);
|
||||
}
|
||||
@@ -355,6 +367,15 @@ vad_state vad_based_segmentation(transcription_filter_data *gf, vad_state last_v
|
||||
}
|
||||
end_timestamp_offset_ns = info_from_buf.timestamp_offset_ns;
|
||||
|
||||
if (start_timestamp_offset_ns > end_timestamp_offset_ns) {
|
||||
// this may happen when the incoming media has a timestamp reset
|
||||
// in this case, we should figure out the start timestamp from the end timestamp
|
||||
// and the number of frames
|
||||
start_timestamp_offset_ns =
|
||||
end_timestamp_offset_ns -
|
||||
num_frames_from_infos * 1000000000 / gf->sample_rate;
|
||||
}
|
||||
|
||||
/* Pop from input circlebuf */
|
||||
for (size_t c = 0; c < gf->channels; c++) {
|
||||
// Push the new data to copy_buffers[c]
|
||||
@@ -388,7 +409,8 @@ vad_state vad_based_segmentation(transcription_filter_data *gf, vad_state last_v
|
||||
const uint64_t start_ts_offset_ms = start_timestamp_offset_ns / 1000000;
|
||||
const uint64_t end_ts_offset_ms = end_timestamp_offset_ns / 1000000;
|
||||
|
||||
vad_state current_vad_state = {false, start_ts_offset_ms, end_ts_offset_ms};
|
||||
vad_state current_vad_state = {false, start_ts_offset_ms, end_ts_offset_ms,
|
||||
last_vad_state.last_partial_segment_end_ts};
|
||||
|
||||
std::vector<timestamp_t> stamps = gf->vad->get_speech_timestamps();
|
||||
if (stamps.size() == 0) {
|
||||
@@ -399,6 +421,7 @@ vad_state vad_based_segmentation(transcription_filter_data *gf, vad_state last_v
|
||||
run_inference_and_callbacks(gf, last_vad_state.start_ts_offest_ms,
|
||||
last_vad_state.end_ts_offset_ms,
|
||||
VAD_STATE_WAS_ON);
|
||||
current_vad_state.last_partial_segment_end_ts = 0;
|
||||
}
|
||||
|
||||
if (gf->enable_audio_chunks_callback) {
|
||||
@@ -410,69 +433,96 @@ vad_state vad_based_segmentation(transcription_filter_data *gf, vad_state last_v
|
||||
current_vad_state.end_ts_offset_ms,
|
||||
{}});
|
||||
}
|
||||
} else {
|
||||
// process vad segments
|
||||
for (size_t i = 0; i < stamps.size(); i++) {
|
||||
int start_frame = stamps[i].start;
|
||||
if (i > 0) {
|
||||
// if this is not the first segment, start from the end of the previous segment
|
||||
start_frame = stamps[i - 1].end;
|
||||
} else {
|
||||
// take at least 100ms of audio before the first speech segment, if available
|
||||
start_frame = std::max(0, start_frame - WHISPER_SAMPLE_RATE / 10);
|
||||
}
|
||||
|
||||
int end_frame = stamps[i].end;
|
||||
if (i == stamps.size() - 1 && stamps[i].end < (int)resampled_16khz_frames) {
|
||||
// take at least 100ms of audio after the last speech segment, if available
|
||||
end_frame = std::min(end_frame + WHISPER_SAMPLE_RATE / 10,
|
||||
(int)resampled_16khz_frames);
|
||||
}
|
||||
return current_vad_state;
|
||||
}
|
||||
|
||||
const int number_of_frames = end_frame - start_frame;
|
||||
// push the data into gf-whisper_buffer
|
||||
circlebuf_push_back(&gf->whisper_buffer, resampled_16khz[0] + start_frame,
|
||||
number_of_frames * sizeof(float));
|
||||
obs_log(gf->log_level,
|
||||
"VAD segment %d. pushed %d to %d (%d frames / %lu ms). current size: %lu bytes / %lu frames / %lu ms",
|
||||
i, start_frame, end_frame, number_of_frames,
|
||||
number_of_frames * 1000 / WHISPER_SAMPLE_RATE,
|
||||
gf->whisper_buffer.size, gf->whisper_buffer.size / sizeof(float),
|
||||
gf->whisper_buffer.size / sizeof(float) * 1000 /
|
||||
WHISPER_SAMPLE_RATE);
|
||||
// process vad segments
|
||||
for (size_t i = 0; i < stamps.size(); i++) {
|
||||
int start_frame = stamps[i].start;
|
||||
if (i > 0) {
|
||||
// if this is not the first segment, start from the end of the previous segment
|
||||
start_frame = stamps[i - 1].end;
|
||||
} else {
|
||||
// take at least 100ms of audio before the first speech segment, if available
|
||||
start_frame = std::max(0, start_frame - WHISPER_SAMPLE_RATE / 10);
|
||||
}
|
||||
|
||||
// if the segment is in the middle of the buffer, send it to inference
|
||||
if (stamps[i].end < (int)resampled_16khz_frames) {
|
||||
// new "ending" segment (not up to the end of the buffer)
|
||||
obs_log(gf->log_level, "VAD segment end -> send to inference");
|
||||
// find the end timestamp of the segment
|
||||
const uint64_t segment_end_ts =
|
||||
start_ts_offset_ms + end_frame * 1000 / WHISPER_SAMPLE_RATE;
|
||||
run_inference_and_callbacks(gf, last_vad_state.start_ts_offest_ms,
|
||||
segment_end_ts,
|
||||
last_vad_state.vad_on
|
||||
? VAD_STATE_WAS_ON
|
||||
: VAD_STATE_WAS_OFF);
|
||||
current_vad_state.vad_on = false;
|
||||
current_vad_state.start_ts_offest_ms =
|
||||
current_vad_state.end_ts_offset_ms;
|
||||
current_vad_state.end_ts_offset_ms = 0;
|
||||
} else {
|
||||
current_vad_state.vad_on = true;
|
||||
if (last_vad_state.vad_on) {
|
||||
current_vad_state.start_ts_offest_ms =
|
||||
last_vad_state.start_ts_offest_ms;
|
||||
} else {
|
||||
current_vad_state.start_ts_offest_ms =
|
||||
start_ts_offset_ms +
|
||||
start_frame * 1000 / WHISPER_SAMPLE_RATE;
|
||||
}
|
||||
obs_log(gf->log_level,
|
||||
"end not reached. vad state: start ts: %llu, end ts: %llu",
|
||||
current_vad_state.start_ts_offest_ms,
|
||||
current_vad_state.end_ts_offset_ms);
|
||||
}
|
||||
int end_frame = stamps[i].end;
|
||||
if (i == stamps.size() - 1 && stamps[i].end < (int)resampled_16khz_frames) {
|
||||
// take at least 100ms of audio after the last speech segment, if available
|
||||
end_frame = std::min(end_frame + WHISPER_SAMPLE_RATE / 10,
|
||||
(int)resampled_16khz_frames);
|
||||
}
|
||||
|
||||
const int number_of_frames = end_frame - start_frame;
|
||||
|
||||
// push the data into gf-whisper_buffer
|
||||
circlebuf_push_back(&gf->whisper_buffer, resampled_16khz[0] + start_frame,
|
||||
number_of_frames * sizeof(float));
|
||||
|
||||
obs_log(gf->log_level,
|
||||
"VAD segment %d. pushed %d to %d (%d frames / %lu ms). current size: %lu bytes / %lu frames / %lu ms",
|
||||
i, start_frame, end_frame, number_of_frames,
|
||||
number_of_frames * 1000 / WHISPER_SAMPLE_RATE, gf->whisper_buffer.size,
|
||||
gf->whisper_buffer.size / sizeof(float),
|
||||
gf->whisper_buffer.size / sizeof(float) * 1000 / WHISPER_SAMPLE_RATE);
|
||||
|
||||
// segment "end" is in the middle of the buffer, send it to inference
|
||||
if (stamps[i].end < (int)resampled_16khz_frames) {
|
||||
// new "ending" segment (not up to the end of the buffer)
|
||||
obs_log(gf->log_level, "VAD segment end -> send to inference");
|
||||
// find the end timestamp of the segment
|
||||
const uint64_t segment_end_ts =
|
||||
start_ts_offset_ms + end_frame * 1000 / WHISPER_SAMPLE_RATE;
|
||||
run_inference_and_callbacks(
|
||||
gf, last_vad_state.start_ts_offest_ms, segment_end_ts,
|
||||
last_vad_state.vad_on ? VAD_STATE_WAS_ON : VAD_STATE_WAS_OFF);
|
||||
current_vad_state.vad_on = false;
|
||||
current_vad_state.start_ts_offest_ms = current_vad_state.end_ts_offset_ms;
|
||||
current_vad_state.end_ts_offset_ms = 0;
|
||||
current_vad_state.last_partial_segment_end_ts = 0;
|
||||
last_vad_state = current_vad_state;
|
||||
continue;
|
||||
}
|
||||
|
||||
// end not reached - speech is ongoing
|
||||
current_vad_state.vad_on = true;
|
||||
if (last_vad_state.vad_on) {
|
||||
current_vad_state.start_ts_offest_ms = last_vad_state.start_ts_offest_ms;
|
||||
} else {
|
||||
current_vad_state.start_ts_offest_ms =
|
||||
start_ts_offset_ms + start_frame * 1000 / WHISPER_SAMPLE_RATE;
|
||||
}
|
||||
obs_log(gf->log_level, "end not reached. vad state: start ts: %llu, end ts: %llu",
|
||||
current_vad_state.start_ts_offest_ms, current_vad_state.end_ts_offset_ms);
|
||||
|
||||
last_vad_state = current_vad_state;
|
||||
|
||||
// if partial transcription is enabled, check if we should send a partial segment
|
||||
if (!gf->partial_transcription) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// current length of audio in buffer
|
||||
const uint64_t current_length_ms =
|
||||
(current_vad_state.end_ts_offset_ms > 0
|
||||
? current_vad_state.end_ts_offset_ms
|
||||
: current_vad_state.start_ts_offest_ms) -
|
||||
(current_vad_state.last_partial_segment_end_ts > 0
|
||||
? current_vad_state.last_partial_segment_end_ts
|
||||
: current_vad_state.start_ts_offest_ms);
|
||||
obs_log(gf->log_level, "current buffer length after last partial (%lu): %lu ms",
|
||||
current_vad_state.last_partial_segment_end_ts, current_length_ms);
|
||||
|
||||
if (current_length_ms > (uint64_t)gf->partial_latency) {
|
||||
current_vad_state.last_partial_segment_end_ts =
|
||||
current_vad_state.end_ts_offset_ms;
|
||||
// send partial segment to inference
|
||||
obs_log(gf->log_level, "Partial segment -> send to inference");
|
||||
run_inference_and_callbacks(gf, current_vad_state.start_ts_offest_ms,
|
||||
current_vad_state.end_ts_offset_ms,
|
||||
VAD_STATE_PARTIAL);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -491,7 +541,7 @@ void whisper_loop(void *data)
|
||||
|
||||
obs_log(gf->log_level, "Starting whisper thread");
|
||||
|
||||
vad_state current_vad_state = {false, 0, 0};
|
||||
vad_state current_vad_state = {false, 0, 0, 0};
|
||||
// 500 ms worth of audio is needed for VAD segmentation
|
||||
uint32_t min_num_bytes_for_vad = (gf->sample_rate / 2) * sizeof(float);
|
||||
|
||||
|
||||
@@ -17,6 +17,7 @@ enum DetectionResult {
|
||||
DETECTION_RESULT_SPEECH = 2,
|
||||
DETECTION_RESULT_SUPPRESSED = 3,
|
||||
DETECTION_RESULT_NO_INFERENCE = 4,
|
||||
DETECTION_RESULT_PARTIAL = 5,
|
||||
};
|
||||
|
||||
struct DetectionResultWithText {
|
||||
@@ -28,7 +29,7 @@ struct DetectionResultWithText {
|
||||
std::string language;
|
||||
};
|
||||
|
||||
enum VadState { VAD_STATE_WAS_ON = 0, VAD_STATE_WAS_OFF, VAD_STATE_IS_OFF };
|
||||
enum VadState { VAD_STATE_WAS_ON = 0, VAD_STATE_WAS_OFF, VAD_STATE_IS_OFF, VAD_STATE_PARTIAL };
|
||||
|
||||
void whisper_loop(void *data);
|
||||
struct whisper_context *init_whisper_context(const std::string &model_path,
|
||||
|
||||
Reference in New Issue
Block a user