mirror of
https://github.com/royshil/obs-localvocal.git
synced 2026-01-10 04:48:02 -05:00
Offline test improvements (#150)
* look at the front of the whisper buffer instead of the back
this should mostly not make a difference, but feels semantically
more correct
* Initialize `resampled_buffer` for offline tests
* Read relevant audio bytes
There are two issues here:
1. `line_size` may contain padding (didn't happen in my tests)
2. from: 2b5f000d3f:/libavutil/frame.h#l405
> For audio, only linesize[0] may be set. For planar audio, each
> channel plane must be the same size.
* log running time in addition to local time
* Run whisper test "as fast as possible"
This kind of behaves like libobs, where each chunk of audio is
inspected individually by VAD/whisper, until processing of either
takes longer than the window length, in which case audio continues
to stream in
* Only ever send a single chunk of audio
* Add additional files to tests copy command
* Use condition variable to signal input thread if available
* Only wait in whisper thread if input buffers are empty
This commit is contained in:
@@ -108,7 +108,8 @@ read_audio_file(const char *filename, std::function<void(int, int)> initializati
|
||||
for (int j = 0; j < codecContext->channels; j++) {
|
||||
buffer[j].insert(buffer[j].end(), frame->data[j],
|
||||
frame->data[j] +
|
||||
frame->linesize[0]);
|
||||
frame->nb_samples *
|
||||
sizeof(float));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -20,21 +20,23 @@ $obsDlls = @(
|
||||
".\release\Release\obs-plugins\64bit\onnxruntime_providers_shared.dll",
|
||||
".\release\Release\obs-plugins\64bit\onnxruntime.dll",
|
||||
".\release\Release\obs-plugins\64bit\whisper.dll",
|
||||
".deps\obs-deps-2023-11-03-x64\bin\avcodec-60.dll",
|
||||
".deps\obs-deps-2023-11-03-x64\bin\avdevice-60.dll",
|
||||
".deps\obs-deps-2023-11-03-x64\bin\avfilter-9.dll",
|
||||
".deps\obs-deps-2023-11-03-x64\bin\avformat-60.dll",
|
||||
".deps\obs-deps-2023-11-03-x64\bin\avutil-58.dll",
|
||||
".deps\obs-deps-2023-11-03-x64\bin\libx264-164.dll",
|
||||
".deps\obs-deps-2023-11-03-x64\bin\swresample-4.dll",
|
||||
".deps\obs-deps-2023-11-03-x64\bin\swscale-7.dll",
|
||||
".deps\obs-deps-2023-11-03-x64\bin\zlib.dll"
|
||||
".deps\obs-studio-30.0.2\build_x64\rundir\Debug\bin\64bit\obs-frontend-api.dll",
|
||||
".deps\obs-studio-30.0.2\build_x64\rundir\Debug\bin\64bit\obs.dll",
|
||||
".deps\obs-studio-30.0.2\build_x64\rundir\Debug\bin\64bit\w32-pthreads.dll"
|
||||
".\release\Release\obs-plugins\64bit\ggml.dll",
|
||||
".deps\obs-deps-2024-03-19-x64\bin\avcodec-60.dll",
|
||||
".deps\obs-deps-2024-03-19-x64\bin\avdevice-60.dll",
|
||||
".deps\obs-deps-2024-03-19-x64\bin\avfilter-9.dll",
|
||||
".deps\obs-deps-2024-03-19-x64\bin\avformat-60.dll",
|
||||
".deps\obs-deps-2024-03-19-x64\bin\avutil-58.dll",
|
||||
".deps\obs-deps-2024-03-19-x64\bin\libx264-164.dll",
|
||||
".deps\obs-deps-2024-03-19-x64\bin\swresample-4.dll",
|
||||
".deps\obs-deps-2024-03-19-x64\bin\swscale-7.dll",
|
||||
".deps\obs-deps-2024-03-19-x64\bin\zlib.dll"
|
||||
".deps\obs-deps-2024-03-19-x64\bin\librist.dll"
|
||||
".deps\obs-deps-2024-03-19-x64\bin\srt.dll"
|
||||
".deps\obs-studio-30.1.2\build_x64\rundir\Debug\bin\64bit\obs-frontend-api.dll",
|
||||
".deps\obs-studio-30.1.2\build_x64\rundir\Debug\bin\64bit\obs.dll",
|
||||
".deps\obs-studio-30.1.2\build_x64\rundir\Debug\bin\64bit\w32-pthreads.dll"
|
||||
)
|
||||
|
||||
$obsDlls | ForEach-Object {
|
||||
Copy-Item -Force -Path $_ -Destination $testToolPath
|
||||
}
|
||||
|
||||
|
||||
@@ -31,6 +31,7 @@
|
||||
|
||||
void obs_log(int log_level, const char *format, ...)
|
||||
{
|
||||
static auto start = std::chrono::system_clock::now();
|
||||
if (log_level == LOG_DEBUG) {
|
||||
return;
|
||||
}
|
||||
@@ -43,9 +44,14 @@ void obs_log(int log_level, const char *format, ...)
|
||||
std::time_t now_time_t = std::chrono::system_clock::to_time_t(now);
|
||||
std::tm now_tm = *std::localtime(&now_time_t);
|
||||
|
||||
auto diff = now - start;
|
||||
|
||||
// print timestamp
|
||||
printf("[%02d:%02d:%02d.%03d] ", now_tm.tm_hour, now_tm.tm_min, now_tm.tm_sec,
|
||||
(int)(epoch.count() % 1000));
|
||||
printf("[%02d:%02d:%02d.%03d] [%02d:%02lld.%03lld] ", now_tm.tm_hour, now_tm.tm_min,
|
||||
now_tm.tm_sec, (int)(epoch.count() % 1000),
|
||||
std::chrono::duration_cast<std::chrono::minutes>(diff).count(),
|
||||
std::chrono::duration_cast<std::chrono::seconds>(diff).count() % 60,
|
||||
std::chrono::duration_cast<std::chrono::milliseconds>(diff).count() % 1000);
|
||||
|
||||
// print log level
|
||||
switch (log_level) {
|
||||
@@ -95,12 +101,14 @@ create_context(int sample_rate, int channels, const std::string &whisper_model_p
|
||||
gf->process_while_muted = false;
|
||||
gf->buffered_output = false;
|
||||
gf->fix_utf8 = true;
|
||||
gf->input_cv.emplace();
|
||||
|
||||
for (size_t i = 0; i < gf->channels; i++) {
|
||||
circlebuf_init(&gf->input_buffers[i]);
|
||||
}
|
||||
circlebuf_init(&gf->info_buffer);
|
||||
circlebuf_init(&gf->whisper_buffer);
|
||||
circlebuf_init(&gf->resampled_buffer);
|
||||
|
||||
// allocate copy buffers
|
||||
gf->copy_buffers[0] =
|
||||
@@ -307,6 +315,7 @@ void release_context(transcription_filter_data *gf)
|
||||
}
|
||||
circlebuf_free(&gf->info_buffer);
|
||||
circlebuf_free(&gf->whisper_buffer);
|
||||
circlebuf_free(&gf->resampled_buffer);
|
||||
|
||||
delete gf;
|
||||
}
|
||||
@@ -420,19 +429,23 @@ int wmain(int argc, wchar_t *argv[])
|
||||
std::remove("segments.json");
|
||||
}
|
||||
|
||||
const auto window_size_in_ms = std::chrono::milliseconds(25);
|
||||
|
||||
// fill up the whisper buffer
|
||||
{
|
||||
gf->start_timestamp_ms = now_ms();
|
||||
|
||||
obs_log(LOG_INFO, "Sending samples to whisper buffer");
|
||||
// 25 ms worth of frames
|
||||
int frames = gf->sample_rate * 25 / 1000;
|
||||
int frames = gf->sample_rate * window_size_in_ms.count() / 1000;
|
||||
const int frame_size_bytes = sizeof(float);
|
||||
int frames_size_bytes = frames * frame_size_bytes;
|
||||
int frames_count = 0;
|
||||
int64_t start_time = std::chrono::duration_cast<std::chrono::nanoseconds>(
|
||||
std::chrono::system_clock::now().time_since_epoch())
|
||||
.count();
|
||||
auto start_time_time = std::chrono::system_clock::now();
|
||||
uint64_t window_number = 0;
|
||||
while (true) {
|
||||
// check if there are enough frames left in the audio buffer
|
||||
if ((frames_count + frames) > (audio[0].size() / frame_size_bytes)) {
|
||||
@@ -441,31 +454,48 @@ int wmain(int argc, wchar_t *argv[])
|
||||
frames_size_bytes = frames * frame_size_bytes;
|
||||
}
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(gf->whisper_buf_mutex);
|
||||
{
|
||||
auto max_wait = start_time_time +
|
||||
(window_number * window_size_in_ms);
|
||||
std::unique_lock<std::mutex> lock(gf->whisper_buf_mutex);
|
||||
for (;;) {
|
||||
// sleep up to window size in case whisper is processing, so the buffer builds up similar to OBS
|
||||
auto now = std::chrono::system_clock::now();
|
||||
if (false && now > max_wait)
|
||||
break;
|
||||
|
||||
// push back current audio data to input circlebuf
|
||||
for (size_t c = 0; c < gf->channels; c++) {
|
||||
circlebuf_push_back(&gf->input_buffers[c],
|
||||
audio[c].data() +
|
||||
frames_count * frame_size_bytes,
|
||||
frames_size_bytes);
|
||||
gf->input_cv->wait_for(
|
||||
lock, std::chrono::milliseconds(10), [&] {
|
||||
return gf->input_buffers->size == 0;
|
||||
});
|
||||
if (gf->input_buffers->size == 0)
|
||||
break;
|
||||
}
|
||||
// push back current audio data to input circlebuf
|
||||
for (size_t c = 0; c < gf->channels; c++) {
|
||||
circlebuf_push_back(
|
||||
&gf->input_buffers[c],
|
||||
audio[c].data() +
|
||||
frames_count * frame_size_bytes,
|
||||
frames_size_bytes);
|
||||
}
|
||||
// push audio packet info (timestamp/frame count) to info circlebuf
|
||||
struct transcription_filter_audio_info info = {0};
|
||||
info.frames = frames; // number of frames in this packet
|
||||
// make a timestamp from the current position in the audio buffer
|
||||
info.timestamp_offset_ns =
|
||||
start_time + (int64_t)(((float)frames_count /
|
||||
(float)gf->sample_rate) *
|
||||
1e9);
|
||||
circlebuf_push_back(&gf->info_buffer, &info, sizeof(info));
|
||||
}
|
||||
// push audio packet info (timestamp/frame count) to info circlebuf
|
||||
struct transcription_filter_audio_info info = {0};
|
||||
info.frames = frames; // number of frames in this packet
|
||||
// make a timestamp from the current position in the audio buffer
|
||||
info.timestamp_offset_ns =
|
||||
start_time +
|
||||
(int64_t)(((float)frames_count / (float)gf->sample_rate) *
|
||||
1e9);
|
||||
circlebuf_push_back(&gf->info_buffer, &info, sizeof(info));
|
||||
gf->wshiper_thread_cv.notify_one();
|
||||
}
|
||||
frames_count += frames;
|
||||
window_number += 1;
|
||||
if (frames_count >= audio[0].size() / frame_size_bytes) {
|
||||
break;
|
||||
}
|
||||
// sleep for 25 ms
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(25));
|
||||
}
|
||||
// push a second of silence to the input circlebuf
|
||||
frames = 2 * gf->sample_rate;
|
||||
|
||||
@@ -104,6 +104,7 @@ struct transcription_filter_data {
|
||||
std::mutex whisper_buf_mutex;
|
||||
std::mutex whisper_ctx_mutex;
|
||||
std::condition_variable wshiper_thread_cv;
|
||||
std::optional<std::condition_variable> input_cv;
|
||||
|
||||
// translation context
|
||||
struct translation_context translation_ctx;
|
||||
|
||||
@@ -305,11 +305,11 @@ void run_inference_and_callbacks(transcription_filter_data *gf, uint64_t start_o
|
||||
float *pcm32f_data = (float *)bzalloc(pcm32f_size_with_silence * sizeof(float));
|
||||
if (vad_state == VAD_STATE_PARTIAL) {
|
||||
// peek instead of pop, since this is a partial run that keeps the data in the buffer
|
||||
circlebuf_peek_back(&gf->whisper_buffer, pcm32f_data + WHISPER_SAMPLE_RATE / 100,
|
||||
pcm32f_size * sizeof(float));
|
||||
circlebuf_peek_front(&gf->whisper_buffer, pcm32f_data + WHISPER_SAMPLE_RATE / 100,
|
||||
pcm32f_size * sizeof(float));
|
||||
} else {
|
||||
circlebuf_pop_back(&gf->whisper_buffer, pcm32f_data + WHISPER_SAMPLE_RATE / 100,
|
||||
pcm32f_size * sizeof(float));
|
||||
circlebuf_pop_front(&gf->whisper_buffer, pcm32f_data + WHISPER_SAMPLE_RATE / 100,
|
||||
pcm32f_size * sizeof(float));
|
||||
}
|
||||
|
||||
struct DetectionResultWithText inference_result =
|
||||
@@ -599,11 +599,16 @@ void whisper_loop(void *data)
|
||||
}
|
||||
}
|
||||
|
||||
if (gf->input_cv.has_value())
|
||||
gf->input_cv->notify_one();
|
||||
|
||||
// Sleep using the condition variable wshiper_thread_cv
|
||||
// This will wake up the thread if there is new data in the input buffer
|
||||
// or if the whisper context is null
|
||||
std::unique_lock<std::mutex> lock(gf->whisper_ctx_mutex);
|
||||
gf->wshiper_thread_cv.wait_for(lock, std::chrono::milliseconds(50));
|
||||
if (gf->input_buffers->size == 0) {
|
||||
gf->wshiper_thread_cv.wait_for(lock, std::chrono::milliseconds(50));
|
||||
}
|
||||
}
|
||||
|
||||
obs_log(gf->log_level, "Exiting whisper thread");
|
||||
|
||||
Reference in New Issue
Block a user