mirror of
https://github.com/royshil/obs-localvocal.git
synced 2026-04-28 03:00:05 -04:00
* refactor: Add whisper_buffer to transcription_filter_data struct * refactor: Add sentence_psum_accept_thresh to transcription_filter_data struct * refactor: Update buffer size and overlap size in whisper-processing.cpp * refactor: Update buffer size and overlap size in whisper-processing.cpp * refactor: Add audio-file-utils.cpp for audio file handling * refactor: Update buffer size and overlap size in whisper-processing.cpp * refactor: Add external model option to translation settings * refactor: Add support for input tokenization style in translation settings * refactor: Update buffer size and overlap size in whisper-processing.cpp
273 lines
7.3 KiB
C++
273 lines
7.3 KiB
C++
|
|
#include "audio-file-utils.h"
|
|
#include "plugin-support.h"
|
|
|
|
#include <obs-module.h>
|
|
|
|
#include <vector>
|
|
#include <functional>
|
|
|
|
#if defined(_WIN32) || defined(__APPLE__)
|
|
|
|
extern "C" {
|
|
#include <libavformat/avformat.h>
|
|
#include <libavcodec/avcodec.h>
|
|
#include <libavutil/frame.h>
|
|
#include <libavutil/mem.h>
|
|
#include <libavutil/opt.h>
|
|
#include <libswresample/swresample.h>
|
|
#include <libavutil/log.h>
|
|
}
|
|
|
|
std::vector<std::vector<uint8_t>>
|
|
read_audio_file(const char *filename, std::function<void(int, int)> initialization_callback)
|
|
{
|
|
av_log_set_level(AV_LOG_QUIET);
|
|
|
|
obs_log(LOG_INFO, "Reading audio file %s", filename);
|
|
|
|
AVFormatContext *formatContext = nullptr;
|
|
int ret = avformat_open_input(&formatContext, filename, nullptr, nullptr);
|
|
if (ret != 0) {
|
|
char errbuf[AV_ERROR_MAX_STRING_SIZE];
|
|
av_make_error_string(errbuf, AV_ERROR_MAX_STRING_SIZE, ret);
|
|
obs_log(LOG_ERROR, "Error opening file: %s", errbuf);
|
|
return {};
|
|
}
|
|
|
|
if (avformat_find_stream_info(formatContext, nullptr) < 0) {
|
|
obs_log(LOG_ERROR, "Error finding stream information");
|
|
return {};
|
|
}
|
|
|
|
int audioStreamIndex = -1;
|
|
for (unsigned int i = 0; i < formatContext->nb_streams; i++) {
|
|
if (formatContext->streams[i]->codecpar->codec_type == AVMEDIA_TYPE_AUDIO) {
|
|
audioStreamIndex = i;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (audioStreamIndex == -1) {
|
|
obs_log(LOG_ERROR, "No audio stream found");
|
|
return {};
|
|
}
|
|
|
|
// print information about the file
|
|
av_dump_format(formatContext, 0, filename, 0);
|
|
|
|
// if the sample format is not float, return
|
|
if (formatContext->streams[audioStreamIndex]->codecpar->format != AV_SAMPLE_FMT_FLTP) {
|
|
obs_log(LOG_ERROR,
|
|
"Sample format is not float (it is %s). Encode the audio file with float planar sample format."
|
|
" For example, use the command 'ffmpeg -i input.mp3 -f f32le -acodec pcm_f32le output.f32le'",
|
|
"convert the audio file to float format.",
|
|
av_get_sample_fmt_name(
|
|
(AVSampleFormat)formatContext->streams[audioStreamIndex]
|
|
->codecpar->format));
|
|
return {};
|
|
}
|
|
|
|
initialization_callback(formatContext->streams[audioStreamIndex]->codecpar->sample_rate,
|
|
formatContext->streams[audioStreamIndex]->codecpar->channels);
|
|
|
|
AVCodecParameters *codecParams = formatContext->streams[audioStreamIndex]->codecpar;
|
|
const AVCodec *codec = avcodec_find_decoder(codecParams->codec_id);
|
|
if (!codec) {
|
|
obs_log(LOG_ERROR, "Decoder not found");
|
|
return {};
|
|
}
|
|
|
|
AVCodecContext *codecContext = avcodec_alloc_context3(codec);
|
|
if (!codecContext) {
|
|
obs_log(LOG_ERROR, "Failed to allocate codec context");
|
|
return {};
|
|
}
|
|
|
|
if (avcodec_parameters_to_context(codecContext, codecParams) < 0) {
|
|
obs_log(LOG_ERROR, "Failed to copy codec parameters to codec context");
|
|
return {};
|
|
}
|
|
|
|
if (avcodec_open2(codecContext, codec, nullptr) < 0) {
|
|
obs_log(LOG_ERROR, "Failed to open codec");
|
|
return {};
|
|
}
|
|
|
|
AVFrame *frame = av_frame_alloc();
|
|
AVPacket packet;
|
|
|
|
std::vector<std::vector<uint8_t>> buffer(
|
|
formatContext->streams[audioStreamIndex]->codecpar->channels);
|
|
|
|
while (av_read_frame(formatContext, &packet) >= 0) {
|
|
if (packet.stream_index == audioStreamIndex) {
|
|
if (avcodec_send_packet(codecContext, &packet) == 0) {
|
|
while (avcodec_receive_frame(codecContext, frame) == 0) {
|
|
// push data to the buffer
|
|
for (int j = 0; j < codecContext->channels; j++) {
|
|
buffer[j].insert(buffer[j].end(), frame->data[j],
|
|
frame->data[j] +
|
|
frame->linesize[0]);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
av_packet_unref(&packet);
|
|
}
|
|
|
|
av_frame_free(&frame);
|
|
avcodec_free_context(&codecContext);
|
|
avformat_close_input(&formatContext);
|
|
|
|
return buffer;
|
|
}
|
|
|
|
void write_audio_wav_file(const std::string &filename, const float *pcm32f_data,
|
|
const size_t frames)
|
|
{
|
|
av_log_set_level(AV_LOG_QUIET);
|
|
|
|
AVFormatContext *formatContext = nullptr;
|
|
AVCodecContext *codecContext = nullptr;
|
|
AVStream *stream = nullptr;
|
|
AVFrame *frame = nullptr;
|
|
AVPacket packet;
|
|
int ret = 0;
|
|
|
|
avformat_alloc_output_context2(&formatContext, nullptr, nullptr, filename.c_str());
|
|
if (!formatContext) {
|
|
obs_log(LOG_ERROR, "Failed to allocate output context");
|
|
return;
|
|
}
|
|
|
|
const AVCodec *codec = avcodec_find_encoder(AV_CODEC_ID_PCM_F32LE);
|
|
if (!codec) {
|
|
obs_log(LOG_ERROR, "Failed to find encoder");
|
|
return;
|
|
}
|
|
|
|
stream = avformat_new_stream(formatContext, codec);
|
|
if (!stream) {
|
|
obs_log(LOG_ERROR, "Failed to create new stream");
|
|
return;
|
|
}
|
|
|
|
codecContext = avcodec_alloc_context3(codec);
|
|
if (!codecContext) {
|
|
obs_log(LOG_ERROR, "Failed to allocate codec context");
|
|
return;
|
|
}
|
|
|
|
codecContext->sample_fmt = AV_SAMPLE_FMT_FLTP;
|
|
codecContext->sample_rate = 16000;
|
|
codecContext->channels = 1;
|
|
codecContext->channel_layout = AV_CH_LAYOUT_MONO;
|
|
codecContext->bit_rate = 64000;
|
|
codecContext->strict_std_compliance = FF_COMPLIANCE_EXPERIMENTAL;
|
|
|
|
if (avcodec_open2(codecContext, codec, nullptr) < 0) {
|
|
obs_log(LOG_ERROR, "Failed to open codec");
|
|
return;
|
|
}
|
|
|
|
if (avcodec_parameters_from_context(stream->codecpar, codecContext) < 0) {
|
|
obs_log(LOG_ERROR, "Failed to copy codec parameters to stream");
|
|
return;
|
|
}
|
|
|
|
if (avio_open(&formatContext->pb, filename.c_str(), AVIO_FLAG_WRITE) < 0) {
|
|
obs_log(LOG_ERROR, "Failed to open file");
|
|
return;
|
|
}
|
|
|
|
if (avformat_write_header(formatContext, nullptr) < 0) {
|
|
obs_log(LOG_ERROR, "Failed to write header");
|
|
return;
|
|
}
|
|
|
|
const int frame_size = 1024;
|
|
const int frame_size_in_bytes = frame_size * sizeof(float);
|
|
frame = av_frame_alloc();
|
|
frame->nb_samples = frame_size;
|
|
frame->format = codecContext->sample_fmt;
|
|
frame->ch_layout = codecContext->ch_layout;
|
|
|
|
ret = av_frame_get_buffer(frame, 0);
|
|
if (ret < 0) {
|
|
char errbuf[AV_ERROR_MAX_STRING_SIZE];
|
|
av_make_error_string(errbuf, AV_ERROR_MAX_STRING_SIZE, ret);
|
|
obs_log(LOG_ERROR, "Failed to allocate frame buffer: %s", errbuf);
|
|
return;
|
|
}
|
|
|
|
for (size_t i = 0; i < frames; i += frame_size) {
|
|
av_init_packet(&packet);
|
|
packet.data = nullptr;
|
|
packet.size = 0;
|
|
|
|
for (int k = 0; k < codecContext->channels; k++) {
|
|
if (i + frame_size < frames) {
|
|
memcpy(frame->data[k], pcm32f_data + i, frame_size_in_bytes);
|
|
} else {
|
|
// zero pad the last frame
|
|
memset(frame->data[k], 0, frame_size_in_bytes);
|
|
memcpy(frame->data[k], pcm32f_data + i,
|
|
(frames - i) * sizeof(float));
|
|
}
|
|
}
|
|
|
|
ret = avcodec_send_frame(codecContext, frame);
|
|
if (ret < 0) {
|
|
obs_log(LOG_ERROR, "Failed to send frame");
|
|
break;
|
|
}
|
|
|
|
ret = avcodec_receive_packet(codecContext, &packet);
|
|
if (ret < 0) {
|
|
obs_log(LOG_ERROR, "Failed to receive packet");
|
|
break;
|
|
}
|
|
|
|
av_packet_rescale_ts(&packet, codecContext->time_base, stream->time_base);
|
|
packet.stream_index = stream->index;
|
|
|
|
ret = av_interleaved_write_frame(formatContext, &packet);
|
|
if (ret < 0) {
|
|
obs_log(LOG_ERROR, "Failed to write frame");
|
|
break;
|
|
}
|
|
|
|
av_packet_unref(&packet);
|
|
}
|
|
|
|
if (ret >= 0) {
|
|
av_write_trailer(formatContext);
|
|
}
|
|
|
|
av_frame_free(&frame);
|
|
avcodec_free_context(&codecContext);
|
|
avformat_free_context(formatContext);
|
|
|
|
if (ret < 0) {
|
|
obs_log(LOG_ERROR, "Failed to write audio file %s", filename.c_str());
|
|
}
|
|
}
|
|
|
|
#else
|
|
|
|
std::vector<std::vector<uint8_t>>
|
|
read_audio_file(const char *filename, std::function<void(int, int)> initialization_callback)
|
|
{
|
|
obs_log(LOG_ERROR, "Reading audio files is not supported on this platform");
|
|
return {};
|
|
}
|
|
|
|
void write_audio_wav_file(const std::string &filename, const float *pcm32f_data,
|
|
const size_t frames)
|
|
{
|
|
obs_log(LOG_ERROR, "Writing audio files is not supported on this platform");
|
|
}
|
|
|
|
#endif
|