Files
obs-localvocal/src/tests/audio-file-utils.cpp
Roy Shilkrot 5227a437b6 VAD based segmentation (#97)
* refactor: Add whisper_buffer to transcription_filter_data struct

* refactor: Add sentence_psum_accept_thresh to transcription_filter_data struct

* refactor: Update buffer size and overlap size in whisper-processing.cpp

* refactor: Update buffer size and overlap size in whisper-processing.cpp

* refactor: Add audio-file-utils.cpp for audio file handling

* refactor: Update buffer size and overlap size in whisper-processing.cpp

* refactor: Add external model option to translation settings

* refactor: Add support for input tokenization style in translation settings

* refactor: Update buffer size and overlap size in whisper-processing.cpp
2024-05-16 15:07:00 -04:00

273 lines
7.3 KiB
C++

#include "audio-file-utils.h"
#include "plugin-support.h"
#include <obs-module.h>
#include <vector>
#include <functional>
#if defined(_WIN32) || defined(__APPLE__)
extern "C" {
#include <libavformat/avformat.h>
#include <libavcodec/avcodec.h>
#include <libavutil/frame.h>
#include <libavutil/mem.h>
#include <libavutil/opt.h>
#include <libswresample/swresample.h>
#include <libavutil/log.h>
}
std::vector<std::vector<uint8_t>>
read_audio_file(const char *filename, std::function<void(int, int)> initialization_callback)
{
av_log_set_level(AV_LOG_QUIET);
obs_log(LOG_INFO, "Reading audio file %s", filename);
AVFormatContext *formatContext = nullptr;
int ret = avformat_open_input(&formatContext, filename, nullptr, nullptr);
if (ret != 0) {
char errbuf[AV_ERROR_MAX_STRING_SIZE];
av_make_error_string(errbuf, AV_ERROR_MAX_STRING_SIZE, ret);
obs_log(LOG_ERROR, "Error opening file: %s", errbuf);
return {};
}
if (avformat_find_stream_info(formatContext, nullptr) < 0) {
obs_log(LOG_ERROR, "Error finding stream information");
return {};
}
int audioStreamIndex = -1;
for (unsigned int i = 0; i < formatContext->nb_streams; i++) {
if (formatContext->streams[i]->codecpar->codec_type == AVMEDIA_TYPE_AUDIO) {
audioStreamIndex = i;
break;
}
}
if (audioStreamIndex == -1) {
obs_log(LOG_ERROR, "No audio stream found");
return {};
}
// print information about the file
av_dump_format(formatContext, 0, filename, 0);
// if the sample format is not float, return
if (formatContext->streams[audioStreamIndex]->codecpar->format != AV_SAMPLE_FMT_FLTP) {
obs_log(LOG_ERROR,
"Sample format is not float (it is %s). Encode the audio file with float planar sample format."
" For example, use the command 'ffmpeg -i input.mp3 -f f32le -acodec pcm_f32le output.f32le'",
"convert the audio file to float format.",
av_get_sample_fmt_name(
(AVSampleFormat)formatContext->streams[audioStreamIndex]
->codecpar->format));
return {};
}
initialization_callback(formatContext->streams[audioStreamIndex]->codecpar->sample_rate,
formatContext->streams[audioStreamIndex]->codecpar->channels);
AVCodecParameters *codecParams = formatContext->streams[audioStreamIndex]->codecpar;
const AVCodec *codec = avcodec_find_decoder(codecParams->codec_id);
if (!codec) {
obs_log(LOG_ERROR, "Decoder not found");
return {};
}
AVCodecContext *codecContext = avcodec_alloc_context3(codec);
if (!codecContext) {
obs_log(LOG_ERROR, "Failed to allocate codec context");
return {};
}
if (avcodec_parameters_to_context(codecContext, codecParams) < 0) {
obs_log(LOG_ERROR, "Failed to copy codec parameters to codec context");
return {};
}
if (avcodec_open2(codecContext, codec, nullptr) < 0) {
obs_log(LOG_ERROR, "Failed to open codec");
return {};
}
AVFrame *frame = av_frame_alloc();
AVPacket packet;
std::vector<std::vector<uint8_t>> buffer(
formatContext->streams[audioStreamIndex]->codecpar->channels);
while (av_read_frame(formatContext, &packet) >= 0) {
if (packet.stream_index == audioStreamIndex) {
if (avcodec_send_packet(codecContext, &packet) == 0) {
while (avcodec_receive_frame(codecContext, frame) == 0) {
// push data to the buffer
for (int j = 0; j < codecContext->channels; j++) {
buffer[j].insert(buffer[j].end(), frame->data[j],
frame->data[j] +
frame->linesize[0]);
}
}
}
}
av_packet_unref(&packet);
}
av_frame_free(&frame);
avcodec_free_context(&codecContext);
avformat_close_input(&formatContext);
return buffer;
}
void write_audio_wav_file(const std::string &filename, const float *pcm32f_data,
const size_t frames)
{
av_log_set_level(AV_LOG_QUIET);
AVFormatContext *formatContext = nullptr;
AVCodecContext *codecContext = nullptr;
AVStream *stream = nullptr;
AVFrame *frame = nullptr;
AVPacket packet;
int ret = 0;
avformat_alloc_output_context2(&formatContext, nullptr, nullptr, filename.c_str());
if (!formatContext) {
obs_log(LOG_ERROR, "Failed to allocate output context");
return;
}
const AVCodec *codec = avcodec_find_encoder(AV_CODEC_ID_PCM_F32LE);
if (!codec) {
obs_log(LOG_ERROR, "Failed to find encoder");
return;
}
stream = avformat_new_stream(formatContext, codec);
if (!stream) {
obs_log(LOG_ERROR, "Failed to create new stream");
return;
}
codecContext = avcodec_alloc_context3(codec);
if (!codecContext) {
obs_log(LOG_ERROR, "Failed to allocate codec context");
return;
}
codecContext->sample_fmt = AV_SAMPLE_FMT_FLTP;
codecContext->sample_rate = 16000;
codecContext->channels = 1;
codecContext->channel_layout = AV_CH_LAYOUT_MONO;
codecContext->bit_rate = 64000;
codecContext->strict_std_compliance = FF_COMPLIANCE_EXPERIMENTAL;
if (avcodec_open2(codecContext, codec, nullptr) < 0) {
obs_log(LOG_ERROR, "Failed to open codec");
return;
}
if (avcodec_parameters_from_context(stream->codecpar, codecContext) < 0) {
obs_log(LOG_ERROR, "Failed to copy codec parameters to stream");
return;
}
if (avio_open(&formatContext->pb, filename.c_str(), AVIO_FLAG_WRITE) < 0) {
obs_log(LOG_ERROR, "Failed to open file");
return;
}
if (avformat_write_header(formatContext, nullptr) < 0) {
obs_log(LOG_ERROR, "Failed to write header");
return;
}
const int frame_size = 1024;
const int frame_size_in_bytes = frame_size * sizeof(float);
frame = av_frame_alloc();
frame->nb_samples = frame_size;
frame->format = codecContext->sample_fmt;
frame->ch_layout = codecContext->ch_layout;
ret = av_frame_get_buffer(frame, 0);
if (ret < 0) {
char errbuf[AV_ERROR_MAX_STRING_SIZE];
av_make_error_string(errbuf, AV_ERROR_MAX_STRING_SIZE, ret);
obs_log(LOG_ERROR, "Failed to allocate frame buffer: %s", errbuf);
return;
}
for (size_t i = 0; i < frames; i += frame_size) {
av_init_packet(&packet);
packet.data = nullptr;
packet.size = 0;
for (int k = 0; k < codecContext->channels; k++) {
if (i + frame_size < frames) {
memcpy(frame->data[k], pcm32f_data + i, frame_size_in_bytes);
} else {
// zero pad the last frame
memset(frame->data[k], 0, frame_size_in_bytes);
memcpy(frame->data[k], pcm32f_data + i,
(frames - i) * sizeof(float));
}
}
ret = avcodec_send_frame(codecContext, frame);
if (ret < 0) {
obs_log(LOG_ERROR, "Failed to send frame");
break;
}
ret = avcodec_receive_packet(codecContext, &packet);
if (ret < 0) {
obs_log(LOG_ERROR, "Failed to receive packet");
break;
}
av_packet_rescale_ts(&packet, codecContext->time_base, stream->time_base);
packet.stream_index = stream->index;
ret = av_interleaved_write_frame(formatContext, &packet);
if (ret < 0) {
obs_log(LOG_ERROR, "Failed to write frame");
break;
}
av_packet_unref(&packet);
}
if (ret >= 0) {
av_write_trailer(formatContext);
}
av_frame_free(&frame);
avcodec_free_context(&codecContext);
avformat_free_context(formatContext);
if (ret < 0) {
obs_log(LOG_ERROR, "Failed to write audio file %s", filename.c_str());
}
}
#else
std::vector<std::vector<uint8_t>>
read_audio_file(const char *filename, std::function<void(int, int)> initialization_callback)
{
obs_log(LOG_ERROR, "Reading audio files is not supported on this platform");
return {};
}
void write_audio_wav_file(const std::string &filename, const float *pcm32f_data,
const size_t frames)
{
obs_log(LOG_ERROR, "Writing audio files is not supported on this platform");
}
#endif