Add WebVTT-in-video-stream support (#196)

* Fix `whisper_buffer` and `resampled_buffer` data race

`media_unpause` was causing `wisper_buffer` to be freed while
`vad_based_segmentation`/`hybrid_vad_segmentation` need that buffer
to not be modified for the duration of those calls

* Slightly improve handling for weird subtitle output filenames

* Squashed 'deps/c-webvtt-in-video-stream/' content from commit 5579ca6

git-subtree-dir: deps/c-webvtt-in-video-stream
git-subtree-split: 5579ca6dc9dcf94e3c14631c6c01b2ee4dfcf005

* Add WIP webvtt sei functionality

* Add webvtt recording/streaming settings

* Make latency_to_video_in_msecs and send_frequency_hz configurable

* Make webvtt languages configurable

* Add translation and main language separately

* Add rust CI integration
This commit is contained in:
Ruwen Hahn
2025-02-03 15:06:52 +01:00
committed by GitHub
parent fe9a52157e
commit b3be219915
26 changed files with 2460 additions and 9 deletions

View File

@@ -119,6 +119,16 @@ jobs:
restore-keys: |
${{ runner.os }}-ccache-${{ matrix.architecture }}-
- uses: actions-rust-lang/setup-rust-toolchain@v1
if: matrix.architecture == 'arm64'
with:
target: aarch64-apple-darwin
- uses: actions-rust-lang/setup-rust-toolchain@v1
if: matrix.architecture == 'x86_64'
with:
target: x86_64-apple-darwin
- name: Set Up Codesigning 🔑
uses: ./.github/actions/setup-macos-codesigning
if: fromJSON(needs.check-event.outputs.codesign)
@@ -197,6 +207,8 @@ jobs:
echo "pluginName=${product_name}" >> $GITHUB_OUTPUT
echo "pluginVersion=${product_version}" >> $GITHUB_OUTPUT
- uses: actions-rust-lang/setup-rust-toolchain@v1
- uses: actions/cache@v4
id: ccache-cache
with:
@@ -271,6 +283,8 @@ jobs:
"pluginName=${ProductName}" >> $env:GITHUB_OUTPUT
"pluginVersion=${ProductVersion}" >> $env:GITHUB_OUTPUT
- uses: actions-rust-lang/setup-rust-toolchain@v1
- name: Build Plugin 🧱
uses: ./.github/actions/build-plugin
with:

View File

@@ -25,3 +25,17 @@ jobs:
uses: ./.github/actions/run-cmake-format
with:
failCondition: error
cargo-fmt:
runs-on: ubuntu-22.04
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0
- uses: actions-rust-lang/setup-rust-toolchain@v1
with:
components: rustfmt
- name: rustfmt
uses: actions-rust-lang/rustfmt@v1
with:
manifest-path: deps/c-webvtt-in-video-stream/Cargo.toml

View File

@@ -106,6 +106,14 @@ if(DEFINED ENV{LOCALVOCAL_EXTRA_VERBOSE})
target_compile_definitions(${CMAKE_PROJECT_NAME} PRIVATE LOCALVOCAL_EXTRA_VERBOSE)
endif()
option(ENABLE_WEBVTT "Enable WebVTT embedding" ON)
if(ENABLE_WEBVTT)
include(cmake/BuildWebVTT.cmake)
target_link_libraries(${CMAKE_PROJECT_NAME} PRIVATE c_webvtt_in_video_stream)
target_compile_definitions(c_webvtt_in_video_stream INTERFACE ENABLE_WEBVTT)
endif()
target_sources(
${CMAKE_PROJECT_NAME}
PRIVATE src/plugin-main.c

23
cmake/BuildWebVTT.cmake Normal file
View File

@@ -0,0 +1,23 @@
include(FetchContent)
set(Rust_RUSTUP_INSTALL_MISSING_TARGET true)
if(OS_MACOS)
if("$ENV{MACOS_ARCH}" STREQUAL "x86_64")
set(Rust_CARGO_TARGET "x86_64-apple-darwin")
endif()
endif()
FetchContent_Declare(
Corrosion
GIT_REPOSITORY https://github.com/corrosion-rs/corrosion.git
GIT_TAG v0.5 # Optionally specify a commit hash, version tag or branch here
)
FetchContent_MakeAvailable(Corrosion)
# Import targets defined in a package or workspace manifest `Cargo.toml` file
corrosion_import_crate(MANIFEST_PATH "${CMAKE_SOURCE_DIR}/deps/c-webvtt-in-video-stream/Cargo.toml" CRATE_TYPES
"staticlib" PROFILE release)
set_target_properties(c_webvtt_in_video_stream PROPERTIES INTERFACE_INCLUDE_DIRECTORIES
"${CMAKE_SOURCE_DIR}/deps/c-webvtt-in-video-stream/target/")

View File

@@ -4,6 +4,12 @@ vad_threshold="VAD Threshold"
log_level="Internal Log Level"
log_words="Log Output to Console"
caption_to_stream="Stream Captions"
webvtt_group="WebVTT"
webvtt_caption_to_stream="Add WebVTT captions to stream"
webvtt_caption_to_recording="Add WebVTT captions to recording"
webvtt_latency_to_video_in_msecs="Latency to video (milliseconds)"
webvtt_send_frequency_hz="Send frequency (Hz)"
webvtt_language_description="Language $1"
subtitle_sources="Output Destination"
none_no_output="None / No output"
file_output_enable="Save to File"

View File

@@ -0,0 +1 @@
/target

View File

@@ -0,0 +1,7 @@
{
"editor.formatOnSave": true,
"evenBetterToml.formatter.reorderKeys": true,
"evenBetterToml.formatter.reorderArrays": true,
"evenBetterToml.formatter.trailingNewline": true,
"rust-analyzer.check.command": "clippy"
}

581
deps/c-webvtt-in-video-stream/Cargo.lock generated vendored Normal file
View File

@@ -0,0 +1,581 @@
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
version = 3
[[package]]
name = "anstream"
version = "0.6.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8acc5369981196006228e28809f761875c0327210a891e941f4c683b3a99529b"
dependencies = [
"anstyle",
"anstyle-parse",
"anstyle-query",
"anstyle-wincon",
"colorchoice",
"is_terminal_polyfill",
"utf8parse",
]
[[package]]
name = "anstyle"
version = "1.0.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9"
[[package]]
name = "anstyle-parse"
version = "0.2.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3b2d16507662817a6a20a9ea92df6652ee4f94f914589377d69f3b21bc5798a9"
dependencies = [
"utf8parse",
]
[[package]]
name = "anstyle-query"
version = "1.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "79947af37f4177cfead1110013d678905c37501914fba0efea834c3fe9a8d60c"
dependencies = [
"windows-sys",
]
[[package]]
name = "anstyle-wincon"
version = "3.0.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2109dbce0e72be3ec00bed26e6a7479ca384ad226efdd66db8fa2e3a38c83125"
dependencies = [
"anstyle",
"windows-sys",
]
[[package]]
name = "bitflags"
version = "2.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de"
[[package]]
name = "bitstream-io"
version = "1.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e445576659fd04a57b44cbd00aa37aaa815ebefa0aa3cb677a6b5e63d883074f"
[[package]]
name = "byteorder"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
[[package]]
name = "c-webvtt-in-video-stream"
version = "0.1.0"
dependencies = [
"cbindgen",
"h264-reader",
"strum_macros",
"video-bytestream-tools",
"webvtt-in-video-stream",
]
[[package]]
name = "cbindgen"
version = "0.27.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3fce8dd7fcfcbf3a0a87d8f515194b49d6135acab73e18bd380d1d93bb1a15eb"
dependencies = [
"clap",
"heck 0.4.1",
"indexmap",
"log",
"proc-macro2",
"quote",
"serde",
"serde_json",
"syn",
"tempfile",
"toml",
]
[[package]]
name = "cfg-if"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
[[package]]
name = "clap"
version = "4.5.23"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3135e7ec2ef7b10c6ed8950f0f792ed96ee093fa088608f1c76e569722700c84"
dependencies = [
"clap_builder",
]
[[package]]
name = "clap_builder"
version = "4.5.23"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "30582fc632330df2bd26877bde0c1f4470d57c582bbc070376afcd04d8cb4838"
dependencies = [
"anstream",
"anstyle",
"clap_lex",
"strsim",
]
[[package]]
name = "clap_lex"
version = "0.7.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f46ad14479a25103f283c0f10005961cf086d8dc42205bb44c46ac563475dca6"
[[package]]
name = "colorchoice"
version = "1.0.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990"
[[package]]
name = "equivalent"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5"
[[package]]
name = "errno"
version = "0.3.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "33d852cb9b869c2a9b3df2f71a3074817f01e1844f839a144f5fcef059a4eb5d"
dependencies = [
"libc",
"windows-sys",
]
[[package]]
name = "fastrand"
version = "2.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be"
[[package]]
name = "four-cc"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3958af68a31b1d1384d3f39b6aa33eb14b6009065b5ca305ddd9712a4237124f"
[[package]]
name = "h264-reader"
version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bd118dcc322cc71cfc33254a19ebece92cfaaf6d4b4793fec3f7f44fbc4150df"
dependencies = [
"bitstream-io",
"hex-slice",
"log",
"memchr",
"rfc6381-codec",
]
[[package]]
name = "hashbrown"
version = "0.15.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bf151400ff0baff5465007dd2f3e717f3fe502074ca563069ce3a6629d07b289"
[[package]]
name = "heck"
version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
[[package]]
name = "heck"
version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
[[package]]
name = "hex-slice"
version = "0.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5491a308e0214554f07a81d8944abe45f552871c12e3c3c6e7e5d354039a6c4c"
[[package]]
name = "indexmap"
version = "2.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "62f822373a4fe84d4bb149bf54e584a7f4abec90e072ed49cda0edea5b95471f"
dependencies = [
"equivalent",
"hashbrown",
]
[[package]]
name = "is_terminal_polyfill"
version = "1.70.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf"
[[package]]
name = "itoa"
version = "1.0.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d75a2a4b1b190afb6f5425f10f6a8f959d2ea0b9c2b1d79553551850539e4674"
[[package]]
name = "libc"
version = "0.2.169"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a"
[[package]]
name = "linux-raw-sys"
version = "0.4.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89"
[[package]]
name = "log"
version = "0.4.22"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24"
[[package]]
name = "memchr"
version = "2.7.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
[[package]]
name = "mp4ra-rust"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "be9daf03b43bf3842962947c62ba40f411e46a58774c60838038f04a67d17626"
dependencies = [
"four-cc",
]
[[package]]
name = "mpeg4-audio-const"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "96a1fe2275b68991faded2c80aa4a33dba398b77d276038b8f50701a22e55918"
[[package]]
name = "once_cell"
version = "1.20.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775"
[[package]]
name = "proc-macro2"
version = "1.0.91"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "307e3004becf10f5a6e0d59d20f3cd28231b0e0827a96cd3e0ce6d14bc1e4bb3"
dependencies = [
"unicode-ident",
]
[[package]]
name = "quote"
version = "1.0.37"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af"
dependencies = [
"proc-macro2",
]
[[package]]
name = "rfc6381-codec"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4395f46a67f0d57c57f6a5361f3a9a0c0183a19cab3998892ecdc003de6d8037"
dependencies = [
"four-cc",
"mp4ra-rust",
"mpeg4-audio-const",
]
[[package]]
name = "rustix"
version = "0.38.42"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f93dc38ecbab2eb790ff964bb77fa94faf256fd3e73285fd7ba0903b76bedb85"
dependencies = [
"bitflags",
"errno",
"libc",
"linux-raw-sys",
"windows-sys",
]
[[package]]
name = "rustversion"
version = "1.0.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0e819f2bc632f285be6d7cd36e25940d45b2391dd6d9b939e79de557f7014248"
[[package]]
name = "ryu"
version = "1.0.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f"
[[package]]
name = "serde"
version = "1.0.216"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0b9781016e935a97e8beecf0c933758c97a5520d32930e460142b4cd80c6338e"
dependencies = [
"serde_derive",
]
[[package]]
name = "serde_derive"
version = "1.0.216"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "46f859dbbf73865c6627ed570e78961cd3ac92407a2d117204c49232485da55e"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "serde_json"
version = "1.0.133"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c7fceb2473b9166b2294ef05efcb65a3db80803f0b03ef86a5fc88a2b85ee377"
dependencies = [
"itoa",
"memchr",
"ryu",
"serde",
]
[[package]]
name = "serde_spanned"
version = "0.6.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "87607cb1398ed59d48732e575a4c28a7a8ebf2454b964fe3f224f2afc07909e1"
dependencies = [
"serde",
]
[[package]]
name = "strsim"
version = "0.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
[[package]]
name = "strum_macros"
version = "0.26.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be"
dependencies = [
"heck 0.5.0",
"proc-macro2",
"quote",
"rustversion",
"syn",
]
[[package]]
name = "syn"
version = "2.0.89"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "44d46482f1c1c87acd84dea20c1bf5ebff4c757009ed6bf19cfd36fb10e92c4e"
dependencies = [
"proc-macro2",
"quote",
"unicode-ident",
]
[[package]]
name = "tempfile"
version = "3.14.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "28cce251fcbc87fac86a866eeb0d6c2d536fc16d06f184bb61aeae11aa4cee0c"
dependencies = [
"cfg-if",
"fastrand",
"once_cell",
"rustix",
"windows-sys",
]
[[package]]
name = "thiserror"
version = "2.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2f49a1853cf82743e3b7950f77e0f4d622ca36cf4317cba00c767838bac8d490"
dependencies = [
"thiserror-impl",
]
[[package]]
name = "thiserror-impl"
version = "2.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8381894bb3efe0c4acac3ded651301ceee58a15d47c2e34885ed1908ad667061"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "toml"
version = "0.8.19"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a1ed1f98e3fdc28d6d910e6737ae6ab1a93bf1985935a1193e68f93eeb68d24e"
dependencies = [
"serde",
"serde_spanned",
"toml_datetime",
"toml_edit",
]
[[package]]
name = "toml_datetime"
version = "0.6.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0dd7358ecb8fc2f8d014bf86f6f638ce72ba252a2c3a2572f2a795f1d23efb41"
dependencies = [
"serde",
]
[[package]]
name = "toml_edit"
version = "0.22.22"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4ae48d6208a266e853d946088ed816055e556cc6028c5e8e2b84d9fa5dd7c7f5"
dependencies = [
"indexmap",
"serde",
"serde_spanned",
"toml_datetime",
"winnow",
]
[[package]]
name = "unicode-ident"
version = "1.0.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83"
[[package]]
name = "utf8parse"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
[[package]]
name = "uuid"
version = "1.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f8c5f0a0af699448548ad1a2fbf920fb4bee257eae39953ba95cb84891a0446a"
[[package]]
name = "video-bytestream-tools"
version = "0.1.0"
dependencies = [
"byteorder",
"h264-reader",
"thiserror",
"uuid",
]
[[package]]
name = "webvtt-in-video-stream"
version = "0.1.0"
dependencies = [
"thiserror",
"video-bytestream-tools",
]
[[package]]
name = "windows-sys"
version = "0.59.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b"
dependencies = [
"windows-targets",
]
[[package]]
name = "windows-targets"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
dependencies = [
"windows_aarch64_gnullvm",
"windows_aarch64_msvc",
"windows_i686_gnu",
"windows_i686_gnullvm",
"windows_i686_msvc",
"windows_x86_64_gnu",
"windows_x86_64_gnullvm",
"windows_x86_64_msvc",
]
[[package]]
name = "windows_aarch64_gnullvm"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
[[package]]
name = "windows_aarch64_msvc"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
[[package]]
name = "windows_i686_gnu"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
[[package]]
name = "windows_i686_gnullvm"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
[[package]]
name = "windows_i686_msvc"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
[[package]]
name = "windows_x86_64_gnu"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
[[package]]
name = "windows_x86_64_gnullvm"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
[[package]]
name = "windows_x86_64_msvc"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
[[package]]
name = "winnow"
version = "0.6.20"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "36c1fec1a2bb5866f07c25f68c26e565c4c200aebb96d7e55710c19d3e8ac49b"
dependencies = [
"memchr",
]

View File

@@ -0,0 +1,27 @@
[package]
edition = "2021"
name = "c-webvtt-in-video-stream"
version = "0.1.0"
[lib]
crate-type = ["staticlib"]
[profile.release]
debug = 2
panic = "abort"
[profile.dev]
debug = 2
panic = "abort"
[workspace]
members = ["webvtt-in-video-stream", "video-bytestream-tools"]
[dependencies]
h264-reader = "0.7.0"
strum_macros = "0.26.3"
video-bytestream-tools = {path = "./video-bytestream-tools"}
webvtt-in-video-stream = {path = "./webvtt-in-video-stream"}
[build-dependencies]
cbindgen = "0.27.0"

View File

@@ -0,0 +1,8 @@
fn main() {
let crate_dir = std::env::var_os("CARGO_MANIFEST_DIR").unwrap();
match cbindgen::generate(crate_dir) {
Ok(bindings) => bindings.write_to_file("target/webvtt-in-sei.h"),
Err(cbindgen::Error::ParseSyntaxError { .. }) => return, // ignore in favor of cargo's syntax check
Err(err) => panic!("{:?}", err),
};
}

View File

@@ -0,0 +1,9 @@
cpp_compat = true
language = "c"
[parse]
include = ["webvtt-in-video-stream"]
parse_deps = true
[export]
include = ["CodecFlavor"]

220
deps/c-webvtt-in-video-stream/src/lib.rs vendored Normal file
View File

@@ -0,0 +1,220 @@
use std::{
error::Error,
ffi::{c_char, CStr},
time::Duration,
};
use strum_macros::FromRepr;
use video_bytestream_tools::{
h264::{self, H264ByteStreamWrite, NalHeader, NalUnitWrite, RbspWrite},
webvtt::WebvttWrite,
};
use webvtt_in_video_stream::{WebvttMuxer, WebvttMuxerBuilder, WebvttString};
#[no_mangle]
pub extern "C" fn webvtt_create_muxer_builder(
latency_to_video_in_msecs: u16,
send_frequency_hz: u8,
video_frame_time_in_nsecs: u64,
) -> Box<WebvttMuxerBuilder> {
Box::new(WebvttMuxerBuilder::new(
Duration::from_millis(latency_to_video_in_msecs.into()),
send_frequency_hz,
Duration::from_nanos(video_frame_time_in_nsecs),
))
}
fn turn_into_webvtt_string(ptr: *const c_char) -> Option<WebvttString> {
if ptr.is_null() {
return None;
}
let c_str = unsafe { CStr::from_ptr(ptr) };
WebvttString::from_string(c_str.to_string_lossy().into_owned()).ok()
}
#[no_mangle]
pub extern "C" fn webvtt_muxer_builder_add_track(
builder: Option<&mut WebvttMuxerBuilder>,
default: bool,
autoselect: bool,
forced: bool,
name_ptr: *const c_char,
language_ptr: *const c_char,
assoc_language_ptr: *const c_char,
characteristics_ptr: *const c_char,
) -> bool {
let Some(builder) = builder else { return false };
let Some(name) = turn_into_webvtt_string(name_ptr) else {
return false;
};
let Some(language) = turn_into_webvtt_string(language_ptr) else {
return false;
};
let assoc_language = turn_into_webvtt_string(assoc_language_ptr);
let characteristics = turn_into_webvtt_string(characteristics_ptr);
builder
.add_track(
default,
autoselect,
forced,
name,
language,
assoc_language,
characteristics,
)
.is_ok()
}
#[no_mangle]
pub extern "C" fn webvtt_muxer_builder_create_muxer(
muxer_builder: Option<Box<WebvttMuxerBuilder>>,
) -> Option<Box<WebvttMuxer>> {
muxer_builder.map(|builder| Box::new(builder.create_muxer()))
}
#[no_mangle]
pub extern "C" fn webvtt_muxer_free(_: Option<Box<WebvttMuxer>>) {}
#[no_mangle]
pub extern "C" fn webvtt_muxer_add_cue(
muxer: Option<&WebvttMuxer>,
track: u8,
start_time_in_msecs: u64,
duration_in_msecs: u64,
text_ptr: *const c_char,
) -> bool {
let Some(muxer) = muxer else { return false };
let Some(text) = turn_into_webvtt_string(text_ptr) else {
return false;
};
muxer
.add_cue(
track,
Duration::from_millis(start_time_in_msecs),
Duration::from_millis(duration_in_msecs),
text,
)
.is_ok()
}
#[derive(FromRepr, Copy, Clone)]
#[repr(u8)]
enum CodecFlavor {
H264Avcc1,
H264Avcc2,
H264Avcc4,
H264AnnexB,
}
impl CodecFlavor {
fn into_internal(self) -> CodecFlavorInternal {
match self {
CodecFlavor::H264Avcc1 => CodecFlavorInternal::H264(CodecFlavorH264::Avcc(1)),
CodecFlavor::H264Avcc2 => CodecFlavorInternal::H264(CodecFlavorH264::Avcc(2)),
CodecFlavor::H264Avcc4 => CodecFlavorInternal::H264(CodecFlavorH264::Avcc(4)),
CodecFlavor::H264AnnexB => CodecFlavorInternal::H264(CodecFlavorH264::AnnexB),
}
}
}
enum CodecFlavorH264 {
Avcc(usize),
AnnexB,
}
enum CodecFlavorInternal {
H264(CodecFlavorH264),
}
pub struct WebvttBuffer(Vec<u8>);
#[no_mangle]
pub extern "C" fn webvtt_muxer_try_mux_into_bytestream(
muxer: Option<&WebvttMuxer>,
video_timestamp_in_nsecs: u64,
add_header: bool,
codec_flavor: u8,
) -> Option<Box<WebvttBuffer>> {
fn mux_into_bytestream<'a, W: WebvttWrite + 'a>(
muxer: &WebvttMuxer,
video_timestamp: Duration,
add_header: bool,
buffer: &'a mut Vec<u8>,
init: impl Fn(&'a mut Vec<u8>) -> Result<W, Box<dyn Error>>,
finish: impl Fn(W) -> Result<(), Box<dyn Error>>,
) -> Result<bool, Box<dyn Error>> {
let mut writer = init(buffer)?;
if !muxer.try_mux_into_bytestream(video_timestamp, add_header, &mut writer)? {
return Ok(false);
}
finish(writer)?;
Ok(true)
}
fn create_nal_header() -> NalHeader {
NalHeader::from_nal_unit_type_and_nal_ref_idc(h264_reader::nal::UnitType::SEI, 0).unwrap()
}
fn inner(
muxer: Option<&WebvttMuxer>,
video_timestamp_in_nsecs: u64,
add_header: bool,
codec_flavor: u8,
) -> Option<Box<WebvttBuffer>> {
let muxer = muxer?;
let video_timestamp = Duration::from_nanos(video_timestamp_in_nsecs);
let codec_flavor = CodecFlavor::from_repr(codec_flavor)?;
let mut buffer = vec![];
let data_written = match codec_flavor.into_internal() {
CodecFlavorInternal::H264(CodecFlavorH264::AnnexB) => mux_into_bytestream(
muxer,
video_timestamp,
add_header,
&mut buffer,
|buffer| {
Ok(h264::annex_b::AnnexBWriter::new(buffer)
.start_write_nal_unit()?
.write_nal_header(create_nal_header())?)
},
|write| {
write.finish_rbsp()?;
Ok(())
},
)
.ok()?,
CodecFlavorInternal::H264(CodecFlavorH264::Avcc(length_size)) => mux_into_bytestream(
muxer,
video_timestamp,
add_header,
&mut buffer,
|buffer| {
Ok(h264::avcc::AVCCWriter::new(length_size, buffer)?
.start_write_nal_unit()?
.write_nal_header(create_nal_header())?)
},
|write| {
write.finish_rbsp()?;
Ok(())
},
)
.ok()?,
};
if !data_written {
return None;
}
Some(Box::new(WebvttBuffer(buffer)))
}
inner(muxer, video_timestamp_in_nsecs, add_header, codec_flavor)
}
#[no_mangle]
pub extern "C" fn webvtt_buffer_data(buffer: Option<&WebvttBuffer>) -> *const u8 {
buffer.map(|b| b.0.as_ptr()).unwrap_or(std::ptr::null())
}
#[no_mangle]
pub extern "C" fn webvtt_buffer_length(buffer: Option<&WebvttBuffer>) -> usize {
buffer.map(|b| b.0.len()).unwrap_or(0)
}
#[no_mangle]
pub extern "C" fn webvtt_buffer_free(_: Option<Box<WebvttBuffer>>) {}

View File

@@ -0,0 +1,10 @@
[package]
edition = "2021"
name = "video-bytestream-tools"
version = "0.1.0"
[dependencies]
byteorder = "1.5.0"
h264-reader = "0.7.0"
thiserror = "2.0.4"
uuid = "1.11.0"

View File

@@ -0,0 +1,361 @@
use crate::webvtt::{write_webvtt_header, write_webvtt_payload, WebvttTrack, WebvttWrite};
use byteorder::WriteBytesExt;
use h264_reader::nal::UnitType;
use std::{collections::VecDeque, io::Write, time::Duration};
type Result<T, E = std::io::Error> = std::result::Result<T, E>;
pub mod annex_b;
pub mod avcc;
pub trait H264ByteStreamWrite<W: ?Sized + Write> {
type Writer: NalUnitWrite<W>;
fn start_write_nal_unit(self) -> Result<Self::Writer>;
}
impl<W: Write> H264ByteStreamWrite<W> for W {
type Writer = NalUnitWriter<W>;
fn start_write_nal_unit(self) -> Result<Self::Writer> {
Ok(NalUnitWriter::new(self))
}
}
#[derive(Debug, Clone, Copy)]
pub struct NalHeader {
nal_unit_type: UnitType,
nal_ref_idc: u8,
}
#[derive(Debug, Clone, Copy)]
pub enum NalHeaderError {
NalRefIdcOutOfRange(u8),
InvalidNalRefIdcForNalUnitType {
nal_unit_type: UnitType,
nal_ref_idc: u8,
},
NalUnitTypeOutOfRange(UnitType),
}
impl NalHeader {
pub fn from_nal_unit_type_and_nal_ref_idc(
nal_unit_type: UnitType,
nal_ref_idc: u8,
) -> Result<NalHeader, NalHeaderError> {
if nal_ref_idc >= 4 {
return Err(NalHeaderError::NalRefIdcOutOfRange(nal_ref_idc));
}
match nal_unit_type.id() {
0 => Err(NalHeaderError::NalUnitTypeOutOfRange(nal_unit_type)),
6 | 9 | 10 | 11 | 12 => {
if nal_ref_idc == 0 {
Ok(NalHeader {
nal_unit_type,
nal_ref_idc,
})
} else {
Err(NalHeaderError::InvalidNalRefIdcForNalUnitType {
nal_unit_type,
nal_ref_idc,
})
}
}
5 => {
if nal_ref_idc != 0 {
Ok(NalHeader {
nal_unit_type,
nal_ref_idc,
})
} else {
Err(NalHeaderError::InvalidNalRefIdcForNalUnitType {
nal_unit_type,
nal_ref_idc,
})
}
}
32.. => Err(NalHeaderError::NalUnitTypeOutOfRange(nal_unit_type)),
_ => Ok(NalHeader {
nal_unit_type,
nal_ref_idc,
}),
}
}
fn as_header_byte(&self) -> u8 {
self.nal_ref_idc << 5 | self.nal_unit_type.id()
}
}
pub struct NalUnitWriter<W: ?Sized + Write> {
inner: W,
}
pub trait NalUnitWrite<W: ?Sized + Write> {
type Writer: RbspWrite<W>;
fn write_nal_header(self, nal_header: NalHeader) -> Result<Self::Writer>;
}
impl<W: Write> NalUnitWriter<W> {
fn new(inner: W) -> Self {
Self { inner }
}
}
impl<W: Write> NalUnitWrite<W> for NalUnitWriter<W> {
type Writer = RbspWriter<W>;
fn write_nal_header(mut self, nal_header: NalHeader) -> Result<RbspWriter<W>> {
self.inner.write_u8(nal_header.as_header_byte())?;
Ok(RbspWriter::new(self.inner))
}
}
pub struct RbspWriter<W: ?Sized + Write> {
last_written: VecDeque<u8>,
inner: W,
}
pub trait RbspWrite<W: ?Sized + Write> {
type Writer: H264ByteStreamWrite<W>;
fn finish_rbsp(self) -> Result<Self::Writer>;
}
impl<W: Write> RbspWriter<W> {
pub fn new(inner: W) -> Self {
Self {
last_written: VecDeque::with_capacity(3),
inner,
}
}
}
impl<W: Write> RbspWrite<W> for RbspWriter<W> {
type Writer = W;
fn finish_rbsp(mut self) -> Result<W> {
self.write_u8(0x80)?;
Ok(self.inner)
}
}
impl<W: ?Sized + Write> Write for RbspWriter<W> {
fn write(&mut self, buf: &[u8]) -> Result<usize> {
let mut written = 0;
for &byte in buf {
let mut last_written_iter = self.last_written.iter();
if last_written_iter.next() == Some(&0)
&& last_written_iter.next() == Some(&0)
&& (byte == 0 || byte == 1 || byte == 2 || byte == 3)
{
self.inner.write_u8(3)?;
self.last_written.clear();
}
self.inner.write_u8(byte)?;
written += 1;
self.last_written.push_back(byte);
if self.last_written.len() > 2 {
self.last_written.pop_front();
}
}
Ok(written)
}
fn flush(&mut self) -> Result<()> {
self.inner.flush()
}
}
pub(crate) struct CountingSink {
count: usize,
}
impl CountingSink {
pub fn new() -> Self {
Self { count: 0 }
}
pub fn count(&self) -> usize {
self.count
}
}
impl Write for CountingSink {
fn write(&mut self, buf: &[u8]) -> Result<usize> {
self.count += buf.len();
Ok(buf.len())
}
fn flush(&mut self) -> Result<()> {
Ok(())
}
}
pub(crate) fn write_sei_header<W: ?Sized + Write>(
writer: &mut W,
mut payload_type: usize,
mut payload_size: usize,
) -> std::io::Result<()> {
while payload_type >= 255 {
writer.write_u8(255)?;
payload_type -= 255;
}
writer.write_u8(payload_type.try_into().unwrap())?;
while payload_size >= 255 {
writer.write_u8(255)?;
payload_size -= 255;
}
writer.write_u8(payload_size.try_into().unwrap())?;
Ok(())
}
impl<W: Write + ?Sized> WebvttWrite for RbspWriter<W> {
fn write_webvtt_header(
&mut self,
max_latency_to_video: Duration,
send_frequency_hz: u8,
subtitle_tracks: &[WebvttTrack],
) -> std::io::Result<()> {
write_webvtt_header(
self,
max_latency_to_video,
send_frequency_hz,
subtitle_tracks,
)
}
fn write_webvtt_payload(
&mut self,
track_index: u8,
chunk_number: u64,
chunk_version: u8,
video_offset: Duration,
webvtt_payload: &str, // TODO: replace with string type that checks for interior NULs
) -> std::io::Result<()> {
write_webvtt_payload(
self,
track_index,
chunk_number,
chunk_version,
video_offset,
webvtt_payload,
)
}
}
#[cfg(test)]
mod tests {
use crate::{
h264::{NalHeader, NalUnitWrite, NalUnitWriter, RbspWrite},
webvtt::{WebvttWrite, PAYLOAD_GUID, USER_DATA_UNREGISTERED},
};
use byteorder::{BigEndian, ReadBytesExt};
use h264_reader::nal::{Nal, RefNal, UnitType};
use std::{io::Read, time::Duration};
#[test]
fn check_webvtt_sei() {
let mut writer = vec![];
let nalu_writer = NalUnitWriter::new(&mut writer);
let nal_unit_type = h264_reader::nal::UnitType::SEI;
let nal_ref_idc = 0;
let nal_header =
NalHeader::from_nal_unit_type_and_nal_ref_idc(nal_unit_type, nal_ref_idc).unwrap();
let mut payload_writer = nalu_writer.write_nal_header(nal_header).unwrap();
let track_index = 0;
let chunk_number = 1;
let chunk_version = 0;
let video_offset = Duration::from_millis(200);
let webvtt_payload = "Some unverified data";
payload_writer
.write_webvtt_payload(
track_index,
chunk_number,
chunk_version,
video_offset,
webvtt_payload,
)
.unwrap();
payload_writer.finish_rbsp().unwrap();
assert!(&writer[3..19] == PAYLOAD_GUID.as_bytes());
let nal = RefNal::new(&writer, &[], true);
assert!(nal.is_complete());
assert!(nal.header().unwrap().nal_unit_type() == UnitType::SEI);
let mut byte_reader = nal.rbsp_bytes();
assert!(usize::from(byte_reader.read_u8().unwrap()) == USER_DATA_UNREGISTERED);
let mut length = 0;
loop {
let byte = byte_reader.read_u8().unwrap();
length += usize::from(byte);
if byte != 255 {
break;
}
}
assert!(length + 1 == byte_reader.clone().bytes().count());
byte_reader.read_u128::<BigEndian>().unwrap();
assert!(track_index == byte_reader.read_u8().unwrap());
assert!(chunk_number == byte_reader.read_u64::<BigEndian>().unwrap());
assert!(chunk_version == byte_reader.read_u8().unwrap());
assert!(
u16::try_from(video_offset.as_millis()).unwrap()
== byte_reader.read_u16::<BigEndian>().unwrap()
);
println!("{writer:02x?}");
}
#[test]
fn check_webvtt_multi_sei() {
let mut writer = vec![];
let nalu_writer = NalUnitWriter::new(&mut writer);
let nal_unit_type = h264_reader::nal::UnitType::SEI;
let nal_ref_idc = 0;
let nal_header =
NalHeader::from_nal_unit_type_and_nal_ref_idc(nal_unit_type, nal_ref_idc).unwrap();
let mut payload_writer = nalu_writer.write_nal_header(nal_header).unwrap();
let track_index = 0;
let chunk_number = 1;
let chunk_version = 0;
let video_offset = Duration::from_millis(200);
let webvtt_payload = "Some unverified data";
payload_writer
.write_webvtt_payload(
track_index,
chunk_number,
chunk_version,
video_offset,
webvtt_payload,
)
.unwrap();
payload_writer
.write_webvtt_payload(1, 1, 0, video_offset, "Something else")
.unwrap();
payload_writer.finish_rbsp().unwrap();
assert!(&writer[3..19] == PAYLOAD_GUID.as_bytes());
let nal = RefNal::new(&writer, &[], true);
assert!(nal.is_complete());
assert!(nal.header().unwrap().nal_unit_type() == UnitType::SEI);
let mut byte_reader = nal.rbsp_bytes();
assert!(usize::from(byte_reader.read_u8().unwrap()) == USER_DATA_UNREGISTERED);
let mut _length = 0;
loop {
let byte = byte_reader.read_u8().unwrap();
_length += usize::from(byte);
if byte != 255 {
break;
}
}
byte_reader.read_u128::<BigEndian>().unwrap();
assert!(track_index == byte_reader.read_u8().unwrap());
assert!(chunk_number == byte_reader.read_u64::<BigEndian>().unwrap());
assert!(chunk_version == byte_reader.read_u8().unwrap());
assert!(
u16::try_from(video_offset.as_millis()).unwrap()
== byte_reader.read_u16::<BigEndian>().unwrap()
);
println!("{writer:02x?}");
}
}

View File

@@ -0,0 +1,100 @@
use super::{
H264ByteStreamWrite, NalHeader, NalUnitWrite, NalUnitWriter, RbspWrite, RbspWriter, Result,
};
use crate::webvtt::{WebvttTrack, WebvttWrite};
use byteorder::WriteBytesExt;
use std::{io::Write, time::Duration};
pub struct AnnexBWriter<W: ?Sized + Write> {
leading_zero_8bits_written: bool,
inner: W,
}
impl<W: Write> AnnexBWriter<W> {
pub fn new(inner: W) -> Self {
Self {
leading_zero_8bits_written: false,
inner,
}
}
}
impl<W: Write> H264ByteStreamWrite<W> for AnnexBWriter<W> {
type Writer = AnnexBNalUnitWriter<W>;
fn start_write_nal_unit(mut self) -> Result<AnnexBNalUnitWriter<W>> {
if !self.leading_zero_8bits_written {
self.inner.write_u8(0)?;
self.leading_zero_8bits_written = true;
}
self.inner.write_all(&[0, 0, 1])?;
Ok(AnnexBNalUnitWriter {
inner: NalUnitWriter::new(self.inner),
})
}
}
pub struct AnnexBNalUnitWriter<W: ?Sized + Write> {
inner: NalUnitWriter<W>,
}
impl<W: Write> AnnexBNalUnitWriter<W> {
fn _nal_unit_writer(&mut self) -> &mut NalUnitWriter<W> {
&mut self.inner
}
}
impl<W: Write> NalUnitWrite<W> for AnnexBNalUnitWriter<W> {
type Writer = AnnexBRbspWriter<W>;
fn write_nal_header(self, nal_header: NalHeader) -> Result<AnnexBRbspWriter<W>> {
self.inner
.write_nal_header(nal_header)
.map(|inner| AnnexBRbspWriter { inner })
}
}
pub struct AnnexBRbspWriter<W: ?Sized + Write> {
inner: RbspWriter<W>,
}
impl<W: ?Sized + Write> AnnexBRbspWriter<W> {}
impl<W: Write> RbspWrite<W> for AnnexBRbspWriter<W> {
type Writer = AnnexBWriter<W>;
fn finish_rbsp(self) -> Result<Self::Writer> {
self.inner
.finish_rbsp()
.map(|writer| AnnexBWriter::new(writer))
}
}
impl<W: Write + ?Sized> WebvttWrite for AnnexBRbspWriter<W> {
fn write_webvtt_header(
&mut self,
max_latency_to_video: Duration,
send_frequency_hz: u8,
subtitle_tracks: &[WebvttTrack],
) -> std::io::Result<()> {
self.inner
.write_webvtt_header(max_latency_to_video, send_frequency_hz, subtitle_tracks)
}
fn write_webvtt_payload(
&mut self,
track_index: u8,
chunk_number: u64,
chunk_version: u8,
video_offset: Duration,
webvtt_payload: &str, // TODO: replace with string type that checks for interior NULs
) -> std::io::Result<()> {
self.inner.write_webvtt_payload(
track_index,
chunk_number,
chunk_version,
video_offset,
webvtt_payload,
)
}
}

View File

@@ -0,0 +1,153 @@
use super::{
H264ByteStreamWrite, NalHeader, NalUnitWrite, NalUnitWriter, RbspWrite, RbspWriter, Result,
};
use crate::webvtt::{WebvttTrack, WebvttWrite};
use byteorder::{BigEndian, WriteBytesExt};
use std::{io::Write, time::Duration};
use thiserror::Error;
const AVCC_MAX_LENGTH: [usize; 4] = [0xff, 0xff_ff, 0, 0xff_ff_ff_ff];
pub struct AVCCWriter<W: ?Sized + Write> {
length_size: usize,
inner: W,
}
#[derive(Error, Debug)]
#[error("AVCC length of {0} is unsupported")]
pub struct InvalidLengthError(pub usize);
#[derive(Error, Debug)]
#[error("Tried to write {required} bytes which exceeds the max size of {max}")]
pub struct MaxNalUnitSizeExceededError {
max: usize,
required: usize,
}
impl<W: Write> AVCCWriter<W> {
pub fn new(length_size: usize, inner: W) -> Result<Self, InvalidLengthError> {
match length_size {
1 | 2 | 4 => Ok(Self { length_size, inner }),
_ => Err(InvalidLengthError(length_size)),
}
}
}
impl<W: Write> H264ByteStreamWrite<W> for AVCCWriter<W> {
type Writer = AVCCNalUnitWriter<AVCCWriterBuffer<W>>;
fn start_write_nal_unit(self) -> Result<AVCCNalUnitWriter<AVCCWriterBuffer<W>>> {
Ok(AVCCNalUnitWriter {
inner: NalUnitWriter::new(AVCCWriterBuffer::new(self)),
})
}
}
pub struct AVCCWriterBuffer<W: ?Sized + Write> {
avcc_buffer: Vec<u8>,
avcc_writer: AVCCWriter<W>,
}
impl<W: Write> AVCCWriterBuffer<W> {
fn new(avcc_writer: AVCCWriter<W>) -> Self {
Self {
avcc_buffer: vec![],
avcc_writer,
}
}
fn finish(mut self) -> Result<AVCCWriter<W>> {
match self.avcc_writer.length_size {
1 => self.write_u8(self.avcc_buffer.len().try_into().unwrap())?,
2 => self.write_u16::<BigEndian>(self.avcc_buffer.len().try_into().unwrap())?,
4 => self.write_u32::<BigEndian>(self.avcc_buffer.len().try_into().unwrap())?,
_ => unreachable!(),
}
self.avcc_writer.inner.write_all(&self.avcc_buffer)?;
Ok(self.avcc_writer)
}
}
impl<W: ?Sized + Write> Write for AVCCWriterBuffer<W> {
fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
let length = self.avcc_buffer.len();
let additional_length = buf.len();
if length + additional_length > AVCC_MAX_LENGTH[self.avcc_writer.length_size] {
Err(std::io::Error::other(MaxNalUnitSizeExceededError {
max: AVCC_MAX_LENGTH[self.avcc_writer.length_size],
required: length + additional_length,
}))
} else {
self.avcc_buffer.write(buf)
}
}
fn flush(&mut self) -> std::io::Result<()> {
Ok(())
}
}
pub struct AVCCNalUnitWriter<W: ?Sized + Write> {
inner: NalUnitWriter<W>,
}
impl<W: Write> AVCCNalUnitWriter<W> {
fn _nal_unit_writer(&mut self) -> &mut NalUnitWriter<W> {
&mut self.inner
}
}
impl<W: Write> NalUnitWrite<W> for AVCCNalUnitWriter<AVCCWriterBuffer<W>> {
type Writer = AVCCRbspWriter<AVCCWriterBuffer<W>>;
fn write_nal_header(
self,
nal_header: NalHeader,
) -> Result<AVCCRbspWriter<AVCCWriterBuffer<W>>> {
self.inner
.write_nal_header(nal_header)
.map(|inner| AVCCRbspWriter { inner })
}
}
pub struct AVCCRbspWriter<W: ?Sized + Write> {
inner: RbspWriter<W>,
}
impl<W: Write> RbspWrite<W> for AVCCRbspWriter<AVCCWriterBuffer<W>> {
type Writer = AVCCWriter<W>;
fn finish_rbsp(self) -> Result<Self::Writer> {
let buffer = self.inner.finish_rbsp()?;
buffer.finish()
}
}
impl<W: Write + ?Sized> WebvttWrite for AVCCRbspWriter<W> {
fn write_webvtt_header(
&mut self,
max_latency_to_video: Duration,
send_frequency_hz: u8,
subtitle_tracks: &[WebvttTrack],
) -> std::io::Result<()> {
self.inner
.write_webvtt_header(max_latency_to_video, send_frequency_hz, subtitle_tracks)
}
fn write_webvtt_payload(
&mut self,
track_index: u8,
chunk_number: u64,
chunk_version: u8,
video_offset: Duration,
webvtt_payload: &str, // TODO: replace with string type that checks for interior NULs
) -> std::io::Result<()> {
self.inner.write_webvtt_payload(
track_index,
chunk_number,
chunk_version,
video_offset,
webvtt_payload,
)
}
}

View File

@@ -0,0 +1,2 @@
pub mod h264;
pub mod webvtt;

View File

@@ -0,0 +1,155 @@
use crate::h264::{write_sei_header, CountingSink};
use byteorder::{BigEndian, WriteBytesExt};
use std::{io::Write, time::Duration};
use uuid::{uuid, Uuid};
pub const USER_DATA_UNREGISTERED: usize = 5;
pub const HEADER_GUID: Uuid = uuid!("cc7124bd-5f1c-4592-b27a-e2d9d218ef9e");
pub const PAYLOAD_GUID: Uuid = uuid!("a0cb4dd1-9db2-4635-a76b-1c9fefd6c37b");
trait WriteCStrExt: Write {
fn write_c_str(&mut self, string: &str) -> std::io::Result<()> {
self.write_all(string.as_bytes())?;
self.write_u8(0)?;
Ok(())
}
}
impl<W: Write + ?Sized> WriteCStrExt for W {}
pub struct WebvttTrack<'a> {
pub default: bool,
pub autoselect: bool,
pub forced: bool,
pub name: &'a str,
pub language: &'a str,
pub assoc_language: Option<&'a str>,
pub characteristics: Option<&'a str>,
}
pub(crate) fn write_webvtt_header<W: Write + ?Sized>(
writer: &mut W,
max_latency_to_video: Duration,
send_frequency_hz: u8,
subtitle_tracks: &[WebvttTrack],
) -> std::io::Result<()> {
fn inner<W: ?Sized + Write>(
writer: &mut W,
max_latency_to_video: Duration,
send_frequency_hz: u8,
subtitle_tracks: &[WebvttTrack],
) -> std::io::Result<()> {
writer.write_all(HEADER_GUID.as_bytes())?;
writer.write_u16::<BigEndian>(max_latency_to_video.as_millis().try_into().unwrap())?;
writer.write_u8(send_frequency_hz)?;
writer.write_u8(subtitle_tracks.len().try_into().unwrap())?;
for track in subtitle_tracks {
let flags = {
let mut flags: u8 = 0;
if track.default {
flags |= 0b1000_0000;
}
if track.autoselect {
flags |= 0b0100_0000;
}
if track.forced {
flags |= 0b0010_0000;
}
if track.assoc_language.is_some() {
flags |= 0b0001_0000;
}
if track.characteristics.is_some() {
flags |= 0b0000_1000;
}
flags
};
writer.write_u8(flags)?;
writer.write_c_str(track.name)?;
writer.write_c_str(track.language)?;
if let Some(assoc_language) = track.assoc_language {
writer.write_c_str(assoc_language)?;
}
if let Some(characteristics) = track.characteristics {
writer.write_c_str(characteristics)?;
}
}
Ok(())
}
let mut count = CountingSink::new();
inner(
&mut count,
max_latency_to_video,
send_frequency_hz,
subtitle_tracks,
)?;
write_sei_header(writer, USER_DATA_UNREGISTERED, count.count())?;
inner(
writer,
max_latency_to_video,
send_frequency_hz,
subtitle_tracks,
)
}
pub(crate) fn write_webvtt_payload<W: Write + ?Sized>(
writer: &mut W,
track_index: u8,
chunk_number: u64,
chunk_version: u8,
video_offset: Duration,
webvtt_payload: &str, // TODO: replace with string type that checks for interior NULs
) -> std::io::Result<()> {
fn inner<W: ?Sized + Write>(
writer: &mut W,
track_index: u8,
chunk_number: u64,
chunk_version: u8,
video_offset: Duration,
webvtt_payload: &str,
) -> std::io::Result<()> {
writer.write_all(PAYLOAD_GUID.as_bytes())?;
writer.write_u8(track_index)?;
writer.write_u64::<BigEndian>(chunk_number)?;
writer.write_u8(chunk_version)?;
writer.write_u16::<BigEndian>(video_offset.as_millis().try_into().unwrap())?;
writer.write_c_str(webvtt_payload)?;
Ok(())
}
let mut count = CountingSink::new();
inner(
&mut count,
track_index,
chunk_number,
chunk_version,
video_offset,
webvtt_payload,
)?;
write_sei_header(writer, USER_DATA_UNREGISTERED, count.count())?;
inner(
writer,
track_index,
chunk_number,
chunk_version,
video_offset,
webvtt_payload,
)
}
pub trait WebvttWrite {
fn write_webvtt_header(
&mut self,
max_latency_to_video: Duration,
send_frequency_hz: u8,
subtitle_tracks: &[WebvttTrack],
) -> std::io::Result<()>;
fn write_webvtt_payload(
&mut self,
track_index: u8,
chunk_number: u64,
chunk_version: u8,
video_offset: Duration,
webvtt_payload: &str, // TODO: replace with string type that checks for interior NULs
) -> std::io::Result<()>;
}

View File

@@ -0,0 +1,8 @@
[package]
edition = "2021"
name = "webvtt-in-video-stream"
version = "0.1.0"
[dependencies]
thiserror = "2.0.4"
video-bytestream-tools = {path = "../video-bytestream-tools"}

View File

@@ -0,0 +1,277 @@
use std::{collections::VecDeque, sync::Mutex, time::Duration};
use video_bytestream_tools::webvtt::WebvttWrite;
pub struct WebvttMuxerBuilder {
latency_to_video: Duration,
send_frequency_hz: u8,
video_frame_time: Duration,
tracks: Vec<WebvttMuxerTrack>,
}
struct WebvttMuxerTrack {
cues: VecDeque<WebvttCue>,
default: bool,
autoselect: bool,
forced: bool,
name: String,
language: String,
assoc_language: Option<String>,
characteristics: Option<String>,
}
pub struct WebvttMuxer {
latency_to_video: Duration,
send_frequency_hz: u8,
video_frame_time: Duration,
inner: Mutex<WebvttMuxerInner>,
}
struct WebvttMuxerInner {
tracks: Vec<WebvttMuxerTrack>,
webvtt_buffer: String,
next_chunk_number: u64,
first_video_timestamp: Option<Duration>,
}
// TODO: this should probably be moved into video-bytestream-tools instead
pub struct WebvttString(String);
struct WebvttCue {
start_time: Duration,
duration: Duration,
text: WebvttString,
}
pub struct NulError {
pub string: String,
pub nul_position: usize,
}
impl WebvttString {
/// Create a `WebvttString`.
/// This verifies that there are no interior NUL bytes, since
/// the WebVTT-in-SEI wire format uses NUL terminated strings.
///
/// # Errors
///
/// This function will return an error if there are any NUL bytes in the string.
pub fn from_string(string: String) -> Result<Self, NulError> {
if let Some(nul_position) = string.find('\0') {
Err(NulError {
string,
nul_position,
})
} else {
Ok(WebvttString(string))
}
}
}
pub struct TooManySubtitleTracksError {
pub name: WebvttString,
pub language: WebvttString,
pub assoc_language: Option<WebvttString>,
pub characteristics: Option<WebvttString>,
}
impl WebvttMuxerBuilder {
pub fn new(
latency_to_video: Duration,
send_frequency_hz: u8,
video_frame_time: Duration,
) -> Self {
Self {
latency_to_video,
send_frequency_hz,
video_frame_time,
tracks: vec![],
}
}
// FIXME: split these arguments somehow?
#[allow(clippy::too_many_arguments)]
pub fn add_track(
&mut self,
default: bool,
autoselect: bool,
forced: bool,
name: WebvttString,
language: WebvttString,
assoc_language: Option<WebvttString>,
characteristics: Option<WebvttString>,
) -> Result<&mut Self, TooManySubtitleTracksError> {
if self.tracks.len() == 0xff {
return Err(TooManySubtitleTracksError {
name,
language,
assoc_language,
characteristics,
});
}
self.tracks.push(WebvttMuxerTrack {
cues: VecDeque::new(),
default,
autoselect,
forced,
name: name.0,
language: language.0,
assoc_language: assoc_language.map(|a| a.0),
characteristics: characteristics.map(|c| c.0),
});
Ok(self)
}
pub fn create_muxer(self) -> WebvttMuxer {
WebvttMuxer {
latency_to_video: self.latency_to_video,
send_frequency_hz: self.send_frequency_hz,
video_frame_time: self.video_frame_time,
inner: Mutex::new(WebvttMuxerInner {
tracks: self.tracks,
webvtt_buffer: String::new(),
next_chunk_number: 0,
first_video_timestamp: None,
}),
}
}
}
pub struct InvalidWebvttTrack(pub u8);
impl WebvttMuxer {
pub fn add_cue(
&self,
track: u8,
start_time: Duration,
duration: Duration,
text: WebvttString,
) -> Result<(), InvalidWebvttTrack> {
let mut inner = self.inner.lock().unwrap();
let tracks = &mut inner.tracks;
let track = tracks
.get_mut(usize::from(track))
.ok_or(InvalidWebvttTrack(track))?;
let cues = &mut track.cues;
let index = cues
.iter()
.position(|c| c.start_time > start_time)
.unwrap_or(cues.len());
cues.insert(
index,
WebvttCue {
start_time,
duration,
text,
},
);
Ok(())
}
fn consume_cues_into_chunk<'a>(
cues: &mut VecDeque<WebvttCue>,
timestamp: Duration,
duration: Duration,
buffer: &'a mut String,
) -> &'a str {
while cues
.front()
.map(|cue| (cue.start_time + cue.duration) < timestamp)
.unwrap_or(false)
{
cues.pop_front();
}
buffer.clear();
for cue in &*cues {
if cue.start_time > (timestamp + duration) {
break;
}
let cue_start = if cue.start_time > timestamp {
cue.start_time
} else {
timestamp
};
let cue_end = (cue.start_time + cue.duration).min(timestamp + duration);
buffer.push_str(&format!(
"{:0>2}:{:0>2}:{:0>2}.{:0>3} --> {:0>2}:{:0>2}:{:0>2}.{:0>3}\n{}\n\n",
cue_start.as_secs() / 3600,
cue_start.as_secs() % 3600 / 60,
cue_start.as_secs() % 60,
cue_start.as_millis() % 1000,
cue_end.as_secs() / 3600,
cue_end.as_secs() % 3600 / 60,
cue_end.as_secs() % 60,
cue_end.as_millis() % 1000,
cue.text.0
))
}
buffer.as_str()
}
pub fn try_mux_into_bytestream(
&self,
video_timestamp: Duration,
add_header: bool,
writer: &mut impl WebvttWrite,
) -> std::io::Result<bool> {
let mut inner = self.inner.lock().unwrap();
let WebvttMuxerInner {
tracks,
webvtt_buffer,
next_chunk_number,
first_video_timestamp,
} = &mut *inner;
if add_header {
// TODO: cache this? forward iter instead?
let webvtt_tracks = tracks
.iter()
.map(|track| video_bytestream_tools::webvtt::WebvttTrack {
default: track.default,
autoselect: track.autoselect,
forced: track.forced,
language: &track.language,
name: &track.name,
assoc_language: track.assoc_language.as_deref(),
characteristics: track.characteristics.as_deref(),
})
.collect::<Vec<_>>();
writer.write_webvtt_header(
self.latency_to_video,
self.send_frequency_hz,
&webvtt_tracks,
)?;
}
let duration_between_sends =
Duration::from_secs_f64(1. / f64::from(self.send_frequency_hz));
let first_video_timestamp = &*first_video_timestamp.get_or_insert(video_timestamp);
let next_chunk_webvtt_timestamp =
u32::try_from(*next_chunk_number).unwrap() * duration_between_sends;
let next_chunk_video_timestamp =
*first_video_timestamp + self.latency_to_video + next_chunk_webvtt_timestamp;
if next_chunk_video_timestamp > video_timestamp + self.video_frame_time * 2 {
return Ok(add_header);
}
let chunk_number = *next_chunk_number;
// TODO: return an error type that allows skipping chunks if the writer fails?
for (track_index, track) in tracks.iter_mut().enumerate() {
let webvtt_payload = Self::consume_cues_into_chunk(
&mut track.cues,
next_chunk_webvtt_timestamp,
duration_between_sends,
webvtt_buffer,
);
writer.write_webvtt_payload(
u8::try_from(track_index).unwrap(),
chunk_number,
0,
video_timestamp - (*first_video_timestamp + next_chunk_webvtt_timestamp),
webvtt_payload,
)?;
}
*next_chunk_number += 1;
Ok(true)
}
}

View File

@@ -28,10 +28,12 @@ MODULE_EXPORT const char *obs_module_description(void)
}
extern struct obs_source_info transcription_filter_info;
extern void load_packet_callback_functions();
bool obs_module_load(void)
{
obs_register_source(&transcription_filter_info);
load_packet_callback_functions();
obs_log(LOG_INFO, "plugin loaded successfully (version %s)", PLUGIN_VERSION);
return true;
}

View File

@@ -3,6 +3,7 @@
#endif
#include <obs.h>
#include <obs.hpp>
#include <obs-frontend-api.h>
#include <curl/curl.h>
@@ -18,6 +19,7 @@
#include "transcription-utils.h"
#include "translation/translation.h"
#include "translation/translation-includes.h"
#include "whisper-utils/whisper-language.h"
#include "whisper-utils/whisper-utils.h"
#include "whisper-utils/whisper-model-utils.h"
#include "translation/language_codes.h"
@@ -200,8 +202,10 @@ void send_translated_sentence_to_file(struct transcription_filter_data *gf,
// add a postfix to the file name (without extension) with the translation target language
std::string translated_file_path = "";
std::string output_file_path = gf->output_file_path;
std::string file_extension =
output_file_path.substr(output_file_path.find_last_of(".") + 1);
auto point_pos = output_file_path.find_last_of(".");
std::string file_extension = point_pos != output_file_path.npos
? output_file_path.substr(point_pos + 1)
: "";
std::string file_name =
output_file_path.substr(0, output_file_path.find_last_of("."));
translated_file_path = file_name + "_" + target_lang + "." + file_extension;
@@ -229,7 +233,43 @@ void send_caption_to_stream(DetectionResultWithText result, const std::string &s
}
}
void set_text_callback(struct transcription_filter_data *gf,
#ifdef ENABLE_WEBVTT
void send_caption_to_webvtt(uint64_t possible_end_ts_ms, DetectionResultWithText result,
const std::string &str_copy, transcription_filter_data &gf)
{
auto lock = std::unique_lock(gf.active_outputs_mutex);
for (auto &output : gf.active_outputs) {
if (!gf.webvtt_caption_to_recording &&
output.output_type == transcription_filter_data::webvtt_output_type::Recording)
continue;
if (!gf.webvtt_caption_to_stream &&
output.output_type == transcription_filter_data::webvtt_output_type::Streaming)
continue;
auto lang_to_track = output.language_to_track.find(result.language);
if (lang_to_track == output.language_to_track.end())
continue;
for (size_t i = 0; i < MAX_OUTPUT_VIDEO_ENCODERS; i++) {
auto &muxer = output.webvtt_muxer[i];
if (!muxer)
continue;
auto duration = result.end_timestamp_ms - result.start_timestamp_ms;
auto segment_start_ts = possible_end_ts_ms - duration;
if (segment_start_ts < output.start_timestamp_ms) {
duration -= output.start_timestamp_ms - segment_start_ts;
segment_start_ts = output.start_timestamp_ms;
}
webvtt_muxer_add_cue(muxer.get(), lang_to_track->second,
segment_start_ts - output.start_timestamp_ms, duration,
str_copy.c_str());
}
}
}
#endif
void set_text_callback(uint64_t possible_end_ts, struct transcription_filter_data *gf,
const DetectionResultWithText &resultIn)
{
DetectionResultWithText result = resultIn;
@@ -263,6 +303,11 @@ void set_text_callback(struct transcription_filter_data *gf,
}
}
#ifdef ENABLE_WEBVTT
if (result.result == DETECTION_RESULT_SPEECH)
send_caption_to_webvtt(possible_end_ts, result, str_copy, *gf);
#endif
bool should_translate_local =
gf->translate_only_full_sentences ? result.result == DETECTION_RESULT_SPEECH : true;
@@ -303,7 +348,21 @@ void set_text_callback(struct transcription_filter_data *gf,
if (should_translate_cloud) {
send_sentence_to_cloud_translation_async(
str_copy, gf, result.language,
[gf, result](const std::string &translated_sentence_cloud) {
[gf, result,
possible_end_ts](const std::string &translated_sentence_cloud) {
#ifdef ENABLE_WEBVTT
if (result.result == DETECTION_RESULT_SPEECH) {
auto target_lang = language_codes_to_whisper.find(
gf->translate_cloud_target_language);
if (target_lang != language_codes_to_whisper.end()) {
auto res_copy = result;
res_copy.language = target_lang->second;
send_caption_to_webvtt(possible_end_ts, res_copy,
translated_sentence_cloud,
*gf);
}
}
#endif
if (gf->translate_cloud_output != "none") {
send_caption_to_source(gf->translate_cloud_output,
translated_sentence_cloud, gf);
@@ -335,6 +394,18 @@ void set_text_callback(struct transcription_filter_data *gf,
}
}
#ifdef ENABLE_WEBVTT
if (should_translate_local && result.result == DETECTION_RESULT_SPEECH) {
auto target_lang = language_codes_to_whisper.find(gf->target_lang);
if (target_lang != language_codes_to_whisper.end()) {
auto res_copy = result;
res_copy.language = target_lang->second;
send_caption_to_webvtt(possible_end_ts, res_copy, translated_sentence_local,
*gf);
}
}
#endif
if (gf->caption_to_stream && result.result == DETECTION_RESULT_SPEECH) {
// TODO: add support for partial transcriptions
send_caption_to_stream(result, str_copy, gf);
@@ -361,6 +432,155 @@ void set_text_callback(struct transcription_filter_data *gf,
}
};
#ifdef ENABLE_WEBVTT
void output_packet_added_callback(obs_output_t *output, struct encoder_packet *pkt,
struct encoder_packet_time *pkt_time, void *param)
{
if (!pkt || !pkt_time)
return;
if (pkt->type != OBS_ENCODER_VIDEO)
return;
if (pkt->track_idx >= MAX_OUTPUT_VIDEO_ENCODERS)
return;
auto &gf = *static_cast<transcription_filter_data *>(param);
auto lock = std::unique_lock(gf.active_outputs_mutex);
auto it = std::find_if(gf.active_outputs.begin(), gf.active_outputs.end(), [&](auto &val) {
return obs_weak_output_references_output(val.output, output);
});
if (it == gf.active_outputs.end())
return;
if (!it->initialized) {
it->initialized = true;
auto settings_lock = std::unique_lock(gf.webvtt_settings_mutex);
for (size_t i = 0; i < MAX_OUTPUT_VIDEO_ENCODERS; i++) {
auto encoder = obs_output_get_video_encoder2(output, i);
if (!encoder)
continue;
auto &codec_flavor = it->codec_flavor[i];
if (strcmp(obs_encoder_get_codec(encoder), "h264") == 0) {
codec_flavor = H264AnnexB;
} else if (strcmp(obs_encoder_get_codec(encoder), "av1") == 0) {
continue;
} else if (strcmp(obs_encoder_get_codec(encoder), "hevc") == 0) {
continue;
} else {
continue;
}
auto video = obs_encoder_video(encoder);
auto voi = video_output_get_info(video);
auto muxer_builder = webvtt_create_muxer_builder(
gf.latency_to_video_in_msecs, gf.send_frequency_hz,
util_mul_div64(1000000000ULL, voi->fps_den, voi->fps_num));
uint8_t track_index = 0;
// FIXME: this may be too lazy, i.e. languages should probably be locked in the signal handler instead
for (auto &lang : gf.active_languages) {
auto lang_it = whisper_available_lang_reverse.find(lang);
if (lang_it == whisper_available_lang.end()) {
obs_log(LOG_WARNING,
"requested language '%s' unknown, track not added",
lang.c_str());
continue;
}
webvtt_muxer_builder_add_track(muxer_builder, false, false, false,
lang_it->second.c_str(),
lang.c_str(), nullptr, nullptr);
it->language_to_track[lang] = track_index++;
}
it->webvtt_muxer[i].reset(webvtt_muxer_builder_create_muxer(muxer_builder));
}
}
auto &muxer = it->webvtt_muxer[pkt->track_idx];
if (!muxer)
return;
std::unique_ptr<WebvttBuffer, webvtt_buffer_deleter> buffer{
webvtt_muxer_try_mux_into_bytestream(muxer.get(), pkt_time->cts, pkt->keyframe,
it->codec_flavor[pkt->track_idx])};
if (!buffer)
return;
long ref = 1;
DARRAY(uint8_t) out_data;
da_init(out_data);
da_reserve(out_data, sizeof(ref) + pkt->size + webvtt_buffer_length(buffer.get()));
// Copy the original packet
da_push_back_array(out_data, (uint8_t *)&ref, sizeof(ref));
da_push_back_array(out_data, pkt->data, pkt->size);
da_push_back_array(out_data, webvtt_buffer_data(buffer.get()),
webvtt_buffer_length(buffer.get()));
auto old_pkt = *pkt;
obs_encoder_packet_release(pkt);
*pkt = old_pkt;
pkt->data = (uint8_t *)out_data.array + sizeof(ref);
pkt->size = out_data.num - sizeof(ref);
}
void add_webvtt_output(transcription_filter_data &gf, obs_output_t *output,
transcription_filter_data::webvtt_output_type output_type)
{
if (!obs_output_add_packet_callback_)
return;
if (!gf.webvtt_caption_to_recording &&
output_type == transcription_filter_data::webvtt_output_type::Recording)
return;
if (!gf.webvtt_caption_to_stream &&
output_type == transcription_filter_data::webvtt_output_type::Streaming)
return;
auto start_ms = now_ms();
auto lock = std::unique_lock(gf.active_outputs_mutex);
gf.active_outputs.push_back({});
auto &entry = gf.active_outputs.back();
entry.output = obs_output_get_weak_output(output);
entry.output_type = output_type;
entry.start_timestamp_ms = start_ms;
obs_output_add_packet_callback_(output, output_packet_added_callback, &gf);
}
void remove_webvtt_output(transcription_filter_data &gf, obs_output_t *output)
{
if (!obs_output_remove_packet_callback_)
return;
auto lock = std::unique_lock(gf.active_outputs_mutex);
for (auto iter = gf.active_outputs.begin(); iter != gf.active_outputs.end(); iter++) {
auto &webvtt_output = *iter;
if (!obs_weak_output_references_output(webvtt_output.output, output))
continue;
obs_output_remove_packet_callback_(output, output_packet_added_callback, &gf);
gf.active_outputs.erase(iter);
return;
}
}
void remove_all_webvtt_outputs(std::unique_lock<std::mutex> & /*active_outputs_lock*/,
transcription_filter_data &gf)
{
for (auto &output : gf.active_outputs) {
auto obs_output = OBSOutputAutoRelease{obs_weak_output_get_output(output.output)};
if (!obs_output)
continue;
obs_output_remove_packet_callback_(obs_output, output_packet_added_callback, &gf);
}
}
#endif
/**
* @brief Callback function to handle recording state changes in OBS.
*
@@ -383,6 +603,10 @@ void recording_state_callback(enum obs_frontend_event event, void *data)
struct transcription_filter_data *gf_ =
static_cast<struct transcription_filter_data *>(data);
if (event == OBS_FRONTEND_EVENT_RECORDING_STARTING) {
#ifdef ENABLE_WEBVTT
add_webvtt_output(*gf_, OBSOutputAutoRelease{obs_frontend_get_recording_output()},
transcription_filter_data::webvtt_output_type::Recording);
#endif
if (gf_->save_srt && gf_->save_only_while_recording &&
gf_->output_file_path != "") {
obs_log(gf_->log_level, "Recording started. Resetting srt file.");
@@ -395,6 +619,11 @@ void recording_state_callback(enum obs_frontend_event event, void *data)
gf_->sentence_number = 1;
gf_->start_timestamp_ms = now_ms();
}
} else if (event == OBS_FRONTEND_EVENT_RECORDING_STOPPING) {
#ifdef ENABLE_WEBVTT
remove_webvtt_output(*gf_,
OBSOutputAutoRelease{obs_frontend_get_recording_output()});
#endif
} else if (event == OBS_FRONTEND_EVENT_RECORDING_STOPPED) {
if (!gf_->save_only_while_recording || !gf_->rename_file_to_match_recording) {
return;
@@ -428,6 +657,16 @@ void recording_state_callback(enum obs_frontend_event event, void *data)
newPath = recordingPath.parent_path() / newPath.filename();
fs::rename(outputPath, newPath);
} else if (event == OBS_FRONTEND_EVENT_STREAMING_STARTING) {
#ifdef ENABLE_WEBVTT
add_webvtt_output(*gf_, OBSOutputAutoRelease{obs_frontend_get_streaming_output()},
transcription_filter_data::webvtt_output_type::Streaming);
#endif
} else if (event == OBS_FRONTEND_EVENT_STREAMING_STOPPING) {
#ifdef ENABLE_WEBVTT
remove_webvtt_output(*gf_,
OBSOutputAutoRelease{obs_frontend_get_streaming_output()});
#endif
}
}
@@ -462,10 +701,8 @@ void reset_caption_state(transcription_filter_data *gf_)
if (gf_->info_buffer.data != nullptr) {
circlebuf_free(&gf_->info_buffer);
}
if (gf_->whisper_buffer.data != nullptr) {
circlebuf_free(&gf_->whisper_buffer);
}
}
gf_->clear_buffers = true;
}
void media_play_callback(void *data_, calldata_t *cd)

View File

@@ -1,6 +1,11 @@
#ifndef TRANSCRIPTION_FILTER_DATA_H
#define TRANSCRIPTION_FILTER_DATA_H
#ifdef ENABLE_WEBVTT
#include <obs.hpp>
#include <webvtt-in-sei.h>
#endif
#include <util/circlebuf.h>
#include <util/darray.h>
#include <media-io/audio-resampler.h>
@@ -22,6 +27,68 @@
#include "translation/cloud-translation/translation-cloud.h"
#define MAX_PREPROC_CHANNELS 10
#define MAX_WEBVTT_TRACKS 5
#if !defined(LIBOBS_MAJOR_VERSION) || LIBOBS_MAJOR_VERSION < 31
struct encoder_packet_time {
/* PTS used to associate uncompressed frames with encoded packets. */
int64_t pts;
/* Composition timestamp is when the frame was rendered,
* captured via os_gettime_ns().
*/
uint64_t cts;
/* FERC (Frame Encode Request) is when the frame was
* submitted to the encoder for encoding via the encode
* callback (e.g. encode_texture2()), captured via os_gettime_ns().
*/
uint64_t fer;
/* FERC (Frame Encode Request Complete) is when
* the associated FER event completed. If the encode
* is synchronous with the call, this means FERC - FEC
* measures the actual encode time, otherwise if the
* encode is asynchronous, it measures the pipeline
* delay between encode request and encode complete.
* FERC is also captured via os_gettime_ns().
*/
uint64_t ferc;
/* PIR (Packet Interleave Request) is when the encoded packet
* is interleaved with the stream. PIR is captured via
* os_gettime_ns(). The difference between PIR and CTS gives
* the total latency between frame rendering
* and packet interleaving.
*/
uint64_t pir;
};
#endif
using obs_output_add_packet_callback_t =
void(obs_output_t *output,
void (*packet_cb)(obs_output_t *output, struct encoder_packet *pkt,
struct encoder_packet_time *pkt_time, void *param),
void *param);
using obs_output_remove_packet_callback_t =
void(obs_output_t *output,
void (*packet_cb)(obs_output_t *output, struct encoder_packet *pkt,
struct encoder_packet_time *pkt_time, void *param),
void *param);
extern obs_output_add_packet_callback_t *obs_output_add_packet_callback_;
extern obs_output_remove_packet_callback_t *obs_output_remove_packet_callback_;
extern "C" void load_packet_callback_functions();
#ifdef ENABLE_WEBVTT
struct webvtt_muxer_deleter {
void operator()(WebvttMuxer *m) { webvtt_muxer_free(m); }
};
struct webvtt_buffer_deleter {
void operator()(WebvttBuffer *b) { webvtt_buffer_free(b); }
};
#endif
struct transcription_filter_data {
obs_source_t *context; // obs filter source (this filter)
@@ -47,6 +114,7 @@ struct transcription_filter_data {
float *copy_buffers[MAX_PREPROC_CHANNELS];
struct circlebuf info_buffer;
struct circlebuf input_buffers[MAX_PREPROC_CHANNELS];
std::atomic<bool> clear_buffers;
struct circlebuf whisper_buffer;
/* Resampler */
@@ -138,6 +206,36 @@ struct transcription_filter_data {
TokenBufferSegmentation buffered_output_output_type =
TokenBufferSegmentation::SEGMENTATION_TOKEN;
#ifdef ENABLE_WEBVTT
enum struct webvtt_output_type {
Streaming,
Recording,
};
struct webvtt_output {
OBSWeakOutputAutoRelease output;
webvtt_output_type output_type;
uint64_t start_timestamp_ms;
bool initialized = false;
std::map<std::string, uint8_t> language_to_track;
std::unique_ptr<WebvttMuxer, webvtt_muxer_deleter>
webvtt_muxer[MAX_OUTPUT_VIDEO_ENCODERS];
CodecFlavor codec_flavor[MAX_OUTPUT_VIDEO_ENCODERS] = {};
};
std::mutex active_outputs_mutex;
std::vector<webvtt_output> active_outputs;
std::mutex webvtt_settings_mutex;
uint16_t latency_to_video_in_msecs;
uint8_t send_frequency_hz;
std::vector<std::string> active_languages;
std::atomic<bool> webvtt_caption_to_stream;
std::atomic<bool> webvtt_caption_to_recording;
#endif
// ctor
transcription_filter_data() : whisper_buf_mutex(), whisper_ctx_mutex(), wshiper_thread_cv()
{
@@ -161,7 +259,8 @@ struct transcription_filter_audio_info {
};
// Callback sent when the transcription has a new result
void set_text_callback(struct transcription_filter_data *gf, const DetectionResultWithText &str);
void set_text_callback(uint64_t possible_end_ts, struct transcription_filter_data *gf,
const DetectionResultWithText &str);
void clear_current_caption(transcription_filter_data *gf_);
// Callback sent when the VAD finds an audio chunk. Sample rate = WHISPER_SAMPLE_RATE, channels = 1

View File

@@ -2,6 +2,7 @@
#include <obs.h>
#include <obs-module.h>
#include <obs-frontend-api.h>
#include <util/dstr.hpp>
#include "transcription-filter-data.h"
#include "transcription-filter.h"
@@ -411,6 +412,45 @@ void add_translation_group_properties(obs_properties_t *ppts)
MT_("translation_no_repeat_ngram_size"), 1, 10, 1);
}
#ifdef ENABLE_WEBVTT
void add_webvtt_group_properties(obs_properties_t *ppts)
{
auto webvtt_group = obs_properties_create();
obs_properties_add_group(ppts, "webvtt_enable", MT_("webvtt_group"), OBS_GROUP_CHECKABLE,
webvtt_group);
obs_properties_add_bool(webvtt_group, "webvtt_caption_to_stream",
MT_("webvtt_caption_to_stream"));
obs_properties_add_bool(webvtt_group, "webvtt_caption_to_recording",
MT_("webvtt_caption_to_recording"));
obs_properties_add_int_slider(webvtt_group, "webvtt_latency_to_video_in_msecs",
MT_("webvtt_latency_to_video_in_msecs"), 0,
std::numeric_limits<uint16_t>::max(), 1);
obs_properties_add_int_slider(webvtt_group, "webvtt_send_frequency_hz",
MT_("webvtt_send_frequency_hz"), 1,
std::numeric_limits<uint8_t>::max(), 1);
DStr num_buffer, name_buffer, description_buffer;
for (size_t i = 0; i < MAX_WEBVTT_TRACKS; i++) {
dstr_printf(num_buffer, "%zu", i + 1);
dstr_printf(name_buffer, "webvtt_language_%zu", i);
dstr_copy(description_buffer, MT_("webvtt_language_description"));
dstr_replace(description_buffer, "$1", num_buffer->array);
obs_property_t *language_select = obs_properties_add_list(
webvtt_group, name_buffer->array, description_buffer->array,
OBS_COMBO_TYPE_LIST, OBS_COMBO_FORMAT_STRING);
obs_property_list_add_string(language_select, "None", "");
for (auto const &pair : whisper_available_lang_reverse) {
if (pair.second == "auto")
continue;
obs_property_list_add_string(language_select, pair.first.c_str(),
pair.second.c_str());
}
}
}
#endif
void add_file_output_group_properties(obs_properties_t *ppts)
{
// create a file output group
@@ -617,6 +657,9 @@ obs_properties_t *transcription_filter_properties(void *data)
add_transcription_group_properties(ppts, gf);
add_translation_group_properties(ppts);
add_translation_cloud_group_properties(ppts);
#ifdef ENABLE_WEBVTT
add_webvtt_group_properties(ppts);
#endif
add_file_output_group_properties(ppts);
add_buffered_output_group_properties(ppts);
add_advanced_group_properties(ppts, gf);
@@ -698,6 +741,10 @@ void transcription_filter_defaults(obs_data_t *s)
"{\n\t\"text\":\"{{sentence}}\",\n\t\"target\":\"{{target_language}}\"\n}");
obs_data_set_default_string(s, "translate_cloud_response_json_path", "translations.0.text");
// webvtt options
obs_data_set_default_int(s, "webvtt_latency_to_video_in_msecs", 10'000);
obs_data_set_default_int(s, "webvtt_send_frequency_hz", 2);
// Whisper parameters
apply_whisper_params_defaults_on_settings(s);
}

View File

@@ -1,5 +1,7 @@
#include <obs-module.h>
#include <obs-frontend-api.h>
#include <util/dstr.hpp>
#include <util/platform.h>
#include <algorithm>
#include <fstream>
@@ -132,6 +134,12 @@ void transcription_filter_remove(void *data, obs_source_t *source)
disconnect_source_signals(gf, source);
}
#ifdef ENABLE_WEBVTT
void remove_all_webvtt_outputs(std::unique_lock<std::mutex> &active_outputs_lock,
transcription_filter_data &gf);
#endif
void transcription_filter_destroy(void *data)
{
struct transcription_filter_data *gf =
@@ -159,6 +167,14 @@ void transcription_filter_destroy(void *data)
circlebuf_free(&gf->resampled_buffer);
#ifdef ENABLE_WEBVTT
{
auto lock = std::unique_lock(gf->active_outputs_mutex);
remove_all_webvtt_outputs(lock, *gf);
gf->active_outputs.clear();
}
#endif
if (gf->captions_monitor.isEnabled()) {
gf->captions_monitor.stopThread();
}
@@ -179,6 +195,41 @@ void transcription_filter_update(void *data, obs_data_t *s)
gf->vad_mode = (int)obs_data_get_int(s, "vad_mode");
gf->log_words = obs_data_get_bool(s, "log_words");
gf->caption_to_stream = obs_data_get_bool(s, "caption_to_stream");
#ifdef ENABLE_WEBVTT
gf->webvtt_caption_to_stream = obs_data_get_bool(s, "webvtt_caption_to_stream");
gf->webvtt_caption_to_recording = obs_data_get_bool(s, "webvtt_caption_to_recording");
{
auto lock = std::unique_lock(gf->webvtt_settings_mutex);
gf->latency_to_video_in_msecs = static_cast<uint16_t>(std::max(
0ll, std::min(static_cast<long long>(std::numeric_limits<uint16_t>::max()),
obs_data_get_int(s, "webvtt_latency_to_video_in_msecs"))));
gf->send_frequency_hz = static_cast<uint8_t>(std::max(
1ll, std::min(static_cast<long long>(std::numeric_limits<uint8_t>::max()),
obs_data_get_int(s, "webvtt_send_frequency_hz"))));
gf->active_languages.clear();
DStr name_buffer;
for (size_t i = 0; i < MAX_WEBVTT_TRACKS; i++) {
dstr_printf(name_buffer, "webvtt_language_%zu", i);
if (!obs_data_has_user_value(s, name_buffer->array))
continue;
std::string lang = obs_data_get_string(s, name_buffer->array);
if (lang.empty())
continue;
if (std::find(gf->active_languages.begin(), gf->active_languages.end(),
lang) != gf->active_languages.end()) {
obs_log(LOG_WARNING, "Not adding duplicate language '%s'",
lang.c_str());
continue;
}
gf->active_languages.push_back(lang);
}
}
#endif
gf->save_to_file = obs_data_get_bool(s, "file_output_enable");
gf->save_srt = obs_data_get_bool(s, "subtitle_save_srt");
gf->truncate_output_file = obs_data_get_bool(s, "truncate_output_file");
@@ -557,3 +608,25 @@ void transcription_filter_hide(void *data)
static_cast<struct transcription_filter_data *>(data);
obs_log(gf->log_level, "filter hide");
}
obs_output_add_packet_callback_t *obs_output_add_packet_callback_ = nullptr;
obs_output_remove_packet_callback_t *obs_output_remove_packet_callback_ = nullptr;
void load_packet_callback_functions()
{
auto libobs = os_dlopen("obs");
if (!libobs)
return;
auto add_callback = os_dlsym(libobs, "obs_output_add_packet_callback");
auto remove_callback = os_dlsym(libobs, "obs_output_remove_packet_callback");
if (!add_callback || !remove_callback)
return;
obs_output_add_packet_callback_ =
reinterpret_cast<obs_output_add_packet_callback_t *>(add_callback);
obs_output_remove_packet_callback_ =
reinterpret_cast<obs_output_remove_packet_callback_t *>(remove_callback);
obs_log(LOG_INFO, "loaded callbacks");
}

View File

@@ -341,11 +341,13 @@ void run_inference_and_callbacks(transcription_filter_data *gf, uint64_t start_o
pcm32f_size * sizeof(float));
}
auto inference_start_ts = now_ms();
struct DetectionResultWithText inference_result =
run_whisper_inference(gf, pcm32f_data, pcm32f_size_with_silence, start_offset_ms,
end_offset_ms, vad_state);
// output inference result to a text source
set_text_callback(gf, inference_result);
set_text_callback(inference_start_ts, gf, inference_result);
if (gf->enable_audio_chunks_callback && vad_state != VAD_STATE_PARTIAL) {
audio_chunk_callback(gf, pcm32f_data, pcm32f_size_with_silence, vad_state,
@@ -386,6 +388,13 @@ void whisper_loop(void *data)
}
}
if (gf->clear_buffers) {
circlebuf_pop_front(&gf->resampled_buffer, nullptr, 0);
circlebuf_pop_front(&gf->whisper_buffer, nullptr, 0);
current_vad_state = {false, now_ms(), 0, 0};
gf->clear_buffers = false;
}
if (gf->vad_mode == VAD_MODE_HYBRID) {
current_vad_state = hybrid_vad_segmentation(gf, current_vad_state);
} else if (gf->vad_mode == VAD_MODE_ACTIVE) {