Files
local_transcription/utils.py
AtHeartEngineer a44ef06a41 rt
2024-03-14 22:29:13 -04:00

44 lines
1.5 KiB
Python

import numpy as np
from pyannote.core import Annotation, SlidingWindowFeature, SlidingWindow
def concat(chunks, collar=0.05):
"""
Concatenate predictions and audio
given a list of `(diarization, waveform)` pairs
and merge contiguous single-speaker regions
with pauses shorter than `collar` seconds.
"""
first_annotation = chunks[0][0]
first_waveform = chunks[0][1]
annotation = Annotation(uri=first_annotation.uri)
data = []
for ann, wav in chunks:
annotation.update(ann)
data.append(wav.data)
annotation = annotation.support(collar)
window = SlidingWindow(
first_waveform.sliding_window.duration,
first_waveform.sliding_window.step,
first_waveform.sliding_window.start,
)
data = np.concatenate(data, axis=0)
return annotation, SlidingWindowFeature(data, window)
def colorize_transcription(transcription):
"""
Unify a speaker-aware transcription represented as
a list of `(speaker: int, text: str)` pairs
into a single text colored by speakers.
"""
colors = 2 * [
"bright_red", "bright_blue", "bright_green", "orange3", "deep_pink1",
"yellow2", "magenta", "cyan", "bright_magenta", "dodger_blue2"
]
result = []
for speaker, text in transcription:
if speaker == -1:
# No speakerfound for this text, use default terminal color
result.append(text)
else:
result.append(f"[{colors[speaker]}]{text}")
return "\n".join(result)