import numpy as np from pyannote.core import Annotation, SlidingWindowFeature, SlidingWindow def concat(chunks, collar=0.05): """ Concatenate predictions and audio given a list of `(diarization, waveform)` pairs and merge contiguous single-speaker regions with pauses shorter than `collar` seconds. """ first_annotation = chunks[0][0] first_waveform = chunks[0][1] annotation = Annotation(uri=first_annotation.uri) data = [] for ann, wav in chunks: annotation.update(ann) data.append(wav.data) annotation = annotation.support(collar) window = SlidingWindow( first_waveform.sliding_window.duration, first_waveform.sliding_window.step, first_waveform.sliding_window.start, ) data = np.concatenate(data, axis=0) return annotation, SlidingWindowFeature(data, window) def colorize_transcription(transcription): """ Unify a speaker-aware transcription represented as a list of `(speaker: int, text: str)` pairs into a single text colored by speakers. """ colors = 2 * [ "bright_red", "bright_blue", "bright_green", "orange3", "deep_pink1", "yellow2", "magenta", "cyan", "bright_magenta", "dodger_blue2" ] result = [] for speaker, text in transcription: if speaker == -1: # No speakerfound for this text, use default terminal color result.append(text) else: result.append(f"[{colors[speaker]}]{text}") return "\n".join(result)