From cdc31903c236b3345bbc59f759e9419b72ed98b8 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Sat, 14 Feb 2026 15:03:40 +0000 Subject: [PATCH] refactor(media-understanding): share gemini inline-data helper --- .../providers/google/audio.ts | 93 +++-------------- .../providers/google/inline-data.ts | 99 +++++++++++++++++++ .../providers/google/video.ts | 93 +++-------------- 3 files changed, 121 insertions(+), 164 deletions(-) create mode 100644 src/media-understanding/providers/google/inline-data.ts diff --git a/src/media-understanding/providers/google/audio.ts b/src/media-understanding/providers/google/audio.ts index e677a31366..5173ad3f09 100644 --- a/src/media-understanding/providers/google/audio.ts +++ b/src/media-understanding/providers/google/audio.ts @@ -1,92 +1,21 @@ import type { AudioTranscriptionRequest, AudioTranscriptionResult } from "../../types.js"; -import { normalizeGoogleModelId } from "../../../agents/models-config.providers.js"; -import { fetchWithTimeoutGuarded, normalizeBaseUrl, readErrorResponse } from "../shared.js"; +import { generateGeminiInlineDataText } from "./inline-data.js"; export const DEFAULT_GOOGLE_AUDIO_BASE_URL = "https://generativelanguage.googleapis.com/v1beta"; const DEFAULT_GOOGLE_AUDIO_MODEL = "gemini-3-flash-preview"; const DEFAULT_GOOGLE_AUDIO_PROMPT = "Transcribe the audio."; -function resolveModel(model?: string): string { - const trimmed = model?.trim(); - if (!trimmed) { - return DEFAULT_GOOGLE_AUDIO_MODEL; - } - return normalizeGoogleModelId(trimmed); -} - -function resolvePrompt(prompt?: string): string { - const trimmed = prompt?.trim(); - return trimmed || DEFAULT_GOOGLE_AUDIO_PROMPT; -} - export async function transcribeGeminiAudio( params: AudioTranscriptionRequest, ): Promise { - const fetchFn = params.fetchFn ?? fetch; - const baseUrl = normalizeBaseUrl(params.baseUrl, DEFAULT_GOOGLE_AUDIO_BASE_URL); - const allowPrivate = Boolean(params.baseUrl?.trim()); - const model = resolveModel(params.model); - const url = `${baseUrl}/models/${model}:generateContent`; - - const headers = new Headers(params.headers); - if (!headers.has("content-type")) { - headers.set("content-type", "application/json"); - } - if (!headers.has("x-goog-api-key")) { - headers.set("x-goog-api-key", params.apiKey); - } - - const body = { - contents: [ - { - role: "user", - parts: [ - { text: resolvePrompt(params.prompt) }, - { - inline_data: { - mime_type: params.mime ?? "audio/wav", - data: params.buffer.toString("base64"), - }, - }, - ], - }, - ], - }; - - const { response: res, release } = await fetchWithTimeoutGuarded( - url, - { - method: "POST", - headers, - body: JSON.stringify(body), - }, - params.timeoutMs, - fetchFn, - allowPrivate ? { ssrfPolicy: { allowPrivateNetwork: true } } : undefined, - ); - - try { - if (!res.ok) { - const detail = await readErrorResponse(res); - const suffix = detail ? `: ${detail}` : ""; - throw new Error(`Audio transcription failed (HTTP ${res.status})${suffix}`); - } - - const payload = (await res.json()) as { - candidates?: Array<{ - content?: { parts?: Array<{ text?: string }> }; - }>; - }; - const parts = payload.candidates?.[0]?.content?.parts ?? []; - const text = parts - .map((part) => part?.text?.trim()) - .filter(Boolean) - .join("\n"); - if (!text) { - throw new Error("Audio transcription response missing text"); - } - return { text, model }; - } finally { - await release(); - } + const { text, model } = await generateGeminiInlineDataText({ + ...params, + defaultBaseUrl: DEFAULT_GOOGLE_AUDIO_BASE_URL, + defaultModel: DEFAULT_GOOGLE_AUDIO_MODEL, + defaultPrompt: DEFAULT_GOOGLE_AUDIO_PROMPT, + defaultMime: "audio/wav", + httpErrorLabel: "Audio transcription failed", + missingTextError: "Audio transcription response missing text", + }); + return { text, model }; } diff --git a/src/media-understanding/providers/google/inline-data.ts b/src/media-understanding/providers/google/inline-data.ts new file mode 100644 index 0000000000..4f5df896a0 --- /dev/null +++ b/src/media-understanding/providers/google/inline-data.ts @@ -0,0 +1,99 @@ +import { normalizeGoogleModelId } from "../../../agents/models-config.providers.js"; +import { fetchWithTimeoutGuarded, normalizeBaseUrl, readErrorResponse } from "../shared.js"; + +export async function generateGeminiInlineDataText(params: { + buffer: Buffer; + mime?: string; + apiKey: string; + baseUrl?: string; + headers?: Record; + model?: string; + prompt?: string; + timeoutMs: number; + fetchFn?: typeof fetch; + defaultBaseUrl: string; + defaultModel: string; + defaultPrompt: string; + defaultMime: string; + httpErrorLabel: string; + missingTextError: string; +}): Promise<{ text: string; model: string }> { + const fetchFn = params.fetchFn ?? fetch; + const baseUrl = normalizeBaseUrl(params.baseUrl, params.defaultBaseUrl); + const allowPrivate = Boolean(params.baseUrl?.trim()); + const model = (() => { + const trimmed = params.model?.trim(); + if (!trimmed) { + return params.defaultModel; + } + return normalizeGoogleModelId(trimmed); + })(); + const url = `${baseUrl}/models/${model}:generateContent`; + + const headers = new Headers(params.headers); + if (!headers.has("content-type")) { + headers.set("content-type", "application/json"); + } + if (!headers.has("x-goog-api-key")) { + headers.set("x-goog-api-key", params.apiKey); + } + + const prompt = (() => { + const trimmed = params.prompt?.trim(); + return trimmed || params.defaultPrompt; + })(); + + const body = { + contents: [ + { + role: "user", + parts: [ + { text: prompt }, + { + inline_data: { + mime_type: params.mime ?? params.defaultMime, + data: params.buffer.toString("base64"), + }, + }, + ], + }, + ], + }; + + const { response: res, release } = await fetchWithTimeoutGuarded( + url, + { + method: "POST", + headers, + body: JSON.stringify(body), + }, + params.timeoutMs, + fetchFn, + allowPrivate ? { ssrfPolicy: { allowPrivateNetwork: true } } : undefined, + ); + + try { + if (!res.ok) { + const detail = await readErrorResponse(res); + const suffix = detail ? `: ${detail}` : ""; + throw new Error(`${params.httpErrorLabel} (HTTP ${res.status})${suffix}`); + } + + const payload = (await res.json()) as { + candidates?: Array<{ + content?: { parts?: Array<{ text?: string }> }; + }>; + }; + const parts = payload.candidates?.[0]?.content?.parts ?? []; + const text = parts + .map((part) => part?.text?.trim()) + .filter(Boolean) + .join("\n"); + if (!text) { + throw new Error(params.missingTextError); + } + return { text, model }; + } finally { + await release(); + } +} diff --git a/src/media-understanding/providers/google/video.ts b/src/media-understanding/providers/google/video.ts index 339c11ae91..edbeccf028 100644 --- a/src/media-understanding/providers/google/video.ts +++ b/src/media-understanding/providers/google/video.ts @@ -1,92 +1,21 @@ import type { VideoDescriptionRequest, VideoDescriptionResult } from "../../types.js"; -import { normalizeGoogleModelId } from "../../../agents/models-config.providers.js"; -import { fetchWithTimeoutGuarded, normalizeBaseUrl, readErrorResponse } from "../shared.js"; +import { generateGeminiInlineDataText } from "./inline-data.js"; export const DEFAULT_GOOGLE_VIDEO_BASE_URL = "https://generativelanguage.googleapis.com/v1beta"; const DEFAULT_GOOGLE_VIDEO_MODEL = "gemini-3-flash-preview"; const DEFAULT_GOOGLE_VIDEO_PROMPT = "Describe the video."; -function resolveModel(model?: string): string { - const trimmed = model?.trim(); - if (!trimmed) { - return DEFAULT_GOOGLE_VIDEO_MODEL; - } - return normalizeGoogleModelId(trimmed); -} - -function resolvePrompt(prompt?: string): string { - const trimmed = prompt?.trim(); - return trimmed || DEFAULT_GOOGLE_VIDEO_PROMPT; -} - export async function describeGeminiVideo( params: VideoDescriptionRequest, ): Promise { - const fetchFn = params.fetchFn ?? fetch; - const baseUrl = normalizeBaseUrl(params.baseUrl, DEFAULT_GOOGLE_VIDEO_BASE_URL); - const allowPrivate = Boolean(params.baseUrl?.trim()); - const model = resolveModel(params.model); - const url = `${baseUrl}/models/${model}:generateContent`; - - const headers = new Headers(params.headers); - if (!headers.has("content-type")) { - headers.set("content-type", "application/json"); - } - if (!headers.has("x-goog-api-key")) { - headers.set("x-goog-api-key", params.apiKey); - } - - const body = { - contents: [ - { - role: "user", - parts: [ - { text: resolvePrompt(params.prompt) }, - { - inline_data: { - mime_type: params.mime ?? "video/mp4", - data: params.buffer.toString("base64"), - }, - }, - ], - }, - ], - }; - - const { response: res, release } = await fetchWithTimeoutGuarded( - url, - { - method: "POST", - headers, - body: JSON.stringify(body), - }, - params.timeoutMs, - fetchFn, - allowPrivate ? { ssrfPolicy: { allowPrivateNetwork: true } } : undefined, - ); - - try { - if (!res.ok) { - const detail = await readErrorResponse(res); - const suffix = detail ? `: ${detail}` : ""; - throw new Error(`Video description failed (HTTP ${res.status})${suffix}`); - } - - const payload = (await res.json()) as { - candidates?: Array<{ - content?: { parts?: Array<{ text?: string }> }; - }>; - }; - const parts = payload.candidates?.[0]?.content?.parts ?? []; - const text = parts - .map((part) => part?.text?.trim()) - .filter(Boolean) - .join("\n"); - if (!text) { - throw new Error("Video description response missing text"); - } - return { text, model }; - } finally { - await release(); - } + const { text, model } = await generateGeminiInlineDataText({ + ...params, + defaultBaseUrl: DEFAULT_GOOGLE_VIDEO_BASE_URL, + defaultModel: DEFAULT_GOOGLE_VIDEO_MODEL, + defaultPrompt: DEFAULT_GOOGLE_VIDEO_PROMPT, + defaultMime: "video/mp4", + httpErrorLabel: "Video description failed", + missingTextError: "Video description response missing text", + }); + return { text, model }; }