refactor(media-understanding): share gemini inline-data helper

This commit is contained in:
Peter Steinberger
2026-02-14 15:03:40 +00:00
parent d1f36bfd84
commit cdc31903c2
3 changed files with 121 additions and 164 deletions

View File

@@ -1,92 +1,21 @@
import type { AudioTranscriptionRequest, AudioTranscriptionResult } from "../../types.js";
import { normalizeGoogleModelId } from "../../../agents/models-config.providers.js";
import { fetchWithTimeoutGuarded, normalizeBaseUrl, readErrorResponse } from "../shared.js";
import { generateGeminiInlineDataText } from "./inline-data.js";
export const DEFAULT_GOOGLE_AUDIO_BASE_URL = "https://generativelanguage.googleapis.com/v1beta";
const DEFAULT_GOOGLE_AUDIO_MODEL = "gemini-3-flash-preview";
const DEFAULT_GOOGLE_AUDIO_PROMPT = "Transcribe the audio.";
function resolveModel(model?: string): string {
const trimmed = model?.trim();
if (!trimmed) {
return DEFAULT_GOOGLE_AUDIO_MODEL;
}
return normalizeGoogleModelId(trimmed);
}
function resolvePrompt(prompt?: string): string {
const trimmed = prompt?.trim();
return trimmed || DEFAULT_GOOGLE_AUDIO_PROMPT;
}
export async function transcribeGeminiAudio(
params: AudioTranscriptionRequest,
): Promise<AudioTranscriptionResult> {
const fetchFn = params.fetchFn ?? fetch;
const baseUrl = normalizeBaseUrl(params.baseUrl, DEFAULT_GOOGLE_AUDIO_BASE_URL);
const allowPrivate = Boolean(params.baseUrl?.trim());
const model = resolveModel(params.model);
const url = `${baseUrl}/models/${model}:generateContent`;
const headers = new Headers(params.headers);
if (!headers.has("content-type")) {
headers.set("content-type", "application/json");
}
if (!headers.has("x-goog-api-key")) {
headers.set("x-goog-api-key", params.apiKey);
}
const body = {
contents: [
{
role: "user",
parts: [
{ text: resolvePrompt(params.prompt) },
{
inline_data: {
mime_type: params.mime ?? "audio/wav",
data: params.buffer.toString("base64"),
},
},
],
},
],
};
const { response: res, release } = await fetchWithTimeoutGuarded(
url,
{
method: "POST",
headers,
body: JSON.stringify(body),
},
params.timeoutMs,
fetchFn,
allowPrivate ? { ssrfPolicy: { allowPrivateNetwork: true } } : undefined,
);
try {
if (!res.ok) {
const detail = await readErrorResponse(res);
const suffix = detail ? `: ${detail}` : "";
throw new Error(`Audio transcription failed (HTTP ${res.status})${suffix}`);
}
const payload = (await res.json()) as {
candidates?: Array<{
content?: { parts?: Array<{ text?: string }> };
}>;
};
const parts = payload.candidates?.[0]?.content?.parts ?? [];
const text = parts
.map((part) => part?.text?.trim())
.filter(Boolean)
.join("\n");
if (!text) {
throw new Error("Audio transcription response missing text");
}
return { text, model };
} finally {
await release();
}
const { text, model } = await generateGeminiInlineDataText({
...params,
defaultBaseUrl: DEFAULT_GOOGLE_AUDIO_BASE_URL,
defaultModel: DEFAULT_GOOGLE_AUDIO_MODEL,
defaultPrompt: DEFAULT_GOOGLE_AUDIO_PROMPT,
defaultMime: "audio/wav",
httpErrorLabel: "Audio transcription failed",
missingTextError: "Audio transcription response missing text",
});
return { text, model };
}

View File

@@ -0,0 +1,99 @@
import { normalizeGoogleModelId } from "../../../agents/models-config.providers.js";
import { fetchWithTimeoutGuarded, normalizeBaseUrl, readErrorResponse } from "../shared.js";
export async function generateGeminiInlineDataText(params: {
buffer: Buffer;
mime?: string;
apiKey: string;
baseUrl?: string;
headers?: Record<string, string>;
model?: string;
prompt?: string;
timeoutMs: number;
fetchFn?: typeof fetch;
defaultBaseUrl: string;
defaultModel: string;
defaultPrompt: string;
defaultMime: string;
httpErrorLabel: string;
missingTextError: string;
}): Promise<{ text: string; model: string }> {
const fetchFn = params.fetchFn ?? fetch;
const baseUrl = normalizeBaseUrl(params.baseUrl, params.defaultBaseUrl);
const allowPrivate = Boolean(params.baseUrl?.trim());
const model = (() => {
const trimmed = params.model?.trim();
if (!trimmed) {
return params.defaultModel;
}
return normalizeGoogleModelId(trimmed);
})();
const url = `${baseUrl}/models/${model}:generateContent`;
const headers = new Headers(params.headers);
if (!headers.has("content-type")) {
headers.set("content-type", "application/json");
}
if (!headers.has("x-goog-api-key")) {
headers.set("x-goog-api-key", params.apiKey);
}
const prompt = (() => {
const trimmed = params.prompt?.trim();
return trimmed || params.defaultPrompt;
})();
const body = {
contents: [
{
role: "user",
parts: [
{ text: prompt },
{
inline_data: {
mime_type: params.mime ?? params.defaultMime,
data: params.buffer.toString("base64"),
},
},
],
},
],
};
const { response: res, release } = await fetchWithTimeoutGuarded(
url,
{
method: "POST",
headers,
body: JSON.stringify(body),
},
params.timeoutMs,
fetchFn,
allowPrivate ? { ssrfPolicy: { allowPrivateNetwork: true } } : undefined,
);
try {
if (!res.ok) {
const detail = await readErrorResponse(res);
const suffix = detail ? `: ${detail}` : "";
throw new Error(`${params.httpErrorLabel} (HTTP ${res.status})${suffix}`);
}
const payload = (await res.json()) as {
candidates?: Array<{
content?: { parts?: Array<{ text?: string }> };
}>;
};
const parts = payload.candidates?.[0]?.content?.parts ?? [];
const text = parts
.map((part) => part?.text?.trim())
.filter(Boolean)
.join("\n");
if (!text) {
throw new Error(params.missingTextError);
}
return { text, model };
} finally {
await release();
}
}

View File

@@ -1,92 +1,21 @@
import type { VideoDescriptionRequest, VideoDescriptionResult } from "../../types.js";
import { normalizeGoogleModelId } from "../../../agents/models-config.providers.js";
import { fetchWithTimeoutGuarded, normalizeBaseUrl, readErrorResponse } from "../shared.js";
import { generateGeminiInlineDataText } from "./inline-data.js";
export const DEFAULT_GOOGLE_VIDEO_BASE_URL = "https://generativelanguage.googleapis.com/v1beta";
const DEFAULT_GOOGLE_VIDEO_MODEL = "gemini-3-flash-preview";
const DEFAULT_GOOGLE_VIDEO_PROMPT = "Describe the video.";
function resolveModel(model?: string): string {
const trimmed = model?.trim();
if (!trimmed) {
return DEFAULT_GOOGLE_VIDEO_MODEL;
}
return normalizeGoogleModelId(trimmed);
}
function resolvePrompt(prompt?: string): string {
const trimmed = prompt?.trim();
return trimmed || DEFAULT_GOOGLE_VIDEO_PROMPT;
}
export async function describeGeminiVideo(
params: VideoDescriptionRequest,
): Promise<VideoDescriptionResult> {
const fetchFn = params.fetchFn ?? fetch;
const baseUrl = normalizeBaseUrl(params.baseUrl, DEFAULT_GOOGLE_VIDEO_BASE_URL);
const allowPrivate = Boolean(params.baseUrl?.trim());
const model = resolveModel(params.model);
const url = `${baseUrl}/models/${model}:generateContent`;
const headers = new Headers(params.headers);
if (!headers.has("content-type")) {
headers.set("content-type", "application/json");
}
if (!headers.has("x-goog-api-key")) {
headers.set("x-goog-api-key", params.apiKey);
}
const body = {
contents: [
{
role: "user",
parts: [
{ text: resolvePrompt(params.prompt) },
{
inline_data: {
mime_type: params.mime ?? "video/mp4",
data: params.buffer.toString("base64"),
},
},
],
},
],
};
const { response: res, release } = await fetchWithTimeoutGuarded(
url,
{
method: "POST",
headers,
body: JSON.stringify(body),
},
params.timeoutMs,
fetchFn,
allowPrivate ? { ssrfPolicy: { allowPrivateNetwork: true } } : undefined,
);
try {
if (!res.ok) {
const detail = await readErrorResponse(res);
const suffix = detail ? `: ${detail}` : "";
throw new Error(`Video description failed (HTTP ${res.status})${suffix}`);
}
const payload = (await res.json()) as {
candidates?: Array<{
content?: { parts?: Array<{ text?: string }> };
}>;
};
const parts = payload.candidates?.[0]?.content?.parts ?? [];
const text = parts
.map((part) => part?.text?.trim())
.filter(Boolean)
.join("\n");
if (!text) {
throw new Error("Video description response missing text");
}
return { text, model };
} finally {
await release();
}
const { text, model } = await generateGeminiInlineDataText({
...params,
defaultBaseUrl: DEFAULT_GOOGLE_VIDEO_BASE_URL,
defaultModel: DEFAULT_GOOGLE_VIDEO_MODEL,
defaultPrompt: DEFAULT_GOOGLE_VIDEO_PROMPT,
defaultMime: "video/mp4",
httpErrorLabel: "Video description failed",
missingTextError: "Video description response missing text",
});
return { text, model };
}