mirror of
https://github.com/openclaw/openclaw.git
synced 2026-02-19 18:39:20 -05:00
refactor(media-understanding): share gemini inline-data helper
This commit is contained in:
@@ -1,92 +1,21 @@
|
||||
import type { AudioTranscriptionRequest, AudioTranscriptionResult } from "../../types.js";
|
||||
import { normalizeGoogleModelId } from "../../../agents/models-config.providers.js";
|
||||
import { fetchWithTimeoutGuarded, normalizeBaseUrl, readErrorResponse } from "../shared.js";
|
||||
import { generateGeminiInlineDataText } from "./inline-data.js";
|
||||
|
||||
export const DEFAULT_GOOGLE_AUDIO_BASE_URL = "https://generativelanguage.googleapis.com/v1beta";
|
||||
const DEFAULT_GOOGLE_AUDIO_MODEL = "gemini-3-flash-preview";
|
||||
const DEFAULT_GOOGLE_AUDIO_PROMPT = "Transcribe the audio.";
|
||||
|
||||
function resolveModel(model?: string): string {
|
||||
const trimmed = model?.trim();
|
||||
if (!trimmed) {
|
||||
return DEFAULT_GOOGLE_AUDIO_MODEL;
|
||||
}
|
||||
return normalizeGoogleModelId(trimmed);
|
||||
}
|
||||
|
||||
function resolvePrompt(prompt?: string): string {
|
||||
const trimmed = prompt?.trim();
|
||||
return trimmed || DEFAULT_GOOGLE_AUDIO_PROMPT;
|
||||
}
|
||||
|
||||
export async function transcribeGeminiAudio(
|
||||
params: AudioTranscriptionRequest,
|
||||
): Promise<AudioTranscriptionResult> {
|
||||
const fetchFn = params.fetchFn ?? fetch;
|
||||
const baseUrl = normalizeBaseUrl(params.baseUrl, DEFAULT_GOOGLE_AUDIO_BASE_URL);
|
||||
const allowPrivate = Boolean(params.baseUrl?.trim());
|
||||
const model = resolveModel(params.model);
|
||||
const url = `${baseUrl}/models/${model}:generateContent`;
|
||||
|
||||
const headers = new Headers(params.headers);
|
||||
if (!headers.has("content-type")) {
|
||||
headers.set("content-type", "application/json");
|
||||
}
|
||||
if (!headers.has("x-goog-api-key")) {
|
||||
headers.set("x-goog-api-key", params.apiKey);
|
||||
}
|
||||
|
||||
const body = {
|
||||
contents: [
|
||||
{
|
||||
role: "user",
|
||||
parts: [
|
||||
{ text: resolvePrompt(params.prompt) },
|
||||
{
|
||||
inline_data: {
|
||||
mime_type: params.mime ?? "audio/wav",
|
||||
data: params.buffer.toString("base64"),
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
const { response: res, release } = await fetchWithTimeoutGuarded(
|
||||
url,
|
||||
{
|
||||
method: "POST",
|
||||
headers,
|
||||
body: JSON.stringify(body),
|
||||
},
|
||||
params.timeoutMs,
|
||||
fetchFn,
|
||||
allowPrivate ? { ssrfPolicy: { allowPrivateNetwork: true } } : undefined,
|
||||
);
|
||||
|
||||
try {
|
||||
if (!res.ok) {
|
||||
const detail = await readErrorResponse(res);
|
||||
const suffix = detail ? `: ${detail}` : "";
|
||||
throw new Error(`Audio transcription failed (HTTP ${res.status})${suffix}`);
|
||||
}
|
||||
|
||||
const payload = (await res.json()) as {
|
||||
candidates?: Array<{
|
||||
content?: { parts?: Array<{ text?: string }> };
|
||||
}>;
|
||||
};
|
||||
const parts = payload.candidates?.[0]?.content?.parts ?? [];
|
||||
const text = parts
|
||||
.map((part) => part?.text?.trim())
|
||||
.filter(Boolean)
|
||||
.join("\n");
|
||||
if (!text) {
|
||||
throw new Error("Audio transcription response missing text");
|
||||
}
|
||||
return { text, model };
|
||||
} finally {
|
||||
await release();
|
||||
}
|
||||
const { text, model } = await generateGeminiInlineDataText({
|
||||
...params,
|
||||
defaultBaseUrl: DEFAULT_GOOGLE_AUDIO_BASE_URL,
|
||||
defaultModel: DEFAULT_GOOGLE_AUDIO_MODEL,
|
||||
defaultPrompt: DEFAULT_GOOGLE_AUDIO_PROMPT,
|
||||
defaultMime: "audio/wav",
|
||||
httpErrorLabel: "Audio transcription failed",
|
||||
missingTextError: "Audio transcription response missing text",
|
||||
});
|
||||
return { text, model };
|
||||
}
|
||||
|
||||
99
src/media-understanding/providers/google/inline-data.ts
Normal file
99
src/media-understanding/providers/google/inline-data.ts
Normal file
@@ -0,0 +1,99 @@
|
||||
import { normalizeGoogleModelId } from "../../../agents/models-config.providers.js";
|
||||
import { fetchWithTimeoutGuarded, normalizeBaseUrl, readErrorResponse } from "../shared.js";
|
||||
|
||||
export async function generateGeminiInlineDataText(params: {
|
||||
buffer: Buffer;
|
||||
mime?: string;
|
||||
apiKey: string;
|
||||
baseUrl?: string;
|
||||
headers?: Record<string, string>;
|
||||
model?: string;
|
||||
prompt?: string;
|
||||
timeoutMs: number;
|
||||
fetchFn?: typeof fetch;
|
||||
defaultBaseUrl: string;
|
||||
defaultModel: string;
|
||||
defaultPrompt: string;
|
||||
defaultMime: string;
|
||||
httpErrorLabel: string;
|
||||
missingTextError: string;
|
||||
}): Promise<{ text: string; model: string }> {
|
||||
const fetchFn = params.fetchFn ?? fetch;
|
||||
const baseUrl = normalizeBaseUrl(params.baseUrl, params.defaultBaseUrl);
|
||||
const allowPrivate = Boolean(params.baseUrl?.trim());
|
||||
const model = (() => {
|
||||
const trimmed = params.model?.trim();
|
||||
if (!trimmed) {
|
||||
return params.defaultModel;
|
||||
}
|
||||
return normalizeGoogleModelId(trimmed);
|
||||
})();
|
||||
const url = `${baseUrl}/models/${model}:generateContent`;
|
||||
|
||||
const headers = new Headers(params.headers);
|
||||
if (!headers.has("content-type")) {
|
||||
headers.set("content-type", "application/json");
|
||||
}
|
||||
if (!headers.has("x-goog-api-key")) {
|
||||
headers.set("x-goog-api-key", params.apiKey);
|
||||
}
|
||||
|
||||
const prompt = (() => {
|
||||
const trimmed = params.prompt?.trim();
|
||||
return trimmed || params.defaultPrompt;
|
||||
})();
|
||||
|
||||
const body = {
|
||||
contents: [
|
||||
{
|
||||
role: "user",
|
||||
parts: [
|
||||
{ text: prompt },
|
||||
{
|
||||
inline_data: {
|
||||
mime_type: params.mime ?? params.defaultMime,
|
||||
data: params.buffer.toString("base64"),
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
const { response: res, release } = await fetchWithTimeoutGuarded(
|
||||
url,
|
||||
{
|
||||
method: "POST",
|
||||
headers,
|
||||
body: JSON.stringify(body),
|
||||
},
|
||||
params.timeoutMs,
|
||||
fetchFn,
|
||||
allowPrivate ? { ssrfPolicy: { allowPrivateNetwork: true } } : undefined,
|
||||
);
|
||||
|
||||
try {
|
||||
if (!res.ok) {
|
||||
const detail = await readErrorResponse(res);
|
||||
const suffix = detail ? `: ${detail}` : "";
|
||||
throw new Error(`${params.httpErrorLabel} (HTTP ${res.status})${suffix}`);
|
||||
}
|
||||
|
||||
const payload = (await res.json()) as {
|
||||
candidates?: Array<{
|
||||
content?: { parts?: Array<{ text?: string }> };
|
||||
}>;
|
||||
};
|
||||
const parts = payload.candidates?.[0]?.content?.parts ?? [];
|
||||
const text = parts
|
||||
.map((part) => part?.text?.trim())
|
||||
.filter(Boolean)
|
||||
.join("\n");
|
||||
if (!text) {
|
||||
throw new Error(params.missingTextError);
|
||||
}
|
||||
return { text, model };
|
||||
} finally {
|
||||
await release();
|
||||
}
|
||||
}
|
||||
@@ -1,92 +1,21 @@
|
||||
import type { VideoDescriptionRequest, VideoDescriptionResult } from "../../types.js";
|
||||
import { normalizeGoogleModelId } from "../../../agents/models-config.providers.js";
|
||||
import { fetchWithTimeoutGuarded, normalizeBaseUrl, readErrorResponse } from "../shared.js";
|
||||
import { generateGeminiInlineDataText } from "./inline-data.js";
|
||||
|
||||
export const DEFAULT_GOOGLE_VIDEO_BASE_URL = "https://generativelanguage.googleapis.com/v1beta";
|
||||
const DEFAULT_GOOGLE_VIDEO_MODEL = "gemini-3-flash-preview";
|
||||
const DEFAULT_GOOGLE_VIDEO_PROMPT = "Describe the video.";
|
||||
|
||||
function resolveModel(model?: string): string {
|
||||
const trimmed = model?.trim();
|
||||
if (!trimmed) {
|
||||
return DEFAULT_GOOGLE_VIDEO_MODEL;
|
||||
}
|
||||
return normalizeGoogleModelId(trimmed);
|
||||
}
|
||||
|
||||
function resolvePrompt(prompt?: string): string {
|
||||
const trimmed = prompt?.trim();
|
||||
return trimmed || DEFAULT_GOOGLE_VIDEO_PROMPT;
|
||||
}
|
||||
|
||||
export async function describeGeminiVideo(
|
||||
params: VideoDescriptionRequest,
|
||||
): Promise<VideoDescriptionResult> {
|
||||
const fetchFn = params.fetchFn ?? fetch;
|
||||
const baseUrl = normalizeBaseUrl(params.baseUrl, DEFAULT_GOOGLE_VIDEO_BASE_URL);
|
||||
const allowPrivate = Boolean(params.baseUrl?.trim());
|
||||
const model = resolveModel(params.model);
|
||||
const url = `${baseUrl}/models/${model}:generateContent`;
|
||||
|
||||
const headers = new Headers(params.headers);
|
||||
if (!headers.has("content-type")) {
|
||||
headers.set("content-type", "application/json");
|
||||
}
|
||||
if (!headers.has("x-goog-api-key")) {
|
||||
headers.set("x-goog-api-key", params.apiKey);
|
||||
}
|
||||
|
||||
const body = {
|
||||
contents: [
|
||||
{
|
||||
role: "user",
|
||||
parts: [
|
||||
{ text: resolvePrompt(params.prompt) },
|
||||
{
|
||||
inline_data: {
|
||||
mime_type: params.mime ?? "video/mp4",
|
||||
data: params.buffer.toString("base64"),
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
const { response: res, release } = await fetchWithTimeoutGuarded(
|
||||
url,
|
||||
{
|
||||
method: "POST",
|
||||
headers,
|
||||
body: JSON.stringify(body),
|
||||
},
|
||||
params.timeoutMs,
|
||||
fetchFn,
|
||||
allowPrivate ? { ssrfPolicy: { allowPrivateNetwork: true } } : undefined,
|
||||
);
|
||||
|
||||
try {
|
||||
if (!res.ok) {
|
||||
const detail = await readErrorResponse(res);
|
||||
const suffix = detail ? `: ${detail}` : "";
|
||||
throw new Error(`Video description failed (HTTP ${res.status})${suffix}`);
|
||||
}
|
||||
|
||||
const payload = (await res.json()) as {
|
||||
candidates?: Array<{
|
||||
content?: { parts?: Array<{ text?: string }> };
|
||||
}>;
|
||||
};
|
||||
const parts = payload.candidates?.[0]?.content?.parts ?? [];
|
||||
const text = parts
|
||||
.map((part) => part?.text?.trim())
|
||||
.filter(Boolean)
|
||||
.join("\n");
|
||||
if (!text) {
|
||||
throw new Error("Video description response missing text");
|
||||
}
|
||||
return { text, model };
|
||||
} finally {
|
||||
await release();
|
||||
}
|
||||
const { text, model } = await generateGeminiInlineDataText({
|
||||
...params,
|
||||
defaultBaseUrl: DEFAULT_GOOGLE_VIDEO_BASE_URL,
|
||||
defaultModel: DEFAULT_GOOGLE_VIDEO_MODEL,
|
||||
defaultPrompt: DEFAULT_GOOGLE_VIDEO_PROMPT,
|
||||
defaultMime: "video/mp4",
|
||||
httpErrorLabel: "Video description failed",
|
||||
missingTextError: "Video description response missing text",
|
||||
});
|
||||
return { text, model };
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user