mirror of
https://github.com/openclaw/openclaw.git
synced 2026-04-03 03:03:24 -04:00
fix(agents): restore multi-image image tool schema contract
This commit is contained in:
@@ -18,6 +18,7 @@ async function writeAuthProfiles(agentDir: string, profiles: unknown) {
|
||||
|
||||
const ONE_PIXEL_PNG_B64 =
|
||||
"iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/woAAn8B9FD5fHAAAAAASUVORK5CYII=";
|
||||
const ONE_PIXEL_GIF_B64 = "R0lGODlhAQABAIABAP///wAAACwAAAAAAQABAAACAkQBADs=";
|
||||
|
||||
async function withTempWorkspacePng(
|
||||
cb: (args: { workspaceDir: string; imagePath: string }) => Promise<void>,
|
||||
@@ -78,6 +79,25 @@ async function expectImageToolExecOk(
|
||||
});
|
||||
}
|
||||
|
||||
function findSchemaUnionKeywords(schema: unknown, path = "root"): string[] {
|
||||
if (!schema || typeof schema !== "object") {
|
||||
return [];
|
||||
}
|
||||
if (Array.isArray(schema)) {
|
||||
return schema.flatMap((item, index) => findSchemaUnionKeywords(item, `${path}[${index}]`));
|
||||
}
|
||||
const record = schema as Record<string, unknown>;
|
||||
const out: string[] = [];
|
||||
for (const [key, value] of Object.entries(record)) {
|
||||
const nextPath = `${path}.${key}`;
|
||||
if (key === "anyOf" || key === "oneOf" || key === "allOf") {
|
||||
out.push(nextPath);
|
||||
}
|
||||
out.push(...findSchemaUnionKeywords(value, nextPath));
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
describe("image tool implicit imageModel config", () => {
|
||||
const priorFetch = global.fetch;
|
||||
|
||||
@@ -211,6 +231,66 @@ describe("image tool implicit imageModel config", () => {
|
||||
expect(tool?.description).toContain("Only use this tool when images were NOT already provided");
|
||||
});
|
||||
|
||||
it("exposes an Anthropic-safe image schema without union keywords", async () => {
|
||||
const agentDir = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-image-"));
|
||||
try {
|
||||
const cfg = createMinimaxImageConfig();
|
||||
const tool = createImageTool({ config: cfg, agentDir });
|
||||
expect(tool).not.toBeNull();
|
||||
if (!tool) {
|
||||
throw new Error("expected image tool");
|
||||
}
|
||||
|
||||
const violations = findSchemaUnionKeywords(tool.parameters, "image.parameters");
|
||||
expect(violations).toEqual([]);
|
||||
|
||||
const schema = tool.parameters as {
|
||||
properties?: Record<string, unknown>;
|
||||
};
|
||||
const imageSchema = schema.properties?.image as { type?: unknown } | undefined;
|
||||
const imagesSchema = schema.properties?.images as
|
||||
| { type?: unknown; items?: unknown }
|
||||
| undefined;
|
||||
const imageItems = imagesSchema?.items as { type?: unknown } | undefined;
|
||||
|
||||
expect(imageSchema?.type).toBe("string");
|
||||
expect(imagesSchema?.type).toBe("array");
|
||||
expect(imageItems?.type).toBe("string");
|
||||
} finally {
|
||||
await fs.rm(agentDir, { recursive: true, force: true });
|
||||
}
|
||||
});
|
||||
|
||||
it("keeps an Anthropic-safe image schema snapshot", async () => {
|
||||
const agentDir = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-image-"));
|
||||
try {
|
||||
const cfg = createMinimaxImageConfig();
|
||||
const tool = createImageTool({ config: cfg, agentDir });
|
||||
expect(tool).not.toBeNull();
|
||||
if (!tool) {
|
||||
throw new Error("expected image tool");
|
||||
}
|
||||
|
||||
expect(JSON.parse(JSON.stringify(tool.parameters))).toEqual({
|
||||
type: "object",
|
||||
properties: {
|
||||
prompt: { type: "string" },
|
||||
image: { description: "Single image path or URL.", type: "string" },
|
||||
images: {
|
||||
description: "Multiple image paths or URLs (up to maxImages, default 20).",
|
||||
type: "array",
|
||||
items: { type: "string" },
|
||||
},
|
||||
model: { type: "string" },
|
||||
maxBytesMb: { type: "number" },
|
||||
maxImages: { type: "number" },
|
||||
},
|
||||
});
|
||||
} finally {
|
||||
await fs.rm(agentDir, { recursive: true, force: true });
|
||||
}
|
||||
});
|
||||
|
||||
it("allows workspace images outside default local media roots", async () => {
|
||||
await withTempWorkspacePng(async ({ workspaceDir, imagePath }) => {
|
||||
const fetch = stubMinimaxOkFetch();
|
||||
@@ -412,7 +492,7 @@ describe("image tool MiniMax VLM routing", () => {
|
||||
return { fetch, tool };
|
||||
}
|
||||
|
||||
it("calls /v1/coding_plan/vlm for minimax image models", async () => {
|
||||
it("accepts image for single-image requests and calls /v1/coding_plan/vlm", async () => {
|
||||
const { fetch, tool } = await createMinimaxVlmFixture({ status_code: 0, status_msg: "" });
|
||||
|
||||
const res = await tool.execute("t1", {
|
||||
@@ -434,6 +514,59 @@ describe("image tool MiniMax VLM routing", () => {
|
||||
expect(text).toBe("ok");
|
||||
});
|
||||
|
||||
it("accepts images[] for multi-image requests", async () => {
|
||||
const { fetch, tool } = await createMinimaxVlmFixture({ status_code: 0, status_msg: "" });
|
||||
|
||||
const res = await tool.execute("t1", {
|
||||
prompt: "Compare these images.",
|
||||
images: [`data:image/png;base64,${pngB64}`, `data:image/gif;base64,${ONE_PIXEL_GIF_B64}`],
|
||||
});
|
||||
|
||||
expect(fetch).toHaveBeenCalledTimes(1);
|
||||
const details = res.details as
|
||||
| {
|
||||
images?: Array<{ image: string }>;
|
||||
}
|
||||
| undefined;
|
||||
expect(details?.images).toHaveLength(2);
|
||||
});
|
||||
|
||||
it("combines image + images with dedupe and enforces maxImages", async () => {
|
||||
const { fetch, tool } = await createMinimaxVlmFixture({ status_code: 0, status_msg: "" });
|
||||
|
||||
const deduped = await tool.execute("t1", {
|
||||
prompt: "Compare these images.",
|
||||
image: `data:image/png;base64,${pngB64}`,
|
||||
images: [
|
||||
`data:image/png;base64,${pngB64}`,
|
||||
`data:image/gif;base64,${ONE_PIXEL_GIF_B64}`,
|
||||
`data:image/gif;base64,${ONE_PIXEL_GIF_B64}`,
|
||||
],
|
||||
});
|
||||
|
||||
expect(fetch).toHaveBeenCalledTimes(1);
|
||||
const dedupedDetails = deduped.details as
|
||||
| {
|
||||
images?: Array<{ image: string }>;
|
||||
}
|
||||
| undefined;
|
||||
expect(dedupedDetails?.images).toHaveLength(2);
|
||||
|
||||
const tooMany = await tool.execute("t2", {
|
||||
prompt: "Compare these images.",
|
||||
image: `data:image/png;base64,${pngB64}`,
|
||||
images: [`data:image/gif;base64,${ONE_PIXEL_GIF_B64}`],
|
||||
maxImages: 1,
|
||||
});
|
||||
|
||||
expect(fetch).toHaveBeenCalledTimes(1);
|
||||
expect(tooMany.details).toMatchObject({
|
||||
error: "too_many_images",
|
||||
count: 2,
|
||||
max: 1,
|
||||
});
|
||||
});
|
||||
|
||||
it("surfaces MiniMax API errors from /v1/coding_plan/vlm", async () => {
|
||||
const { tool } = await createMinimaxVlmFixture({ status_code: 1004, status_msg: "bad key" });
|
||||
|
||||
|
||||
@@ -1,7 +1,9 @@
|
||||
import path from "node:path";
|
||||
import { type Api, type Context, complete, type Model } from "@mariozechner/pi-ai";
|
||||
import { Type } from "@sinclair/typebox";
|
||||
import path from "node:path";
|
||||
import type { OpenClawConfig } from "../../config/config.js";
|
||||
import type { SandboxFsBridge } from "../sandbox/fs-bridge.js";
|
||||
import type { AnyAgentTool } from "./common.js";
|
||||
import { resolveUserPath } from "../../utils.js";
|
||||
import { getDefaultLocalRoots, loadWebMedia } from "../../web/media.js";
|
||||
import { ensureAuthProfileStore, listProfilesForProvider } from "../auth-profiles.js";
|
||||
@@ -12,9 +14,7 @@ import { runWithImageModelFallback } from "../model-fallback.js";
|
||||
import { resolveConfiguredModelRef } from "../model-selection.js";
|
||||
import { ensureOpenClawModelsJson } from "../models-config.js";
|
||||
import { discoverAuthStorage, discoverModels } from "../pi-model-discovery.js";
|
||||
import type { SandboxFsBridge } from "../sandbox/fs-bridge.js";
|
||||
import { normalizeWorkspaceDir } from "../workspace-dir.js";
|
||||
import type { AnyAgentTool } from "./common.js";
|
||||
import {
|
||||
coerceImageAssistantText,
|
||||
coerceImageModelConfig,
|
||||
@@ -358,8 +358,8 @@ export function createImageTool(options?: {
|
||||
// If model has native vision, images in the prompt are auto-injected
|
||||
// so this tool is only needed when image wasn't provided in the prompt
|
||||
const description = options?.modelHasVision
|
||||
? "Analyze one or more images with a vision model. Pass a single image path/URL or an array of up to 20. Only use this tool when images were NOT already provided in the user's message. Images mentioned in the prompt are automatically visible to you."
|
||||
: "Analyze one or more images with the configured image model (agents.defaults.imageModel). Pass a single image path/URL or an array of up to 20. Provide a prompt describing what to analyze.";
|
||||
? "Analyze one or more images with a vision model. Use image for a single path/URL, or images for multiple (up to 20). Only use this tool when images were NOT already provided in the user's message. Images mentioned in the prompt are automatically visible to you."
|
||||
: "Analyze one or more images with the configured image model (agents.defaults.imageModel). Use image for a single path/URL, or images for multiple (up to 20). Provide a prompt describing what to analyze.";
|
||||
|
||||
const localRoots = (() => {
|
||||
const roots = getDefaultLocalRoots();
|
||||
@@ -376,7 +376,12 @@ export function createImageTool(options?: {
|
||||
description,
|
||||
parameters: Type.Object({
|
||||
prompt: Type.Optional(Type.String()),
|
||||
image: Type.String({ description: "Image path or URL (pass multiple as comma-separated)" }),
|
||||
image: Type.Optional(Type.String({ description: "Single image path or URL." })),
|
||||
images: Type.Optional(
|
||||
Type.Array(Type.String(), {
|
||||
description: "Multiple image paths or URLs (up to maxImages, default 20).",
|
||||
}),
|
||||
),
|
||||
model: Type.Optional(Type.String()),
|
||||
maxBytesMb: Type.Optional(Type.Number()),
|
||||
maxImages: Type.Optional(Type.Number()),
|
||||
@@ -384,17 +389,28 @@ export function createImageTool(options?: {
|
||||
execute: async (_toolCallId, args) => {
|
||||
const record = args && typeof args === "object" ? (args as Record<string, unknown>) : {};
|
||||
|
||||
// MARK: - Normalize image input (string | string[])
|
||||
const rawImageInput = record.image;
|
||||
const imageInputs: string[] = (() => {
|
||||
if (typeof rawImageInput === "string") {
|
||||
return [rawImageInput];
|
||||
// MARK: - Normalize image + images input and dedupe while preserving order
|
||||
const imageCandidates: string[] = [];
|
||||
if (typeof record.image === "string") {
|
||||
imageCandidates.push(record.image);
|
||||
}
|
||||
if (Array.isArray(record.images)) {
|
||||
imageCandidates.push(...record.images.filter((v): v is string => typeof v === "string"));
|
||||
}
|
||||
|
||||
const seenImages = new Set<string>();
|
||||
const imageInputs: string[] = [];
|
||||
for (const candidate of imageCandidates) {
|
||||
const trimmedCandidate = candidate.trim();
|
||||
const normalizedForDedupe = trimmedCandidate.startsWith("@")
|
||||
? trimmedCandidate.slice(1).trim()
|
||||
: trimmedCandidate;
|
||||
if (!normalizedForDedupe || seenImages.has(normalizedForDedupe)) {
|
||||
continue;
|
||||
}
|
||||
if (Array.isArray(rawImageInput)) {
|
||||
return rawImageInput.filter((v): v is string => typeof v === "string");
|
||||
}
|
||||
return [];
|
||||
})();
|
||||
seenImages.add(normalizedForDedupe);
|
||||
imageInputs.push(trimmedCandidate);
|
||||
}
|
||||
if (imageInputs.length === 0) {
|
||||
throw new Error("image required");
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user