From 33ee8bbf1d448bfccc48ca31aa79d89fd3c59f64 Mon Sep 17 00:00:00 2001 From: Liu Yuan Date: Tue, 10 Feb 2026 10:38:09 +0800 Subject: [PATCH] feat: add zai/glm-4.6v image understanding support (#10267) Fixes #10265. Thanks @liuy. --- CHANGELOG.md | 1 + src/agents/tools/image-tool.test.ts | 17 +++++++++++++++++ src/agents/tools/image-tool.ts | 2 ++ src/media-understanding/defaults.ts | 17 +++++++++++++++++ src/media-understanding/providers/index.ts | 2 ++ src/media-understanding/providers/zai/index.ts | 8 ++++++++ src/media-understanding/runner.ts | 14 ++++---------- 7 files changed, 51 insertions(+), 10 deletions(-) create mode 100644 src/media-understanding/providers/zai/index.ts diff --git a/CHANGELOG.md b/CHANGELOG.md index 7213b32666..8db03f0c7f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,7 @@ Docs: https://docs.openclaw.ai - Gateway: add agent management RPC methods for the web UI (`agents.create`, `agents.update`, `agents.delete`). (#11045) Thanks @advaitpaliwal. - Web UI: show a Compaction divider in chat history. (#11341) Thanks @Takhoffman. - Agents: include runtime shell in agent envelopes. (#1835) Thanks @Takhoffman. +- Agents: auto-select `zai/glm-4.6v` for image understanding when ZAI is primary provider. (#10267) Thanks @liuy. - Paths: add `OPENCLAW_HOME` for overriding the home directory used by internal path resolution. (#12091) Thanks @sebslight. ### Fixes diff --git a/src/agents/tools/image-tool.test.ts b/src/agents/tools/image-tool.test.ts index e9e4661fd0..921246f94c 100644 --- a/src/agents/tools/image-tool.test.ts +++ b/src/agents/tools/image-tool.test.ts @@ -22,6 +22,8 @@ describe("image tool implicit imageModel config", () => { vi.stubEnv("ANTHROPIC_API_KEY", ""); vi.stubEnv("ANTHROPIC_OAUTH_TOKEN", ""); vi.stubEnv("MINIMAX_API_KEY", ""); + vi.stubEnv("ZAI_API_KEY", ""); + vi.stubEnv("Z_AI_API_KEY", ""); // Avoid implicit Copilot provider discovery hitting the network in tests. vi.stubEnv("COPILOT_GITHUB_TOKEN", ""); vi.stubEnv("GH_TOKEN", ""); @@ -58,6 +60,21 @@ describe("image tool implicit imageModel config", () => { expect(createImageTool({ config: cfg, agentDir })).not.toBeNull(); }); + it("pairs zai primary with glm-4.6v (and fallbacks) when auth exists", async () => { + const agentDir = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-image-")); + vi.stubEnv("ZAI_API_KEY", "zai-test"); + vi.stubEnv("OPENAI_API_KEY", "openai-test"); + vi.stubEnv("ANTHROPIC_API_KEY", "anthropic-test"); + const cfg: OpenClawConfig = { + agents: { defaults: { model: { primary: "zai/glm-4.7" } } }, + }; + expect(resolveImageModelConfigForTool({ cfg, agentDir })).toEqual({ + primary: "zai/glm-4.6v", + fallbacks: ["openai/gpt-5-mini", "anthropic/claude-opus-4-5"], + }); + expect(createImageTool({ config: cfg, agentDir })).not.toBeNull(); + }); + it("pairs a custom provider when it declares an image-capable model", async () => { const agentDir = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-image-")); await writeAuthProfiles(agentDir, { diff --git a/src/agents/tools/image-tool.ts b/src/agents/tools/image-tool.ts index 8af8b16ac7..6f71314262 100644 --- a/src/agents/tools/image-tool.ts +++ b/src/agents/tools/image-tool.ts @@ -116,6 +116,8 @@ export function resolveImageModelConfigForTool(params: { preferred = "minimax/MiniMax-VL-01"; } else if (providerOk && providerVisionFromConfig) { preferred = providerVisionFromConfig; + } else if (primary.provider === "zai" && providerOk) { + preferred = "zai/glm-4.6v"; } else if (primary.provider === "openai" && openaiOk) { preferred = "openai/gpt-5-mini"; } else if (primary.provider === "anthropic" && anthropicOk) { diff --git a/src/media-understanding/defaults.ts b/src/media-understanding/defaults.ts index b4e443d20d..1e3d352a7b 100644 --- a/src/media-understanding/defaults.ts +++ b/src/media-understanding/defaults.ts @@ -32,5 +32,22 @@ export const DEFAULT_AUDIO_MODELS: Record = { openai: "gpt-4o-mini-transcribe", deepgram: "nova-3", }; + +export const AUTO_AUDIO_KEY_PROVIDERS = ["openai", "groq", "deepgram", "google"] as const; +export const AUTO_IMAGE_KEY_PROVIDERS = [ + "openai", + "anthropic", + "google", + "minimax", + "zai", +] as const; +export const AUTO_VIDEO_KEY_PROVIDERS = ["google"] as const; +export const DEFAULT_IMAGE_MODELS: Record = { + openai: "gpt-5-mini", + anthropic: "claude-opus-4-6", + google: "gemini-3-flash-preview", + minimax: "MiniMax-VL-01", + zai: "glm-4.6v", +}; export const CLI_OUTPUT_MAX_BUFFER = 5 * MB; export const DEFAULT_MEDIA_CONCURRENCY = 2; diff --git a/src/media-understanding/providers/index.ts b/src/media-understanding/providers/index.ts index 5fc5bd02ed..d64e5f94c6 100644 --- a/src/media-understanding/providers/index.ts +++ b/src/media-understanding/providers/index.ts @@ -6,6 +6,7 @@ import { googleProvider } from "./google/index.js"; import { groqProvider } from "./groq/index.js"; import { minimaxProvider } from "./minimax/index.js"; import { openaiProvider } from "./openai/index.js"; +import { zaiProvider } from "./zai/index.js"; const PROVIDERS: MediaUnderstandingProvider[] = [ groqProvider, @@ -13,6 +14,7 @@ const PROVIDERS: MediaUnderstandingProvider[] = [ googleProvider, anthropicProvider, minimaxProvider, + zaiProvider, deepgramProvider, ]; diff --git a/src/media-understanding/providers/zai/index.ts b/src/media-understanding/providers/zai/index.ts new file mode 100644 index 0000000000..337ea0a685 --- /dev/null +++ b/src/media-understanding/providers/zai/index.ts @@ -0,0 +1,8 @@ +import type { MediaUnderstandingProvider } from "../../types.js"; +import { describeImageWithModel } from "../image.js"; + +export const zaiProvider: MediaUnderstandingProvider = { + id: "zai", + capabilities: ["image"], + describeImage: describeImageWithModel, +}; diff --git a/src/media-understanding/runner.ts b/src/media-understanding/runner.ts index 142584d035..5881e85809 100644 --- a/src/media-understanding/runner.ts +++ b/src/media-understanding/runner.ts @@ -27,8 +27,12 @@ import { logVerbose, shouldLogVerbose } from "../globals.js"; import { runExec } from "../process/exec.js"; import { MediaAttachmentCache, normalizeAttachments, selectAttachments } from "./attachments.js"; import { + AUTO_AUDIO_KEY_PROVIDERS, + AUTO_IMAGE_KEY_PROVIDERS, + AUTO_VIDEO_KEY_PROVIDERS, CLI_OUTPUT_MAX_BUFFER, DEFAULT_AUDIO_MODELS, + DEFAULT_IMAGE_MODELS, DEFAULT_TIMEOUT_SECONDS, } from "./defaults.js"; import { isMediaUnderstandingSkipError, MediaUnderstandingSkipError } from "./errors.js"; @@ -48,16 +52,6 @@ import { } from "./resolve.js"; import { estimateBase64Size, resolveVideoMaxBase64Bytes } from "./video.js"; -const AUTO_AUDIO_KEY_PROVIDERS = ["openai", "groq", "deepgram", "google"] as const; -const AUTO_IMAGE_KEY_PROVIDERS = ["openai", "anthropic", "google", "minimax"] as const; -const AUTO_VIDEO_KEY_PROVIDERS = ["google"] as const; -const DEFAULT_IMAGE_MODELS: Record = { - openai: "gpt-5-mini", - anthropic: "claude-opus-4-6", - google: "gemini-3-flash-preview", - minimax: "MiniMax-VL-01", -}; - export type ActiveMediaModel = { provider: string; model?: string;