diff --git a/src/media-understanding/runner.auto-audio.test.ts b/src/media-understanding/runner.auto-audio.test.ts index fa437011f9..c21841dc38 100644 --- a/src/media-understanding/runner.auto-audio.test.ts +++ b/src/media-understanding/runner.auto-audio.test.ts @@ -11,40 +11,66 @@ import { runCapability, } from "./runner.js"; +async function withAudioFixture( + run: (params: { + ctx: MsgContext; + media: ReturnType; + cache: ReturnType; + }) => Promise, +) { + const originalPath = process.env.PATH; + process.env.PATH = "/usr/bin:/bin"; + const tmpPath = path.join(os.tmpdir(), `openclaw-auto-audio-${Date.now()}.wav`); + await fs.writeFile(tmpPath, Buffer.from("RIFF")); + const ctx: MsgContext = { MediaPath: tmpPath, MediaType: "audio/wav" }; + const media = normalizeMediaAttachments(ctx); + const cache = createMediaAttachmentCache(media); + + try { + await run({ ctx, media, cache }); + } finally { + process.env.PATH = originalPath; + await cache.cleanup(); + await fs.unlink(tmpPath).catch(() => {}); + } +} + +function createOpenAiAudioProvider( + transcribeAudio: (req: { model?: string }) => Promise<{ text: string; model: string }>, +) { + return buildProviderRegistry({ + openai: { + id: "openai", + capabilities: ["audio"], + transcribeAudio, + }, + }); +} + +function createOpenAiAudioCfg(extra?: Partial): OpenClawConfig { + return { + models: { + providers: { + openai: { + apiKey: "test-key", + models: [], + }, + }, + }, + ...extra, + } as unknown as OpenClawConfig; +} + describe("runCapability auto audio entries", () => { it("uses provider keys to auto-enable audio transcription", async () => { - const originalPath = process.env.PATH; - process.env.PATH = "/usr/bin:/bin"; - const tmpPath = path.join(os.tmpdir(), `openclaw-auto-audio-${Date.now()}.wav`); - await fs.writeFile(tmpPath, Buffer.from("RIFF")); - const ctx: MsgContext = { MediaPath: tmpPath, MediaType: "audio/wav" }; - const media = normalizeMediaAttachments(ctx); - const cache = createMediaAttachmentCache(media); + await withAudioFixture(async ({ ctx, media, cache }) => { + let seenModel: string | undefined; + const providerRegistry = createOpenAiAudioProvider(async (req) => { + seenModel = req.model; + return { text: "ok", model: req.model ?? "unknown" }; + }); + const cfg = createOpenAiAudioCfg(); - let seenModel: string | undefined; - const providerRegistry = buildProviderRegistry({ - openai: { - id: "openai", - capabilities: ["audio"], - transcribeAudio: async (req) => { - seenModel = req.model; - return { text: "ok", model: req.model }; - }, - }, - }); - - const cfg = { - models: { - providers: { - openai: { - apiKey: "test-key", - models: [], - }, - }, - }, - } as unknown as OpenClawConfig; - - try { const result = await runCapability({ capability: "audio", cfg, @@ -56,49 +82,25 @@ describe("runCapability auto audio entries", () => { expect(result.outputs[0]?.text).toBe("ok"); expect(seenModel).toBe("gpt-4o-mini-transcribe"); expect(result.decision.outcome).toBe("success"); - } finally { - process.env.PATH = originalPath; - await cache.cleanup(); - await fs.unlink(tmpPath).catch(() => {}); - } + }); }); it("skips auto audio when disabled", async () => { - const originalPath = process.env.PATH; - process.env.PATH = "/usr/bin:/bin"; - const tmpPath = path.join(os.tmpdir(), `openclaw-auto-audio-${Date.now()}.wav`); - await fs.writeFile(tmpPath, Buffer.from("RIFF")); - const ctx: MsgContext = { MediaPath: tmpPath, MediaType: "audio/wav" }; - const media = normalizeMediaAttachments(ctx); - const cache = createMediaAttachmentCache(media); - - const providerRegistry = buildProviderRegistry({ - openai: { - id: "openai", - capabilities: ["audio"], - transcribeAudio: async () => ({ text: "ok", model: "whisper-1" }), - }, - }); - - const cfg = { - models: { - providers: { - openai: { - apiKey: "test-key", - models: [], + await withAudioFixture(async ({ ctx, media, cache }) => { + const providerRegistry = createOpenAiAudioProvider(async () => ({ + text: "ok", + model: "whisper-1", + })); + const cfg = createOpenAiAudioCfg({ + tools: { + media: { + audio: { + enabled: false, + }, }, }, - }, - tools: { - media: { - audio: { - enabled: false, - }, - }, - }, - } as unknown as OpenClawConfig; + }); - try { const result = await runCapability({ capability: "audio", cfg, @@ -109,10 +111,37 @@ describe("runCapability auto audio entries", () => { }); expect(result.outputs).toHaveLength(0); expect(result.decision.outcome).toBe("disabled"); - } finally { - process.env.PATH = originalPath; - await cache.cleanup(); - await fs.unlink(tmpPath).catch(() => {}); - } + }); + }); + + it("prefers explicitly configured audio model entries", async () => { + await withAudioFixture(async ({ ctx, media, cache }) => { + let seenModel: string | undefined; + const providerRegistry = createOpenAiAudioProvider(async (req) => { + seenModel = req.model; + return { text: "ok", model: req.model ?? "unknown" }; + }); + const cfg = createOpenAiAudioCfg({ + tools: { + media: { + audio: { + models: [{ provider: "openai", model: "whisper-1" }], + }, + }, + }, + }); + + const result = await runCapability({ + capability: "audio", + cfg, + ctx, + attachments: cache, + media, + providerRegistry, + }); + + expect(result.outputs[0]?.text).toBe("ok"); + expect(seenModel).toBe("whisper-1"); + }); }); });