From 0585e181f818cb11757815ea6aef255ca79eb0d6 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Thu, 23 Apr 2026 05:28:20 +0100 Subject: [PATCH] fix(media): prefer provider stt before local whisper --- CHANGELOG.md | 1 + .../runner.auto-audio.test.ts | 39 ++++++++++++++++++- src/media-understanding/runner.ts | 4 ++ 3 files changed, 43 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c3ed545cddf..cb6f09bdfdf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -37,6 +37,7 @@ Docs: https://docs.openclaw.ai ### Fixes +- Media understanding/audio: prefer configured or key-backed STT providers before auto-detected local Whisper CLIs, so installed local transcription tools no longer shadow API providers such as Groq/OpenAI in `tools.media.audio` auto mode. Fixes #68727. - Providers/OpenAI: lock the auth picker wording for OpenAI API key, Codex browser login, and Codex device pairing so the setup choices no longer imply a mixed Codex/API-key auth path. (#67848) Thanks @tmlxrd. - Agents/BTW: route `/btw` side questions through provider stream registration with the session workspace, so Ollama provider URL construction and workspace-scoped hooks apply correctly. Fixes #68336. (#70413) Thanks @suboss87. - Agents/sessions: make session transcript write locks non-reentrant by default, so same-process transcript writers contend unless a helper explicitly opts into nested lock ownership. diff --git a/src/media-understanding/runner.auto-audio.test.ts b/src/media-understanding/runner.auto-audio.test.ts index 3bed355e0a3..4bfe8893daf 100644 --- a/src/media-understanding/runner.auto-audio.test.ts +++ b/src/media-understanding/runner.auto-audio.test.ts @@ -4,7 +4,7 @@ import path from "node:path"; import { describe, expect, it, vi } from "vitest"; import type { OpenClawConfig } from "../config/types.js"; import { withEnvAsync } from "../test-utils/env.js"; -import { runCapability } from "./runner.js"; +import { clearMediaUnderstandingBinaryCacheForTests, runCapability } from "./runner.js"; import { withAudioFixture } from "./runner.test-utils.js"; import type { AudioTranscriptionRequest, MediaUnderstandingProvider } from "./types.js"; @@ -52,6 +52,12 @@ function createOpenAiAudioCfg(extra?: Partial): OpenClawConfig { } as unknown as OpenClawConfig; } +async function createMockExecutable(dir: string, name: string) { + const executablePath = path.join(dir, name); + await fs.writeFile(executablePath, "#!/bin/sh\necho mocked-local-whisper\n", { mode: 0o755 }); + return executablePath; +} + async function runAutoAudioCase(params: { transcribeAudio: (req: AudioTranscriptionRequest) => Promise<{ text: string; model: string }>; cfgExtra?: Partial; @@ -89,6 +95,37 @@ describe("runCapability auto audio entries", () => { expect(result.decision.outcome).toBe("success"); }); + it("prefers provider keys over auto-detected local whisper", async () => { + const binDir = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-auto-audio-bin-")); + try { + await createMockExecutable(binDir, "whisper"); + clearMediaUnderstandingBinaryCacheForTests(); + let seenModel: string | undefined; + const result = await withEnvAsync( + { + PATH: binDir, + SHERPA_ONNX_MODEL_DIR: undefined, + WHISPER_CPP_MODEL: undefined, + GEMINI_API_KEY: undefined, + }, + async () => + await runAutoAudioCase({ + transcribeAudio: async (req) => { + seenModel = req.model; + return { text: "provider transcription", model: req.model ?? "unknown" }; + }, + }), + ); + + expect(result.outputs[0]?.provider).toBe("openai"); + expect(result.outputs[0]?.text).toBe("provider transcription"); + expect(seenModel).toBe("gpt-4o-transcribe"); + } finally { + clearMediaUnderstandingBinaryCacheForTests(); + await fs.rm(binDir, { recursive: true, force: true }); + } + }); + it("skips auto audio when disabled", async () => { const result = await runAutoAudioCase({ transcribeAudio: async () => ({ diff --git a/src/media-understanding/runner.ts b/src/media-understanding/runner.ts index c02f28e7fe9..6f6f74be2bd 100644 --- a/src/media-understanding/runner.ts +++ b/src/media-understanding/runner.ts @@ -510,6 +510,10 @@ async function resolveAutoEntries(params: { return [activeEntry]; } if (params.capability === "audio") { + const keyEntry = await resolveKeyEntry(params); + if (keyEntry) { + return [keyEntry]; + } const localAudio = await resolveLocalAudioEntry(); if (localAudio) { return [localAudio];