fix(media): prefer provider stt before local whisper

This commit is contained in:
Peter Steinberger
2026-04-23 05:28:20 +01:00
parent fdf97a8784
commit 0585e181f8
3 changed files with 43 additions and 1 deletions

View File

@@ -37,6 +37,7 @@ Docs: https://docs.openclaw.ai
### Fixes
- Media understanding/audio: prefer configured or key-backed STT providers before auto-detected local Whisper CLIs, so installed local transcription tools no longer shadow API providers such as Groq/OpenAI in `tools.media.audio` auto mode. Fixes #68727.
- Providers/OpenAI: lock the auth picker wording for OpenAI API key, Codex browser login, and Codex device pairing so the setup choices no longer imply a mixed Codex/API-key auth path. (#67848) Thanks @tmlxrd.
- Agents/BTW: route `/btw` side questions through provider stream registration with the session workspace, so Ollama provider URL construction and workspace-scoped hooks apply correctly. Fixes #68336. (#70413) Thanks @suboss87.
- Agents/sessions: make session transcript write locks non-reentrant by default, so same-process transcript writers contend unless a helper explicitly opts into nested lock ownership.

View File

@@ -4,7 +4,7 @@ import path from "node:path";
import { describe, expect, it, vi } from "vitest";
import type { OpenClawConfig } from "../config/types.js";
import { withEnvAsync } from "../test-utils/env.js";
import { runCapability } from "./runner.js";
import { clearMediaUnderstandingBinaryCacheForTests, runCapability } from "./runner.js";
import { withAudioFixture } from "./runner.test-utils.js";
import type { AudioTranscriptionRequest, MediaUnderstandingProvider } from "./types.js";
@@ -52,6 +52,12 @@ function createOpenAiAudioCfg(extra?: Partial<OpenClawConfig>): OpenClawConfig {
} as unknown as OpenClawConfig;
}
async function createMockExecutable(dir: string, name: string) {
const executablePath = path.join(dir, name);
await fs.writeFile(executablePath, "#!/bin/sh\necho mocked-local-whisper\n", { mode: 0o755 });
return executablePath;
}
async function runAutoAudioCase(params: {
transcribeAudio: (req: AudioTranscriptionRequest) => Promise<{ text: string; model: string }>;
cfgExtra?: Partial<OpenClawConfig>;
@@ -89,6 +95,37 @@ describe("runCapability auto audio entries", () => {
expect(result.decision.outcome).toBe("success");
});
it("prefers provider keys over auto-detected local whisper", async () => {
const binDir = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-auto-audio-bin-"));
try {
await createMockExecutable(binDir, "whisper");
clearMediaUnderstandingBinaryCacheForTests();
let seenModel: string | undefined;
const result = await withEnvAsync(
{
PATH: binDir,
SHERPA_ONNX_MODEL_DIR: undefined,
WHISPER_CPP_MODEL: undefined,
GEMINI_API_KEY: undefined,
},
async () =>
await runAutoAudioCase({
transcribeAudio: async (req) => {
seenModel = req.model;
return { text: "provider transcription", model: req.model ?? "unknown" };
},
}),
);
expect(result.outputs[0]?.provider).toBe("openai");
expect(result.outputs[0]?.text).toBe("provider transcription");
expect(seenModel).toBe("gpt-4o-transcribe");
} finally {
clearMediaUnderstandingBinaryCacheForTests();
await fs.rm(binDir, { recursive: true, force: true });
}
});
it("skips auto audio when disabled", async () => {
const result = await runAutoAudioCase({
transcribeAudio: async () => ({

View File

@@ -510,6 +510,10 @@ async function resolveAutoEntries(params: {
return [activeEntry];
}
if (params.capability === "audio") {
const keyEntry = await resolveKeyEntry(params);
if (keyEntry) {
return [keyEntry];
}
const localAudio = await resolveLocalAudioEntry();
if (localAudio) {
return [localAudio];