fix(media): prefer provider stt before local whisper

2026-05-06 05:20:43 +00:00 · 2026-04-23 05:28:20 +01:00
parent fdf97a8784
commit 0585e181f8
3 changed files with 43 additions and 1 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -37,6 +37,7 @@ Docs: https://docs.openclaw.ai

 ### Fixes

+- Media understanding/audio: prefer configured or key-backed STT providers before auto-detected local Whisper CLIs, so installed local transcription tools no longer shadow API providers such as Groq/OpenAI in `tools.media.audio` auto mode. Fixes #68727.
 - Providers/OpenAI: lock the auth picker wording for OpenAI API key, Codex browser login, and Codex device pairing so the setup choices no longer imply a mixed Codex/API-key auth path. (#67848) Thanks @tmlxrd.
 - Agents/BTW: route `/btw` side questions through provider stream registration with the session workspace, so Ollama provider URL construction and workspace-scoped hooks apply correctly. Fixes #68336. (#70413) Thanks @suboss87.
 - Agents/sessions: make session transcript write locks non-reentrant by default, so same-process transcript writers contend unless a helper explicitly opts into nested lock ownership.
--- a/src/media-understanding/runner.auto-audio.test.ts
+++ b/src/media-understanding/runner.auto-audio.test.ts
@@ -4,7 +4,7 @@ import path from "node:path";
 import { describe, expect, it, vi } from "vitest";
 import type { OpenClawConfig } from "../config/types.js";
 import { withEnvAsync } from "../test-utils/env.js";
-import { runCapability } from "./runner.js";
+import { clearMediaUnderstandingBinaryCacheForTests, runCapability } from "./runner.js";
 import { withAudioFixture } from "./runner.test-utils.js";
 import type { AudioTranscriptionRequest, MediaUnderstandingProvider } from "./types.js";

@@ -52,6 +52,12 @@ function createOpenAiAudioCfg(extra?: Partial<OpenClawConfig>): OpenClawConfig {
  } as unknown as OpenClawConfig;
 }

+async function createMockExecutable(dir: string, name: string) {
+  const executablePath = path.join(dir, name);
+  await fs.writeFile(executablePath, "#!/bin/sh\necho mocked-local-whisper\n", { mode: 0o755 });
+  return executablePath;
+}
+
 async function runAutoAudioCase(params: {
  transcribeAudio: (req: AudioTranscriptionRequest) => Promise<{ text: string; model: string }>;
  cfgExtra?: Partial<OpenClawConfig>;
@@ -89,6 +95,37 @@ describe("runCapability auto audio entries", () => {
    expect(result.decision.outcome).toBe("success");
  });

+  it("prefers provider keys over auto-detected local whisper", async () => {
+    const binDir = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-auto-audio-bin-"));
+    try {
+      await createMockExecutable(binDir, "whisper");
+      clearMediaUnderstandingBinaryCacheForTests();
+      let seenModel: string | undefined;
+      const result = await withEnvAsync(
+        {
+          PATH: binDir,
+          SHERPA_ONNX_MODEL_DIR: undefined,
+          WHISPER_CPP_MODEL: undefined,
+          GEMINI_API_KEY: undefined,
+        },
+        async () =>
+          await runAutoAudioCase({
+            transcribeAudio: async (req) => {
+              seenModel = req.model;
+              return { text: "provider transcription", model: req.model ?? "unknown" };
+            },
+          }),
+      );
+
+      expect(result.outputs[0]?.provider).toBe("openai");
+      expect(result.outputs[0]?.text).toBe("provider transcription");
+      expect(seenModel).toBe("gpt-4o-transcribe");
+    } finally {
+      clearMediaUnderstandingBinaryCacheForTests();
+      await fs.rm(binDir, { recursive: true, force: true });
+    }
+  });
+
  it("skips auto audio when disabled", async () => {
    const result = await runAutoAudioCase({
      transcribeAudio: async () => ({
--- a/src/media-understanding/runner.ts
+++ b/src/media-understanding/runner.ts
@@ -510,6 +510,10 @@ async function resolveAutoEntries(params: {
    return [activeEntry];
  }
  if (params.capability === "audio") {
+    const keyEntry = await resolveKeyEntry(params);
+    if (keyEntry) {
+      return [keyEntry];
+    }
    const localAudio = await resolveLocalAudioEntry();
    if (localAudio) {
      return [localAudio];