mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 05:20:43 +00:00
fix(media): prefer provider stt before local whisper
This commit is contained in:
@@ -37,6 +37,7 @@ Docs: https://docs.openclaw.ai
|
||||
|
||||
### Fixes
|
||||
|
||||
- Media understanding/audio: prefer configured or key-backed STT providers before auto-detected local Whisper CLIs, so installed local transcription tools no longer shadow API providers such as Groq/OpenAI in `tools.media.audio` auto mode. Fixes #68727.
|
||||
- Providers/OpenAI: lock the auth picker wording for OpenAI API key, Codex browser login, and Codex device pairing so the setup choices no longer imply a mixed Codex/API-key auth path. (#67848) Thanks @tmlxrd.
|
||||
- Agents/BTW: route `/btw` side questions through provider stream registration with the session workspace, so Ollama provider URL construction and workspace-scoped hooks apply correctly. Fixes #68336. (#70413) Thanks @suboss87.
|
||||
- Agents/sessions: make session transcript write locks non-reentrant by default, so same-process transcript writers contend unless a helper explicitly opts into nested lock ownership.
|
||||
|
||||
@@ -4,7 +4,7 @@ import path from "node:path";
|
||||
import { describe, expect, it, vi } from "vitest";
|
||||
import type { OpenClawConfig } from "../config/types.js";
|
||||
import { withEnvAsync } from "../test-utils/env.js";
|
||||
import { runCapability } from "./runner.js";
|
||||
import { clearMediaUnderstandingBinaryCacheForTests, runCapability } from "./runner.js";
|
||||
import { withAudioFixture } from "./runner.test-utils.js";
|
||||
import type { AudioTranscriptionRequest, MediaUnderstandingProvider } from "./types.js";
|
||||
|
||||
@@ -52,6 +52,12 @@ function createOpenAiAudioCfg(extra?: Partial<OpenClawConfig>): OpenClawConfig {
|
||||
} as unknown as OpenClawConfig;
|
||||
}
|
||||
|
||||
async function createMockExecutable(dir: string, name: string) {
|
||||
const executablePath = path.join(dir, name);
|
||||
await fs.writeFile(executablePath, "#!/bin/sh\necho mocked-local-whisper\n", { mode: 0o755 });
|
||||
return executablePath;
|
||||
}
|
||||
|
||||
async function runAutoAudioCase(params: {
|
||||
transcribeAudio: (req: AudioTranscriptionRequest) => Promise<{ text: string; model: string }>;
|
||||
cfgExtra?: Partial<OpenClawConfig>;
|
||||
@@ -89,6 +95,37 @@ describe("runCapability auto audio entries", () => {
|
||||
expect(result.decision.outcome).toBe("success");
|
||||
});
|
||||
|
||||
it("prefers provider keys over auto-detected local whisper", async () => {
|
||||
const binDir = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-auto-audio-bin-"));
|
||||
try {
|
||||
await createMockExecutable(binDir, "whisper");
|
||||
clearMediaUnderstandingBinaryCacheForTests();
|
||||
let seenModel: string | undefined;
|
||||
const result = await withEnvAsync(
|
||||
{
|
||||
PATH: binDir,
|
||||
SHERPA_ONNX_MODEL_DIR: undefined,
|
||||
WHISPER_CPP_MODEL: undefined,
|
||||
GEMINI_API_KEY: undefined,
|
||||
},
|
||||
async () =>
|
||||
await runAutoAudioCase({
|
||||
transcribeAudio: async (req) => {
|
||||
seenModel = req.model;
|
||||
return { text: "provider transcription", model: req.model ?? "unknown" };
|
||||
},
|
||||
}),
|
||||
);
|
||||
|
||||
expect(result.outputs[0]?.provider).toBe("openai");
|
||||
expect(result.outputs[0]?.text).toBe("provider transcription");
|
||||
expect(seenModel).toBe("gpt-4o-transcribe");
|
||||
} finally {
|
||||
clearMediaUnderstandingBinaryCacheForTests();
|
||||
await fs.rm(binDir, { recursive: true, force: true });
|
||||
}
|
||||
});
|
||||
|
||||
it("skips auto audio when disabled", async () => {
|
||||
const result = await runAutoAudioCase({
|
||||
transcribeAudio: async () => ({
|
||||
|
||||
@@ -510,6 +510,10 @@ async function resolveAutoEntries(params: {
|
||||
return [activeEntry];
|
||||
}
|
||||
if (params.capability === "audio") {
|
||||
const keyEntry = await resolveKeyEntry(params);
|
||||
if (keyEntry) {
|
||||
return [keyEntry];
|
||||
}
|
||||
const localAudio = await resolveLocalAudioEntry();
|
||||
if (localAudio) {
|
||||
return [localAudio];
|
||||
|
||||
Reference in New Issue
Block a user