fix(openai): route Codex audio to transcription model

This commit is contained in:
Vincent Koc
2026-05-04 17:14:08 -07:00
committed by GitHub
parent a491090b48
commit 2cb03ee7b5
8 changed files with 150 additions and 19 deletions

View File

@@ -58,9 +58,9 @@ const mediaMetadataPlugins = vi.hoisted(() => [
autoPriority: { image: 10, audio: 10 },
},
"openai-codex": {
capabilities: ["image"],
defaultModels: { image: "gpt-5.5" },
autoPriority: { image: 20 },
capabilities: ["image", "audio"],
defaultModels: { image: "gpt-5.5", audio: "gpt-4o-transcribe" },
autoPriority: { image: 20, audio: 20 },
},
opencode: { capabilities: ["image"], defaultModels: { image: "gpt-5-nano" } },
"opencode-go": { capabilities: ["image"], defaultModels: { image: "kimi-k2.6" } },
@@ -108,6 +108,9 @@ describe("resolveDefaultMediaModel", () => {
expect(resolveDefaultMediaModel({ providerId: "mistral", capability: "audio" })).toBe(
"voxtral-mini-latest",
);
expect(resolveDefaultMediaModel({ providerId: "openai-codex", capability: "audio" })).toBe(
"gpt-4o-transcribe",
);
});
it("resolves bundled image defaults beyond the historical core set", () => {
@@ -136,6 +139,7 @@ describe("resolveAutoMediaKeyProviders", () => {
it("keeps the bundled audio fallback order", () => {
expect(resolveAutoMediaKeyProviders({ capability: "audio" })).toEqual([
"openai",
"openai-codex",
"xai",
"google",
"mistral",

View File

@@ -95,6 +95,52 @@ describe("runCapability auto audio entries", () => {
expect(result.decision.outcome).toBe("success");
});
it("uses the provider audio default instead of the active Codex chat model", async () => {
let runResult: Awaited<ReturnType<typeof runCapability>> | undefined;
let seenModel: string | undefined;
await withAudioFixture("openclaw-auto-audio-codex", async ({ ctx, media, cache }) => {
const providerRegistry = createProviderRegistry({
"openai-codex": {
id: "openai-codex",
capabilities: ["image", "audio"],
defaultModels: { image: "gpt-5.5", audio: "gpt-4o-transcribe" },
transcribeAudio: async (req) => {
seenModel = req.model;
return { text: "codex audio", model: req.model ?? "unknown" };
},
},
});
const cfg = {
models: {
providers: {
"openai-codex": {
apiKey: "codex-test-key", // pragma: allowlist secret
models: [],
},
},
},
} as unknown as OpenClawConfig;
runResult = await runCapability({
capability: "audio",
cfg,
ctx,
attachments: cache,
media,
providerRegistry,
activeModel: { provider: "openai-codex", model: "gpt-5.5" },
});
});
expect(runResult?.outputs[0]).toMatchObject({
provider: "openai-codex",
model: "gpt-4o-transcribe",
text: "codex audio",
});
expect(seenModel).toBe("gpt-4o-transcribe");
});
it("prefers provider keys over auto-detected local whisper", async () => {
const binDir = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-auto-audio-bin-"));
try {

View File

@@ -749,16 +749,24 @@ async function resolveActiveModelEntry(params: {
if (!hasAuth) {
return null;
}
const model =
params.capability === "image"
? await resolveAutoImageModelId({
cfg: params.cfg,
providerId,
providerRegistry: params.providerRegistry,
explicitModel: params.activeModel?.model,
})
: params.activeModel?.model;
if (params.capability === "image" && !model) {
let model: string | undefined;
if (params.capability === "image") {
model = await resolveAutoImageModelId({
cfg: params.cfg,
providerId,
providerRegistry: params.providerRegistry,
explicitModel: params.activeModel?.model,
});
} else if (params.capability === "audio") {
model = resolveDefaultMediaModelFromRegistry({
providerId,
capability: "audio",
providerRegistry: params.providerRegistry,
});
} else {
model = params.activeModel?.model;
}
if ((params.capability === "image" || params.capability === "audio") && !model) {
return null;
}
return {