fix(openai): route Codex audio to transcription model

2026-05-06 12:30:44 +00:00 · 2026-05-04 17:14:08 -07:00
parent a491090b48
commit 2cb03ee7b5
8 changed files with 150 additions and 19 deletions
--- a/src/media-understanding/defaults.test.ts
+++ b/src/media-understanding/defaults.test.ts
@@ -58,9 +58,9 @@ const mediaMetadataPlugins = vi.hoisted(() => [
        autoPriority: { image: 10, audio: 10 },
      },
      "openai-codex": {
-        capabilities: ["image"],
-        defaultModels: { image: "gpt-5.5" },
-        autoPriority: { image: 20 },
+        capabilities: ["image", "audio"],
+        defaultModels: { image: "gpt-5.5", audio: "gpt-4o-transcribe" },
+        autoPriority: { image: 20, audio: 20 },
      },
      opencode: { capabilities: ["image"], defaultModels: { image: "gpt-5-nano" } },
      "opencode-go": { capabilities: ["image"], defaultModels: { image: "kimi-k2.6" } },
@@ -108,6 +108,9 @@ describe("resolveDefaultMediaModel", () => {
    expect(resolveDefaultMediaModel({ providerId: "mistral", capability: "audio" })).toBe(
      "voxtral-mini-latest",
    );
+    expect(resolveDefaultMediaModel({ providerId: "openai-codex", capability: "audio" })).toBe(
+      "gpt-4o-transcribe",
+    );
  });

  it("resolves bundled image defaults beyond the historical core set", () => {
@@ -136,6 +139,7 @@ describe("resolveAutoMediaKeyProviders", () => {
  it("keeps the bundled audio fallback order", () => {
    expect(resolveAutoMediaKeyProviders({ capability: "audio" })).toEqual([
      "openai",
+      "openai-codex",
      "xai",
      "google",
      "mistral",
--- a/src/media-understanding/runner.auto-audio.test.ts
+++ b/src/media-understanding/runner.auto-audio.test.ts
@@ -95,6 +95,52 @@ describe("runCapability auto audio entries", () => {
    expect(result.decision.outcome).toBe("success");
  });

+  it("uses the provider audio default instead of the active Codex chat model", async () => {
+    let runResult: Awaited<ReturnType<typeof runCapability>> | undefined;
+    let seenModel: string | undefined;
+
+    await withAudioFixture("openclaw-auto-audio-codex", async ({ ctx, media, cache }) => {
+      const providerRegistry = createProviderRegistry({
+        "openai-codex": {
+          id: "openai-codex",
+          capabilities: ["image", "audio"],
+          defaultModels: { image: "gpt-5.5", audio: "gpt-4o-transcribe" },
+          transcribeAudio: async (req) => {
+            seenModel = req.model;
+            return { text: "codex audio", model: req.model ?? "unknown" };
+          },
+        },
+      });
+      const cfg = {
+        models: {
+          providers: {
+            "openai-codex": {
+              apiKey: "codex-test-key", // pragma: allowlist secret
+              models: [],
+            },
+          },
+        },
+      } as unknown as OpenClawConfig;
+
+      runResult = await runCapability({
+        capability: "audio",
+        cfg,
+        ctx,
+        attachments: cache,
+        media,
+        providerRegistry,
+        activeModel: { provider: "openai-codex", model: "gpt-5.5" },
+      });
+    });
+
+    expect(runResult?.outputs[0]).toMatchObject({
+      provider: "openai-codex",
+      model: "gpt-4o-transcribe",
+      text: "codex audio",
+    });
+    expect(seenModel).toBe("gpt-4o-transcribe");
+  });
+
  it("prefers provider keys over auto-detected local whisper", async () => {
    const binDir = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-auto-audio-bin-"));
    try {
--- a/src/media-understanding/runner.ts
+++ b/src/media-understanding/runner.ts
@@ -749,16 +749,24 @@ async function resolveActiveModelEntry(params: {
  if (!hasAuth) {
    return null;
  }
-  const model =
-    params.capability === "image"
-      ? await resolveAutoImageModelId({
-          cfg: params.cfg,
-          providerId,
-          providerRegistry: params.providerRegistry,
-          explicitModel: params.activeModel?.model,
-        })
-      : params.activeModel?.model;
-  if (params.capability === "image" && !model) {
+  let model: string | undefined;
+  if (params.capability === "image") {
+    model = await resolveAutoImageModelId({
+      cfg: params.cfg,
+      providerId,
+      providerRegistry: params.providerRegistry,
+      explicitModel: params.activeModel?.model,
+    });
+  } else if (params.capability === "audio") {
+    model = resolveDefaultMediaModelFromRegistry({
+      providerId,
+      capability: "audio",
+      providerRegistry: params.providerRegistry,
+    });
+  } else {
+    model = params.activeModel?.model;
+  }
+  if ((params.capability === "image" || params.capability === "audio") && !model) {
    return null;
  }
  return {