fix: handle reasoning-only image responses (#69444)

Signed-off-by: sallyom <somalley@redhat.com>
2026-05-06 07:00:43 +00:00 · 2026-04-21 00:20:23 -04:00
parent 1303b03241
commit 62aff9aa56
5 changed files with 393 additions and 14 deletions
--- a/src/agents/tools/image-tool.helpers.ts
+++ b/src/agents/tools/image-tool.helpers.ts
@@ -8,6 +8,76 @@ import { coerceToolModelConfig, type ToolModelConfig } from "./model-config.help

 export type ImageModelConfig = ToolModelConfig;

+const IMAGE_REASONING_FALLBACK_SIGNATURES = new Set([
+  "reasoning_content",
+  "reasoning",
+  "reasoning_details",
+  "reasoning_text",
+]);
+const MAX_IMAGE_REASONING_FALLBACK_BLOCKS = 50;
+const MAX_IMAGE_REASONING_SIGNATURE_PARSE_CHARS = 2_048;
+const MAX_IMAGE_REASONING_SIGNATURE_SCAN_CHARS = 65_536;
+
+function hasResponsesReasoningSignatureMarkers(value: string): boolean {
+  const scanned = value.slice(0, MAX_IMAGE_REASONING_SIGNATURE_SCAN_CHARS);
+  return /"id"\s*:\s*"rs_/.test(scanned) && /"type"\s*:\s*"reasoning(?:[."])/.test(scanned);
+}
+
+function isImageReasoningFallbackSignature(value: unknown): boolean {
+  if (!value) {
+    return false;
+  }
+  if (typeof value === "string") {
+    if (IMAGE_REASONING_FALLBACK_SIGNATURES.has(value)) {
+      return true;
+    }
+    const trimmed = value.trim();
+    if (!trimmed.startsWith("{") || !trimmed.endsWith("}")) {
+      return false;
+    }
+    if (trimmed.length > MAX_IMAGE_REASONING_SIGNATURE_PARSE_CHARS) {
+      return hasResponsesReasoningSignatureMarkers(trimmed);
+    }
+    try {
+      return isImageReasoningFallbackSignature(JSON.parse(trimmed));
+    } catch {
+      return false;
+    }
+  }
+  if (typeof value !== "object") {
+    return false;
+  }
+  const record = value as { id?: unknown; type?: unknown };
+  const id = typeof record.id === "string" ? record.id : "";
+  const type = typeof record.type === "string" ? record.type : "";
+  return id.startsWith("rs_") && (type === "reasoning" || type.startsWith("reasoning."));
+}
+
+export function hasImageReasoningOnlyResponse(message: AssistantMessage): boolean {
+  if (extractAssistantText(message).trim() || !Array.isArray(message.content)) {
+    return false;
+  }
+  let checkedBlocks = 0;
+  for (const block of message.content) {
+    checkedBlocks += 1;
+    if (checkedBlocks > MAX_IMAGE_REASONING_FALLBACK_BLOCKS) {
+      break;
+    }
+    if (!block || typeof block !== "object") {
+      continue;
+    }
+    const record = block as { type?: unknown; thinking?: unknown; thinkingSignature?: unknown };
+    if (
+      record.type === "thinking" &&
+      typeof record.thinking === "string" &&
+      isImageReasoningFallbackSignature(record.thinkingSignature)
+    ) {
+      return true;
+    }
+  }
+  return false;
+}
+
 export function decodeDataUrl(
  dataUrl: string,
  opts?: { maxBytes?: number },
--- a/src/agents/tools/image-tool.test.ts
+++ b/src/agents/tools/image-tool.test.ts
@@ -1439,4 +1439,114 @@ describe("image tool response validation", () => {
    });
    expect(text).toBe("hello");
  });
+
+  it.each(["reasoning_content", "reasoning", "reasoning_details", "reasoning_text"])(
+    "detects %s as a retryable image reasoning-only response",
+    (thinkingSignature) => {
+      const message = createAssistantMessage({
+        content: [
+          {
+            type: "thinking",
+            thinking: "  <think>private</think> maybe a cat  ",
+            thinkingSignature,
+          },
+        ],
+      });
+      expect(__testing.hasImageReasoningOnlyResponse(message as never)).toBe(true);
+      expect(() =>
+        __testing.coerceImageAssistantText({
+          provider: "openai",
+          model: "gpt-5.4-mini",
+          message: message as never,
+        }),
+      ).toThrow(/returned no text/i);
+    },
+  );
+
+  it.each([
+    JSON.stringify({ id: "rs_123", type: "reasoning" }),
+    { id: "rs_456", type: "reasoning.encrypted" },
+  ])(
+    "detects Responses reasoning signature as a retryable image reasoning-only response",
+    (thinkingSignature) => {
+      const message = createAssistantMessage({
+        content: [
+          {
+            type: "thinking",
+            thinking: "  <think>private</think> maybe a cat  ",
+            thinkingSignature,
+          },
+        ],
+      });
+      expect(__testing.hasImageReasoningOnlyResponse(message as never)).toBe(true);
+      expect(() =>
+        __testing.coerceImageAssistantText({
+          provider: "openai",
+          model: "gpt-5.4-mini",
+          message: message as never,
+        }),
+      ).toThrow(/returned no text/i);
+    },
+  );
+
+  it("detects oversized JSON reasoning signatures without parsing the whole payload", () => {
+    const message = createAssistantMessage({
+      content: [
+        {
+          type: "thinking",
+          thinking: "retryable",
+          thinkingSignature: JSON.stringify({
+            id: "rs_123",
+            summary: [{ text: "x".repeat(2_100) }],
+            type: "reasoning",
+          }),
+        },
+      ],
+    });
+
+    expect(__testing.hasImageReasoningOnlyResponse(message as never)).toBe(true);
+  });
+
+  it("ignores oversized JSON signatures without Responses reasoning markers", () => {
+    const message = createAssistantMessage({
+      content: [
+        {
+          type: "thinking",
+          thinking: "retryable",
+          thinkingSignature: `{"id":"not-reasoning","summary":"${"x".repeat(2_100)}"}`,
+        },
+      ],
+    });
+
+    expect(__testing.hasImageReasoningOnlyResponse(message as never)).toBe(false);
+  });
+
+  it("detects signed reasoning-only responses with empty summary text", () => {
+    const message = createAssistantMessage({
+      content: [
+        {
+          type: "thinking",
+          thinking: "",
+          thinkingSignature: "reasoning_content",
+        },
+      ],
+    });
+
+    expect(__testing.hasImageReasoningOnlyResponse(message as never)).toBe(true);
+  });
+
+  it("bounds reasoning-only detection before scanning every block", () => {
+    const message = createAssistantMessage({
+      content: [
+        ...Array.from({ length: 50 }, () => ({ type: "thinking", thinking: "untagged" })),
+        {
+          type: "thinking",
+          thinking: "retryable",
+          thinkingSignature: "reasoning_content",
+        },
+      ],
+    });
+
+    expect(__testing.hasImageReasoningOnlyResponse(message as never)).toBe(false);
+  });
 });
--- a/src/agents/tools/image-tool.ts
+++ b/src/agents/tools/image-tool.ts
@@ -19,6 +19,7 @@ import {
  coerceImageAssistantText,
  coerceImageModelConfig,
  decodeDataUrl,
+  hasImageReasoningOnlyResponse,
  type ImageModelConfig,
  resolveProviderVisionModelFromConfig,
 } from "./image-tool.helpers.js";
@@ -58,6 +59,7 @@ const imageToolProviderDeps = {
 export const __testing = {
  decodeDataUrl,
  coerceImageAssistantText,
+  hasImageReasoningOnlyResponse,
  resolveImageToolMaxTokens,
  setProviderDepsForTest(overrides?: {
    buildProviderRegistry?: typeof buildProviderRegistry;
--- a/src/media-understanding/image.test.ts
+++ b/src/media-understanding/image.test.ts
@@ -233,6 +233,123 @@ describe("describeImageWithModel", () => {
    expect(context?.messages?.[0]?.content).toHaveLength(1);
  });

+  it.each([
+    {
+      name: "direct OpenAI Responses baseUrl",
+      provider: "openai",
+      model: {
+        api: "openai-responses",
+        provider: "openai",
+        id: "gpt-5.4-mini",
+        input: ["text", "image"],
+        baseUrl: "https://api.openai.com/v1",
+      },
+      expectedRetryPayload: {
+        reasoning: { effort: "none" },
+      },
+    },
+    {
+      name: "default OpenAI Responses route without explicit baseUrl",
+      provider: "openai",
+      model: {
+        api: "openai-responses",
+        provider: "openai",
+        id: "gpt-5.4-mini",
+        input: ["text", "image"],
+      },
+      expectedRetryPayload: {
+        reasoning: { effort: "none" },
+      },
+    },
+    {
+      name: "azure-openai provider using openai-responses api",
+      provider: "azure-openai",
+      model: {
+        api: "openai-responses",
+        provider: "azure-openai",
+        id: "gpt-5.4-mini",
+        input: ["text", "image"],
+        baseUrl: "https://myresource.openai.azure.com/openai/v1",
+      },
+      expectedRetryPayload: {
+        reasoning: { effort: "none" },
+      },
+    },
+    {
+      name: "proxy-like openai-responses route",
+      provider: "openai",
+      model: {
+        api: "openai-responses",
+        provider: "openai",
+        id: "gpt-5.4-mini",
+        input: ["text", "image"],
+        baseUrl: "https://proxy.example.com/v1",
+      },
+      expectedRetryPayload: {},
+    },
+  ])(
+    "retries reasoning-only image responses with reasoning disabled for $name",
+    async ({ provider, model, expectedRetryPayload }) => {
+      discoverModelsMock.mockReturnValue({
+        find: vi.fn(() => model),
+      });
+      completeMock
+        .mockResolvedValueOnce({
+          role: "assistant",
+          api: model.api,
+          provider: model.provider,
+          model: model.id,
+          stopReason: "stop",
+          timestamp: Date.now(),
+          content: [
+            {
+              type: "thinking",
+              thinking: "internal image reasoning",
+              thinkingSignature: "reasoning_content",
+            },
+          ],
+        })
+        .mockResolvedValueOnce({
+          role: "assistant",
+          api: model.api,
+          provider: model.provider,
+          model: model.id,
+          stopReason: "stop",
+          timestamp: Date.now(),
+          content: [{ type: "text", text: "retry ok" }],
+        });
+
+      const result = await describeImageWithModel({
+        cfg: {},
+        agentDir: "/tmp/openclaw-agent",
+        provider,
+        model: model.id,
+        buffer: Buffer.from("png-bytes"),
+        fileName: "image.png",
+        mime: "image/png",
+        prompt: "Describe the image.",
+        timeoutMs: 1000,
+      });
+
+      expect(result).toEqual({
+        text: "retry ok",
+        model: model.id,
+      });
+      expect(completeMock).toHaveBeenCalledTimes(2);
+      const [, , retryOptions] = completeMock.mock.calls[1] ?? [];
+      expect(retryOptions?.onPayload).toEqual(expect.any(Function));
+      const retryPayload = await retryOptions?.onPayload?.(
+        {
+          reasoning: { effort: "high", summary: "auto" },
+          reasoning_effort: "high",
+          include: ["reasoning.encrypted_content"],
+        },
+        completeMock.mock.calls[1]?.[0],
+      );
+      expect(retryPayload).toEqual(expectedRetryPayload);
+    },
+  );
+
  it("normalizes deprecated google flash ids before lookup and keeps profile auth selection", async () => {
    const findMock = vi.fn((provider: string, modelId: string) => {
      expect(provider).toBe("google");
--- a/src/media-understanding/image.ts
+++ b/src/media-understanding/image.ts
@@ -1,4 +1,4 @@
-import type { Api, Context, Model } from "@mariozechner/pi-ai";
+import type { Api, Context, Model, ProviderStreamOptions } from "@mariozechner/pi-ai";
 import { complete } from "@mariozechner/pi-ai";
 import { isMinimaxVlmModel, minimaxUnderstandImage } from "../agents/minimax-vlm.js";
 import {
@@ -8,7 +8,11 @@ import {
 } from "../agents/model-auth.js";
 import { normalizeModelRef } from "../agents/model-selection.js";
 import { ensureOpenClawModelsJson } from "../agents/models-config.js";
-import { coerceImageAssistantText } from "../agents/tools/image-tool.helpers.js";
+import { resolveProviderRequestCapabilities } from "../agents/provider-attribution.js";
+import {
+  coerceImageAssistantText,
+  hasImageReasoningOnlyResponse,
+} from "../agents/tools/image-tool.helpers.js";
 import type {
  ImageDescriptionRequest,
  ImageDescriptionResult,
@@ -36,6 +40,60 @@ function resolveImageToolMaxTokens(modelMaxTokens: number | undefined, requested
  return Math.min(requestedMaxTokens, modelMaxTokens);
 }

+function isRecord(value: unknown): value is Record<string, unknown> {
+  return Boolean(value) && typeof value === "object" && !Array.isArray(value);
+}
+
+function isNativeResponsesReasoningPayload(model: Model<Api>): boolean {
+  if (
+    model.api !== "openai-responses" &&
+    model.api !== "azure-openai-responses" &&
+    model.api !== "openai-codex-responses"
+  ) {
+    return false;
+  }
+  return resolveProviderRequestCapabilities({
+    provider: model.provider,
+    api: model.api,
+    baseUrl: model.baseUrl,
+    capability: "image",
+    transport: "media-understanding",
+  }).usesKnownNativeOpenAIRoute;
+}
+
+function removeReasoningInclude(value: unknown): unknown {
+  if (!Array.isArray(value)) {
+    return value;
+  }
+  const next = value.filter((entry) => entry !== "reasoning.encrypted_content");
+  return next.length > 0 ? next : undefined;
+}
+
+function disableReasoningForImageRetryPayload(payload: unknown, model: Model<Api>): unknown {
+  if (!isRecord(payload)) {
+    return undefined;
+  }
+  const next = { ...payload };
+  delete next.reasoning;
+  delete next.reasoning_effort;
+
+  const include = removeReasoningInclude(next.include);
+  if (include === undefined) {
+    delete next.include;
+  } else {
+    next.include = include;
+  }
+
+  if (isNativeResponsesReasoningPayload(model)) {
+    next.reasoning = { effort: "none" };
+  }
+  return next;
+}
+
+function isImageModelNoTextError(err: unknown): boolean {
+  return err instanceof Error && /^Image model returned no text\b/.test(err.message);
+}
+
 async function resolveImageRuntime(params: {
  cfg: ImageDescriptionRequest["cfg"];
  agentDir: string;
@@ -195,19 +253,41 @@ export async function describeImagesWithModel(
    params.timeoutMs > 0
      ? setTimeout(() => controller.abort(), params.timeoutMs)
      : undefined;
-  const message = await complete(model, context, {
-    apiKey,
-    maxTokens: resolveImageToolMaxTokens(model.maxTokens, params.maxTokens ?? 512),
-    signal: controller.signal,
-  }).finally(() => {
+
+  const maxTokens = resolveImageToolMaxTokens(model.maxTokens, params.maxTokens ?? 512);
+  const completeImage = async (onPayload?: ProviderStreamOptions["onPayload"]) =>
+    await complete(model, context, {
+      apiKey,
+      maxTokens,
+      signal: controller.signal,
+      ...(onPayload ? { onPayload } : {}),
+    });
+
+  try {
+    const message = await completeImage();
+    try {
+      const text = coerceImageAssistantText({
+        message,
+        provider: model.provider,
+        model: model.id,
+      });
+      return { text, model: model.id };
+    } catch (err) {
+      if (!isImageModelNoTextError(err) || !hasImageReasoningOnlyResponse(message)) {
+        throw err;
+      }
+    }
+
+    const retryMessage = await completeImage(disableReasoningForImageRetryPayload);
+    const text = coerceImageAssistantText({
+      message: retryMessage,
+      provider: model.provider,
+      model: model.id,
+    });
+    return { text, model: model.id };
+  } finally {
    clearTimeout(timeout);
-  });
-  const text = coerceImageAssistantText({
-    message,
-    provider: model.provider,
-    model: model.id,
-  });
-  return { text, model: model.id };
+  }
 }

 export async function describeImageWithModel(