diff --git a/src/agents/tools/image-tool.helpers.ts b/src/agents/tools/image-tool.helpers.ts
index 1cee912b11c..ead4656b080 100644
--- a/src/agents/tools/image-tool.helpers.ts
+++ b/src/agents/tools/image-tool.helpers.ts
@@ -8,6 +8,76 @@ import { coerceToolModelConfig, type ToolModelConfig } from "./model-config.help
export type ImageModelConfig = ToolModelConfig;
+const IMAGE_REASONING_FALLBACK_SIGNATURES = new Set([
+ "reasoning_content",
+ "reasoning",
+ "reasoning_details",
+ "reasoning_text",
+]);
+const MAX_IMAGE_REASONING_FALLBACK_BLOCKS = 50;
+const MAX_IMAGE_REASONING_SIGNATURE_PARSE_CHARS = 2_048;
+const MAX_IMAGE_REASONING_SIGNATURE_SCAN_CHARS = 65_536;
+
+function hasResponsesReasoningSignatureMarkers(value: string): boolean {
+ const scanned = value.slice(0, MAX_IMAGE_REASONING_SIGNATURE_SCAN_CHARS);
+ return /"id"\s*:\s*"rs_/.test(scanned) && /"type"\s*:\s*"reasoning(?:[."])/.test(scanned);
+}
+
+function isImageReasoningFallbackSignature(value: unknown): boolean {
+ if (!value) {
+ return false;
+ }
+ if (typeof value === "string") {
+ if (IMAGE_REASONING_FALLBACK_SIGNATURES.has(value)) {
+ return true;
+ }
+ const trimmed = value.trim();
+ if (!trimmed.startsWith("{") || !trimmed.endsWith("}")) {
+ return false;
+ }
+ if (trimmed.length > MAX_IMAGE_REASONING_SIGNATURE_PARSE_CHARS) {
+ return hasResponsesReasoningSignatureMarkers(trimmed);
+ }
+ try {
+ return isImageReasoningFallbackSignature(JSON.parse(trimmed));
+ } catch {
+ return false;
+ }
+ }
+ if (typeof value !== "object") {
+ return false;
+ }
+ const record = value as { id?: unknown; type?: unknown };
+ const id = typeof record.id === "string" ? record.id : "";
+ const type = typeof record.type === "string" ? record.type : "";
+ return id.startsWith("rs_") && (type === "reasoning" || type.startsWith("reasoning."));
+}
+
+export function hasImageReasoningOnlyResponse(message: AssistantMessage): boolean {
+ if (extractAssistantText(message).trim() || !Array.isArray(message.content)) {
+ return false;
+ }
+ let checkedBlocks = 0;
+ for (const block of message.content) {
+ checkedBlocks += 1;
+ if (checkedBlocks > MAX_IMAGE_REASONING_FALLBACK_BLOCKS) {
+ break;
+ }
+ if (!block || typeof block !== "object") {
+ continue;
+ }
+ const record = block as { type?: unknown; thinking?: unknown; thinkingSignature?: unknown };
+ if (
+ record.type === "thinking" &&
+ typeof record.thinking === "string" &&
+ isImageReasoningFallbackSignature(record.thinkingSignature)
+ ) {
+ return true;
+ }
+ }
+ return false;
+}
+
export function decodeDataUrl(
dataUrl: string,
opts?: { maxBytes?: number },
diff --git a/src/agents/tools/image-tool.test.ts b/src/agents/tools/image-tool.test.ts
index f3c2acd1de8..62aa6c48790 100644
--- a/src/agents/tools/image-tool.test.ts
+++ b/src/agents/tools/image-tool.test.ts
@@ -1439,4 +1439,114 @@ describe("image tool response validation", () => {
});
expect(text).toBe("hello");
});
+
+ it.each(["reasoning_content", "reasoning", "reasoning_details", "reasoning_text"])(
+ "detects %s as a retryable image reasoning-only response",
+ (thinkingSignature) => {
+ const message = createAssistantMessage({
+ content: [
+ {
+ type: "thinking",
+ thinking: " private maybe a cat ",
+ thinkingSignature,
+ },
+ ],
+ });
+ expect(__testing.hasImageReasoningOnlyResponse(message as never)).toBe(true);
+ expect(() =>
+ __testing.coerceImageAssistantText({
+ provider: "openai",
+ model: "gpt-5.4-mini",
+ message: message as never,
+ }),
+ ).toThrow(/returned no text/i);
+ },
+ );
+
+ it.each([
+ JSON.stringify({ id: "rs_123", type: "reasoning" }),
+ { id: "rs_456", type: "reasoning.encrypted" },
+ ])(
+ "detects Responses reasoning signature as a retryable image reasoning-only response",
+ (thinkingSignature) => {
+ const message = createAssistantMessage({
+ content: [
+ {
+ type: "thinking",
+ thinking: " private maybe a cat ",
+ thinkingSignature,
+ },
+ ],
+ });
+ expect(__testing.hasImageReasoningOnlyResponse(message as never)).toBe(true);
+ expect(() =>
+ __testing.coerceImageAssistantText({
+ provider: "openai",
+ model: "gpt-5.4-mini",
+ message: message as never,
+ }),
+ ).toThrow(/returned no text/i);
+ },
+ );
+
+ it("detects oversized JSON reasoning signatures without parsing the whole payload", () => {
+ const message = createAssistantMessage({
+ content: [
+ {
+ type: "thinking",
+ thinking: "retryable",
+ thinkingSignature: JSON.stringify({
+ id: "rs_123",
+ summary: [{ text: "x".repeat(2_100) }],
+ type: "reasoning",
+ }),
+ },
+ ],
+ });
+
+ expect(__testing.hasImageReasoningOnlyResponse(message as never)).toBe(true);
+ });
+
+ it("ignores oversized JSON signatures without Responses reasoning markers", () => {
+ const message = createAssistantMessage({
+ content: [
+ {
+ type: "thinking",
+ thinking: "retryable",
+ thinkingSignature: `{"id":"not-reasoning","summary":"${"x".repeat(2_100)}"}`,
+ },
+ ],
+ });
+
+ expect(__testing.hasImageReasoningOnlyResponse(message as never)).toBe(false);
+ });
+
+ it("detects signed reasoning-only responses with empty summary text", () => {
+ const message = createAssistantMessage({
+ content: [
+ {
+ type: "thinking",
+ thinking: "",
+ thinkingSignature: "reasoning_content",
+ },
+ ],
+ });
+
+ expect(__testing.hasImageReasoningOnlyResponse(message as never)).toBe(true);
+ });
+
+ it("bounds reasoning-only detection before scanning every block", () => {
+ const message = createAssistantMessage({
+ content: [
+ ...Array.from({ length: 50 }, () => ({ type: "thinking", thinking: "untagged" })),
+ {
+ type: "thinking",
+ thinking: "retryable",
+ thinkingSignature: "reasoning_content",
+ },
+ ],
+ });
+
+ expect(__testing.hasImageReasoningOnlyResponse(message as never)).toBe(false);
+ });
});
diff --git a/src/agents/tools/image-tool.ts b/src/agents/tools/image-tool.ts
index e94481447b7..c34a5590bbc 100644
--- a/src/agents/tools/image-tool.ts
+++ b/src/agents/tools/image-tool.ts
@@ -19,6 +19,7 @@ import {
coerceImageAssistantText,
coerceImageModelConfig,
decodeDataUrl,
+ hasImageReasoningOnlyResponse,
type ImageModelConfig,
resolveProviderVisionModelFromConfig,
} from "./image-tool.helpers.js";
@@ -58,6 +59,7 @@ const imageToolProviderDeps = {
export const __testing = {
decodeDataUrl,
coerceImageAssistantText,
+ hasImageReasoningOnlyResponse,
resolveImageToolMaxTokens,
setProviderDepsForTest(overrides?: {
buildProviderRegistry?: typeof buildProviderRegistry;
diff --git a/src/media-understanding/image.test.ts b/src/media-understanding/image.test.ts
index 16750172c2b..d603eb3969a 100644
--- a/src/media-understanding/image.test.ts
+++ b/src/media-understanding/image.test.ts
@@ -233,6 +233,123 @@ describe("describeImageWithModel", () => {
expect(context?.messages?.[0]?.content).toHaveLength(1);
});
+ it.each([
+ {
+ name: "direct OpenAI Responses baseUrl",
+ provider: "openai",
+ model: {
+ api: "openai-responses",
+ provider: "openai",
+ id: "gpt-5.4-mini",
+ input: ["text", "image"],
+ baseUrl: "https://api.openai.com/v1",
+ },
+ expectedRetryPayload: {
+ reasoning: { effort: "none" },
+ },
+ },
+ {
+ name: "default OpenAI Responses route without explicit baseUrl",
+ provider: "openai",
+ model: {
+ api: "openai-responses",
+ provider: "openai",
+ id: "gpt-5.4-mini",
+ input: ["text", "image"],
+ },
+ expectedRetryPayload: {
+ reasoning: { effort: "none" },
+ },
+ },
+ {
+ name: "azure-openai provider using openai-responses api",
+ provider: "azure-openai",
+ model: {
+ api: "openai-responses",
+ provider: "azure-openai",
+ id: "gpt-5.4-mini",
+ input: ["text", "image"],
+ baseUrl: "https://myresource.openai.azure.com/openai/v1",
+ },
+ expectedRetryPayload: {
+ reasoning: { effort: "none" },
+ },
+ },
+ {
+ name: "proxy-like openai-responses route",
+ provider: "openai",
+ model: {
+ api: "openai-responses",
+ provider: "openai",
+ id: "gpt-5.4-mini",
+ input: ["text", "image"],
+ baseUrl: "https://proxy.example.com/v1",
+ },
+ expectedRetryPayload: {},
+ },
+ ])(
+ "retries reasoning-only image responses with reasoning disabled for $name",
+ async ({ provider, model, expectedRetryPayload }) => {
+ discoverModelsMock.mockReturnValue({
+ find: vi.fn(() => model),
+ });
+ completeMock
+ .mockResolvedValueOnce({
+ role: "assistant",
+ api: model.api,
+ provider: model.provider,
+ model: model.id,
+ stopReason: "stop",
+ timestamp: Date.now(),
+ content: [
+ {
+ type: "thinking",
+ thinking: "internal image reasoning",
+ thinkingSignature: "reasoning_content",
+ },
+ ],
+ })
+ .mockResolvedValueOnce({
+ role: "assistant",
+ api: model.api,
+ provider: model.provider,
+ model: model.id,
+ stopReason: "stop",
+ timestamp: Date.now(),
+ content: [{ type: "text", text: "retry ok" }],
+ });
+
+ const result = await describeImageWithModel({
+ cfg: {},
+ agentDir: "/tmp/openclaw-agent",
+ provider,
+ model: model.id,
+ buffer: Buffer.from("png-bytes"),
+ fileName: "image.png",
+ mime: "image/png",
+ prompt: "Describe the image.",
+ timeoutMs: 1000,
+ });
+
+ expect(result).toEqual({
+ text: "retry ok",
+ model: model.id,
+ });
+ expect(completeMock).toHaveBeenCalledTimes(2);
+ const [, , retryOptions] = completeMock.mock.calls[1] ?? [];
+ expect(retryOptions?.onPayload).toEqual(expect.any(Function));
+ const retryPayload = await retryOptions?.onPayload?.(
+ {
+ reasoning: { effort: "high", summary: "auto" },
+ reasoning_effort: "high",
+ include: ["reasoning.encrypted_content"],
+ },
+ completeMock.mock.calls[1]?.[0],
+ );
+ expect(retryPayload).toEqual(expectedRetryPayload);
+ },
+ );
+
it("normalizes deprecated google flash ids before lookup and keeps profile auth selection", async () => {
const findMock = vi.fn((provider: string, modelId: string) => {
expect(provider).toBe("google");
diff --git a/src/media-understanding/image.ts b/src/media-understanding/image.ts
index 4de9c000da5..5fd24d8b26e 100644
--- a/src/media-understanding/image.ts
+++ b/src/media-understanding/image.ts
@@ -1,4 +1,4 @@
-import type { Api, Context, Model } from "@mariozechner/pi-ai";
+import type { Api, Context, Model, ProviderStreamOptions } from "@mariozechner/pi-ai";
import { complete } from "@mariozechner/pi-ai";
import { isMinimaxVlmModel, minimaxUnderstandImage } from "../agents/minimax-vlm.js";
import {
@@ -8,7 +8,11 @@ import {
} from "../agents/model-auth.js";
import { normalizeModelRef } from "../agents/model-selection.js";
import { ensureOpenClawModelsJson } from "../agents/models-config.js";
-import { coerceImageAssistantText } from "../agents/tools/image-tool.helpers.js";
+import { resolveProviderRequestCapabilities } from "../agents/provider-attribution.js";
+import {
+ coerceImageAssistantText,
+ hasImageReasoningOnlyResponse,
+} from "../agents/tools/image-tool.helpers.js";
import type {
ImageDescriptionRequest,
ImageDescriptionResult,
@@ -36,6 +40,60 @@ function resolveImageToolMaxTokens(modelMaxTokens: number | undefined, requested
return Math.min(requestedMaxTokens, modelMaxTokens);
}
+function isRecord(value: unknown): value is Record {
+ return Boolean(value) && typeof value === "object" && !Array.isArray(value);
+}
+
+function isNativeResponsesReasoningPayload(model: Model): boolean {
+ if (
+ model.api !== "openai-responses" &&
+ model.api !== "azure-openai-responses" &&
+ model.api !== "openai-codex-responses"
+ ) {
+ return false;
+ }
+ return resolveProviderRequestCapabilities({
+ provider: model.provider,
+ api: model.api,
+ baseUrl: model.baseUrl,
+ capability: "image",
+ transport: "media-understanding",
+ }).usesKnownNativeOpenAIRoute;
+}
+
+function removeReasoningInclude(value: unknown): unknown {
+ if (!Array.isArray(value)) {
+ return value;
+ }
+ const next = value.filter((entry) => entry !== "reasoning.encrypted_content");
+ return next.length > 0 ? next : undefined;
+}
+
+function disableReasoningForImageRetryPayload(payload: unknown, model: Model): unknown {
+ if (!isRecord(payload)) {
+ return undefined;
+ }
+ const next = { ...payload };
+ delete next.reasoning;
+ delete next.reasoning_effort;
+
+ const include = removeReasoningInclude(next.include);
+ if (include === undefined) {
+ delete next.include;
+ } else {
+ next.include = include;
+ }
+
+ if (isNativeResponsesReasoningPayload(model)) {
+ next.reasoning = { effort: "none" };
+ }
+ return next;
+}
+
+function isImageModelNoTextError(err: unknown): boolean {
+ return err instanceof Error && /^Image model returned no text\b/.test(err.message);
+}
+
async function resolveImageRuntime(params: {
cfg: ImageDescriptionRequest["cfg"];
agentDir: string;
@@ -195,19 +253,41 @@ export async function describeImagesWithModel(
params.timeoutMs > 0
? setTimeout(() => controller.abort(), params.timeoutMs)
: undefined;
- const message = await complete(model, context, {
- apiKey,
- maxTokens: resolveImageToolMaxTokens(model.maxTokens, params.maxTokens ?? 512),
- signal: controller.signal,
- }).finally(() => {
+
+ const maxTokens = resolveImageToolMaxTokens(model.maxTokens, params.maxTokens ?? 512);
+ const completeImage = async (onPayload?: ProviderStreamOptions["onPayload"]) =>
+ await complete(model, context, {
+ apiKey,
+ maxTokens,
+ signal: controller.signal,
+ ...(onPayload ? { onPayload } : {}),
+ });
+
+ try {
+ const message = await completeImage();
+ try {
+ const text = coerceImageAssistantText({
+ message,
+ provider: model.provider,
+ model: model.id,
+ });
+ return { text, model: model.id };
+ } catch (err) {
+ if (!isImageModelNoTextError(err) || !hasImageReasoningOnlyResponse(message)) {
+ throw err;
+ }
+ }
+
+ const retryMessage = await completeImage(disableReasoningForImageRetryPayload);
+ const text = coerceImageAssistantText({
+ message: retryMessage,
+ provider: model.provider,
+ model: model.id,
+ });
+ return { text, model: model.id };
+ } finally {
clearTimeout(timeout);
- });
- const text = coerceImageAssistantText({
- message,
- provider: model.provider,
- model: model.id,
- });
- return { text, model: model.id };
+ }
}
export async function describeImageWithModel(