diff --git a/src/agents/tools/image-tool.helpers.ts b/src/agents/tools/image-tool.helpers.ts index 1cee912b11c..ead4656b080 100644 --- a/src/agents/tools/image-tool.helpers.ts +++ b/src/agents/tools/image-tool.helpers.ts @@ -8,6 +8,76 @@ import { coerceToolModelConfig, type ToolModelConfig } from "./model-config.help export type ImageModelConfig = ToolModelConfig; +const IMAGE_REASONING_FALLBACK_SIGNATURES = new Set([ + "reasoning_content", + "reasoning", + "reasoning_details", + "reasoning_text", +]); +const MAX_IMAGE_REASONING_FALLBACK_BLOCKS = 50; +const MAX_IMAGE_REASONING_SIGNATURE_PARSE_CHARS = 2_048; +const MAX_IMAGE_REASONING_SIGNATURE_SCAN_CHARS = 65_536; + +function hasResponsesReasoningSignatureMarkers(value: string): boolean { + const scanned = value.slice(0, MAX_IMAGE_REASONING_SIGNATURE_SCAN_CHARS); + return /"id"\s*:\s*"rs_/.test(scanned) && /"type"\s*:\s*"reasoning(?:[."])/.test(scanned); +} + +function isImageReasoningFallbackSignature(value: unknown): boolean { + if (!value) { + return false; + } + if (typeof value === "string") { + if (IMAGE_REASONING_FALLBACK_SIGNATURES.has(value)) { + return true; + } + const trimmed = value.trim(); + if (!trimmed.startsWith("{") || !trimmed.endsWith("}")) { + return false; + } + if (trimmed.length > MAX_IMAGE_REASONING_SIGNATURE_PARSE_CHARS) { + return hasResponsesReasoningSignatureMarkers(trimmed); + } + try { + return isImageReasoningFallbackSignature(JSON.parse(trimmed)); + } catch { + return false; + } + } + if (typeof value !== "object") { + return false; + } + const record = value as { id?: unknown; type?: unknown }; + const id = typeof record.id === "string" ? record.id : ""; + const type = typeof record.type === "string" ? record.type : ""; + return id.startsWith("rs_") && (type === "reasoning" || type.startsWith("reasoning.")); +} + +export function hasImageReasoningOnlyResponse(message: AssistantMessage): boolean { + if (extractAssistantText(message).trim() || !Array.isArray(message.content)) { + return false; + } + let checkedBlocks = 0; + for (const block of message.content) { + checkedBlocks += 1; + if (checkedBlocks > MAX_IMAGE_REASONING_FALLBACK_BLOCKS) { + break; + } + if (!block || typeof block !== "object") { + continue; + } + const record = block as { type?: unknown; thinking?: unknown; thinkingSignature?: unknown }; + if ( + record.type === "thinking" && + typeof record.thinking === "string" && + isImageReasoningFallbackSignature(record.thinkingSignature) + ) { + return true; + } + } + return false; +} + export function decodeDataUrl( dataUrl: string, opts?: { maxBytes?: number }, diff --git a/src/agents/tools/image-tool.test.ts b/src/agents/tools/image-tool.test.ts index f3c2acd1de8..62aa6c48790 100644 --- a/src/agents/tools/image-tool.test.ts +++ b/src/agents/tools/image-tool.test.ts @@ -1439,4 +1439,114 @@ describe("image tool response validation", () => { }); expect(text).toBe("hello"); }); + + it.each(["reasoning_content", "reasoning", "reasoning_details", "reasoning_text"])( + "detects %s as a retryable image reasoning-only response", + (thinkingSignature) => { + const message = createAssistantMessage({ + content: [ + { + type: "thinking", + thinking: " private maybe a cat ", + thinkingSignature, + }, + ], + }); + expect(__testing.hasImageReasoningOnlyResponse(message as never)).toBe(true); + expect(() => + __testing.coerceImageAssistantText({ + provider: "openai", + model: "gpt-5.4-mini", + message: message as never, + }), + ).toThrow(/returned no text/i); + }, + ); + + it.each([ + JSON.stringify({ id: "rs_123", type: "reasoning" }), + { id: "rs_456", type: "reasoning.encrypted" }, + ])( + "detects Responses reasoning signature as a retryable image reasoning-only response", + (thinkingSignature) => { + const message = createAssistantMessage({ + content: [ + { + type: "thinking", + thinking: " private maybe a cat ", + thinkingSignature, + }, + ], + }); + expect(__testing.hasImageReasoningOnlyResponse(message as never)).toBe(true); + expect(() => + __testing.coerceImageAssistantText({ + provider: "openai", + model: "gpt-5.4-mini", + message: message as never, + }), + ).toThrow(/returned no text/i); + }, + ); + + it("detects oversized JSON reasoning signatures without parsing the whole payload", () => { + const message = createAssistantMessage({ + content: [ + { + type: "thinking", + thinking: "retryable", + thinkingSignature: JSON.stringify({ + id: "rs_123", + summary: [{ text: "x".repeat(2_100) }], + type: "reasoning", + }), + }, + ], + }); + + expect(__testing.hasImageReasoningOnlyResponse(message as never)).toBe(true); + }); + + it("ignores oversized JSON signatures without Responses reasoning markers", () => { + const message = createAssistantMessage({ + content: [ + { + type: "thinking", + thinking: "retryable", + thinkingSignature: `{"id":"not-reasoning","summary":"${"x".repeat(2_100)}"}`, + }, + ], + }); + + expect(__testing.hasImageReasoningOnlyResponse(message as never)).toBe(false); + }); + + it("detects signed reasoning-only responses with empty summary text", () => { + const message = createAssistantMessage({ + content: [ + { + type: "thinking", + thinking: "", + thinkingSignature: "reasoning_content", + }, + ], + }); + + expect(__testing.hasImageReasoningOnlyResponse(message as never)).toBe(true); + }); + + it("bounds reasoning-only detection before scanning every block", () => { + const message = createAssistantMessage({ + content: [ + ...Array.from({ length: 50 }, () => ({ type: "thinking", thinking: "untagged" })), + { + type: "thinking", + thinking: "retryable", + thinkingSignature: "reasoning_content", + }, + ], + }); + + expect(__testing.hasImageReasoningOnlyResponse(message as never)).toBe(false); + }); }); diff --git a/src/agents/tools/image-tool.ts b/src/agents/tools/image-tool.ts index e94481447b7..c34a5590bbc 100644 --- a/src/agents/tools/image-tool.ts +++ b/src/agents/tools/image-tool.ts @@ -19,6 +19,7 @@ import { coerceImageAssistantText, coerceImageModelConfig, decodeDataUrl, + hasImageReasoningOnlyResponse, type ImageModelConfig, resolveProviderVisionModelFromConfig, } from "./image-tool.helpers.js"; @@ -58,6 +59,7 @@ const imageToolProviderDeps = { export const __testing = { decodeDataUrl, coerceImageAssistantText, + hasImageReasoningOnlyResponse, resolveImageToolMaxTokens, setProviderDepsForTest(overrides?: { buildProviderRegistry?: typeof buildProviderRegistry; diff --git a/src/media-understanding/image.test.ts b/src/media-understanding/image.test.ts index 16750172c2b..d603eb3969a 100644 --- a/src/media-understanding/image.test.ts +++ b/src/media-understanding/image.test.ts @@ -233,6 +233,123 @@ describe("describeImageWithModel", () => { expect(context?.messages?.[0]?.content).toHaveLength(1); }); + it.each([ + { + name: "direct OpenAI Responses baseUrl", + provider: "openai", + model: { + api: "openai-responses", + provider: "openai", + id: "gpt-5.4-mini", + input: ["text", "image"], + baseUrl: "https://api.openai.com/v1", + }, + expectedRetryPayload: { + reasoning: { effort: "none" }, + }, + }, + { + name: "default OpenAI Responses route without explicit baseUrl", + provider: "openai", + model: { + api: "openai-responses", + provider: "openai", + id: "gpt-5.4-mini", + input: ["text", "image"], + }, + expectedRetryPayload: { + reasoning: { effort: "none" }, + }, + }, + { + name: "azure-openai provider using openai-responses api", + provider: "azure-openai", + model: { + api: "openai-responses", + provider: "azure-openai", + id: "gpt-5.4-mini", + input: ["text", "image"], + baseUrl: "https://myresource.openai.azure.com/openai/v1", + }, + expectedRetryPayload: { + reasoning: { effort: "none" }, + }, + }, + { + name: "proxy-like openai-responses route", + provider: "openai", + model: { + api: "openai-responses", + provider: "openai", + id: "gpt-5.4-mini", + input: ["text", "image"], + baseUrl: "https://proxy.example.com/v1", + }, + expectedRetryPayload: {}, + }, + ])( + "retries reasoning-only image responses with reasoning disabled for $name", + async ({ provider, model, expectedRetryPayload }) => { + discoverModelsMock.mockReturnValue({ + find: vi.fn(() => model), + }); + completeMock + .mockResolvedValueOnce({ + role: "assistant", + api: model.api, + provider: model.provider, + model: model.id, + stopReason: "stop", + timestamp: Date.now(), + content: [ + { + type: "thinking", + thinking: "internal image reasoning", + thinkingSignature: "reasoning_content", + }, + ], + }) + .mockResolvedValueOnce({ + role: "assistant", + api: model.api, + provider: model.provider, + model: model.id, + stopReason: "stop", + timestamp: Date.now(), + content: [{ type: "text", text: "retry ok" }], + }); + + const result = await describeImageWithModel({ + cfg: {}, + agentDir: "/tmp/openclaw-agent", + provider, + model: model.id, + buffer: Buffer.from("png-bytes"), + fileName: "image.png", + mime: "image/png", + prompt: "Describe the image.", + timeoutMs: 1000, + }); + + expect(result).toEqual({ + text: "retry ok", + model: model.id, + }); + expect(completeMock).toHaveBeenCalledTimes(2); + const [, , retryOptions] = completeMock.mock.calls[1] ?? []; + expect(retryOptions?.onPayload).toEqual(expect.any(Function)); + const retryPayload = await retryOptions?.onPayload?.( + { + reasoning: { effort: "high", summary: "auto" }, + reasoning_effort: "high", + include: ["reasoning.encrypted_content"], + }, + completeMock.mock.calls[1]?.[0], + ); + expect(retryPayload).toEqual(expectedRetryPayload); + }, + ); + it("normalizes deprecated google flash ids before lookup and keeps profile auth selection", async () => { const findMock = vi.fn((provider: string, modelId: string) => { expect(provider).toBe("google"); diff --git a/src/media-understanding/image.ts b/src/media-understanding/image.ts index 4de9c000da5..5fd24d8b26e 100644 --- a/src/media-understanding/image.ts +++ b/src/media-understanding/image.ts @@ -1,4 +1,4 @@ -import type { Api, Context, Model } from "@mariozechner/pi-ai"; +import type { Api, Context, Model, ProviderStreamOptions } from "@mariozechner/pi-ai"; import { complete } from "@mariozechner/pi-ai"; import { isMinimaxVlmModel, minimaxUnderstandImage } from "../agents/minimax-vlm.js"; import { @@ -8,7 +8,11 @@ import { } from "../agents/model-auth.js"; import { normalizeModelRef } from "../agents/model-selection.js"; import { ensureOpenClawModelsJson } from "../agents/models-config.js"; -import { coerceImageAssistantText } from "../agents/tools/image-tool.helpers.js"; +import { resolveProviderRequestCapabilities } from "../agents/provider-attribution.js"; +import { + coerceImageAssistantText, + hasImageReasoningOnlyResponse, +} from "../agents/tools/image-tool.helpers.js"; import type { ImageDescriptionRequest, ImageDescriptionResult, @@ -36,6 +40,60 @@ function resolveImageToolMaxTokens(modelMaxTokens: number | undefined, requested return Math.min(requestedMaxTokens, modelMaxTokens); } +function isRecord(value: unknown): value is Record { + return Boolean(value) && typeof value === "object" && !Array.isArray(value); +} + +function isNativeResponsesReasoningPayload(model: Model): boolean { + if ( + model.api !== "openai-responses" && + model.api !== "azure-openai-responses" && + model.api !== "openai-codex-responses" + ) { + return false; + } + return resolveProviderRequestCapabilities({ + provider: model.provider, + api: model.api, + baseUrl: model.baseUrl, + capability: "image", + transport: "media-understanding", + }).usesKnownNativeOpenAIRoute; +} + +function removeReasoningInclude(value: unknown): unknown { + if (!Array.isArray(value)) { + return value; + } + const next = value.filter((entry) => entry !== "reasoning.encrypted_content"); + return next.length > 0 ? next : undefined; +} + +function disableReasoningForImageRetryPayload(payload: unknown, model: Model): unknown { + if (!isRecord(payload)) { + return undefined; + } + const next = { ...payload }; + delete next.reasoning; + delete next.reasoning_effort; + + const include = removeReasoningInclude(next.include); + if (include === undefined) { + delete next.include; + } else { + next.include = include; + } + + if (isNativeResponsesReasoningPayload(model)) { + next.reasoning = { effort: "none" }; + } + return next; +} + +function isImageModelNoTextError(err: unknown): boolean { + return err instanceof Error && /^Image model returned no text\b/.test(err.message); +} + async function resolveImageRuntime(params: { cfg: ImageDescriptionRequest["cfg"]; agentDir: string; @@ -195,19 +253,41 @@ export async function describeImagesWithModel( params.timeoutMs > 0 ? setTimeout(() => controller.abort(), params.timeoutMs) : undefined; - const message = await complete(model, context, { - apiKey, - maxTokens: resolveImageToolMaxTokens(model.maxTokens, params.maxTokens ?? 512), - signal: controller.signal, - }).finally(() => { + + const maxTokens = resolveImageToolMaxTokens(model.maxTokens, params.maxTokens ?? 512); + const completeImage = async (onPayload?: ProviderStreamOptions["onPayload"]) => + await complete(model, context, { + apiKey, + maxTokens, + signal: controller.signal, + ...(onPayload ? { onPayload } : {}), + }); + + try { + const message = await completeImage(); + try { + const text = coerceImageAssistantText({ + message, + provider: model.provider, + model: model.id, + }); + return { text, model: model.id }; + } catch (err) { + if (!isImageModelNoTextError(err) || !hasImageReasoningOnlyResponse(message)) { + throw err; + } + } + + const retryMessage = await completeImage(disableReasoningForImageRetryPayload); + const text = coerceImageAssistantText({ + message: retryMessage, + provider: model.provider, + model: model.id, + }); + return { text, model: model.id }; + } finally { clearTimeout(timeout); - }); - const text = coerceImageAssistantText({ - message, - provider: model.provider, - model: model.id, - }); - return { text, model: model.id }; + } } export async function describeImageWithModel(