fix: handle reasoning-only image responses (#69444)

Signed-off-by: sallyom <somalley@redhat.com>
This commit is contained in:
Sally O'Malley
2026-04-21 00:20:23 -04:00
committed by GitHub
parent 1303b03241
commit 62aff9aa56
5 changed files with 393 additions and 14 deletions

View File

@@ -8,6 +8,76 @@ import { coerceToolModelConfig, type ToolModelConfig } from "./model-config.help
export type ImageModelConfig = ToolModelConfig;
const IMAGE_REASONING_FALLBACK_SIGNATURES = new Set([
"reasoning_content",
"reasoning",
"reasoning_details",
"reasoning_text",
]);
const MAX_IMAGE_REASONING_FALLBACK_BLOCKS = 50;
const MAX_IMAGE_REASONING_SIGNATURE_PARSE_CHARS = 2_048;
const MAX_IMAGE_REASONING_SIGNATURE_SCAN_CHARS = 65_536;
function hasResponsesReasoningSignatureMarkers(value: string): boolean {
const scanned = value.slice(0, MAX_IMAGE_REASONING_SIGNATURE_SCAN_CHARS);
return /"id"\s*:\s*"rs_/.test(scanned) && /"type"\s*:\s*"reasoning(?:[."])/.test(scanned);
}
function isImageReasoningFallbackSignature(value: unknown): boolean {
if (!value) {
return false;
}
if (typeof value === "string") {
if (IMAGE_REASONING_FALLBACK_SIGNATURES.has(value)) {
return true;
}
const trimmed = value.trim();
if (!trimmed.startsWith("{") || !trimmed.endsWith("}")) {
return false;
}
if (trimmed.length > MAX_IMAGE_REASONING_SIGNATURE_PARSE_CHARS) {
return hasResponsesReasoningSignatureMarkers(trimmed);
}
try {
return isImageReasoningFallbackSignature(JSON.parse(trimmed));
} catch {
return false;
}
}
if (typeof value !== "object") {
return false;
}
const record = value as { id?: unknown; type?: unknown };
const id = typeof record.id === "string" ? record.id : "";
const type = typeof record.type === "string" ? record.type : "";
return id.startsWith("rs_") && (type === "reasoning" || type.startsWith("reasoning."));
}
export function hasImageReasoningOnlyResponse(message: AssistantMessage): boolean {
if (extractAssistantText(message).trim() || !Array.isArray(message.content)) {
return false;
}
let checkedBlocks = 0;
for (const block of message.content) {
checkedBlocks += 1;
if (checkedBlocks > MAX_IMAGE_REASONING_FALLBACK_BLOCKS) {
break;
}
if (!block || typeof block !== "object") {
continue;
}
const record = block as { type?: unknown; thinking?: unknown; thinkingSignature?: unknown };
if (
record.type === "thinking" &&
typeof record.thinking === "string" &&
isImageReasoningFallbackSignature(record.thinkingSignature)
) {
return true;
}
}
return false;
}
export function decodeDataUrl(
dataUrl: string,
opts?: { maxBytes?: number },

View File

@@ -1439,4 +1439,114 @@ describe("image tool response validation", () => {
});
expect(text).toBe("hello");
});
it.each(["reasoning_content", "reasoning", "reasoning_details", "reasoning_text"])(
"detects %s as a retryable image reasoning-only response",
(thinkingSignature) => {
const message = createAssistantMessage({
content: [
{
type: "thinking",
thinking: " <think>private</think> maybe a cat ",
thinkingSignature,
},
],
});
expect(__testing.hasImageReasoningOnlyResponse(message as never)).toBe(true);
expect(() =>
__testing.coerceImageAssistantText({
provider: "openai",
model: "gpt-5.4-mini",
message: message as never,
}),
).toThrow(/returned no text/i);
},
);
it.each([
JSON.stringify({ id: "rs_123", type: "reasoning" }),
{ id: "rs_456", type: "reasoning.encrypted" },
])(
"detects Responses reasoning signature as a retryable image reasoning-only response",
(thinkingSignature) => {
const message = createAssistantMessage({
content: [
{
type: "thinking",
thinking: " <think>private</think> maybe a cat ",
thinkingSignature,
},
],
});
expect(__testing.hasImageReasoningOnlyResponse(message as never)).toBe(true);
expect(() =>
__testing.coerceImageAssistantText({
provider: "openai",
model: "gpt-5.4-mini",
message: message as never,
}),
).toThrow(/returned no text/i);
},
);
it("detects oversized JSON reasoning signatures without parsing the whole payload", () => {
const message = createAssistantMessage({
content: [
{
type: "thinking",
thinking: "retryable",
thinkingSignature: JSON.stringify({
id: "rs_123",
summary: [{ text: "x".repeat(2_100) }],
type: "reasoning",
}),
},
],
});
expect(__testing.hasImageReasoningOnlyResponse(message as never)).toBe(true);
});
it("ignores oversized JSON signatures without Responses reasoning markers", () => {
const message = createAssistantMessage({
content: [
{
type: "thinking",
thinking: "retryable",
thinkingSignature: `{"id":"not-reasoning","summary":"${"x".repeat(2_100)}"}`,
},
],
});
expect(__testing.hasImageReasoningOnlyResponse(message as never)).toBe(false);
});
it("detects signed reasoning-only responses with empty summary text", () => {
const message = createAssistantMessage({
content: [
{
type: "thinking",
thinking: "",
thinkingSignature: "reasoning_content",
},
],
});
expect(__testing.hasImageReasoningOnlyResponse(message as never)).toBe(true);
});
it("bounds reasoning-only detection before scanning every block", () => {
const message = createAssistantMessage({
content: [
...Array.from({ length: 50 }, () => ({ type: "thinking", thinking: "untagged" })),
{
type: "thinking",
thinking: "retryable",
thinkingSignature: "reasoning_content",
},
],
});
expect(__testing.hasImageReasoningOnlyResponse(message as never)).toBe(false);
});
});

View File

@@ -19,6 +19,7 @@ import {
coerceImageAssistantText,
coerceImageModelConfig,
decodeDataUrl,
hasImageReasoningOnlyResponse,
type ImageModelConfig,
resolveProviderVisionModelFromConfig,
} from "./image-tool.helpers.js";
@@ -58,6 +59,7 @@ const imageToolProviderDeps = {
export const __testing = {
decodeDataUrl,
coerceImageAssistantText,
hasImageReasoningOnlyResponse,
resolveImageToolMaxTokens,
setProviderDepsForTest(overrides?: {
buildProviderRegistry?: typeof buildProviderRegistry;

View File

@@ -233,6 +233,123 @@ describe("describeImageWithModel", () => {
expect(context?.messages?.[0]?.content).toHaveLength(1);
});
it.each([
{
name: "direct OpenAI Responses baseUrl",
provider: "openai",
model: {
api: "openai-responses",
provider: "openai",
id: "gpt-5.4-mini",
input: ["text", "image"],
baseUrl: "https://api.openai.com/v1",
},
expectedRetryPayload: {
reasoning: { effort: "none" },
},
},
{
name: "default OpenAI Responses route without explicit baseUrl",
provider: "openai",
model: {
api: "openai-responses",
provider: "openai",
id: "gpt-5.4-mini",
input: ["text", "image"],
},
expectedRetryPayload: {
reasoning: { effort: "none" },
},
},
{
name: "azure-openai provider using openai-responses api",
provider: "azure-openai",
model: {
api: "openai-responses",
provider: "azure-openai",
id: "gpt-5.4-mini",
input: ["text", "image"],
baseUrl: "https://myresource.openai.azure.com/openai/v1",
},
expectedRetryPayload: {
reasoning: { effort: "none" },
},
},
{
name: "proxy-like openai-responses route",
provider: "openai",
model: {
api: "openai-responses",
provider: "openai",
id: "gpt-5.4-mini",
input: ["text", "image"],
baseUrl: "https://proxy.example.com/v1",
},
expectedRetryPayload: {},
},
])(
"retries reasoning-only image responses with reasoning disabled for $name",
async ({ provider, model, expectedRetryPayload }) => {
discoverModelsMock.mockReturnValue({
find: vi.fn(() => model),
});
completeMock
.mockResolvedValueOnce({
role: "assistant",
api: model.api,
provider: model.provider,
model: model.id,
stopReason: "stop",
timestamp: Date.now(),
content: [
{
type: "thinking",
thinking: "internal image reasoning",
thinkingSignature: "reasoning_content",
},
],
})
.mockResolvedValueOnce({
role: "assistant",
api: model.api,
provider: model.provider,
model: model.id,
stopReason: "stop",
timestamp: Date.now(),
content: [{ type: "text", text: "retry ok" }],
});
const result = await describeImageWithModel({
cfg: {},
agentDir: "/tmp/openclaw-agent",
provider,
model: model.id,
buffer: Buffer.from("png-bytes"),
fileName: "image.png",
mime: "image/png",
prompt: "Describe the image.",
timeoutMs: 1000,
});
expect(result).toEqual({
text: "retry ok",
model: model.id,
});
expect(completeMock).toHaveBeenCalledTimes(2);
const [, , retryOptions] = completeMock.mock.calls[1] ?? [];
expect(retryOptions?.onPayload).toEqual(expect.any(Function));
const retryPayload = await retryOptions?.onPayload?.(
{
reasoning: { effort: "high", summary: "auto" },
reasoning_effort: "high",
include: ["reasoning.encrypted_content"],
},
completeMock.mock.calls[1]?.[0],
);
expect(retryPayload).toEqual(expectedRetryPayload);
},
);
it("normalizes deprecated google flash ids before lookup and keeps profile auth selection", async () => {
const findMock = vi.fn((provider: string, modelId: string) => {
expect(provider).toBe("google");

View File

@@ -1,4 +1,4 @@
import type { Api, Context, Model } from "@mariozechner/pi-ai";
import type { Api, Context, Model, ProviderStreamOptions } from "@mariozechner/pi-ai";
import { complete } from "@mariozechner/pi-ai";
import { isMinimaxVlmModel, minimaxUnderstandImage } from "../agents/minimax-vlm.js";
import {
@@ -8,7 +8,11 @@ import {
} from "../agents/model-auth.js";
import { normalizeModelRef } from "../agents/model-selection.js";
import { ensureOpenClawModelsJson } from "../agents/models-config.js";
import { coerceImageAssistantText } from "../agents/tools/image-tool.helpers.js";
import { resolveProviderRequestCapabilities } from "../agents/provider-attribution.js";
import {
coerceImageAssistantText,
hasImageReasoningOnlyResponse,
} from "../agents/tools/image-tool.helpers.js";
import type {
ImageDescriptionRequest,
ImageDescriptionResult,
@@ -36,6 +40,60 @@ function resolveImageToolMaxTokens(modelMaxTokens: number | undefined, requested
return Math.min(requestedMaxTokens, modelMaxTokens);
}
function isRecord(value: unknown): value is Record<string, unknown> {
return Boolean(value) && typeof value === "object" && !Array.isArray(value);
}
function isNativeResponsesReasoningPayload(model: Model<Api>): boolean {
if (
model.api !== "openai-responses" &&
model.api !== "azure-openai-responses" &&
model.api !== "openai-codex-responses"
) {
return false;
}
return resolveProviderRequestCapabilities({
provider: model.provider,
api: model.api,
baseUrl: model.baseUrl,
capability: "image",
transport: "media-understanding",
}).usesKnownNativeOpenAIRoute;
}
function removeReasoningInclude(value: unknown): unknown {
if (!Array.isArray(value)) {
return value;
}
const next = value.filter((entry) => entry !== "reasoning.encrypted_content");
return next.length > 0 ? next : undefined;
}
function disableReasoningForImageRetryPayload(payload: unknown, model: Model<Api>): unknown {
if (!isRecord(payload)) {
return undefined;
}
const next = { ...payload };
delete next.reasoning;
delete next.reasoning_effort;
const include = removeReasoningInclude(next.include);
if (include === undefined) {
delete next.include;
} else {
next.include = include;
}
if (isNativeResponsesReasoningPayload(model)) {
next.reasoning = { effort: "none" };
}
return next;
}
function isImageModelNoTextError(err: unknown): boolean {
return err instanceof Error && /^Image model returned no text\b/.test(err.message);
}
async function resolveImageRuntime(params: {
cfg: ImageDescriptionRequest["cfg"];
agentDir: string;
@@ -195,19 +253,41 @@ export async function describeImagesWithModel(
params.timeoutMs > 0
? setTimeout(() => controller.abort(), params.timeoutMs)
: undefined;
const message = await complete(model, context, {
apiKey,
maxTokens: resolveImageToolMaxTokens(model.maxTokens, params.maxTokens ?? 512),
signal: controller.signal,
}).finally(() => {
const maxTokens = resolveImageToolMaxTokens(model.maxTokens, params.maxTokens ?? 512);
const completeImage = async (onPayload?: ProviderStreamOptions["onPayload"]) =>
await complete(model, context, {
apiKey,
maxTokens,
signal: controller.signal,
...(onPayload ? { onPayload } : {}),
});
try {
const message = await completeImage();
try {
const text = coerceImageAssistantText({
message,
provider: model.provider,
model: model.id,
});
return { text, model: model.id };
} catch (err) {
if (!isImageModelNoTextError(err) || !hasImageReasoningOnlyResponse(message)) {
throw err;
}
}
const retryMessage = await completeImage(disableReasoningForImageRetryPayload);
const text = coerceImageAssistantText({
message: retryMessage,
provider: model.provider,
model: model.id,
});
return { text, model: model.id };
} finally {
clearTimeout(timeout);
});
const text = coerceImageAssistantText({
message,
provider: model.provider,
model: model.id,
});
return { text, model: model.id };
}
}
export async function describeImageWithModel(