mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 07:00:43 +00:00
fix: handle reasoning-only image responses (#69444)
Signed-off-by: sallyom <somalley@redhat.com>
This commit is contained in:
@@ -8,6 +8,76 @@ import { coerceToolModelConfig, type ToolModelConfig } from "./model-config.help
|
||||
|
||||
export type ImageModelConfig = ToolModelConfig;
|
||||
|
||||
const IMAGE_REASONING_FALLBACK_SIGNATURES = new Set([
|
||||
"reasoning_content",
|
||||
"reasoning",
|
||||
"reasoning_details",
|
||||
"reasoning_text",
|
||||
]);
|
||||
const MAX_IMAGE_REASONING_FALLBACK_BLOCKS = 50;
|
||||
const MAX_IMAGE_REASONING_SIGNATURE_PARSE_CHARS = 2_048;
|
||||
const MAX_IMAGE_REASONING_SIGNATURE_SCAN_CHARS = 65_536;
|
||||
|
||||
function hasResponsesReasoningSignatureMarkers(value: string): boolean {
|
||||
const scanned = value.slice(0, MAX_IMAGE_REASONING_SIGNATURE_SCAN_CHARS);
|
||||
return /"id"\s*:\s*"rs_/.test(scanned) && /"type"\s*:\s*"reasoning(?:[."])/.test(scanned);
|
||||
}
|
||||
|
||||
function isImageReasoningFallbackSignature(value: unknown): boolean {
|
||||
if (!value) {
|
||||
return false;
|
||||
}
|
||||
if (typeof value === "string") {
|
||||
if (IMAGE_REASONING_FALLBACK_SIGNATURES.has(value)) {
|
||||
return true;
|
||||
}
|
||||
const trimmed = value.trim();
|
||||
if (!trimmed.startsWith("{") || !trimmed.endsWith("}")) {
|
||||
return false;
|
||||
}
|
||||
if (trimmed.length > MAX_IMAGE_REASONING_SIGNATURE_PARSE_CHARS) {
|
||||
return hasResponsesReasoningSignatureMarkers(trimmed);
|
||||
}
|
||||
try {
|
||||
return isImageReasoningFallbackSignature(JSON.parse(trimmed));
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
if (typeof value !== "object") {
|
||||
return false;
|
||||
}
|
||||
const record = value as { id?: unknown; type?: unknown };
|
||||
const id = typeof record.id === "string" ? record.id : "";
|
||||
const type = typeof record.type === "string" ? record.type : "";
|
||||
return id.startsWith("rs_") && (type === "reasoning" || type.startsWith("reasoning."));
|
||||
}
|
||||
|
||||
export function hasImageReasoningOnlyResponse(message: AssistantMessage): boolean {
|
||||
if (extractAssistantText(message).trim() || !Array.isArray(message.content)) {
|
||||
return false;
|
||||
}
|
||||
let checkedBlocks = 0;
|
||||
for (const block of message.content) {
|
||||
checkedBlocks += 1;
|
||||
if (checkedBlocks > MAX_IMAGE_REASONING_FALLBACK_BLOCKS) {
|
||||
break;
|
||||
}
|
||||
if (!block || typeof block !== "object") {
|
||||
continue;
|
||||
}
|
||||
const record = block as { type?: unknown; thinking?: unknown; thinkingSignature?: unknown };
|
||||
if (
|
||||
record.type === "thinking" &&
|
||||
typeof record.thinking === "string" &&
|
||||
isImageReasoningFallbackSignature(record.thinkingSignature)
|
||||
) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
export function decodeDataUrl(
|
||||
dataUrl: string,
|
||||
opts?: { maxBytes?: number },
|
||||
|
||||
@@ -1439,4 +1439,114 @@ describe("image tool response validation", () => {
|
||||
});
|
||||
expect(text).toBe("hello");
|
||||
});
|
||||
|
||||
it.each(["reasoning_content", "reasoning", "reasoning_details", "reasoning_text"])(
|
||||
"detects %s as a retryable image reasoning-only response",
|
||||
(thinkingSignature) => {
|
||||
const message = createAssistantMessage({
|
||||
content: [
|
||||
{
|
||||
type: "thinking",
|
||||
thinking: " <think>private</think> maybe a cat ",
|
||||
thinkingSignature,
|
||||
},
|
||||
],
|
||||
});
|
||||
expect(__testing.hasImageReasoningOnlyResponse(message as never)).toBe(true);
|
||||
expect(() =>
|
||||
__testing.coerceImageAssistantText({
|
||||
provider: "openai",
|
||||
model: "gpt-5.4-mini",
|
||||
message: message as never,
|
||||
}),
|
||||
).toThrow(/returned no text/i);
|
||||
},
|
||||
);
|
||||
|
||||
it.each([
|
||||
JSON.stringify({ id: "rs_123", type: "reasoning" }),
|
||||
{ id: "rs_456", type: "reasoning.encrypted" },
|
||||
])(
|
||||
"detects Responses reasoning signature as a retryable image reasoning-only response",
|
||||
(thinkingSignature) => {
|
||||
const message = createAssistantMessage({
|
||||
content: [
|
||||
{
|
||||
type: "thinking",
|
||||
thinking: " <think>private</think> maybe a cat ",
|
||||
thinkingSignature,
|
||||
},
|
||||
],
|
||||
});
|
||||
expect(__testing.hasImageReasoningOnlyResponse(message as never)).toBe(true);
|
||||
expect(() =>
|
||||
__testing.coerceImageAssistantText({
|
||||
provider: "openai",
|
||||
model: "gpt-5.4-mini",
|
||||
message: message as never,
|
||||
}),
|
||||
).toThrow(/returned no text/i);
|
||||
},
|
||||
);
|
||||
|
||||
it("detects oversized JSON reasoning signatures without parsing the whole payload", () => {
|
||||
const message = createAssistantMessage({
|
||||
content: [
|
||||
{
|
||||
type: "thinking",
|
||||
thinking: "retryable",
|
||||
thinkingSignature: JSON.stringify({
|
||||
id: "rs_123",
|
||||
summary: [{ text: "x".repeat(2_100) }],
|
||||
type: "reasoning",
|
||||
}),
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
expect(__testing.hasImageReasoningOnlyResponse(message as never)).toBe(true);
|
||||
});
|
||||
|
||||
it("ignores oversized JSON signatures without Responses reasoning markers", () => {
|
||||
const message = createAssistantMessage({
|
||||
content: [
|
||||
{
|
||||
type: "thinking",
|
||||
thinking: "retryable",
|
||||
thinkingSignature: `{"id":"not-reasoning","summary":"${"x".repeat(2_100)}"}`,
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
expect(__testing.hasImageReasoningOnlyResponse(message as never)).toBe(false);
|
||||
});
|
||||
|
||||
it("detects signed reasoning-only responses with empty summary text", () => {
|
||||
const message = createAssistantMessage({
|
||||
content: [
|
||||
{
|
||||
type: "thinking",
|
||||
thinking: "",
|
||||
thinkingSignature: "reasoning_content",
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
expect(__testing.hasImageReasoningOnlyResponse(message as never)).toBe(true);
|
||||
});
|
||||
|
||||
it("bounds reasoning-only detection before scanning every block", () => {
|
||||
const message = createAssistantMessage({
|
||||
content: [
|
||||
...Array.from({ length: 50 }, () => ({ type: "thinking", thinking: "untagged" })),
|
||||
{
|
||||
type: "thinking",
|
||||
thinking: "retryable",
|
||||
thinkingSignature: "reasoning_content",
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
expect(__testing.hasImageReasoningOnlyResponse(message as never)).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
@@ -19,6 +19,7 @@ import {
|
||||
coerceImageAssistantText,
|
||||
coerceImageModelConfig,
|
||||
decodeDataUrl,
|
||||
hasImageReasoningOnlyResponse,
|
||||
type ImageModelConfig,
|
||||
resolveProviderVisionModelFromConfig,
|
||||
} from "./image-tool.helpers.js";
|
||||
@@ -58,6 +59,7 @@ const imageToolProviderDeps = {
|
||||
export const __testing = {
|
||||
decodeDataUrl,
|
||||
coerceImageAssistantText,
|
||||
hasImageReasoningOnlyResponse,
|
||||
resolveImageToolMaxTokens,
|
||||
setProviderDepsForTest(overrides?: {
|
||||
buildProviderRegistry?: typeof buildProviderRegistry;
|
||||
|
||||
@@ -233,6 +233,123 @@ describe("describeImageWithModel", () => {
|
||||
expect(context?.messages?.[0]?.content).toHaveLength(1);
|
||||
});
|
||||
|
||||
it.each([
|
||||
{
|
||||
name: "direct OpenAI Responses baseUrl",
|
||||
provider: "openai",
|
||||
model: {
|
||||
api: "openai-responses",
|
||||
provider: "openai",
|
||||
id: "gpt-5.4-mini",
|
||||
input: ["text", "image"],
|
||||
baseUrl: "https://api.openai.com/v1",
|
||||
},
|
||||
expectedRetryPayload: {
|
||||
reasoning: { effort: "none" },
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "default OpenAI Responses route without explicit baseUrl",
|
||||
provider: "openai",
|
||||
model: {
|
||||
api: "openai-responses",
|
||||
provider: "openai",
|
||||
id: "gpt-5.4-mini",
|
||||
input: ["text", "image"],
|
||||
},
|
||||
expectedRetryPayload: {
|
||||
reasoning: { effort: "none" },
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "azure-openai provider using openai-responses api",
|
||||
provider: "azure-openai",
|
||||
model: {
|
||||
api: "openai-responses",
|
||||
provider: "azure-openai",
|
||||
id: "gpt-5.4-mini",
|
||||
input: ["text", "image"],
|
||||
baseUrl: "https://myresource.openai.azure.com/openai/v1",
|
||||
},
|
||||
expectedRetryPayload: {
|
||||
reasoning: { effort: "none" },
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "proxy-like openai-responses route",
|
||||
provider: "openai",
|
||||
model: {
|
||||
api: "openai-responses",
|
||||
provider: "openai",
|
||||
id: "gpt-5.4-mini",
|
||||
input: ["text", "image"],
|
||||
baseUrl: "https://proxy.example.com/v1",
|
||||
},
|
||||
expectedRetryPayload: {},
|
||||
},
|
||||
])(
|
||||
"retries reasoning-only image responses with reasoning disabled for $name",
|
||||
async ({ provider, model, expectedRetryPayload }) => {
|
||||
discoverModelsMock.mockReturnValue({
|
||||
find: vi.fn(() => model),
|
||||
});
|
||||
completeMock
|
||||
.mockResolvedValueOnce({
|
||||
role: "assistant",
|
||||
api: model.api,
|
||||
provider: model.provider,
|
||||
model: model.id,
|
||||
stopReason: "stop",
|
||||
timestamp: Date.now(),
|
||||
content: [
|
||||
{
|
||||
type: "thinking",
|
||||
thinking: "internal image reasoning",
|
||||
thinkingSignature: "reasoning_content",
|
||||
},
|
||||
],
|
||||
})
|
||||
.mockResolvedValueOnce({
|
||||
role: "assistant",
|
||||
api: model.api,
|
||||
provider: model.provider,
|
||||
model: model.id,
|
||||
stopReason: "stop",
|
||||
timestamp: Date.now(),
|
||||
content: [{ type: "text", text: "retry ok" }],
|
||||
});
|
||||
|
||||
const result = await describeImageWithModel({
|
||||
cfg: {},
|
||||
agentDir: "/tmp/openclaw-agent",
|
||||
provider,
|
||||
model: model.id,
|
||||
buffer: Buffer.from("png-bytes"),
|
||||
fileName: "image.png",
|
||||
mime: "image/png",
|
||||
prompt: "Describe the image.",
|
||||
timeoutMs: 1000,
|
||||
});
|
||||
|
||||
expect(result).toEqual({
|
||||
text: "retry ok",
|
||||
model: model.id,
|
||||
});
|
||||
expect(completeMock).toHaveBeenCalledTimes(2);
|
||||
const [, , retryOptions] = completeMock.mock.calls[1] ?? [];
|
||||
expect(retryOptions?.onPayload).toEqual(expect.any(Function));
|
||||
const retryPayload = await retryOptions?.onPayload?.(
|
||||
{
|
||||
reasoning: { effort: "high", summary: "auto" },
|
||||
reasoning_effort: "high",
|
||||
include: ["reasoning.encrypted_content"],
|
||||
},
|
||||
completeMock.mock.calls[1]?.[0],
|
||||
);
|
||||
expect(retryPayload).toEqual(expectedRetryPayload);
|
||||
},
|
||||
);
|
||||
|
||||
it("normalizes deprecated google flash ids before lookup and keeps profile auth selection", async () => {
|
||||
const findMock = vi.fn((provider: string, modelId: string) => {
|
||||
expect(provider).toBe("google");
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
import type { Api, Context, Model } from "@mariozechner/pi-ai";
|
||||
import type { Api, Context, Model, ProviderStreamOptions } from "@mariozechner/pi-ai";
|
||||
import { complete } from "@mariozechner/pi-ai";
|
||||
import { isMinimaxVlmModel, minimaxUnderstandImage } from "../agents/minimax-vlm.js";
|
||||
import {
|
||||
@@ -8,7 +8,11 @@ import {
|
||||
} from "../agents/model-auth.js";
|
||||
import { normalizeModelRef } from "../agents/model-selection.js";
|
||||
import { ensureOpenClawModelsJson } from "../agents/models-config.js";
|
||||
import { coerceImageAssistantText } from "../agents/tools/image-tool.helpers.js";
|
||||
import { resolveProviderRequestCapabilities } from "../agents/provider-attribution.js";
|
||||
import {
|
||||
coerceImageAssistantText,
|
||||
hasImageReasoningOnlyResponse,
|
||||
} from "../agents/tools/image-tool.helpers.js";
|
||||
import type {
|
||||
ImageDescriptionRequest,
|
||||
ImageDescriptionResult,
|
||||
@@ -36,6 +40,60 @@ function resolveImageToolMaxTokens(modelMaxTokens: number | undefined, requested
|
||||
return Math.min(requestedMaxTokens, modelMaxTokens);
|
||||
}
|
||||
|
||||
function isRecord(value: unknown): value is Record<string, unknown> {
|
||||
return Boolean(value) && typeof value === "object" && !Array.isArray(value);
|
||||
}
|
||||
|
||||
function isNativeResponsesReasoningPayload(model: Model<Api>): boolean {
|
||||
if (
|
||||
model.api !== "openai-responses" &&
|
||||
model.api !== "azure-openai-responses" &&
|
||||
model.api !== "openai-codex-responses"
|
||||
) {
|
||||
return false;
|
||||
}
|
||||
return resolveProviderRequestCapabilities({
|
||||
provider: model.provider,
|
||||
api: model.api,
|
||||
baseUrl: model.baseUrl,
|
||||
capability: "image",
|
||||
transport: "media-understanding",
|
||||
}).usesKnownNativeOpenAIRoute;
|
||||
}
|
||||
|
||||
function removeReasoningInclude(value: unknown): unknown {
|
||||
if (!Array.isArray(value)) {
|
||||
return value;
|
||||
}
|
||||
const next = value.filter((entry) => entry !== "reasoning.encrypted_content");
|
||||
return next.length > 0 ? next : undefined;
|
||||
}
|
||||
|
||||
function disableReasoningForImageRetryPayload(payload: unknown, model: Model<Api>): unknown {
|
||||
if (!isRecord(payload)) {
|
||||
return undefined;
|
||||
}
|
||||
const next = { ...payload };
|
||||
delete next.reasoning;
|
||||
delete next.reasoning_effort;
|
||||
|
||||
const include = removeReasoningInclude(next.include);
|
||||
if (include === undefined) {
|
||||
delete next.include;
|
||||
} else {
|
||||
next.include = include;
|
||||
}
|
||||
|
||||
if (isNativeResponsesReasoningPayload(model)) {
|
||||
next.reasoning = { effort: "none" };
|
||||
}
|
||||
return next;
|
||||
}
|
||||
|
||||
function isImageModelNoTextError(err: unknown): boolean {
|
||||
return err instanceof Error && /^Image model returned no text\b/.test(err.message);
|
||||
}
|
||||
|
||||
async function resolveImageRuntime(params: {
|
||||
cfg: ImageDescriptionRequest["cfg"];
|
||||
agentDir: string;
|
||||
@@ -195,19 +253,41 @@ export async function describeImagesWithModel(
|
||||
params.timeoutMs > 0
|
||||
? setTimeout(() => controller.abort(), params.timeoutMs)
|
||||
: undefined;
|
||||
const message = await complete(model, context, {
|
||||
apiKey,
|
||||
maxTokens: resolveImageToolMaxTokens(model.maxTokens, params.maxTokens ?? 512),
|
||||
signal: controller.signal,
|
||||
}).finally(() => {
|
||||
|
||||
const maxTokens = resolveImageToolMaxTokens(model.maxTokens, params.maxTokens ?? 512);
|
||||
const completeImage = async (onPayload?: ProviderStreamOptions["onPayload"]) =>
|
||||
await complete(model, context, {
|
||||
apiKey,
|
||||
maxTokens,
|
||||
signal: controller.signal,
|
||||
...(onPayload ? { onPayload } : {}),
|
||||
});
|
||||
|
||||
try {
|
||||
const message = await completeImage();
|
||||
try {
|
||||
const text = coerceImageAssistantText({
|
||||
message,
|
||||
provider: model.provider,
|
||||
model: model.id,
|
||||
});
|
||||
return { text, model: model.id };
|
||||
} catch (err) {
|
||||
if (!isImageModelNoTextError(err) || !hasImageReasoningOnlyResponse(message)) {
|
||||
throw err;
|
||||
}
|
||||
}
|
||||
|
||||
const retryMessage = await completeImage(disableReasoningForImageRetryPayload);
|
||||
const text = coerceImageAssistantText({
|
||||
message: retryMessage,
|
||||
provider: model.provider,
|
||||
model: model.id,
|
||||
});
|
||||
return { text, model: model.id };
|
||||
} finally {
|
||||
clearTimeout(timeout);
|
||||
});
|
||||
const text = coerceImageAssistantText({
|
||||
message,
|
||||
provider: model.provider,
|
||||
model: model.id,
|
||||
});
|
||||
return { text, model: model.id };
|
||||
}
|
||||
}
|
||||
|
||||
export async function describeImageWithModel(
|
||||
|
||||
Reference in New Issue
Block a user