From 9d3c56d236fbbf706ca676c4ef247f6697ae8d83 Mon Sep 17 00:00:00 2001 From: Altay Date: Sat, 25 Apr 2026 01:37:28 +0300 Subject: [PATCH] fix: don't classify 400/422 with no body as format error (#67024) * fix: keep no-body 400/422 failover errors out of format * fix: keep failover changelog entry in unreleased fixes --- CHANGELOG.md | 4 + src/agents/failover-error.test.ts | 109 +++++++++++++++- src/agents/failover-error.ts | 122 ++++++++++++++++-- ...dded-helpers.isbillingerrormessage.test.ts | 26 +++- src/agents/pi-embedded-helpers/errors.ts | 36 ++++++ 5 files changed, 280 insertions(+), 17 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4d3f7304fbe..80eddf22c4b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,10 @@ Docs: https://docs.openclaw.ai - Matrix: require full cross-signing identity trust for self-device verification and add `openclaw matrix verify self` so operators can establish that trust from the CLI. (#70401) Thanks @gumadeiras. +### Fixes + +- Agents/failover: stop body-less HTTP 400/422 proxy failures from defaulting to `"format"` classification, so embedded retries surface the opaque provider failure instead of falling into a compaction loop. Fixes #66462. (#67024) Thanks @altaywtf and @HongzhuLiu. + ## 2026.4.24 ### Breaking diff --git a/src/agents/failover-error.test.ts b/src/agents/failover-error.test.ts index da9bc7e2723..86d6f167cb2 100644 --- a/src/agents/failover-error.test.ts +++ b/src/agents/failover-error.test.ts @@ -73,8 +73,105 @@ describe("failover-error", () => { expect(resolveFailoverReasonFromError({ status: 408 })).toBe("timeout"); expect(resolveFailoverReasonFromError({ status: 410 })).toBe("timeout"); expect(resolveFailoverReasonFromError({ status: 499 })).toBe("timeout"); - expect(resolveFailoverReasonFromError({ status: 400 })).toBe("format"); - expect(resolveFailoverReasonFromError({ status: 422 })).toBe("format"); + // 400/422 with no body returns null — avoids triggering a compaction loop + // when the provider returns an empty or wrapper-only 400/422 (e.g. + // transient proxy issue). + expect(resolveFailoverReasonFromError({ status: 400 })).toBeNull(); + expect(resolveFailoverReasonFromError({ status: 422 })).toBeNull(); + expect( + resolveFailoverReasonFromError({ + status: 400, + message: "400 status code (no body)", + }), + ).toBeNull(); + expect( + resolveFailoverReasonFromError({ + status: 422, + message: "HTTP 422: No body", + }), + ).toBeNull(); + expect( + resolveFailoverReasonFromError({ + status: 422, + message: "HTTP 422: No response body", + }), + ).toBeNull(); + expect( + resolveFailoverReasonFromError({ + status: 422, + message: "Error: HTTP 422: No response body", + }), + ).toBeNull(); + expect(resolveFailoverReasonFromError({ message: "400 status code (no body)" })).toBeNull(); + expect(resolveFailoverReasonFromError({ message: "HTTP 422: No body" })).toBeNull(); + expect(resolveFailoverReasonFromError({ message: "HTTP 422: No response body" })).toBeNull(); + expect( + resolveFailoverReasonFromError({ + message: "outer wrapper", + cause: { + status: 422, + message: "HTTP 422: No response body", + }, + }), + ).toBeNull(); + expect( + resolveFailoverReasonFromError({ + status: 422, + message: "check open ai req parameter error", + cause: { + status: 422, + message: "HTTP 422: No response body", + }, + }), + ).toBeNull(); + expect( + resolveFailoverReasonFromError({ + status: 422, + message: "check open ai req parameter error", + cause: new Error("No response body"), + }), + ).toBeNull(); + expect( + resolveFailoverReasonFromError({ + status: 422, + message: "Unprocessable Entity", + error: { + message: "HTTP 422: No response body", + }, + }), + ).toBeNull(); + expect( + resolveFailoverReasonFromError({ + status: 422, + message: "Unprocessable Entity", + cause: { + message: "Unprocessable Entity", + error: { + message: "HTTP 422: No response body", + }, + }, + }), + ).toBeNull(); + expect( + resolveFailoverReasonFromError({ + status: 422, + error: { + message: "missing required property", + }, + cause: {}, + }), + ).toBe("format"); + expect( + resolveFailoverReasonFromError({ + status: 422, + error: { + message: "missing required property", + }, + cause: { + message: "HTTP 422: No response body", + }, + }), + ).toBe("format"); // Transient server errors (500/502/503/504) should trigger failover as timeout. expect(resolveFailoverReasonFromError({ status: 500 })).toBe("timeout"); expect(resolveFailoverReasonFromError({ status: 502 })).toBe("timeout"); @@ -87,6 +184,14 @@ describe("failover-error", () => { expect(resolveFailoverReasonFromError({ status: 529 })).toBe("overloaded"); }); + it("stops on cyclic cause chains", () => { + const first: { cause?: unknown } = {}; + const second: { cause?: unknown } = { cause: first }; + first.cause = second; + + expect(resolveFailoverReasonFromError(first)).toBeNull(); + }); + it("treats session-specific HTTP 410s differently from generic 410s", () => { expect( resolveFailoverReasonFromError({ diff --git a/src/agents/failover-error.ts b/src/agents/failover-error.ts index 94e43251435..c66ab11ee3b 100644 --- a/src/agents/failover-error.ts +++ b/src/agents/failover-error.ts @@ -1,6 +1,7 @@ import { readErrorName } from "../infra/errors.js"; import { classifyFailoverSignal, + isUnclassifiedNoBodyHttpSignal, type FailoverClassification, type FailoverSignal, } from "./pi-embedded-helpers/errors.js"; @@ -8,6 +9,7 @@ import { isTimeoutErrorMessage } from "./pi-embedded-helpers/errors.js"; import type { FailoverReason } from "./pi-embedded-helpers/types.js"; const ABORT_TIMEOUT_RE = /request was aborted|request aborted/i; +const MAX_FAILOVER_CAUSE_DEPTH = 25; export class FailoverError extends Error { readonly reason: FailoverReason; @@ -186,11 +188,14 @@ function getErrorMessage(err: unknown): string { return findErrorProperty(err, readDirectErrorMessage) ?? ""; } -function getErrorCause(err: unknown): unknown { - if (!err || typeof err !== "object" || !("cause" in err)) { - return undefined; - } - return (err as { cause?: unknown }).cause; +function normalizeDirectErrorSignal(err: unknown): FailoverSignal { + const message = readDirectErrorMessage(err); + return { + status: readDirectStatusCode(err), + code: readDirectErrorCode(err), + message: message || undefined, + provider: readDirectProvider(err), + }; } function hasTimeoutHint(err: unknown): boolean { @@ -239,7 +244,72 @@ function normalizeErrorSignal(err: unknown): FailoverSignal { }; } -function resolveFailoverClassificationFromError(err: unknown): FailoverClassification | null { +function getNestedErrorCandidates(err: unknown): unknown[] { + if (!err || typeof err !== "object") { + return []; + } + const candidate = err as { error?: unknown; cause?: unknown }; + return [candidate.error, candidate.cause].filter( + (value): value is unknown => value !== undefined && value !== err, + ); +} + +function isFormatClassification(classification: FailoverClassification | null): boolean { + return classification?.kind === "reason" && classification.reason === "format"; +} + +function decideNestedFormatOverride( + candidate: unknown, + inheritedStatus: number | undefined, + seen: Set, + depth: number, +): boolean | null { + if (depth > MAX_FAILOVER_CAUSE_DEPTH) { + return null; + } + if (candidate && typeof candidate === "object") { + if (seen.has(candidate)) { + return null; + } + seen.add(candidate); + } + + const directSignal = normalizeDirectErrorSignal(candidate); + const nestedCandidates = getNestedErrorCandidates(candidate); + const nestedStatus = directSignal.status ?? inheritedStatus; + const hasDirectMessage = Boolean(directSignal.message?.trim()); + if ( + hasDirectMessage && + isUnclassifiedNoBodyHttpSignal({ ...directSignal, status: nestedStatus }) + ) { + return true; + } + if (hasDirectMessage && (nestedCandidates.length === 0 || classifyFailoverSignal(directSignal))) { + return false; + } + for (const nestedCandidate of nestedCandidates) { + const decision = decideNestedFormatOverride(nestedCandidate, nestedStatus, seen, depth + 1); + if (decision !== null) { + return decision; + } + } + return null; +} + +function resolveFailoverClassificationFromErrorInternal( + err: unknown, + seen: Set, + depth: number, +): FailoverClassification | null { + if (depth > MAX_FAILOVER_CAUSE_DEPTH) { + return null; + } + if (err && typeof err === "object") { + if (seen.has(err)) { + return null; + } + seen.add(err); + } if (isFailoverError(err)) { return { kind: "reason", @@ -247,14 +317,36 @@ function resolveFailoverClassificationFromError(err: unknown): FailoverClassific }; } - const classification = classifyFailoverSignal(normalizeErrorSignal(err)); + const signal = normalizeErrorSignal(err); + const classification = classifyFailoverSignal(signal); + const nestedCandidates = getNestedErrorCandidates(err); + if (!classification || classification.kind === "context_overflow") { - // Let wrapped causes override parent timeout/overflow guesses. - const cause = getErrorCause(err); - if (cause && cause !== err) { - const causeClassification = resolveFailoverClassificationFromError(cause); - if (causeClassification) { - return causeClassification; + for (const candidate of nestedCandidates) { + const nestedClassification = resolveFailoverClassificationFromErrorInternal( + candidate, + seen, + depth + 1, + ); + if (nestedClassification) { + return nestedClassification; + } + } + } + + if (isFormatClassification(classification)) { + for (const candidate of nestedCandidates) { + const shouldClearFormat = decideNestedFormatOverride( + candidate, + signal.status, + seen, + depth + 1, + ); + if (shouldClearFormat === true) { + return null; + } + if (shouldClearFormat === false) { + break; } } } @@ -272,6 +364,10 @@ function resolveFailoverClassificationFromError(err: unknown): FailoverClassific return null; } +function resolveFailoverClassificationFromError(err: unknown): FailoverClassification | null { + return resolveFailoverClassificationFromErrorInternal(err, new Set(), 0); +} + export function resolveFailoverReasonFromError(err: unknown): FailoverReason | null { return failoverReasonFromClassification(resolveFailoverClassificationFromError(err)); } diff --git a/src/agents/pi-embedded-helpers.isbillingerrormessage.test.ts b/src/agents/pi-embedded-helpers.isbillingerrormessage.test.ts index e8cd0dbc88d..705461d5c3d 100644 --- a/src/agents/pi-embedded-helpers.isbillingerrormessage.test.ts +++ b/src/agents/pi-embedded-helpers.isbillingerrormessage.test.ts @@ -575,8 +575,21 @@ describe("classifyFailoverReasonFromHttpStatus", () => { expect(classifyFailoverReasonFromHttpStatus(401, "invalid_api_key")).toBe("auth"); }); - it("treats HTTP 422 as format error", () => { - expect(classifyFailoverReasonFromHttpStatus(422)).toBe("format"); + it("treats body-less HTTP 422 as unknown instead of format", () => { + expect(classifyFailoverReasonFromHttpStatus(422)).toBeNull(); + }); + + it("treats no-body HTTP 400/422 wrappers as unknown instead of format", () => { + expect(classifyFailoverReasonFromHttpStatus(400, "No body response")).toBeNull(); + expect(classifyFailoverReasonFromHttpStatus(400, "400 status code (no body)")).toBeNull(); + expect(classifyFailoverReasonFromHttpStatus(422, "HTTP 422: No body")).toBeNull(); + expect(classifyFailoverReasonFromHttpStatus(422, "HTTP 422: No response body")).toBeNull(); + expect( + classifyFailoverReasonFromHttpStatus(422, "Error: HTTP 422: No response body"), + ).toBeNull(); + }); + + it("treats HTTP 422 with an unclassifiable body as format error", () => { expect(classifyFailoverReasonFromHttpStatus(422, "check open ai req parameter error")).toBe( "format", ); @@ -686,6 +699,15 @@ describe("classifyFailoverReason", () => { expect(classifyFailoverReason("HTTP 404: No body")).toBe("model_not_found"); }); + it("keeps HTTP 400/422 no-body wrappers out of the format bucket", () => { + expect(classifyFailoverReason("400 status code (no body)")).toBeNull(); + expect(classifyFailoverReason("HTTP 400: No body")).toBeNull(); + expect(classifyFailoverReason("422 status code (no body)")).toBeNull(); + expect(classifyFailoverReason("HTTP 422: No body")).toBeNull(); + expect(classifyFailoverReason("HTTP 422: No response body")).toBeNull(); + expect(classifyFailoverReason("Error: HTTP 422: No response body")).toBeNull(); + }); + it("preserves session and auth billing signals on HTTP 404 text", () => { expect(classifyFailoverReason("HTTP 404: session not found")).toBe("session_expired"); expect(classifyFailoverReason("HTTP 404: invalid_api_key")).toBe("auth"); diff --git a/src/agents/pi-embedded-helpers/errors.ts b/src/agents/pi-embedded-helpers/errors.ts index 2243c44cf92..44086dba835 100644 --- a/src/agents/pi-embedded-helpers/errors.ts +++ b/src/agents/pi-embedded-helpers/errors.ts @@ -330,6 +330,8 @@ const REPLAY_INVALID_RE = /\bprevious_response_id\b.*\b(?:invalid|unknown|not found|does not exist|expired|mismatch)\b|\btool_(?:use|call)\.(?:input|arguments)\b.*\b(?:missing|required)\b|\bincorrect role information\b|\broles must alternate\b|\binput item id does not belong to this connection\b/i; const SANDBOX_BLOCKED_RE = /\bapproval is required\b|\bapproval timed out\b|\bapproval was denied\b|\bblocked by sandbox\b|\bsandbox\b.*\b(?:blocked|denied|forbidden|disabled|not allowed)\b/i; +const NO_BODY_HTTP_WRAPPER_RE = + /^(?:no body(?: response)?|no response body|status code \(no body\))$/i; function stripErrorPrefix(raw: string): string { return raw.replace(/^error:\s*/i, "").trim(); @@ -342,6 +344,31 @@ function inferSignalStatus(signal: FailoverSignal): number | undefined { return extractLeadingHttpStatus(stripErrorPrefix(signal.message?.trim() ?? ""))?.code; } +function isExplicitNoBodyHttpMessage(raw: string | undefined, status?: number): boolean { + const trimmed = raw?.trim(); + if (!trimmed) { + return false; + } + const candidate = extractLeadingHttpStatus(trimmed) ? trimmed : stripErrorPrefix(trimmed); + const leadingStatus = extractLeadingHttpStatus(candidate); + if (leadingStatus) { + if (typeof status === "number" && leadingStatus.code !== status) { + return false; + } + return NO_BODY_HTTP_WRAPPER_RE.test(leadingStatus.rest); + } + return NO_BODY_HTTP_WRAPPER_RE.test(candidate); +} + +export function isUnclassifiedNoBodyHttpSignal(signal: FailoverSignal): boolean { + const status = inferSignalStatus(signal); + if (status !== 400 && status !== 422) { + return false; + } + const message = signal.message?.trim(); + return !message || isExplicitNoBodyHttpMessage(message, status); +} + function isHtmlErrorResponse(raw: string, status?: number): boolean { const trimmed = raw.trim(); if (!trimmed) { @@ -683,6 +710,15 @@ function classifyFailoverClassificationFromHttpStatus( if (messageClassification) { return messageClassification; } + // When the response has no body at all, or only surfaces as an HTTP wrapper + // like "400 status code (no body)", return null instead of defaulting to + // "format". These shapes are likely transient proxy issues — classifying + // them as "format" triggers a compaction loop that cannot recover. + if (isUnclassifiedNoBodyHttpSignal({ status, message })) { + return null; + } + // Body exists but couldn't be classified — still treat as format error + // since the provider rejected the request schema. return toReasonClassification("format"); } return null;