From 95916daf8d71d4b8b89a187a78dd7990c36b6482 Mon Sep 17 00:00:00 2001 From: openperf <16864032@qq.com> Date: Mon, 4 May 2026 19:36:02 +0800 Subject: [PATCH] fix(auth-profiles): exclude format rejections from profile cooldown MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A format-classified failure means the provider rejected the request payload shape (e.g. an assistant-prefill 400 when a session transcript ends with a stream-error placeholder turn). That is a per-session transcript-shape problem, not a profile-wide reliability signal. Mark the reason with the existing transport-timeout exclusion so a single bad session no longer cascades to a profile cooldown that takes down every other healthy session sharing the same auth profile or, when all profiles share the same fault, the whole provider for the backoff window. Refs #77228 — addresses the cascading-cooldown amplifier only. The other two items in the same issue (the prefill placeholder leaving transcripts ending in assistant, and the auto-repair filling the JSONL with null-role entries) are separate failure modes and remain open. --- .../run/auth-profile-failure-policy.test.ts | 19 +++++++++++++++++++ .../run/auth-profile-failure-policy.ts | 17 +++++++++++++++-- 2 files changed, 34 insertions(+), 2 deletions(-) diff --git a/src/agents/pi-embedded-runner/run/auth-profile-failure-policy.test.ts b/src/agents/pi-embedded-runner/run/auth-profile-failure-policy.test.ts index f4994d1245b..c499ee6bc24 100644 --- a/src/agents/pi-embedded-runner/run/auth-profile-failure-policy.test.ts +++ b/src/agents/pi-embedded-runner/run/auth-profile-failure-policy.test.ts @@ -39,4 +39,23 @@ describe("resolveAuthProfileFailureReason", () => { }), ).toBeNull(); }); + + it("does not persist request-shape (format) rejections as auth-profile health (#77228)", () => { + // A format rejection (e.g. the github-copilot prefill-strict 400 + // "conversation must end with a user message" reported in #77228) is + // a per-session transcript-shape problem; cascading it to a profile + // cooldown blocks every other healthy session sharing the same auth + // profile and can take down the whole provider for the backoff window. + expect( + resolveAuthProfileFailureReason({ + failoverReason: "format", + }), + ).toBeNull(); + expect( + resolveAuthProfileFailureReason({ + failoverReason: "format", + policy: "shared", + }), + ).toBeNull(); + }); }); diff --git a/src/agents/pi-embedded-runner/run/auth-profile-failure-policy.ts b/src/agents/pi-embedded-runner/run/auth-profile-failure-policy.ts index ddd199ba2bc..8ffd9f2d788 100644 --- a/src/agents/pi-embedded-runner/run/auth-profile-failure-policy.ts +++ b/src/agents/pi-embedded-runner/run/auth-profile-failure-policy.ts @@ -6,8 +6,21 @@ export function resolveAuthProfileFailureReason(params: { failoverReason: FailoverReason | null; policy?: AuthProfileFailurePolicy; }): AuthProfileFailureReason | null { - // Helper-local runs and transport timeouts should not poison shared provider auth health. - if (params.policy === "local" || !params.failoverReason || params.failoverReason === "timeout") { + // Helper-local runs, transport timeouts, and request-shape ("format") rejections + // should not poison shared provider auth health. A `format` failure means the + // provider rejected the request payload (e.g. an assistant-prefill 400 from a + // strict provider when a session transcript ends with a stream-error placeholder + // turn) — that is a per-session transcript-shape problem, not a profile-wide + // reliability signal. Cascading it to a profile cooldown blocks every other + // healthy session sharing the same auth profile and, when all profiles share + // the same fault, takes down the entire provider for the configured backoff + // window (#77228). + if ( + params.policy === "local" || + !params.failoverReason || + params.failoverReason === "timeout" || + params.failoverReason === "format" + ) { return null; } return params.failoverReason;