diff --git a/src/agents/pi-embedded-runner/run/auth-profile-failure-policy.test.ts b/src/agents/pi-embedded-runner/run/auth-profile-failure-policy.test.ts index f4994d1245b..c499ee6bc24 100644 --- a/src/agents/pi-embedded-runner/run/auth-profile-failure-policy.test.ts +++ b/src/agents/pi-embedded-runner/run/auth-profile-failure-policy.test.ts @@ -39,4 +39,23 @@ describe("resolveAuthProfileFailureReason", () => { }), ).toBeNull(); }); + + it("does not persist request-shape (format) rejections as auth-profile health (#77228)", () => { + // A format rejection (e.g. the github-copilot prefill-strict 400 + // "conversation must end with a user message" reported in #77228) is + // a per-session transcript-shape problem; cascading it to a profile + // cooldown blocks every other healthy session sharing the same auth + // profile and can take down the whole provider for the backoff window. + expect( + resolveAuthProfileFailureReason({ + failoverReason: "format", + }), + ).toBeNull(); + expect( + resolveAuthProfileFailureReason({ + failoverReason: "format", + policy: "shared", + }), + ).toBeNull(); + }); }); diff --git a/src/agents/pi-embedded-runner/run/auth-profile-failure-policy.ts b/src/agents/pi-embedded-runner/run/auth-profile-failure-policy.ts index ddd199ba2bc..8ffd9f2d788 100644 --- a/src/agents/pi-embedded-runner/run/auth-profile-failure-policy.ts +++ b/src/agents/pi-embedded-runner/run/auth-profile-failure-policy.ts @@ -6,8 +6,21 @@ export function resolveAuthProfileFailureReason(params: { failoverReason: FailoverReason | null; policy?: AuthProfileFailurePolicy; }): AuthProfileFailureReason | null { - // Helper-local runs and transport timeouts should not poison shared provider auth health. - if (params.policy === "local" || !params.failoverReason || params.failoverReason === "timeout") { + // Helper-local runs, transport timeouts, and request-shape ("format") rejections + // should not poison shared provider auth health. A `format` failure means the + // provider rejected the request payload (e.g. an assistant-prefill 400 from a + // strict provider when a session transcript ends with a stream-error placeholder + // turn) — that is a per-session transcript-shape problem, not a profile-wide + // reliability signal. Cascading it to a profile cooldown blocks every other + // healthy session sharing the same auth profile and, when all profiles share + // the same fault, takes down the entire provider for the configured backoff + // window (#77228). + if ( + params.policy === "local" || + !params.failoverReason || + params.failoverReason === "timeout" || + params.failoverReason === "format" + ) { return null; } return params.failoverReason;