fix(auth-profiles): exclude format rejections from profile cooldown

A format-classified failure means the provider rejected the request payload
shape (e.g. an assistant-prefill 400 when a session transcript ends with a
stream-error placeholder turn). That is a per-session transcript-shape
problem, not a profile-wide reliability signal. Mark the reason with the
existing transport-timeout exclusion so a single bad session no longer
cascades to a profile cooldown that takes down every other healthy session
sharing the same auth profile or, when all profiles share the same fault,
the whole provider for the backoff window.

Refs #77228 — addresses the cascading-cooldown amplifier only.
The other two items in the same issue (the prefill placeholder leaving
transcripts ending in assistant, and the auto-repair filling the JSONL
with null-role entries) are separate failure modes and remain open.
This commit is contained in:
openperf
2026-05-04 19:36:02 +08:00
parent 1c924c3c12
commit 95916daf8d
2 changed files with 34 additions and 2 deletions

View File

@@ -39,4 +39,23 @@ describe("resolveAuthProfileFailureReason", () => {
}),
).toBeNull();
});
it("does not persist request-shape (format) rejections as auth-profile health (#77228)", () => {
// A format rejection (e.g. the github-copilot prefill-strict 400
// "conversation must end with a user message" reported in #77228) is
// a per-session transcript-shape problem; cascading it to a profile
// cooldown blocks every other healthy session sharing the same auth
// profile and can take down the whole provider for the backoff window.
expect(
resolveAuthProfileFailureReason({
failoverReason: "format",
}),
).toBeNull();
expect(
resolveAuthProfileFailureReason({
failoverReason: "format",
policy: "shared",
}),
).toBeNull();
});
});

View File

@@ -6,8 +6,21 @@ export function resolveAuthProfileFailureReason(params: {
failoverReason: FailoverReason | null;
policy?: AuthProfileFailurePolicy;
}): AuthProfileFailureReason | null {
// Helper-local runs and transport timeouts should not poison shared provider auth health.
if (params.policy === "local" || !params.failoverReason || params.failoverReason === "timeout") {
// Helper-local runs, transport timeouts, and request-shape ("format") rejections
// should not poison shared provider auth health. A `format` failure means the
// provider rejected the request payload (e.g. an assistant-prefill 400 from a
// strict provider when a session transcript ends with a stream-error placeholder
// turn) — that is a per-session transcript-shape problem, not a profile-wide
// reliability signal. Cascading it to a profile cooldown blocks every other
// healthy session sharing the same auth profile and, when all profiles share
// the same fault, takes down the entire provider for the configured backoff
// window (#77228).
if (
params.policy === "local" ||
!params.failoverReason ||
params.failoverReason === "timeout" ||
params.failoverReason === "format"
) {
return null;
}
return params.failoverReason;