fix(agents): escalate LLM idle timeout to model fallback after profile rotation

When the LLM idle watchdog fires (model produced no tokens for N seconds),
idleTimedOut is set in handleAssistantFailover but was never passed into
resolveRunFailoverDecision. As a result, shouldRotateAssistant saw neither
failoverReason nor timedOut (the run-budget timeout) set, returned false,
and the decision fell through to continue_normal -- the agent silently froze
without surfacing an error or advancing the fallback chain.

Fixes #76877 (regression since 2026.4.24).

Changes:
- failover-policy.ts: add idleTimedOut to AssistantDecisionParams; include it
  in shouldRotateAssistant and reason selection in resolveRunFailoverDecision
- assistant-failover.ts: pass idleTimedOut into resolveRunFailoverDecision
- failover-policy.test.ts: 4 new cases for idle timeout path; update existing
  assistant stage cases with the new required field (idleTimedOut: false)
This commit is contained in:
Jim Dawdy
2026-05-10 18:20:48 -05:00
committed by Ayaan Zaidi
parent ba17ddaef3
commit 5ca95b2012
3 changed files with 101 additions and 2 deletions

View File

@@ -190,6 +190,7 @@ export async function handleAssistantFailover(params: {
failoverFailure: params.failoverFailure,
failoverReason: params.failoverReason,
timedOut: params.timedOut,
idleTimedOut: params.idleTimedOut,
timedOutDuringCompaction: params.timedOutDuringCompaction,
timedOutDuringToolExecution: params.timedOutDuringToolExecution,
profileRotated: true,

View File

@@ -106,6 +106,7 @@ describe("resolveRunFailoverDecision", () => {
failoverFailure: false,
failoverReason: "rate_limit",
timedOut: false,
idleTimedOut: false,
timedOutDuringCompaction: false,
timedOutDuringToolExecution: false,
profileRotated: false,
@@ -167,6 +168,7 @@ describe("resolveRunFailoverDecision", () => {
failoverFailure: false,
failoverReason: "rate_limit",
timedOut: false,
idleTimedOut: false,
timedOutDuringCompaction: false,
timedOutDuringToolExecution: false,
profileRotated: true,
@@ -187,6 +189,7 @@ describe("resolveRunFailoverDecision", () => {
failoverFailure: false,
failoverReason: null,
timedOut: false,
idleTimedOut: false,
timedOutDuringCompaction: false,
timedOutDuringToolExecution: false,
profileRotated: false,
@@ -223,6 +226,7 @@ describe("resolveRunFailoverDecision", () => {
failoverFailure: false,
failoverReason: null,
timedOut: true,
idleTimedOut: false,
timedOutDuringCompaction: false,
timedOutDuringToolExecution: true,
profileRotated: false,
@@ -242,6 +246,7 @@ describe("resolveRunFailoverDecision", () => {
failoverFailure: false,
failoverReason: null,
timedOut: true,
idleTimedOut: false,
timedOutDuringCompaction: false,
timedOutDuringToolExecution: true,
profileRotated: true,
@@ -261,6 +266,7 @@ describe("resolveRunFailoverDecision", () => {
failoverFailure: false,
failoverReason: null,
timedOut: true,
idleTimedOut: false,
timedOutDuringCompaction: false,
timedOutDuringToolExecution: false,
profileRotated: false,
@@ -281,6 +287,95 @@ describe("resolveRunFailoverDecision", () => {
failoverFailure: false,
failoverReason: null,
timedOut: true,
idleTimedOut: false,
timedOutDuringCompaction: false,
timedOutDuringToolExecution: false,
profileRotated: false,
}),
).toEqual({
action: "surface_error",
reason: null,
});
});
});
it("rotates profile on LLM idle timeout before falling back", () => {
// idleTimedOut = model produced no tokens; no provider API error was classified.
// Before this fix, failoverReason=null + timedOut=false → shouldRotateAssistant=false
// → continue_normal, causing a silent agent freeze.
expect(
resolveRunFailoverDecision({
stage: "assistant",
aborted: false,
externalAbort: false,
fallbackConfigured: true,
failoverFailure: false,
failoverReason: null,
timedOut: false,
idleTimedOut: true,
timedOutDuringCompaction: false,
timedOutDuringToolExecution: false,
profileRotated: false,
}),
).toEqual({
action: "rotate_profile",
reason: null,
});
});
it("escalates LLM idle timeout to fallback_model after profile rotation is exhausted", () => {
expect(
resolveRunFailoverDecision({
stage: "assistant",
aborted: false,
externalAbort: false,
fallbackConfigured: true,
failoverFailure: false,
failoverReason: null,
timedOut: false,
idleTimedOut: true,
timedOutDuringCompaction: false,
timedOutDuringToolExecution: false,
profileRotated: true,
}),
).toEqual({
action: "fallback_model",
reason: "timeout",
});
});
it("surfaces error on LLM idle timeout when no fallback is configured and rotation is exhausted", () => {
expect(
resolveRunFailoverDecision({
stage: "assistant",
aborted: false,
externalAbort: false,
fallbackConfigured: false,
failoverFailure: false,
failoverReason: null,
timedOut: false,
idleTimedOut: true,
timedOutDuringCompaction: false,
timedOutDuringToolExecution: false,
profileRotated: true,
}),
).toEqual({
action: "surface_error",
reason: null,
});
});
it("does not escalate LLM idle timeout after an external abort", () => {
expect(
resolveRunFailoverDecision({
stage: "assistant",
aborted: false,
externalAbort: true,
fallbackConfigured: true,
failoverFailure: false,
failoverReason: null,
timedOut: false,
idleTimedOut: true,
timedOutDuringCompaction: false,
timedOutDuringToolExecution: false,
profileRotated: false,

View File

@@ -57,6 +57,7 @@ type AssistantDecisionParams = {
failoverFailure: boolean;
failoverReason: FailoverReason | null;
timedOut: boolean;
idleTimedOut: boolean;
timedOutDuringCompaction: boolean;
timedOutDuringToolExecution: boolean;
profileRotated: boolean;
@@ -98,7 +99,8 @@ function shouldRotateAssistant(params: AssistantDecisionParams): boolean {
}
return (
(!params.aborted && (params.failoverFailure || params.failoverReason !== null)) ||
(params.timedOut && !params.timedOutDuringCompaction && !params.timedOutDuringToolExecution)
(params.timedOut && !params.timedOutDuringCompaction && !params.timedOutDuringToolExecution) ||
params.idleTimedOut
);
}
@@ -178,7 +180,8 @@ export function resolveRunFailoverDecision(params: RunFailoverDecisionParams): R
if (assistantShouldRotate && params.fallbackConfigured) {
return {
action: "fallback_model",
reason: params.timedOut ? "timeout" : (params.failoverReason ?? "unknown"),
reason:
params.timedOut || params.idleTimedOut ? "timeout" : (params.failoverReason ?? "unknown"),
};
}
if (!assistantShouldRotate) {