From 5ca95b2012cac97ccd17bd71e68f25064cb4b1e3 Mon Sep 17 00:00:00 2001 From: Jim Dawdy Date: Sun, 10 May 2026 18:20:48 -0500 Subject: [PATCH] fix(agents): escalate LLM idle timeout to model fallback after profile rotation When the LLM idle watchdog fires (model produced no tokens for N seconds), idleTimedOut is set in handleAssistantFailover but was never passed into resolveRunFailoverDecision. As a result, shouldRotateAssistant saw neither failoverReason nor timedOut (the run-budget timeout) set, returned false, and the decision fell through to continue_normal -- the agent silently froze without surfacing an error or advancing the fallback chain. Fixes #76877 (regression since 2026.4.24). Changes: - failover-policy.ts: add idleTimedOut to AssistantDecisionParams; include it in shouldRotateAssistant and reason selection in resolveRunFailoverDecision - assistant-failover.ts: pass idleTimedOut into resolveRunFailoverDecision - failover-policy.test.ts: 4 new cases for idle timeout path; update existing assistant stage cases with the new required field (idleTimedOut: false) --- .../run/assistant-failover.ts | 1 + .../run/failover-policy.test.ts | 95 +++++++++++++++++++ .../pi-embedded-runner/run/failover-policy.ts | 7 +- 3 files changed, 101 insertions(+), 2 deletions(-) diff --git a/src/agents/pi-embedded-runner/run/assistant-failover.ts b/src/agents/pi-embedded-runner/run/assistant-failover.ts index 00674f20319..1dd0bf17121 100644 --- a/src/agents/pi-embedded-runner/run/assistant-failover.ts +++ b/src/agents/pi-embedded-runner/run/assistant-failover.ts @@ -190,6 +190,7 @@ export async function handleAssistantFailover(params: { failoverFailure: params.failoverFailure, failoverReason: params.failoverReason, timedOut: params.timedOut, + idleTimedOut: params.idleTimedOut, timedOutDuringCompaction: params.timedOutDuringCompaction, timedOutDuringToolExecution: params.timedOutDuringToolExecution, profileRotated: true, diff --git a/src/agents/pi-embedded-runner/run/failover-policy.test.ts b/src/agents/pi-embedded-runner/run/failover-policy.test.ts index 88048e7c6d0..ec8790ed028 100644 --- a/src/agents/pi-embedded-runner/run/failover-policy.test.ts +++ b/src/agents/pi-embedded-runner/run/failover-policy.test.ts @@ -106,6 +106,7 @@ describe("resolveRunFailoverDecision", () => { failoverFailure: false, failoverReason: "rate_limit", timedOut: false, + idleTimedOut: false, timedOutDuringCompaction: false, timedOutDuringToolExecution: false, profileRotated: false, @@ -167,6 +168,7 @@ describe("resolveRunFailoverDecision", () => { failoverFailure: false, failoverReason: "rate_limit", timedOut: false, + idleTimedOut: false, timedOutDuringCompaction: false, timedOutDuringToolExecution: false, profileRotated: true, @@ -187,6 +189,7 @@ describe("resolveRunFailoverDecision", () => { failoverFailure: false, failoverReason: null, timedOut: false, + idleTimedOut: false, timedOutDuringCompaction: false, timedOutDuringToolExecution: false, profileRotated: false, @@ -223,6 +226,7 @@ describe("resolveRunFailoverDecision", () => { failoverFailure: false, failoverReason: null, timedOut: true, + idleTimedOut: false, timedOutDuringCompaction: false, timedOutDuringToolExecution: true, profileRotated: false, @@ -242,6 +246,7 @@ describe("resolveRunFailoverDecision", () => { failoverFailure: false, failoverReason: null, timedOut: true, + idleTimedOut: false, timedOutDuringCompaction: false, timedOutDuringToolExecution: true, profileRotated: true, @@ -261,6 +266,7 @@ describe("resolveRunFailoverDecision", () => { failoverFailure: false, failoverReason: null, timedOut: true, + idleTimedOut: false, timedOutDuringCompaction: false, timedOutDuringToolExecution: false, profileRotated: false, @@ -281,6 +287,95 @@ describe("resolveRunFailoverDecision", () => { failoverFailure: false, failoverReason: null, timedOut: true, + idleTimedOut: false, + timedOutDuringCompaction: false, + timedOutDuringToolExecution: false, + profileRotated: false, + }), + ).toEqual({ + action: "surface_error", + reason: null, + }); + }); +}); + + it("rotates profile on LLM idle timeout before falling back", () => { + // idleTimedOut = model produced no tokens; no provider API error was classified. + // Before this fix, failoverReason=null + timedOut=false → shouldRotateAssistant=false + // → continue_normal, causing a silent agent freeze. + expect( + resolveRunFailoverDecision({ + stage: "assistant", + aborted: false, + externalAbort: false, + fallbackConfigured: true, + failoverFailure: false, + failoverReason: null, + timedOut: false, + idleTimedOut: true, + timedOutDuringCompaction: false, + timedOutDuringToolExecution: false, + profileRotated: false, + }), + ).toEqual({ + action: "rotate_profile", + reason: null, + }); + }); + + it("escalates LLM idle timeout to fallback_model after profile rotation is exhausted", () => { + expect( + resolveRunFailoverDecision({ + stage: "assistant", + aborted: false, + externalAbort: false, + fallbackConfigured: true, + failoverFailure: false, + failoverReason: null, + timedOut: false, + idleTimedOut: true, + timedOutDuringCompaction: false, + timedOutDuringToolExecution: false, + profileRotated: true, + }), + ).toEqual({ + action: "fallback_model", + reason: "timeout", + }); + }); + + it("surfaces error on LLM idle timeout when no fallback is configured and rotation is exhausted", () => { + expect( + resolveRunFailoverDecision({ + stage: "assistant", + aborted: false, + externalAbort: false, + fallbackConfigured: false, + failoverFailure: false, + failoverReason: null, + timedOut: false, + idleTimedOut: true, + timedOutDuringCompaction: false, + timedOutDuringToolExecution: false, + profileRotated: true, + }), + ).toEqual({ + action: "surface_error", + reason: null, + }); + }); + + it("does not escalate LLM idle timeout after an external abort", () => { + expect( + resolveRunFailoverDecision({ + stage: "assistant", + aborted: false, + externalAbort: true, + fallbackConfigured: true, + failoverFailure: false, + failoverReason: null, + timedOut: false, + idleTimedOut: true, timedOutDuringCompaction: false, timedOutDuringToolExecution: false, profileRotated: false, diff --git a/src/agents/pi-embedded-runner/run/failover-policy.ts b/src/agents/pi-embedded-runner/run/failover-policy.ts index 82619369f0c..658a698054b 100644 --- a/src/agents/pi-embedded-runner/run/failover-policy.ts +++ b/src/agents/pi-embedded-runner/run/failover-policy.ts @@ -57,6 +57,7 @@ type AssistantDecisionParams = { failoverFailure: boolean; failoverReason: FailoverReason | null; timedOut: boolean; + idleTimedOut: boolean; timedOutDuringCompaction: boolean; timedOutDuringToolExecution: boolean; profileRotated: boolean; @@ -98,7 +99,8 @@ function shouldRotateAssistant(params: AssistantDecisionParams): boolean { } return ( (!params.aborted && (params.failoverFailure || params.failoverReason !== null)) || - (params.timedOut && !params.timedOutDuringCompaction && !params.timedOutDuringToolExecution) + (params.timedOut && !params.timedOutDuringCompaction && !params.timedOutDuringToolExecution) || + params.idleTimedOut ); } @@ -178,7 +180,8 @@ export function resolveRunFailoverDecision(params: RunFailoverDecisionParams): R if (assistantShouldRotate && params.fallbackConfigured) { return { action: "fallback_model", - reason: params.timedOut ? "timeout" : (params.failoverReason ?? "unknown"), + reason: + params.timedOut || params.idleTimedOut ? "timeout" : (params.failoverReason ?? "unknown"), }; } if (!assistantShouldRotate) {