From 43a003b8a0629ad2c73947b2a863d99dd506bcb9 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sun, 26 Apr 2026 14:45:02 -0700 Subject: [PATCH] fix: short-circuit live model switch fallback redirects (#72375) --- docs/concepts/model-failover.md | 1 + src/agents/model-fallback.test.ts | 28 ++++++++++++++++++++++++++-- src/agents/model-fallback.ts | 24 +++++++++++++----------- 3 files changed, 40 insertions(+), 13 deletions(-) diff --git a/docs/concepts/model-failover.md b/docs/concepts/model-failover.md index 3b200709505..2c30b582d19 100644 --- a/docs/concepts/model-failover.md +++ b/docs/concepts/model-failover.md @@ -265,6 +265,7 @@ That means fallback retries have to coordinate with live model switching: - System-driven model changes such as fallback rotation, heartbeat overrides, or compaction never mark a pending live switch on their own. - Before a fallback retry starts, the reply runner persists the selected fallback override fields to the session entry. - Live-session reconciliation prefers persisted session overrides over stale runtime model fields. +- If a live-switch error points at a later candidate in the active fallback chain, OpenClaw jumps directly to that selected model instead of walking unrelated candidates first. - If the fallback attempt fails, the runner rolls back only the override fields it wrote, and only if they still match that failed candidate. This prevents the classic race: diff --git a/src/agents/model-fallback.test.ts b/src/agents/model-fallback.test.ts index 663e1ad32cd..f0bca92435e 100644 --- a/src/agents/model-fallback.test.ts +++ b/src/agents/model-fallback.test.ts @@ -663,7 +663,7 @@ describe("runWithModelFallback", () => { expect(run).toHaveBeenCalledTimes(1); }); - it("treats LiveSessionModelSwitchError as failover on last candidate (#58466)", async () => { + it("treats LiveSessionModelSwitchError as failover on last candidate (#58496 family)", async () => { const cfg = makeCfg(); const switchError = new LiveSessionModelSwitchError({ provider: "anthropic", @@ -689,7 +689,7 @@ describe("runWithModelFallback", () => { expect(run).toHaveBeenCalledTimes(1); }); - it("continues fallback chain past LiveSessionModelSwitchError to next candidate (#58466)", async () => { + it("continues fallback chain past LiveSessionModelSwitchError to next candidate (#58496 family)", async () => { const cfg = makeCfg(); const switchError = new LiveSessionModelSwitchError({ provider: "anthropic", @@ -756,6 +756,30 @@ describe("runWithModelFallback", () => { ]); }); + it("does not redirect stale live-session switch errors back to the current candidate (#58496 family)", async () => { + const cfg = makeCfg(); + const switchError = new LiveSessionModelSwitchError({ + provider: "openai", + model: "gpt-4.1-mini", + }); + const run = vi.fn().mockRejectedValueOnce(switchError).mockResolvedValueOnce("ok"); + + const result = await runWithModelFallback({ + cfg, + provider: "openai", + model: "gpt-4.1-mini", + run, + }); + + expect(result.result).toBe("ok"); + expect(result.provider).toBe("anthropic"); + expect(result.model).toBe("claude-haiku-3-5"); + expect(run.mock.calls).toEqual([ + ["openai", "gpt-4.1-mini"], + ["anthropic", "claude-haiku-3-5"], + ]); + }); + it("falls back on auth errors", async () => { await expectFallsBackToHaiku({ provider: "openai", diff --git a/src/agents/model-fallback.ts b/src/agents/model-fallback.ts index 5bc291b8fc9..40c3d1c5ebb 100644 --- a/src/agents/model-fallback.ts +++ b/src/agents/model-fallback.ts @@ -326,16 +326,19 @@ function recordFailedCandidateAttempt(params: { }); } -function findLaterLiveSessionModelSwitchCandidateIndex(params: { +function findLiveSessionModelSwitchRedirectIndex(params: { error: LiveSessionModelSwitchError; candidates: ModelCandidate[]; currentIndex: number; }): number | null { const targetKey = modelKey(params.error.provider, params.error.model); - const targetIndex = params.candidates.findIndex( - (candidate) => modelKey(candidate.provider, candidate.model) === targetKey, - ); - return targetIndex > params.currentIndex ? targetIndex : null; + for (let i = params.currentIndex + 1; i < params.candidates.length; i += 1) { + const candidate = params.candidates[i]; + if (modelKey(candidate.provider, candidate.model) === targetKey) { + return i; + } + } + return null; } function throwFallbackFailureSummary(params: { @@ -930,13 +933,12 @@ export async function runWithModelFallback(params: { model: candidate.model, }) ?? err; - // LiveSessionModelSwitchError during fallback means the session's - // persisted model conflicts with this fallback candidate. Treat it - // as a known failover so the chain continues to the next candidate - // instead of re-throwing and triggering infinite retry loops in the - // outer runner. (#58466) + // LiveSessionModelSwitchError during fallback may point at a later + // candidate that is already the active live-session selection. Jump + // there directly. Stale same/earlier targets remain a known failover + // so the outer runner cannot loop on the conflicting model. if (err instanceof LiveSessionModelSwitchError) { - const liveSwitchTargetIndex = findLaterLiveSessionModelSwitchCandidateIndex({ + const liveSwitchTargetIndex = findLiveSessionModelSwitchRedirectIndex({ error: err, candidates, currentIndex: i,