From 9611260225ed32ad5a7339c5d0ca205d70370dd9 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Mon, 27 Apr 2026 09:18:54 +0100 Subject: [PATCH] fix: retry primary after auto model fallback --- CHANGELOG.md | 1 + docs/concepts/model-failover.md | 6 +- .../agent-command.live-model-switch.test.ts | 112 +++++++++++++++++- src/agents/agent-command.ts | 33 +++++- 4 files changed, 145 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 98caeae33d0..be9849b719f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -132,6 +132,7 @@ Docs: https://docs.openclaw.ai - CLI/models: include explicitly configured provider models in `openclaw models list --provider ` without requiring the full catalog path, so configured Ollama models are visible. Fixes #65207. Thanks @drzeast-png. - Docker/QA: add observability coverage to the normal Docker aggregate so QA-lab OTEL and Prometheus diagnostics run inside Docker. Thanks @vincentkoc. - Auto-reply: poison inbound message dedupe after replay-unsafe provider/runtime failures so retries stay safe before visible progress but cannot duplicate messages after block output, tool side effects, or session progress. Fixes #69303; keeps #58549 and #64606 as duplicate validation. Thanks @martingarramon, @NikolaFC, and @zeroth-blip. +- Agents/model fallback: clear auto-persisted fallback model overrides before the next run so recovered primaries are retried instead of leaving sessions permanently pinned to the fallback provider. Fixes #72697. Thanks @kibedu. - Agents/model fallback: jump directly to a known later live-session model redirect instead of walking unrelated fallback candidates, while preserving the already-landed live-session/fallback loop guard. Fixes #57471; related loop family already closed via #58496. Thanks @yuxiaoyang2007-prog. - Gateway/Bonjour: keep @homebridge/ciao cancellation handlers registered across advertiser restarts so late probing cancellations cannot crash Linux and other mDNS-churned gateways. Thanks @codex. - Plugins/startup: load the default `memory-core` slot during Gateway startup when permitted so active-memory recall can call `memory_search` and `memory_get` without requiring an explicit `plugins.slots.memory` entry, while preserving `plugins.slots.memory: "none"`. Thanks @codex. diff --git a/docs/concepts/model-failover.md b/docs/concepts/model-failover.md index 2c30b582d19..66eadf6bddd 100644 --- a/docs/concepts/model-failover.md +++ b/docs/concepts/model-failover.md @@ -21,7 +21,7 @@ For a normal text run, OpenClaw evaluates candidates in this order: - Resolve the active session model and auth-profile preference. + Resolve the active session model and auth-profile preference. A session override with `modelOverrideSource: "auto"` came from an earlier fallback, so the next run clears it first and retries the configured primary; user-selected overrides stay sticky. Build the model candidate chain from the currently selected session model, then `agents.defaults.model.fallbacks` in order, ending with the configured primary when the run started from an override. @@ -33,7 +33,7 @@ For a normal text run, OpenClaw evaluates candidates in this order: If that provider is exhausted with a failover-worthy error, move to the next model candidate. - Persist the selected fallback override before the retry starts so other session readers see the same provider/model the runner is about to use. + Persist the selected fallback override before the retry starts so other session readers see the same provider/model the runner is about to use. The persisted model override is marked `modelOverrideSource: "auto"`. If the fallback candidate fails, roll back only the fallback-owned session override fields when they still match that failed candidate. @@ -264,6 +264,8 @@ That means fallback retries have to coordinate with live model switching: - Only explicit user-driven model changes mark a pending live switch. That includes `/model`, `session_status(model=...)`, and `sessions.patch`. - System-driven model changes such as fallback rotation, heartbeat overrides, or compaction never mark a pending live switch on their own. - Before a fallback retry starts, the reply runner persists the selected fallback override fields to the session entry. +- On the next run, auto fallback overrides are cleared before model selection so the configured primary is retried. If it is still unhealthy, the fallback loop records a fresh auto override for that new attempt. +- User model overrides (`modelOverrideSource: "user"`) and legacy overrides without a source field remain persistent across turns. - Live-session reconciliation prefers persisted session overrides over stale runtime model fields. - If a live-switch error points at a later candidate in the active fallback chain, OpenClaw jumps directly to that selected model instead of walking unrelated candidates first. - If the fallback attempt fails, the runner rolls back only the override fields it wrote, and only if they still match that failed candidate. diff --git a/src/agents/agent-command.live-model-switch.test.ts b/src/agents/agent-command.live-model-switch.test.ts index 03bbef69dfc..21f1cc5a1ca 100644 --- a/src/agents/agent-command.live-model-switch.test.ts +++ b/src/agents/agent-command.live-model-switch.test.ts @@ -15,11 +15,15 @@ const state = vi.hoisted(() => ({ registerAgentRunContextMock: vi.fn(), clearAgentRunContextMock: vi.fn(), updateSessionStoreAfterAgentRunMock: vi.fn(), + updateSessionStoreFileMock: vi.fn(), deliverAgentCommandResultMock: vi.fn(), clearSessionAuthProfileOverrideMock: vi.fn(), + applyModelOverrideToSessionEntryMock: vi.fn((..._args: unknown[]) => ({ updated: false })), authProfileStoreMock: { profiles: {} } as { profiles: Record }, sessionEntryMock: undefined as unknown, sessionStoreMock: undefined as unknown, + sessionStorePathMock: undefined as string | undefined, + persistedSessionStoreMock: undefined as Record | undefined, })); vi.mock("./model-fallback.js", () => ({ @@ -73,7 +77,7 @@ vi.mock("./command/session.js", () => ({ skillsSnapshot: { prompt: "", skills: [], version: 0 }, }, sessionStore: state.sessionStoreMock, - storePath: undefined, + storePath: state.sessionStorePathMock, isNewSession: false, persistedThinking: undefined, persistedVerbose: undefined, @@ -177,6 +181,10 @@ vi.mock("../config/sessions.js", () => ({ ), })); +vi.mock("../config/sessions/store.js", () => ({ + updateSessionStore: (...args: unknown[]) => state.updateSessionStoreFileMock(...args), +})); + vi.mock("../config/sessions/transcript-resolve.runtime.js", () => ({ resolveSessionTranscriptFile: async () => ({ sessionFile: "/tmp/session.jsonl", @@ -231,7 +239,8 @@ vi.mock("../sessions/level-overrides.js", () => ({ })); vi.mock("../sessions/model-overrides.js", () => ({ - applyModelOverrideToSessionEntry: () => ({ updated: false }), + applyModelOverrideToSessionEntry: (...args: unknown[]) => + state.applyModelOverrideToSessionEntryMock(...args), })); vi.mock("../sessions/send-policy.js", () => ({ @@ -400,6 +409,38 @@ function expectFallbackOverrideCalls(first: boolean, second: boolean) { }); } +function useRealisticDefaultModelOverrideReset() { + state.applyModelOverrideToSessionEntryMock.mockImplementation((params: unknown) => { + const { entry, selection } = params as { + entry: { + providerOverride?: string; + modelOverride?: string; + modelOverrideSource?: string; + authProfileOverride?: string; + authProfileOverrideSource?: string; + }; + selection: { isDefault?: boolean }; + }; + if (!selection.isDefault) { + return { updated: false }; + } + const before = { ...entry }; + delete entry.providerOverride; + delete entry.modelOverride; + delete entry.modelOverrideSource; + delete entry.authProfileOverride; + delete entry.authProfileOverrideSource; + return { + updated: + before.providerOverride !== entry.providerOverride || + before.modelOverride !== entry.modelOverride || + before.modelOverrideSource !== entry.modelOverrideSource || + before.authProfileOverride !== entry.authProfileOverride || + before.authProfileOverrideSource !== entry.authProfileOverrideSource, + }; + }); +} + describe("agentCommand – LiveSessionModelSwitchError retry", () => { beforeEach(() => { vi.clearAllMocks(); @@ -430,6 +471,13 @@ describe("agentCommand – LiveSessionModelSwitchError retry", () => { state.authProfileStoreMock = { profiles: {} }; state.sessionEntryMock = undefined; state.sessionStoreMock = undefined; + state.sessionStorePathMock = undefined; + state.persistedSessionStoreMock = undefined; + state.applyModelOverrideToSessionEntryMock.mockReturnValue({ updated: false }); + state.updateSessionStoreFileMock.mockImplementation( + async (_path: string, fn: (store: Record) => unknown) => + fn(state.persistedSessionStoreMock ?? {}), + ); state.deliverAgentCommandResultMock.mockResolvedValue(undefined); state.updateSessionStoreAfterAgentRunMock.mockResolvedValue(undefined); }); @@ -642,4 +690,64 @@ describe("agentCommand – LiveSessionModelSwitchError retry", () => { expectFallbackOverrideCalls(false, true); }); + + it("clears auto-fallback model overrides before the next command retries primary", async () => { + useRealisticDefaultModelOverrideReset(); + + const sessionEntry = { + sessionId: "session-1", + updatedAt: Date.now(), + providerOverride: "openai", + modelOverride: "gpt-5.4", + modelOverrideSource: "auto", + authProfileOverride: "openai:default", + authProfileOverrideSource: "auto", + skillsSnapshot: { prompt: "", skills: [], version: 0 }, + }; + state.sessionEntryMock = sessionEntry; + state.sessionStoreMock = { "agent:main": sessionEntry }; + state.sessionStorePathMock = "/tmp/sessions.json"; + state.persistedSessionStoreMock = { "agent:main": { ...sessionEntry } }; + state.runWithModelFallbackMock.mockImplementation(async (params: FallbackRunnerParams) => { + const result = await params.run(params.provider, params.model); + return { + result, + provider: params.provider, + model: params.model, + attempts: [], + }; + }); + state.runAgentAttemptMock.mockResolvedValue(makeSuccessResult("anthropic", "claude")); + + await agentCommand({ + message: "hello", + sessionKey: "agent:main", + senderIsOwner: true, + }); + + const fallbackParams = state.runWithModelFallbackMock.mock.calls[0]?.[0] as + | FallbackRunnerParams + | undefined; + expect(fallbackParams).toMatchObject({ + provider: "anthropic", + model: "claude", + }); + expect(state.resolveEffectiveModelFallbacksMock).toHaveBeenCalledWith( + expect.objectContaining({ hasSessionModelOverride: false }), + ); + const activeStore = state.sessionStoreMock as Record; + const persistedStore = state.persistedSessionStoreMock as Record; + expect(activeStore["agent:main"]).toMatchObject({ + sessionId: "session-1", + }); + expect(activeStore["agent:main"].providerOverride).toBeUndefined(); + expect(activeStore["agent:main"].modelOverride).toBeUndefined(); + expect(activeStore["agent:main"].modelOverrideSource).toBeUndefined(); + expect(persistedStore["agent:main"]).toMatchObject({ + sessionId: "session-1", + }); + expect(persistedStore["agent:main"].providerOverride).toBeUndefined(); + expect(persistedStore["agent:main"].modelOverride).toBeUndefined(); + expect(persistedStore["agent:main"].modelOverrideSource).toBeUndefined(); + }); }); diff --git a/src/agents/agent-command.ts b/src/agents/agent-command.ts index af28a3d17dd..aefc34ef313 100644 --- a/src/agents/agent-command.ts +++ b/src/agents/agent-command.ts @@ -191,6 +191,7 @@ type PersistSessionEntryParams = { type OverrideFieldClearedByDelete = | "providerOverride" | "modelOverride" + | "modelOverrideSource" | "authProfileOverride" | "authProfileOverrideSource" | "authProfileOverrideCompactionCount" @@ -202,6 +203,7 @@ type OverrideFieldClearedByDelete = const OVERRIDE_FIELDS_CLEARED_BY_DELETE: OverrideFieldClearedByDelete[] = [ "providerOverride", "modelOverride", + "modelOverrideSource", "authProfileOverride", "authProfileOverrideSource", "authProfileOverrideCompactionCount", @@ -722,7 +724,28 @@ async function agentCommandInternal( allowAnyModel = allowed.allowAny ?? false; } - if (sessionEntry && sessionStore && sessionKey && hasStoredOverride) { + const hasAutoStoredOverride = + sessionEntry?.modelOverrideSource === "auto" && Boolean(sessionEntry.modelOverride?.trim()); + + if (sessionEntry && sessionStore && sessionKey && hasAutoStoredOverride) { + const { updated } = applyModelOverrideToSessionEntry({ + entry: sessionEntry, + selection: { provider: defaultProvider, model: defaultModel, isDefault: true }, + }); + if (updated) { + sessionStore[sessionKey] = sessionEntry; + if (storePath) { + await persistSessionEntry({ + sessionStore, + sessionKey, + storePath, + entry: sessionEntry, + }); + } + } + } + + if (sessionEntry && sessionStore && sessionKey && hasStoredOverride && !hasAutoStoredOverride) { const entry = sessionEntry; const overrideProvider = sessionEntry.providerOverride?.trim() || defaultProvider; const overrideModel = sessionEntry.modelOverride?.trim(); @@ -746,8 +769,12 @@ async function agentCommandInternal( } } - const storedProviderOverride = sessionEntry?.providerOverride?.trim(); - let storedModelOverride = sessionEntry?.modelOverride?.trim(); + const storedProviderOverride = hasAutoStoredOverride + ? undefined + : sessionEntry?.providerOverride?.trim(); + let storedModelOverride = hasAutoStoredOverride + ? undefined + : sessionEntry?.modelOverride?.trim(); if (storedModelOverride) { const candidateProvider = storedProviderOverride || defaultProvider; const normalizedStored = normalizeModelRef(candidateProvider, storedModelOverride);