fix: short-circuit live model switch fallback redirects (#72375)

This commit is contained in:
Vincent Koc
2026-04-26 14:45:02 -07:00
committed by GitHub
parent fa85e6c26e
commit 43a003b8a0
3 changed files with 40 additions and 13 deletions

View File

@@ -265,6 +265,7 @@ That means fallback retries have to coordinate with live model switching:
- System-driven model changes such as fallback rotation, heartbeat overrides, or compaction never mark a pending live switch on their own.
- Before a fallback retry starts, the reply runner persists the selected fallback override fields to the session entry.
- Live-session reconciliation prefers persisted session overrides over stale runtime model fields.
- If a live-switch error points at a later candidate in the active fallback chain, OpenClaw jumps directly to that selected model instead of walking unrelated candidates first.
- If the fallback attempt fails, the runner rolls back only the override fields it wrote, and only if they still match that failed candidate.
This prevents the classic race:

View File

@@ -663,7 +663,7 @@ describe("runWithModelFallback", () => {
expect(run).toHaveBeenCalledTimes(1);
});
it("treats LiveSessionModelSwitchError as failover on last candidate (#58466)", async () => {
it("treats LiveSessionModelSwitchError as failover on last candidate (#58496 family)", async () => {
const cfg = makeCfg();
const switchError = new LiveSessionModelSwitchError({
provider: "anthropic",
@@ -689,7 +689,7 @@ describe("runWithModelFallback", () => {
expect(run).toHaveBeenCalledTimes(1);
});
it("continues fallback chain past LiveSessionModelSwitchError to next candidate (#58466)", async () => {
it("continues fallback chain past LiveSessionModelSwitchError to next candidate (#58496 family)", async () => {
const cfg = makeCfg();
const switchError = new LiveSessionModelSwitchError({
provider: "anthropic",
@@ -756,6 +756,30 @@ describe("runWithModelFallback", () => {
]);
});
it("does not redirect stale live-session switch errors back to the current candidate (#58496 family)", async () => {
const cfg = makeCfg();
const switchError = new LiveSessionModelSwitchError({
provider: "openai",
model: "gpt-4.1-mini",
});
const run = vi.fn().mockRejectedValueOnce(switchError).mockResolvedValueOnce("ok");
const result = await runWithModelFallback({
cfg,
provider: "openai",
model: "gpt-4.1-mini",
run,
});
expect(result.result).toBe("ok");
expect(result.provider).toBe("anthropic");
expect(result.model).toBe("claude-haiku-3-5");
expect(run.mock.calls).toEqual([
["openai", "gpt-4.1-mini"],
["anthropic", "claude-haiku-3-5"],
]);
});
it("falls back on auth errors", async () => {
await expectFallsBackToHaiku({
provider: "openai",

View File

@@ -326,16 +326,19 @@ function recordFailedCandidateAttempt(params: {
});
}
function findLaterLiveSessionModelSwitchCandidateIndex(params: {
function findLiveSessionModelSwitchRedirectIndex(params: {
error: LiveSessionModelSwitchError;
candidates: ModelCandidate[];
currentIndex: number;
}): number | null {
const targetKey = modelKey(params.error.provider, params.error.model);
const targetIndex = params.candidates.findIndex(
(candidate) => modelKey(candidate.provider, candidate.model) === targetKey,
);
return targetIndex > params.currentIndex ? targetIndex : null;
for (let i = params.currentIndex + 1; i < params.candidates.length; i += 1) {
const candidate = params.candidates[i];
if (modelKey(candidate.provider, candidate.model) === targetKey) {
return i;
}
}
return null;
}
function throwFallbackFailureSummary(params: {
@@ -930,13 +933,12 @@ export async function runWithModelFallback<T>(params: {
model: candidate.model,
}) ?? err;
// LiveSessionModelSwitchError during fallback means the session's
// persisted model conflicts with this fallback candidate. Treat it
// as a known failover so the chain continues to the next candidate
// instead of re-throwing and triggering infinite retry loops in the
// outer runner. (#58466)
// LiveSessionModelSwitchError during fallback may point at a later
// candidate that is already the active live-session selection. Jump
// there directly. Stale same/earlier targets remain a known failover
// so the outer runner cannot loop on the conflicting model.
if (err instanceof LiveSessionModelSwitchError) {
const liveSwitchTargetIndex = findLaterLiveSessionModelSwitchCandidateIndex({
const liveSwitchTargetIndex = findLiveSessionModelSwitchRedirectIndex({
error: err,
candidates,
currentIndex: i,