fix: record model fallback steps in trajectories

This commit is contained in:
Peter Steinberger
2026-04-28 05:05:02 +01:00
parent 714f3b59cc
commit ab95812d65
7 changed files with 167 additions and 45 deletions

View File

@@ -44,6 +44,7 @@ Docs: https://docs.openclaw.ai
- Gateway/hooks: route non-delivered hook completion and error summaries to the target agent's main session instead of the default agent session, preserving multi-agent hook isolation. Fixes #24693; carries forward #68667. Thanks @abersonFAC and @bluesky6868.
- Discord: own the Carbon interaction listener and hand off Discord slash/component handling asynchronously, so compaction or long session locks no longer trip `InteractionEventListener` listener timeouts. Fixes #73204. Thanks @slideshow-dingo.
- Compaction/diagnostics: keep unknown compaction failure classifications stable while logging sanitized detail for unclassified provider errors such as missing Ollama provider adapters. Thanks @gzsiang.
- Models/fallbacks: record first-class `model.fallback_step` trajectory events with from/to models, failure detail, chain position, and final outcome so support exports preserve the primary model failure even when a later fallback also fails. Fixes #71744. Thanks @nikolaykazakovvs-ux.
- Gateway/startup: keep value-option foreground starts on the gateway fast path and skip proxy bootstrap unless proxy env is configured, reducing normal gateway startup RSS and avoiding full CLI graph loading. Thanks @vincentkoc.
- Heartbeat/models: show heartbeat model bleed guidance on context-overflow resets when the last runtime model matches configured `heartbeat.model`, so smaller local heartbeat models point users to `isolatedSession` or `lightContext` instead of only compaction-buffer tuning. Fixes #67314. Thanks @Knightmare6890.
- Subagents/models: persist `sessions_spawn.model` and configured subagent models as child-session model overrides before the first turn, so spawned subagents actually run on the requested provider/model instead of reverting to the target agent default. Fixes #73180. Thanks @danielzinhu99.

View File

@@ -64,6 +64,7 @@ Runtime events include:
- `trace.metadata`
- `context.compiled`
- `prompt.submitted`
- `model.fallback_step`, including the source model, next model, failure reason/detail, chain position, and whether fallback advanced, succeeded, or exhausted the chain
- `model.completed`
- `trace.artifacts`
- `session.ended`

View File

@@ -19,6 +19,8 @@ const state = vi.hoisted(() => ({
clearAgentRunContextMock: vi.fn(),
updateSessionStoreAfterAgentRunMock: vi.fn(),
deliverAgentCommandResultMock: vi.fn(),
trajectoryRecordEventMock: vi.fn(),
trajectoryFlushMock: vi.fn(async () => undefined),
clearSessionAuthProfileOverrideMock: vi.fn(),
authProfileStoreMock: { profiles: {} } as { profiles: Record<string, unknown> },
sessionEntryMock: undefined as unknown,
@@ -249,6 +251,15 @@ vi.mock("../terminal/ansi.js", () => ({
sanitizeForLog: (s: string) => s,
}));
vi.mock("../trajectory/runtime.js", () => ({
createTrajectoryRuntimeRecorder: () => ({
enabled: true,
filePath: "/tmp/session.trajectory.jsonl",
recordEvent: (...args: unknown[]) => state.trajectoryRecordEventMock(...args),
flush: () => state.trajectoryFlushMock(),
}),
}));
vi.mock("../utils/message-channel.js", () => ({
resolveMessageChannel: () => "test",
}));
@@ -356,6 +367,7 @@ type FallbackRunnerParams = {
provider: string;
model: string;
run: (provider: string, model: string) => Promise<unknown>;
onFallbackStep?: (step: Record<string, unknown>) => void | Promise<void>;
classifyResult?: (params: {
provider: string;
model: string;
@@ -462,6 +474,7 @@ describe("agentCommand LiveSessionModelSwitchError retry", () => {
state.sessionStoreMock = undefined;
state.deliverAgentCommandResultMock.mockResolvedValue(undefined);
state.updateSessionStoreAfterAgentRunMock.mockResolvedValue(undefined);
state.trajectoryFlushMock.mockResolvedValue(undefined);
});
afterEach(() => {
@@ -493,6 +506,42 @@ describe("agentCommand LiveSessionModelSwitchError retry", () => {
expect(lifecycleEndCalls.length).toBeGreaterThanOrEqual(1);
});
it("records fallback steps to the session trajectory runtime", async () => {
state.runWithModelFallbackMock.mockImplementation(async (params: FallbackRunnerParams) => {
await params.onFallbackStep?.({
fallbackStepType: "fallback_step",
fallbackStepFromModel: "ollama/llama3",
fallbackStepToModel: "openai/gpt-5.4",
fallbackStepFromFailureReason: "overloaded",
fallbackStepChainPosition: 1,
fallbackStepFinalOutcome: "next_fallback",
});
const result = await params.run(params.provider, params.model);
return {
result,
provider: params.provider,
model: params.model,
attempts: [],
};
});
state.runAgentAttemptMock.mockResolvedValue(makeSuccessResult("openai", "gpt-5.4"));
await runBasicAgentCommand();
expect(state.trajectoryRecordEventMock).toHaveBeenCalledWith(
"model.fallback_step",
expect.objectContaining({
fallbackStepType: "fallback_step",
fallbackStepFromModel: "ollama/llama3",
fallbackStepToModel: "openai/gpt-5.4",
fallbackStepFromFailureReason: "overloaded",
fallbackStepChainPosition: 1,
fallbackStepFinalOutcome: "next_fallback",
}),
);
expect(state.trajectoryFlushMock).toHaveBeenCalled();
});
it("propagates non-switch errors without retrying and emits lifecycle error", async () => {
state.runWithModelFallbackMock.mockRejectedValueOnce(new Error("provider down"));

View File

@@ -25,6 +25,7 @@ import { applyModelOverrideToSessionEntry } from "../sessions/model-overrides.js
import { resolveSendPolicy } from "../sessions/send-policy.js";
import { normalizeOptionalString } from "../shared/string-coerce.js";
import { sanitizeForLog } from "../terminal/ansi.js";
import { createTrajectoryRuntimeRecorder } from "../trajectory/runtime.js";
import { resolveMessageChannel } from "../utils/message-channel.js";
import { resolveAgentRuntimeConfig } from "./agent-runtime-config.js";
import {
@@ -909,6 +910,16 @@ async function agentCommandInternal(
let fallbackModel = model;
const MAX_LIVE_SWITCH_RETRIES = 5;
let liveSwitchRetries = 0;
const fallbackTrajectoryRecorder = createTrajectoryRuntimeRecorder({
cfg,
runId,
sessionId,
sessionKey,
sessionFile,
provider,
modelId: model,
workspaceDir,
});
for (;;) {
try {
const spawnedBy = normalizedSpawned.spawnedBy ?? sessionEntry?.spawnedBy;
@@ -928,6 +939,9 @@ async function agentCommandInternal(
runId,
agentDir,
fallbacksOverride: effectiveFallbacksOverride,
onFallbackStep: (step) => {
fallbackTrajectoryRecorder?.recordEvent("model.fallback_step", step);
},
classifyResult: ({ provider, model, result }) =>
classifyEmbeddedPiRunResultForModelFallback({
provider,
@@ -1024,6 +1038,7 @@ async function agentCommandInternal(
},
});
}
await fallbackTrajectoryRecorder?.flush();
throw new Error(
`Exceeded maximum live model switch retries (${MAX_LIVE_SWITCH_RETRIES})`,
{ cause: err },
@@ -1048,6 +1063,7 @@ async function agentCommandInternal(
},
});
}
await fallbackTrajectoryRecorder?.flush();
throw new Error(
`Live model switch rejected: ${sanitizeForLog(err.provider)}/${sanitizeForLog(err.model)} is not in the agent allowlist`,
{ cause: err },
@@ -1094,9 +1110,11 @@ async function agentCommandInternal(
},
});
}
await fallbackTrajectoryRecorder?.flush();
throw err;
}
}
await fallbackTrajectoryRecorder?.flush();
// Update token+model fields in the session store.
if (sessionStore && sessionKey) {

View File

@@ -29,6 +29,41 @@ function buildErrorObservationFields(error?: string): {
type FallbackStepOutcome = "next_fallback" | "succeeded" | "chain_exhausted";
export type ModelFallbackStepFields = {
fallbackStepType: "fallback_step";
fallbackStepFromModel: string;
fallbackStepToModel?: string;
fallbackStepFromFailureReason?: FailoverReason;
fallbackStepFromFailureDetail?: string;
fallbackStepChainPosition?: number;
fallbackStepFinalOutcome: FallbackStepOutcome;
};
export type ModelFallbackDecisionParams = {
decision:
| "skip_candidate"
| "probe_cooldown_candidate"
| "candidate_failed"
| "candidate_succeeded";
runId?: string;
requestedProvider: string;
requestedModel: string;
candidate: ModelCandidate;
attempt?: number;
total?: number;
reason?: FailoverReason | null;
status?: number;
code?: string;
error?: string;
nextCandidate?: ModelCandidate;
isPrimary?: boolean;
requestedModelMatched?: boolean;
fallbackConfigured?: boolean;
allowTransientCooldownProbe?: boolean;
profileCount?: number;
previousAttempts?: FallbackAttempt[];
};
function formatModelRef(candidate: ModelCandidate): string {
return `${candidate.provider}/${candidate.model}`;
}
@@ -41,17 +76,7 @@ function buildFallbackStepFields(params: {
nextCandidate?: ModelCandidate;
attempt?: number;
previousAttempts?: FallbackAttempt[];
}):
| {
fallbackStepType: "fallback_step";
fallbackStepFromModel: string;
fallbackStepToModel?: string;
fallbackStepFromFailureReason?: FailoverReason;
fallbackStepFromFailureDetail?: string;
fallbackStepChainPosition?: number;
fallbackStepFinalOutcome: FallbackStepOutcome;
}
| undefined {
}): ModelFallbackStepFields | undefined {
const lastPreviousAttempt = params.previousAttempts?.at(-1);
if (params.decision === "candidate_succeeded") {
if (!lastPreviousAttempt) {
@@ -89,30 +114,9 @@ function buildFallbackStepFields(params: {
};
}
export function logModelFallbackDecision(params: {
decision:
| "skip_candidate"
| "probe_cooldown_candidate"
| "candidate_failed"
| "candidate_succeeded";
runId?: string;
requestedProvider: string;
requestedModel: string;
candidate: ModelCandidate;
attempt?: number;
total?: number;
reason?: FailoverReason | null;
status?: number;
code?: string;
error?: string;
nextCandidate?: ModelCandidate;
isPrimary?: boolean;
requestedModelMatched?: boolean;
fallbackConfigured?: boolean;
allowTransientCooldownProbe?: boolean;
profileCount?: number;
previousAttempts?: FallbackAttempt[];
}): void {
export function logModelFallbackDecision(
params: ModelFallbackDecisionParams,
): ModelFallbackStepFields | undefined {
const nextText = params.nextCandidate
? `${sanitizeForLog(params.nextCandidate.provider)}/${sanitizeForLog(params.nextCandidate.model)}`
: "none";
@@ -172,4 +176,5 @@ export function logModelFallbackDecision(params: {
`model fallback decision: decision=${params.decision} requested=${sanitizeForLog(params.requestedProvider)}/${sanitizeForLog(params.requestedModel)} ` +
`candidate=${sanitizeForLog(params.candidate.provider)}/${sanitizeForLog(params.candidate.model)} reason=${reasonText}${providerErrorTypeSuffix} next=${nextText}${detailSuffix}`,
});
return fallbackStepFields;
}

View File

@@ -306,8 +306,15 @@ describe("runWithModelFallback probe logic", () => {
.fn()
.mockRejectedValueOnce(Object.assign(new Error("rate limited"), { status: 429 }))
.mockResolvedValueOnce("fallback-ok");
const onFallbackStep = vi.fn();
const fallbackResult = await runPrimaryCandidate(fallbackCfg, fallbackRun);
const fallbackResult = await runWithModelFallback({
cfg: fallbackCfg,
provider: "openai",
model: "gpt-4.1-mini",
run: fallbackRun,
onFallbackStep,
});
await logCapture.flush();
expect(fallbackResult.result).toBe("fallback-ok");
@@ -369,6 +376,26 @@ describe("runWithModelFallback probe logic", () => {
}),
]),
);
expect(onFallbackStep).toHaveBeenCalledWith(
expect.objectContaining({
fallbackStepType: "fallback_step",
fallbackStepFromModel: "openai/gpt-4.1-mini",
fallbackStepToModel: "anthropic/claude-haiku-3-5",
fallbackStepFromFailureReason: "rate_limit",
fallbackStepChainPosition: 1,
fallbackStepFinalOutcome: "next_fallback",
}),
);
expect(onFallbackStep).toHaveBeenCalledWith(
expect.objectContaining({
fallbackStepType: "fallback_step",
fallbackStepFromModel: "openai/gpt-4.1-mini",
fallbackStepToModel: "anthropic/claude-haiku-3-5",
fallbackStepFromFailureReason: "rate_limit",
fallbackStepChainPosition: 2,
fallbackStepFinalOutcome: "succeeded",
}),
);
});
it("probes primary model when cooldown already expired", async () => {

View File

@@ -23,7 +23,11 @@ import {
shouldUseTransientCooldownProbeSlot,
} from "./failover-policy.js";
import { LiveSessionModelSwitchError } from "./live-model-switch-error.js";
import { logModelFallbackDecision } from "./model-fallback-observation.js";
import {
logModelFallbackDecision,
type ModelFallbackDecisionParams,
type ModelFallbackStepFields,
} from "./model-fallback-observation.js";
import type { FallbackAttempt, ModelCandidate } from "./model-fallback.types.js";
import { modelKey, normalizeModelRef } from "./model-selection-normalize.js";
import {
@@ -133,6 +137,8 @@ type ModelFallbackErrorHandler = (attempt: {
total: number;
}) => void | Promise<void>;
type ModelFallbackStepHandler = (step: ModelFallbackStepFields) => void | Promise<void>;
export type ModelFallbackResultClassification =
| {
message: string;
@@ -297,7 +303,7 @@ function recordFailedCandidateAttempt(params: {
isPrimary: boolean;
requestedModelMatched: boolean;
fallbackConfigured: boolean;
}) {
}): ModelFallbackStepFields | undefined {
const described = describeFailoverError(params.error);
params.attempts.push({
provider: params.candidate.provider,
@@ -307,7 +313,7 @@ function recordFailedCandidateAttempt(params: {
status: described.status,
code: described.code,
});
logModelFallbackDecision({
return logModelFallbackDecision({
decision: "candidate_failed",
runId: params.runId,
requestedProvider: params.requestedProvider ?? params.candidate.provider,
@@ -737,6 +743,7 @@ export async function runWithModelFallback<T>(params: {
fallbacksOverride?: string[];
run: ModelFallbackRunFn<T>;
onError?: ModelFallbackErrorHandler;
onFallbackStep?: ModelFallbackStepHandler;
classifyResult?: ModelFallbackResultClassifier<T>;
}): Promise<ModelFallbackRunResult<T>> {
const candidates = resolveFallbackCandidates({
@@ -755,6 +762,20 @@ export async function runWithModelFallback<T>(params: {
const attempts: FallbackAttempt[] = [];
let lastError: unknown;
const cooldownProbeUsedProviders = new Set<string>();
const observeDecision = async (decision: ModelFallbackDecisionParams) => {
const fallbackStep = logModelFallbackDecision(decision);
if (fallbackStep) {
await params.onFallbackStep?.(fallbackStep);
}
};
const observeFailedCandidate = async (
failedAttempt: Parameters<typeof recordFailedCandidateAttempt>[0],
) => {
const fallbackStep = recordFailedCandidateAttempt(failedAttempt);
if (fallbackStep) {
await params.onFallbackStep?.(fallbackStep);
}
};
const hasFallbackCandidates = candidates.length > 1;
@@ -799,7 +820,7 @@ export async function runWithModelFallback<T>(params: {
error: decision.error,
reason: decision.reason,
});
logModelFallbackDecision({
await observeDecision({
decision: "skip_candidate",
runId: params.runId,
requestedProvider: params.provider,
@@ -834,7 +855,7 @@ export async function runWithModelFallback<T>(params: {
error,
reason: decision.reason,
});
logModelFallbackDecision({
await observeDecision({
decision: "skip_candidate",
runId: params.runId,
requestedProvider: params.provider,
@@ -858,7 +879,7 @@ export async function runWithModelFallback<T>(params: {
}
}
attemptedDuringCooldown = true;
logModelFallbackDecision({
await observeDecision({
decision: "probe_cooldown_candidate",
runId: params.runId,
requestedProvider: params.provider,
@@ -888,7 +909,7 @@ export async function runWithModelFallback<T>(params: {
});
if ("success" in attemptRun) {
if (i > 0 || attempts.length > 0 || attemptedDuringCooldown) {
logModelFallbackDecision({
await observeDecision({
decision: "candidate_succeeded",
runId: params.runId,
requestedProvider: params.provider,
@@ -956,7 +977,7 @@ export async function runWithModelFallback<T>(params: {
model: candidate.model,
});
lastError = switchNormalized;
recordFailedCandidateAttempt({
await observeFailedCandidate({
attempts,
candidate,
error: switchNormalized,
@@ -982,7 +1003,7 @@ export async function runWithModelFallback<T>(params: {
}
lastError = isKnownFailover ? normalized : err;
recordFailedCandidateAttempt({
await observeFailedCandidate({
attempts,
candidate,
error: normalized,