diff --git a/CHANGELOG.md b/CHANGELOG.md index 99a8f357737..e5348e629a3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -89,6 +89,7 @@ Docs: https://docs.openclaw.ai - Gateway/models: keep read-only model-list responses on registry-compatible fallbacks and metadata defaults, so empty or minimal persisted model files do not hide built-ins or custom model capabilities. Thanks @Marvinthebored. - CLI/doctor: load the configured memory-slot plugin when resolving memory diagnostics so bundled `memory-core` no longer triggers a false “no active memory plugin” warning on standalone `doctor` / `status` runs. Fixes #76367. Thanks @neeravmakwana. - Gateway: preserve stack diagnostics when `chat.send` or agent attachment parsing/staging fails, improving image-send failure triage. Refs #63432. (#75135) Thanks @keen0206. +- Agents/idle-timeout: add a cost-runaway breaker to the outer embedded-run retry loop that halts further attempts after 5 consecutive idle timeouts without completed model progress, so a wedged provider can no longer fan paid model calls out across the same run; completed text or tool-call progress resets the breaker, but partial tool-argument token dribbles do not. Fixes #76293. Thanks @ThePuma312. - Heartbeats/Codex: stop sending the legacy `HEARTBEAT_OK` prompt instruction when heartbeat turns have the structured `heartbeat_respond` tool, while keeping the text sentinel for legacy automatic heartbeat replies. Thanks @pashpashpash. - Agent runtimes: fail explicit plugin runtime selections honestly when the requested harness is unavailable instead of silently falling back to the embedded PI runtime. Thanks @pashpashpash. - Maintainer workflow: push prepared PR heads through GitHub's verified commit API by default and require an explicit override before git-protocol pushes can publish unsigned commits. Thanks @BunsDev. diff --git a/src/agents/pi-embedded-runner/run.ts b/src/agents/pi-embedded-runner/run.ts index dc7ce4b9f34..0a4b9855ca6 100644 --- a/src/agents/pi-embedded-runner/run.ts +++ b/src/agents/pi-embedded-runner/run.ts @@ -120,6 +120,11 @@ import { type RuntimeAuthState, scrubAnthropicRefusalMagic, } from "./run/helpers.js"; +import { + MAX_CONSECUTIVE_IDLE_TIMEOUTS_BEFORE_OUTPUT, + createIdleTimeoutBreakerState, + stepIdleTimeoutBreaker, +} from "./run/idle-timeout-breaker.js"; import { DEFAULT_EMPTY_RESPONSE_RETRY_LIMIT, DEFAULT_REASONING_ONLY_RETRY_LIMIT, @@ -214,6 +219,16 @@ function normalizeEmbeddedRunAttemptResult( }; } +function hasCompletedModelProgressForIdleBreaker(attempt: EmbeddedRunAttemptForRunner): boolean { + return ( + attempt.assistantTexts.some((text) => text.trim().length > 0) || + attempt.toolMetas.length > 0 || + (attempt.clientToolCalls?.length ?? 0) > 0 || + hasMessagingToolDeliveryEvidence(attempt) || + attempt.itemLifecycle.completedCount > 0 + ); +} + function createEmptyAuthProfileStore(): AuthProfileStore { return { version: 1, @@ -755,6 +770,13 @@ export async function runEmbeddedPiAgent( let reasoningOnlyRetryAttempts = 0; let emptyResponseRetryAttempts = 0; let sameModelIdleTimeoutRetries = 0; + // Cost-runaway breaker for #76293. State lives at the run-loop level + // on purpose so it survives across attempt boundaries and across + // profile/auth retries within this embedded run (a wrapper-local + // counter would reset on every iteration). The helper is pure and + // unit-tested in run/idle-timeout-breaker.test.ts; the run loop just + // feeds it the outcome of each attempt. + const idleTimeoutBreakerState = createIdleTimeoutBreakerState(); let lastRetryFailoverReason: FailoverReason | null = null; let planningOnlyRetryInstruction: string | null = null; let reasoningOnlyRetryInstruction: string | null = null; @@ -1162,6 +1184,55 @@ export async function runEmbeddedPiAgent( // reflects current context usage, not accumulated tool-loop usage. lastRunPromptUsage = lastAssistantUsage ?? attemptUsage; lastTurnTotal = lastAssistantUsage?.total ?? attemptUsage?.total; + // Idle-timeout cost-runaway breaker (#76293). Logic lives in the + // pure helper below so it stays unit-testable; the run loop just + // feeds it the latest attempt outcome and bails through the + // existing retry-limit exhaustion path when the cap is hit. + const breakerStep = stepIdleTimeoutBreaker(idleTimeoutBreakerState, { + idleTimedOut, + completedModelProgress: hasCompletedModelProgressForIdleBreaker(attempt), + outputTokens: attemptUsage?.output, + }); + if (breakerStep.tripped) { + const breakerMessage = + `Idle-timeout cost-runaway breaker tripped: ` + + `${breakerStep.consecutive} consecutive idle timeouts ` + + `without completed model progress ` + + `(cap=${MAX_CONSECUTIVE_IDLE_TIMEOUTS_BEFORE_OUTPUT}). ` + + `Halting further attempts to bound paid model calls. ` + + `See issue #76293.`; + log.error( + `[idle-timeout-circuit-breaker-tripped] ` + + `sessionKey=${params.sessionKey ?? params.sessionId} ` + + `provider=${provider}/${modelId} ` + + `consecutive=${breakerStep.consecutive} ` + + `cap=${MAX_CONSECUTIVE_IDLE_TIMEOUTS_BEFORE_OUTPUT}`, + ); + const breakerDecision = resolveRunFailoverDecision({ + stage: "retry_limit", + fallbackConfigured, + failoverReason: lastRetryFailoverReason, + }); + return handleRetryLimitExhaustion({ + message: breakerMessage, + decision: breakerDecision, + provider, + model: modelId, + profileId: lastProfileId, + durationMs: Date.now() - started, + agentMeta: buildErrorAgentMeta({ + sessionId: activeSessionId, + provider, + model: model.id, + contextTokens: ctxInfo.tokens, + usageAccumulator, + lastRunPromptUsage, + lastTurnTotal, + }), + replayInvalid: accumulatedReplayState.replayInvalid ? true : undefined, + livenessState: "blocked", + }); + } const attemptCompactionCount = Math.max(0, attempt.compactionCount ?? 0); autoCompactionCount += attemptCompactionCount; if ( diff --git a/src/agents/pi-embedded-runner/run/idle-timeout-breaker.test.ts b/src/agents/pi-embedded-runner/run/idle-timeout-breaker.test.ts new file mode 100644 index 00000000000..d168c3c481b --- /dev/null +++ b/src/agents/pi-embedded-runner/run/idle-timeout-breaker.test.ts @@ -0,0 +1,164 @@ +import { describe, expect, it } from "vitest"; +import { + MAX_CONSECUTIVE_IDLE_TIMEOUTS_BEFORE_OUTPUT, + createIdleTimeoutBreakerState, + stepIdleTimeoutBreaker, +} from "./idle-timeout-breaker.js"; + +// Issue #76293. The wedge: a stalled provider returns from each LLM call +// with idleTimedOut=true and no completed model progress. Without this +// breaker the outer run loop in run.ts can keep starting fresh attempts (a +// new session and a new streamWithIdleTimeout wrapper each time, so any +// wrapper-local counter would reset on every iteration). The breaker state +// has to live at the outer-loop scope to survive across attempts and profile +// failover, which is what stepIdleTimeoutBreaker captures. +// +// These tests exercise the helper directly. The integration in run.ts is +// just `if (step.tripped) return handleRetryLimitExhaustion(...)`, so +// proving the helper trips/resets correctly is what matters. +describe("stepIdleTimeoutBreaker (#76293)", () => { + function drive( + inputs: Array<{ + idleTimedOut: boolean; + completedModelProgress: boolean; + outputTokens?: number; + }>, + options?: { cap?: number }, + ) { + const state = createIdleTimeoutBreakerState(); + const steps: Array<{ consecutive: number; tripped: boolean }> = []; + for (const input of inputs) { + steps.push(stepIdleTimeoutBreaker(state, input, options)); + } + return steps; + } + + it("default cap matches the constant the run loop reads from", () => { + expect(MAX_CONSECUTIVE_IDLE_TIMEOUTS_BEFORE_OUTPUT).toBe(5); + }); + + it("does not trip on a single wedged attempt", () => { + const steps = drive([{ idleTimedOut: true, completedModelProgress: false }]); + expect(steps[0]).toEqual({ consecutive: 1, tripped: false }); + }); + + it("trips on the Nth consecutive wedged attempt at the default cap", () => { + const steps = drive([ + { idleTimedOut: true, completedModelProgress: false }, + { idleTimedOut: true, completedModelProgress: false }, + { idleTimedOut: true, completedModelProgress: false }, + { idleTimedOut: true, completedModelProgress: false }, + { idleTimedOut: true, completedModelProgress: false }, + ]); + expect(steps.map((s) => s.tripped)).toEqual([false, false, false, false, true]); + expect(steps.at(-1)?.consecutive).toBe(5); + }); + + it("respects an explicit smaller cap", () => { + const steps = drive( + [ + { idleTimedOut: true, completedModelProgress: false }, + { idleTimedOut: true, completedModelProgress: false }, + { idleTimedOut: true, completedModelProgress: false }, + ], + { cap: 3 }, + ); + expect(steps.map((s) => s.tripped)).toEqual([false, false, true]); + }); + + it("disables the breaker entirely when cap is 0 (escape hatch)", () => { + const steps = drive( + [ + { idleTimedOut: true, completedModelProgress: false }, + { idleTimedOut: true, completedModelProgress: false }, + { idleTimedOut: true, completedModelProgress: false }, + { idleTimedOut: true, completedModelProgress: false }, + { idleTimedOut: true, completedModelProgress: false }, + { idleTimedOut: true, completedModelProgress: false }, + { idleTimedOut: true, completedModelProgress: false }, + ], + { cap: 0 }, + ); + expect(steps.every((s) => !s.tripped)).toBe(true); + expect(steps.at(-1)?.consecutive).toBe(7); + }); + + it("does not trip when the model completed progress, even on a timeout (slow but alive)", () => { + // 8 attempts that each timed out but each completed text or tool-call + // progress. The model is slow at the tail of its turn, not wedged. The + // breaker must stay disarmed so legitimate slow streams keep retrying. + const steps = drive( + Array.from({ length: 8 }, () => ({ + idleTimedOut: true, + completedModelProgress: true, + outputTokens: 220, + })), + ); + expect(steps.every((s) => !s.tripped)).toBe(true); + expect(steps.at(-1)?.consecutive).toBe(0); + }); + + it("resets the counter when a productive attempt arrives between wedged attempts", () => { + // 4 wedged + 1 productive (completed progress) + 4 wedged. No run of 5 + // wedged in a row, so the breaker must stay disarmed across the whole + // 9-attempt sequence even though 8 of the attempts were wedged in total. + const steps = drive([ + { idleTimedOut: true, completedModelProgress: false }, + { idleTimedOut: true, completedModelProgress: false }, + { idleTimedOut: true, completedModelProgress: false }, + { idleTimedOut: true, completedModelProgress: false }, + { idleTimedOut: false, completedModelProgress: true, outputTokens: 320 }, + { idleTimedOut: true, completedModelProgress: false }, + { idleTimedOut: true, completedModelProgress: false }, + { idleTimedOut: true, completedModelProgress: false }, + { idleTimedOut: true, completedModelProgress: false }, + ]); + expect(steps.map((s) => s.tripped)).toEqual([ + false, + false, + false, + false, + false, + false, + false, + false, + false, + ]); + expect(steps.map((s) => s.consecutive)).toEqual([1, 2, 3, 4, 0, 1, 2, 3, 4]); + }); + + it("non-timeout error attempts (no output) leave the counter unchanged", () => { + // Sequence: 3 wedged, then 2 non-timeout attempts that produced no + // completed progress (e.g. transport error, prompt overflow), then 2 + // more wedged. The non-timeout attempts must NOT reset the counter + // (they're not evidence the model is alive) and must NOT increment it + // (the breaker is specifically about idle timeouts). So 3+0+0+1+1 = 5, + // trip on the last attempt. + const steps = drive([ + { idleTimedOut: true, completedModelProgress: false }, + { idleTimedOut: true, completedModelProgress: false }, + { idleTimedOut: true, completedModelProgress: false }, + { idleTimedOut: false, completedModelProgress: false }, + { idleTimedOut: false, completedModelProgress: false }, + { idleTimedOut: true, completedModelProgress: false }, + { idleTimedOut: true, completedModelProgress: false }, + ]); + expect(steps.map((s) => s.consecutive)).toEqual([1, 2, 3, 3, 3, 4, 5]); + expect(steps.at(-1)?.tripped).toBe(true); + }); + + it("does not reset for partial tool-argument tokens without completed progress", () => { + const steps = drive([ + { idleTimedOut: true, completedModelProgress: false, outputTokens: 12 }, + { idleTimedOut: true, completedModelProgress: false, outputTokens: 18 }, + { idleTimedOut: true, completedModelProgress: false, outputTokens: 24 }, + { idleTimedOut: true, completedModelProgress: false, outputTokens: 30 }, + { idleTimedOut: true, completedModelProgress: false, outputTokens: 36 }, + ]); + // Raw provider output tokens can come from partial tool-call argument + // deltas before the provider stalls. They are billed, but they are not + // completed progress, so they must not reset the breaker. + expect(steps.map((s) => s.consecutive)).toEqual([1, 2, 3, 4, 5]); + expect(steps.at(-1)?.tripped).toBe(true); + }); +}); diff --git a/src/agents/pi-embedded-runner/run/idle-timeout-breaker.ts b/src/agents/pi-embedded-runner/run/idle-timeout-breaker.ts new file mode 100644 index 00000000000..a1584a0a3d1 --- /dev/null +++ b/src/agents/pi-embedded-runner/run/idle-timeout-breaker.ts @@ -0,0 +1,82 @@ +/** + * Cap on consecutive attempts that ended in an idle timeout without completed + * model progress, before the outer run loop refuses to start another attempt. + * Distinct from MAX_SAME_MODEL_IDLE_TIMEOUT_RETRIES (which gates one extra + * retry on the same model before failover) and the broad MAX_RUN_LOOP_ITERATIONS + * backstop in run.ts. + * + * This one fires across profile/auth retries inside the same embedded run so a + * wedged provider cannot fan out paid model calls across every fallback profile + * in sequence. Resets when an attempt produces completed text or tool-call + * progress, but not merely because the provider billed partial output tokens. + * + * See issue #76293 for the original report (single heartbeat fire generating + * 761-1384 paid Anthropic calls in 60 seconds, costing $20-30 per incident). + */ +export const MAX_CONSECUTIVE_IDLE_TIMEOUTS_BEFORE_OUTPUT = 5; + +export type IdleTimeoutBreakerState = { + consecutiveIdleTimeoutsBeforeOutput: number; +}; + +export function createIdleTimeoutBreakerState(): IdleTimeoutBreakerState { + return { consecutiveIdleTimeoutsBeforeOutput: 0 }; +} + +export type IdleTimeoutBreakerInput = { + idleTimedOut: boolean; + completedModelProgress: boolean; + outputTokens?: number; +}; + +export type IdleTimeoutBreakerStep = { + consecutive: number; + tripped: boolean; +}; + +/** + * Update the breaker counter from the latest attempt's outcome and report + * whether the cap is now tripped. Designed to be called from the outer run + * loop right after an embedded attempt completes. + * + * Pure function modulo the mutable `state.consecutiveIdleTimeoutsBeforeOutput` + * field, so the caller decides where the state lives (typically a `let` in + * the outer loop). + * + * Decision table: + * idleTimedOut completedModelProgress action + * ------------ ---------------------- ------ + * true false count += 1 (wedged provider candidate) + * true true count = 0 (model is alive but slow tail) + * false true count = 0 (clean progress, all good) + * false false count unchanged (e.g. non-timeout error; + * don't poison or reset) + * + * The "false / false" branch matters: a non-timeout error attempt with no + * completed progress should not reset the breaker (it isn't a sign the + * provider is healthy), but it also shouldn't increment it (the issue at hand + * is idle timeouts, not arbitrary errors). + * + * `outputTokens` is intentionally not part of the reset condition. Some + * transports can accumulate billed output tokens from partial tool-call + * argument deltas before the model stalls; those tokens are cost, not completed + * progress, so they must not keep the breaker disarmed. + */ +export function stepIdleTimeoutBreaker( + state: IdleTimeoutBreakerState, + input: IdleTimeoutBreakerInput, + options?: { cap?: number }, +): IdleTimeoutBreakerStep { + const cap = options?.cap ?? MAX_CONSECUTIVE_IDLE_TIMEOUTS_BEFORE_OUTPUT; + + if (input.idleTimedOut && !input.completedModelProgress) { + state.consecutiveIdleTimeoutsBeforeOutput += 1; + } else if (input.completedModelProgress) { + state.consecutiveIdleTimeoutsBeforeOutput = 0; + } + + return { + consecutive: state.consecutiveIdleTimeoutsBeforeOutput, + tripped: cap > 0 && state.consecutiveIdleTimeoutsBeforeOutput >= cap, + }; +}