fix: cap idle timeouts without completed progress (#76345)

Co-authored-by: adhiraj <vacationadhi@gmail.com>
This commit is contained in:
Peter Steinberger
2026-05-03 14:07:03 +01:00
parent ccce342a24
commit 8f20d03373
4 changed files with 318 additions and 0 deletions

View File

@@ -89,6 +89,7 @@ Docs: https://docs.openclaw.ai
- Gateway/models: keep read-only model-list responses on registry-compatible fallbacks and metadata defaults, so empty or minimal persisted model files do not hide built-ins or custom model capabilities. Thanks @Marvinthebored.
- CLI/doctor: load the configured memory-slot plugin when resolving memory diagnostics so bundled `memory-core` no longer triggers a false “no active memory plugin” warning on standalone `doctor` / `status` runs. Fixes #76367. Thanks @neeravmakwana.
- Gateway: preserve stack diagnostics when `chat.send` or agent attachment parsing/staging fails, improving image-send failure triage. Refs #63432. (#75135) Thanks @keen0206.
- Agents/idle-timeout: add a cost-runaway breaker to the outer embedded-run retry loop that halts further attempts after 5 consecutive idle timeouts without completed model progress, so a wedged provider can no longer fan paid model calls out across the same run; completed text or tool-call progress resets the breaker, but partial tool-argument token dribbles do not. Fixes #76293. Thanks @ThePuma312.
- Heartbeats/Codex: stop sending the legacy `HEARTBEAT_OK` prompt instruction when heartbeat turns have the structured `heartbeat_respond` tool, while keeping the text sentinel for legacy automatic heartbeat replies. Thanks @pashpashpash.
- Agent runtimes: fail explicit plugin runtime selections honestly when the requested harness is unavailable instead of silently falling back to the embedded PI runtime. Thanks @pashpashpash.
- Maintainer workflow: push prepared PR heads through GitHub's verified commit API by default and require an explicit override before git-protocol pushes can publish unsigned commits. Thanks @BunsDev.

View File

@@ -120,6 +120,11 @@ import {
type RuntimeAuthState,
scrubAnthropicRefusalMagic,
} from "./run/helpers.js";
import {
MAX_CONSECUTIVE_IDLE_TIMEOUTS_BEFORE_OUTPUT,
createIdleTimeoutBreakerState,
stepIdleTimeoutBreaker,
} from "./run/idle-timeout-breaker.js";
import {
DEFAULT_EMPTY_RESPONSE_RETRY_LIMIT,
DEFAULT_REASONING_ONLY_RETRY_LIMIT,
@@ -214,6 +219,16 @@ function normalizeEmbeddedRunAttemptResult(
};
}
function hasCompletedModelProgressForIdleBreaker(attempt: EmbeddedRunAttemptForRunner): boolean {
return (
attempt.assistantTexts.some((text) => text.trim().length > 0) ||
attempt.toolMetas.length > 0 ||
(attempt.clientToolCalls?.length ?? 0) > 0 ||
hasMessagingToolDeliveryEvidence(attempt) ||
attempt.itemLifecycle.completedCount > 0
);
}
function createEmptyAuthProfileStore(): AuthProfileStore {
return {
version: 1,
@@ -755,6 +770,13 @@ export async function runEmbeddedPiAgent(
let reasoningOnlyRetryAttempts = 0;
let emptyResponseRetryAttempts = 0;
let sameModelIdleTimeoutRetries = 0;
// Cost-runaway breaker for #76293. State lives at the run-loop level
// on purpose so it survives across attempt boundaries and across
// profile/auth retries within this embedded run (a wrapper-local
// counter would reset on every iteration). The helper is pure and
// unit-tested in run/idle-timeout-breaker.test.ts; the run loop just
// feeds it the outcome of each attempt.
const idleTimeoutBreakerState = createIdleTimeoutBreakerState();
let lastRetryFailoverReason: FailoverReason | null = null;
let planningOnlyRetryInstruction: string | null = null;
let reasoningOnlyRetryInstruction: string | null = null;
@@ -1162,6 +1184,55 @@ export async function runEmbeddedPiAgent(
// reflects current context usage, not accumulated tool-loop usage.
lastRunPromptUsage = lastAssistantUsage ?? attemptUsage;
lastTurnTotal = lastAssistantUsage?.total ?? attemptUsage?.total;
// Idle-timeout cost-runaway breaker (#76293). Logic lives in the
// pure helper below so it stays unit-testable; the run loop just
// feeds it the latest attempt outcome and bails through the
// existing retry-limit exhaustion path when the cap is hit.
const breakerStep = stepIdleTimeoutBreaker(idleTimeoutBreakerState, {
idleTimedOut,
completedModelProgress: hasCompletedModelProgressForIdleBreaker(attempt),
outputTokens: attemptUsage?.output,
});
if (breakerStep.tripped) {
const breakerMessage =
`Idle-timeout cost-runaway breaker tripped: ` +
`${breakerStep.consecutive} consecutive idle timeouts ` +
`without completed model progress ` +
`(cap=${MAX_CONSECUTIVE_IDLE_TIMEOUTS_BEFORE_OUTPUT}). ` +
`Halting further attempts to bound paid model calls. ` +
`See issue #76293.`;
log.error(
`[idle-timeout-circuit-breaker-tripped] ` +
`sessionKey=${params.sessionKey ?? params.sessionId} ` +
`provider=${provider}/${modelId} ` +
`consecutive=${breakerStep.consecutive} ` +
`cap=${MAX_CONSECUTIVE_IDLE_TIMEOUTS_BEFORE_OUTPUT}`,
);
const breakerDecision = resolveRunFailoverDecision({
stage: "retry_limit",
fallbackConfigured,
failoverReason: lastRetryFailoverReason,
});
return handleRetryLimitExhaustion({
message: breakerMessage,
decision: breakerDecision,
provider,
model: modelId,
profileId: lastProfileId,
durationMs: Date.now() - started,
agentMeta: buildErrorAgentMeta({
sessionId: activeSessionId,
provider,
model: model.id,
contextTokens: ctxInfo.tokens,
usageAccumulator,
lastRunPromptUsage,
lastTurnTotal,
}),
replayInvalid: accumulatedReplayState.replayInvalid ? true : undefined,
livenessState: "blocked",
});
}
const attemptCompactionCount = Math.max(0, attempt.compactionCount ?? 0);
autoCompactionCount += attemptCompactionCount;
if (

View File

@@ -0,0 +1,164 @@
import { describe, expect, it } from "vitest";
import {
MAX_CONSECUTIVE_IDLE_TIMEOUTS_BEFORE_OUTPUT,
createIdleTimeoutBreakerState,
stepIdleTimeoutBreaker,
} from "./idle-timeout-breaker.js";
// Issue #76293. The wedge: a stalled provider returns from each LLM call
// with idleTimedOut=true and no completed model progress. Without this
// breaker the outer run loop in run.ts can keep starting fresh attempts (a
// new session and a new streamWithIdleTimeout wrapper each time, so any
// wrapper-local counter would reset on every iteration). The breaker state
// has to live at the outer-loop scope to survive across attempts and profile
// failover, which is what stepIdleTimeoutBreaker captures.
//
// These tests exercise the helper directly. The integration in run.ts is
// just `if (step.tripped) return handleRetryLimitExhaustion(...)`, so
// proving the helper trips/resets correctly is what matters.
describe("stepIdleTimeoutBreaker (#76293)", () => {
function drive(
inputs: Array<{
idleTimedOut: boolean;
completedModelProgress: boolean;
outputTokens?: number;
}>,
options?: { cap?: number },
) {
const state = createIdleTimeoutBreakerState();
const steps: Array<{ consecutive: number; tripped: boolean }> = [];
for (const input of inputs) {
steps.push(stepIdleTimeoutBreaker(state, input, options));
}
return steps;
}
it("default cap matches the constant the run loop reads from", () => {
expect(MAX_CONSECUTIVE_IDLE_TIMEOUTS_BEFORE_OUTPUT).toBe(5);
});
it("does not trip on a single wedged attempt", () => {
const steps = drive([{ idleTimedOut: true, completedModelProgress: false }]);
expect(steps[0]).toEqual({ consecutive: 1, tripped: false });
});
it("trips on the Nth consecutive wedged attempt at the default cap", () => {
const steps = drive([
{ idleTimedOut: true, completedModelProgress: false },
{ idleTimedOut: true, completedModelProgress: false },
{ idleTimedOut: true, completedModelProgress: false },
{ idleTimedOut: true, completedModelProgress: false },
{ idleTimedOut: true, completedModelProgress: false },
]);
expect(steps.map((s) => s.tripped)).toEqual([false, false, false, false, true]);
expect(steps.at(-1)?.consecutive).toBe(5);
});
it("respects an explicit smaller cap", () => {
const steps = drive(
[
{ idleTimedOut: true, completedModelProgress: false },
{ idleTimedOut: true, completedModelProgress: false },
{ idleTimedOut: true, completedModelProgress: false },
],
{ cap: 3 },
);
expect(steps.map((s) => s.tripped)).toEqual([false, false, true]);
});
it("disables the breaker entirely when cap is 0 (escape hatch)", () => {
const steps = drive(
[
{ idleTimedOut: true, completedModelProgress: false },
{ idleTimedOut: true, completedModelProgress: false },
{ idleTimedOut: true, completedModelProgress: false },
{ idleTimedOut: true, completedModelProgress: false },
{ idleTimedOut: true, completedModelProgress: false },
{ idleTimedOut: true, completedModelProgress: false },
{ idleTimedOut: true, completedModelProgress: false },
],
{ cap: 0 },
);
expect(steps.every((s) => !s.tripped)).toBe(true);
expect(steps.at(-1)?.consecutive).toBe(7);
});
it("does not trip when the model completed progress, even on a timeout (slow but alive)", () => {
// 8 attempts that each timed out but each completed text or tool-call
// progress. The model is slow at the tail of its turn, not wedged. The
// breaker must stay disarmed so legitimate slow streams keep retrying.
const steps = drive(
Array.from({ length: 8 }, () => ({
idleTimedOut: true,
completedModelProgress: true,
outputTokens: 220,
})),
);
expect(steps.every((s) => !s.tripped)).toBe(true);
expect(steps.at(-1)?.consecutive).toBe(0);
});
it("resets the counter when a productive attempt arrives between wedged attempts", () => {
// 4 wedged + 1 productive (completed progress) + 4 wedged. No run of 5
// wedged in a row, so the breaker must stay disarmed across the whole
// 9-attempt sequence even though 8 of the attempts were wedged in total.
const steps = drive([
{ idleTimedOut: true, completedModelProgress: false },
{ idleTimedOut: true, completedModelProgress: false },
{ idleTimedOut: true, completedModelProgress: false },
{ idleTimedOut: true, completedModelProgress: false },
{ idleTimedOut: false, completedModelProgress: true, outputTokens: 320 },
{ idleTimedOut: true, completedModelProgress: false },
{ idleTimedOut: true, completedModelProgress: false },
{ idleTimedOut: true, completedModelProgress: false },
{ idleTimedOut: true, completedModelProgress: false },
]);
expect(steps.map((s) => s.tripped)).toEqual([
false,
false,
false,
false,
false,
false,
false,
false,
false,
]);
expect(steps.map((s) => s.consecutive)).toEqual([1, 2, 3, 4, 0, 1, 2, 3, 4]);
});
it("non-timeout error attempts (no output) leave the counter unchanged", () => {
// Sequence: 3 wedged, then 2 non-timeout attempts that produced no
// completed progress (e.g. transport error, prompt overflow), then 2
// more wedged. The non-timeout attempts must NOT reset the counter
// (they're not evidence the model is alive) and must NOT increment it
// (the breaker is specifically about idle timeouts). So 3+0+0+1+1 = 5,
// trip on the last attempt.
const steps = drive([
{ idleTimedOut: true, completedModelProgress: false },
{ idleTimedOut: true, completedModelProgress: false },
{ idleTimedOut: true, completedModelProgress: false },
{ idleTimedOut: false, completedModelProgress: false },
{ idleTimedOut: false, completedModelProgress: false },
{ idleTimedOut: true, completedModelProgress: false },
{ idleTimedOut: true, completedModelProgress: false },
]);
expect(steps.map((s) => s.consecutive)).toEqual([1, 2, 3, 3, 3, 4, 5]);
expect(steps.at(-1)?.tripped).toBe(true);
});
it("does not reset for partial tool-argument tokens without completed progress", () => {
const steps = drive([
{ idleTimedOut: true, completedModelProgress: false, outputTokens: 12 },
{ idleTimedOut: true, completedModelProgress: false, outputTokens: 18 },
{ idleTimedOut: true, completedModelProgress: false, outputTokens: 24 },
{ idleTimedOut: true, completedModelProgress: false, outputTokens: 30 },
{ idleTimedOut: true, completedModelProgress: false, outputTokens: 36 },
]);
// Raw provider output tokens can come from partial tool-call argument
// deltas before the provider stalls. They are billed, but they are not
// completed progress, so they must not reset the breaker.
expect(steps.map((s) => s.consecutive)).toEqual([1, 2, 3, 4, 5]);
expect(steps.at(-1)?.tripped).toBe(true);
});
});

View File

@@ -0,0 +1,82 @@
/**
* Cap on consecutive attempts that ended in an idle timeout without completed
* model progress, before the outer run loop refuses to start another attempt.
* Distinct from MAX_SAME_MODEL_IDLE_TIMEOUT_RETRIES (which gates one extra
* retry on the same model before failover) and the broad MAX_RUN_LOOP_ITERATIONS
* backstop in run.ts.
*
* This one fires across profile/auth retries inside the same embedded run so a
* wedged provider cannot fan out paid model calls across every fallback profile
* in sequence. Resets when an attempt produces completed text or tool-call
* progress, but not merely because the provider billed partial output tokens.
*
* See issue #76293 for the original report (single heartbeat fire generating
* 761-1384 paid Anthropic calls in 60 seconds, costing $20-30 per incident).
*/
export const MAX_CONSECUTIVE_IDLE_TIMEOUTS_BEFORE_OUTPUT = 5;
export type IdleTimeoutBreakerState = {
consecutiveIdleTimeoutsBeforeOutput: number;
};
export function createIdleTimeoutBreakerState(): IdleTimeoutBreakerState {
return { consecutiveIdleTimeoutsBeforeOutput: 0 };
}
export type IdleTimeoutBreakerInput = {
idleTimedOut: boolean;
completedModelProgress: boolean;
outputTokens?: number;
};
export type IdleTimeoutBreakerStep = {
consecutive: number;
tripped: boolean;
};
/**
* Update the breaker counter from the latest attempt's outcome and report
* whether the cap is now tripped. Designed to be called from the outer run
* loop right after an embedded attempt completes.
*
* Pure function modulo the mutable `state.consecutiveIdleTimeoutsBeforeOutput`
* field, so the caller decides where the state lives (typically a `let` in
* the outer loop).
*
* Decision table:
* idleTimedOut completedModelProgress action
* ------------ ---------------------- ------
* true false count += 1 (wedged provider candidate)
* true true count = 0 (model is alive but slow tail)
* false true count = 0 (clean progress, all good)
* false false count unchanged (e.g. non-timeout error;
* don't poison or reset)
*
* The "false / false" branch matters: a non-timeout error attempt with no
* completed progress should not reset the breaker (it isn't a sign the
* provider is healthy), but it also shouldn't increment it (the issue at hand
* is idle timeouts, not arbitrary errors).
*
* `outputTokens` is intentionally not part of the reset condition. Some
* transports can accumulate billed output tokens from partial tool-call
* argument deltas before the model stalls; those tokens are cost, not completed
* progress, so they must not keep the breaker disarmed.
*/
export function stepIdleTimeoutBreaker(
state: IdleTimeoutBreakerState,
input: IdleTimeoutBreakerInput,
options?: { cap?: number },
): IdleTimeoutBreakerStep {
const cap = options?.cap ?? MAX_CONSECUTIVE_IDLE_TIMEOUTS_BEFORE_OUTPUT;
if (input.idleTimedOut && !input.completedModelProgress) {
state.consecutiveIdleTimeoutsBeforeOutput += 1;
} else if (input.completedModelProgress) {
state.consecutiveIdleTimeoutsBeforeOutput = 0;
}
return {
consecutive: state.consecutiveIdleTimeoutsBeforeOutput,
tripped: cap > 0 && state.consecutiveIdleTimeoutsBeforeOutput >= cap,
};
}