fix: cap idle timeouts without completed progress (#76345)

Co-authored-by: adhiraj <vacationadhi@gmail.com>
2026-05-06 09:50:42 +00:00 · 2026-05-03 14:07:03 +01:00
parent ccce342a24
commit 8f20d03373
4 changed files with 318 additions and 0 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -89,6 +89,7 @@ Docs: https://docs.openclaw.ai
 - Gateway/models: keep read-only model-list responses on registry-compatible fallbacks and metadata defaults, so empty or minimal persisted model files do not hide built-ins or custom model capabilities. Thanks @Marvinthebored.
 - CLI/doctor: load the configured memory-slot plugin when resolving memory diagnostics so bundled `memory-core` no longer triggers a false “no active memory plugin” warning on standalone `doctor` / `status` runs. Fixes #76367. Thanks @neeravmakwana.
 - Gateway: preserve stack diagnostics when `chat.send` or agent attachment parsing/staging fails, improving image-send failure triage. Refs #63432. (#75135) Thanks @keen0206.
+- Agents/idle-timeout: add a cost-runaway breaker to the outer embedded-run retry loop that halts further attempts after 5 consecutive idle timeouts without completed model progress, so a wedged provider can no longer fan paid model calls out across the same run; completed text or tool-call progress resets the breaker, but partial tool-argument token dribbles do not. Fixes #76293. Thanks @ThePuma312.
 - Heartbeats/Codex: stop sending the legacy `HEARTBEAT_OK` prompt instruction when heartbeat turns have the structured `heartbeat_respond` tool, while keeping the text sentinel for legacy automatic heartbeat replies. Thanks @pashpashpash.
 - Agent runtimes: fail explicit plugin runtime selections honestly when the requested harness is unavailable instead of silently falling back to the embedded PI runtime. Thanks @pashpashpash.
 - Maintainer workflow: push prepared PR heads through GitHub's verified commit API by default and require an explicit override before git-protocol pushes can publish unsigned commits. Thanks @BunsDev.
--- a/src/agents/pi-embedded-runner/run.ts
+++ b/src/agents/pi-embedded-runner/run.ts
@@ -120,6 +120,11 @@ import {
  type RuntimeAuthState,
  scrubAnthropicRefusalMagic,
 } from "./run/helpers.js";
+import {
+  MAX_CONSECUTIVE_IDLE_TIMEOUTS_BEFORE_OUTPUT,
+  createIdleTimeoutBreakerState,
+  stepIdleTimeoutBreaker,
+} from "./run/idle-timeout-breaker.js";
 import {
  DEFAULT_EMPTY_RESPONSE_RETRY_LIMIT,
  DEFAULT_REASONING_ONLY_RETRY_LIMIT,
@@ -214,6 +219,16 @@ function normalizeEmbeddedRunAttemptResult(
  };
 }

+function hasCompletedModelProgressForIdleBreaker(attempt: EmbeddedRunAttemptForRunner): boolean {
+  return (
+    attempt.assistantTexts.some((text) => text.trim().length > 0) ||
+    attempt.toolMetas.length > 0 ||
+    (attempt.clientToolCalls?.length ?? 0) > 0 ||
+    hasMessagingToolDeliveryEvidence(attempt) ||
+    attempt.itemLifecycle.completedCount > 0
+  );
+}
+
 function createEmptyAuthProfileStore(): AuthProfileStore {
  return {
    version: 1,
@@ -755,6 +770,13 @@ export async function runEmbeddedPiAgent(
      let reasoningOnlyRetryAttempts = 0;
      let emptyResponseRetryAttempts = 0;
      let sameModelIdleTimeoutRetries = 0;
+      // Cost-runaway breaker for #76293. State lives at the run-loop level
+      // on purpose so it survives across attempt boundaries and across
+      // profile/auth retries within this embedded run (a wrapper-local
+      // counter would reset on every iteration). The helper is pure and
+      // unit-tested in run/idle-timeout-breaker.test.ts; the run loop just
+      // feeds it the outcome of each attempt.
+      const idleTimeoutBreakerState = createIdleTimeoutBreakerState();
      let lastRetryFailoverReason: FailoverReason | null = null;
      let planningOnlyRetryInstruction: string | null = null;
      let reasoningOnlyRetryInstruction: string | null = null;
@@ -1162,6 +1184,55 @@ export async function runEmbeddedPiAgent(
          // reflects current context usage, not accumulated tool-loop usage.
          lastRunPromptUsage = lastAssistantUsage ?? attemptUsage;
          lastTurnTotal = lastAssistantUsage?.total ?? attemptUsage?.total;
+          // Idle-timeout cost-runaway breaker (#76293). Logic lives in the
+          // pure helper below so it stays unit-testable; the run loop just
+          // feeds it the latest attempt outcome and bails through the
+          // existing retry-limit exhaustion path when the cap is hit.
+          const breakerStep = stepIdleTimeoutBreaker(idleTimeoutBreakerState, {
+            idleTimedOut,
+            completedModelProgress: hasCompletedModelProgressForIdleBreaker(attempt),
+            outputTokens: attemptUsage?.output,
+          });
+          if (breakerStep.tripped) {
+            const breakerMessage =
+              `Idle-timeout cost-runaway breaker tripped: ` +
+              `${breakerStep.consecutive} consecutive idle timeouts ` +
+              `without completed model progress ` +
+              `(cap=${MAX_CONSECUTIVE_IDLE_TIMEOUTS_BEFORE_OUTPUT}). ` +
+              `Halting further attempts to bound paid model calls. ` +
+              `See issue #76293.`;
+            log.error(
+              `[idle-timeout-circuit-breaker-tripped] ` +
+                `sessionKey=${params.sessionKey ?? params.sessionId} ` +
+                `provider=${provider}/${modelId} ` +
+                `consecutive=${breakerStep.consecutive} ` +
+                `cap=${MAX_CONSECUTIVE_IDLE_TIMEOUTS_BEFORE_OUTPUT}`,
+            );
+            const breakerDecision = resolveRunFailoverDecision({
+              stage: "retry_limit",
+              fallbackConfigured,
+              failoverReason: lastRetryFailoverReason,
+            });
+            return handleRetryLimitExhaustion({
+              message: breakerMessage,
+              decision: breakerDecision,
+              provider,
+              model: modelId,
+              profileId: lastProfileId,
+              durationMs: Date.now() - started,
+              agentMeta: buildErrorAgentMeta({
+                sessionId: activeSessionId,
+                provider,
+                model: model.id,
+                contextTokens: ctxInfo.tokens,
+                usageAccumulator,
+                lastRunPromptUsage,
+                lastTurnTotal,
+              }),
+              replayInvalid: accumulatedReplayState.replayInvalid ? true : undefined,
+              livenessState: "blocked",
+            });
+          }
          const attemptCompactionCount = Math.max(0, attempt.compactionCount ?? 0);
          autoCompactionCount += attemptCompactionCount;
          if (
--- a/src/agents/pi-embedded-runner/run/idle-timeout-breaker.test.ts
+++ b/src/agents/pi-embedded-runner/run/idle-timeout-breaker.test.ts
@@ -0,0 +1,164 @@
+import { describe, expect, it } from "vitest";
+import {
+  MAX_CONSECUTIVE_IDLE_TIMEOUTS_BEFORE_OUTPUT,
+  createIdleTimeoutBreakerState,
+  stepIdleTimeoutBreaker,
+} from "./idle-timeout-breaker.js";
+
+// Issue #76293. The wedge: a stalled provider returns from each LLM call
+// with idleTimedOut=true and no completed model progress. Without this
+// breaker the outer run loop in run.ts can keep starting fresh attempts (a
+// new session and a new streamWithIdleTimeout wrapper each time, so any
+// wrapper-local counter would reset on every iteration). The breaker state
+// has to live at the outer-loop scope to survive across attempts and profile
+// failover, which is what stepIdleTimeoutBreaker captures.
+//
+// These tests exercise the helper directly. The integration in run.ts is
+// just `if (step.tripped) return handleRetryLimitExhaustion(...)`, so
+// proving the helper trips/resets correctly is what matters.
+describe("stepIdleTimeoutBreaker (#76293)", () => {
+  function drive(
+    inputs: Array<{
+      idleTimedOut: boolean;
+      completedModelProgress: boolean;
+      outputTokens?: number;
+    }>,
+    options?: { cap?: number },
+  ) {
+    const state = createIdleTimeoutBreakerState();
+    const steps: Array<{ consecutive: number; tripped: boolean }> = [];
+    for (const input of inputs) {
+      steps.push(stepIdleTimeoutBreaker(state, input, options));
+    }
+    return steps;
+  }
+
+  it("default cap matches the constant the run loop reads from", () => {
+    expect(MAX_CONSECUTIVE_IDLE_TIMEOUTS_BEFORE_OUTPUT).toBe(5);
+  });
+
+  it("does not trip on a single wedged attempt", () => {
+    const steps = drive([{ idleTimedOut: true, completedModelProgress: false }]);
+    expect(steps[0]).toEqual({ consecutive: 1, tripped: false });
+  });
+
+  it("trips on the Nth consecutive wedged attempt at the default cap", () => {
+    const steps = drive([
+      { idleTimedOut: true, completedModelProgress: false },
+      { idleTimedOut: true, completedModelProgress: false },
+      { idleTimedOut: true, completedModelProgress: false },
+      { idleTimedOut: true, completedModelProgress: false },
+      { idleTimedOut: true, completedModelProgress: false },
+    ]);
+    expect(steps.map((s) => s.tripped)).toEqual([false, false, false, false, true]);
+    expect(steps.at(-1)?.consecutive).toBe(5);
+  });
+
+  it("respects an explicit smaller cap", () => {
+    const steps = drive(
+      [
+        { idleTimedOut: true, completedModelProgress: false },
+        { idleTimedOut: true, completedModelProgress: false },
+        { idleTimedOut: true, completedModelProgress: false },
+      ],
+      { cap: 3 },
+    );
+    expect(steps.map((s) => s.tripped)).toEqual([false, false, true]);
+  });
+
+  it("disables the breaker entirely when cap is 0 (escape hatch)", () => {
+    const steps = drive(
+      [
+        { idleTimedOut: true, completedModelProgress: false },
+        { idleTimedOut: true, completedModelProgress: false },
+        { idleTimedOut: true, completedModelProgress: false },
+        { idleTimedOut: true, completedModelProgress: false },
+        { idleTimedOut: true, completedModelProgress: false },
+        { idleTimedOut: true, completedModelProgress: false },
+        { idleTimedOut: true, completedModelProgress: false },
+      ],
+      { cap: 0 },
+    );
+    expect(steps.every((s) => !s.tripped)).toBe(true);
+    expect(steps.at(-1)?.consecutive).toBe(7);
+  });
+
+  it("does not trip when the model completed progress, even on a timeout (slow but alive)", () => {
+    // 8 attempts that each timed out but each completed text or tool-call
+    // progress. The model is slow at the tail of its turn, not wedged. The
+    // breaker must stay disarmed so legitimate slow streams keep retrying.
+    const steps = drive(
+      Array.from({ length: 8 }, () => ({
+        idleTimedOut: true,
+        completedModelProgress: true,
+        outputTokens: 220,
+      })),
+    );
+    expect(steps.every((s) => !s.tripped)).toBe(true);
+    expect(steps.at(-1)?.consecutive).toBe(0);
+  });
+
+  it("resets the counter when a productive attempt arrives between wedged attempts", () => {
+    // 4 wedged + 1 productive (completed progress) + 4 wedged. No run of 5
+    // wedged in a row, so the breaker must stay disarmed across the whole
+    // 9-attempt sequence even though 8 of the attempts were wedged in total.
+    const steps = drive([
+      { idleTimedOut: true, completedModelProgress: false },
+      { idleTimedOut: true, completedModelProgress: false },
+      { idleTimedOut: true, completedModelProgress: false },
+      { idleTimedOut: true, completedModelProgress: false },
+      { idleTimedOut: false, completedModelProgress: true, outputTokens: 320 },
+      { idleTimedOut: true, completedModelProgress: false },
+      { idleTimedOut: true, completedModelProgress: false },
+      { idleTimedOut: true, completedModelProgress: false },
+      { idleTimedOut: true, completedModelProgress: false },
+    ]);
+    expect(steps.map((s) => s.tripped)).toEqual([
+      false,
+      false,
+      false,
+      false,
+      false,
+      false,
+      false,
+      false,
+      false,
+    ]);
+    expect(steps.map((s) => s.consecutive)).toEqual([1, 2, 3, 4, 0, 1, 2, 3, 4]);
+  });
+
+  it("non-timeout error attempts (no output) leave the counter unchanged", () => {
+    // Sequence: 3 wedged, then 2 non-timeout attempts that produced no
+    // completed progress (e.g. transport error, prompt overflow), then 2
+    // more wedged. The non-timeout attempts must NOT reset the counter
+    // (they're not evidence the model is alive) and must NOT increment it
+    // (the breaker is specifically about idle timeouts). So 3+0+0+1+1 = 5,
+    // trip on the last attempt.
+    const steps = drive([
+      { idleTimedOut: true, completedModelProgress: false },
+      { idleTimedOut: true, completedModelProgress: false },
+      { idleTimedOut: true, completedModelProgress: false },
+      { idleTimedOut: false, completedModelProgress: false },
+      { idleTimedOut: false, completedModelProgress: false },
+      { idleTimedOut: true, completedModelProgress: false },
+      { idleTimedOut: true, completedModelProgress: false },
+    ]);
+    expect(steps.map((s) => s.consecutive)).toEqual([1, 2, 3, 3, 3, 4, 5]);
+    expect(steps.at(-1)?.tripped).toBe(true);
+  });
+
+  it("does not reset for partial tool-argument tokens without completed progress", () => {
+    const steps = drive([
+      { idleTimedOut: true, completedModelProgress: false, outputTokens: 12 },
+      { idleTimedOut: true, completedModelProgress: false, outputTokens: 18 },
+      { idleTimedOut: true, completedModelProgress: false, outputTokens: 24 },
+      { idleTimedOut: true, completedModelProgress: false, outputTokens: 30 },
+      { idleTimedOut: true, completedModelProgress: false, outputTokens: 36 },
+    ]);
+    // Raw provider output tokens can come from partial tool-call argument
+    // deltas before the provider stalls. They are billed, but they are not
+    // completed progress, so they must not reset the breaker.
+    expect(steps.map((s) => s.consecutive)).toEqual([1, 2, 3, 4, 5]);
+    expect(steps.at(-1)?.tripped).toBe(true);
+  });
+});
--- a/src/agents/pi-embedded-runner/run/idle-timeout-breaker.ts
+++ b/src/agents/pi-embedded-runner/run/idle-timeout-breaker.ts
@@ -0,0 +1,82 @@
+/**
+ * Cap on consecutive attempts that ended in an idle timeout without completed
+ * model progress, before the outer run loop refuses to start another attempt.
+ * Distinct from MAX_SAME_MODEL_IDLE_TIMEOUT_RETRIES (which gates one extra
+ * retry on the same model before failover) and the broad MAX_RUN_LOOP_ITERATIONS
+ * backstop in run.ts.
+ *
+ * This one fires across profile/auth retries inside the same embedded run so a
+ * wedged provider cannot fan out paid model calls across every fallback profile
+ * in sequence. Resets when an attempt produces completed text or tool-call
+ * progress, but not merely because the provider billed partial output tokens.
+ *
+ * See issue #76293 for the original report (single heartbeat fire generating
+ * 761-1384 paid Anthropic calls in 60 seconds, costing $20-30 per incident).
+ */
+export const MAX_CONSECUTIVE_IDLE_TIMEOUTS_BEFORE_OUTPUT = 5;
+
+export type IdleTimeoutBreakerState = {
+  consecutiveIdleTimeoutsBeforeOutput: number;
+};
+
+export function createIdleTimeoutBreakerState(): IdleTimeoutBreakerState {
+  return { consecutiveIdleTimeoutsBeforeOutput: 0 };
+}
+
+export type IdleTimeoutBreakerInput = {
+  idleTimedOut: boolean;
+  completedModelProgress: boolean;
+  outputTokens?: number;
+};
+
+export type IdleTimeoutBreakerStep = {
+  consecutive: number;
+  tripped: boolean;
+};
+
+/**
+ * Update the breaker counter from the latest attempt's outcome and report
+ * whether the cap is now tripped. Designed to be called from the outer run
+ * loop right after an embedded attempt completes.
+ *
+ * Pure function modulo the mutable `state.consecutiveIdleTimeoutsBeforeOutput`
+ * field, so the caller decides where the state lives (typically a `let` in
+ * the outer loop).
+ *
+ * Decision table:
+ *   idleTimedOut  completedModelProgress   action
+ *   ------------  ----------------------   ------
+ *   true          false                    count += 1   (wedged provider candidate)
+ *   true          true                     count = 0    (model is alive but slow tail)
+ *   false         true                     count = 0    (clean progress, all good)
+ *   false         false                    count unchanged (e.g. non-timeout error;
+ *                                                          don't poison or reset)
+ *
+ * The "false / false" branch matters: a non-timeout error attempt with no
+ * completed progress should not reset the breaker (it isn't a sign the
+ * provider is healthy), but it also shouldn't increment it (the issue at hand
+ * is idle timeouts, not arbitrary errors).
+ *
+ * `outputTokens` is intentionally not part of the reset condition. Some
+ * transports can accumulate billed output tokens from partial tool-call
+ * argument deltas before the model stalls; those tokens are cost, not completed
+ * progress, so they must not keep the breaker disarmed.
+ */
+export function stepIdleTimeoutBreaker(
+  state: IdleTimeoutBreakerState,
+  input: IdleTimeoutBreakerInput,
+  options?: { cap?: number },
+): IdleTimeoutBreakerStep {
+  const cap = options?.cap ?? MAX_CONSECUTIVE_IDLE_TIMEOUTS_BEFORE_OUTPUT;
+
+  if (input.idleTimedOut && !input.completedModelProgress) {
+    state.consecutiveIdleTimeoutsBeforeOutput += 1;
+  } else if (input.completedModelProgress) {
+    state.consecutiveIdleTimeoutsBeforeOutput = 0;
+  }
+
+  return {
+    consecutive: state.consecutiveIdleTimeoutsBeforeOutput,
+    tripped: cap > 0 && state.consecutiveIdleTimeoutsBeforeOutput >= cap,
+  };
+}