mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 09:50:42 +00:00
fix: cap idle timeouts without completed progress (#76345)
Co-authored-by: adhiraj <vacationadhi@gmail.com>
This commit is contained in:
@@ -89,6 +89,7 @@ Docs: https://docs.openclaw.ai
|
||||
- Gateway/models: keep read-only model-list responses on registry-compatible fallbacks and metadata defaults, so empty or minimal persisted model files do not hide built-ins or custom model capabilities. Thanks @Marvinthebored.
|
||||
- CLI/doctor: load the configured memory-slot plugin when resolving memory diagnostics so bundled `memory-core` no longer triggers a false “no active memory plugin” warning on standalone `doctor` / `status` runs. Fixes #76367. Thanks @neeravmakwana.
|
||||
- Gateway: preserve stack diagnostics when `chat.send` or agent attachment parsing/staging fails, improving image-send failure triage. Refs #63432. (#75135) Thanks @keen0206.
|
||||
- Agents/idle-timeout: add a cost-runaway breaker to the outer embedded-run retry loop that halts further attempts after 5 consecutive idle timeouts without completed model progress, so a wedged provider can no longer fan paid model calls out across the same run; completed text or tool-call progress resets the breaker, but partial tool-argument token dribbles do not. Fixes #76293. Thanks @ThePuma312.
|
||||
- Heartbeats/Codex: stop sending the legacy `HEARTBEAT_OK` prompt instruction when heartbeat turns have the structured `heartbeat_respond` tool, while keeping the text sentinel for legacy automatic heartbeat replies. Thanks @pashpashpash.
|
||||
- Agent runtimes: fail explicit plugin runtime selections honestly when the requested harness is unavailable instead of silently falling back to the embedded PI runtime. Thanks @pashpashpash.
|
||||
- Maintainer workflow: push prepared PR heads through GitHub's verified commit API by default and require an explicit override before git-protocol pushes can publish unsigned commits. Thanks @BunsDev.
|
||||
|
||||
@@ -120,6 +120,11 @@ import {
|
||||
type RuntimeAuthState,
|
||||
scrubAnthropicRefusalMagic,
|
||||
} from "./run/helpers.js";
|
||||
import {
|
||||
MAX_CONSECUTIVE_IDLE_TIMEOUTS_BEFORE_OUTPUT,
|
||||
createIdleTimeoutBreakerState,
|
||||
stepIdleTimeoutBreaker,
|
||||
} from "./run/idle-timeout-breaker.js";
|
||||
import {
|
||||
DEFAULT_EMPTY_RESPONSE_RETRY_LIMIT,
|
||||
DEFAULT_REASONING_ONLY_RETRY_LIMIT,
|
||||
@@ -214,6 +219,16 @@ function normalizeEmbeddedRunAttemptResult(
|
||||
};
|
||||
}
|
||||
|
||||
function hasCompletedModelProgressForIdleBreaker(attempt: EmbeddedRunAttemptForRunner): boolean {
|
||||
return (
|
||||
attempt.assistantTexts.some((text) => text.trim().length > 0) ||
|
||||
attempt.toolMetas.length > 0 ||
|
||||
(attempt.clientToolCalls?.length ?? 0) > 0 ||
|
||||
hasMessagingToolDeliveryEvidence(attempt) ||
|
||||
attempt.itemLifecycle.completedCount > 0
|
||||
);
|
||||
}
|
||||
|
||||
function createEmptyAuthProfileStore(): AuthProfileStore {
|
||||
return {
|
||||
version: 1,
|
||||
@@ -755,6 +770,13 @@ export async function runEmbeddedPiAgent(
|
||||
let reasoningOnlyRetryAttempts = 0;
|
||||
let emptyResponseRetryAttempts = 0;
|
||||
let sameModelIdleTimeoutRetries = 0;
|
||||
// Cost-runaway breaker for #76293. State lives at the run-loop level
|
||||
// on purpose so it survives across attempt boundaries and across
|
||||
// profile/auth retries within this embedded run (a wrapper-local
|
||||
// counter would reset on every iteration). The helper is pure and
|
||||
// unit-tested in run/idle-timeout-breaker.test.ts; the run loop just
|
||||
// feeds it the outcome of each attempt.
|
||||
const idleTimeoutBreakerState = createIdleTimeoutBreakerState();
|
||||
let lastRetryFailoverReason: FailoverReason | null = null;
|
||||
let planningOnlyRetryInstruction: string | null = null;
|
||||
let reasoningOnlyRetryInstruction: string | null = null;
|
||||
@@ -1162,6 +1184,55 @@ export async function runEmbeddedPiAgent(
|
||||
// reflects current context usage, not accumulated tool-loop usage.
|
||||
lastRunPromptUsage = lastAssistantUsage ?? attemptUsage;
|
||||
lastTurnTotal = lastAssistantUsage?.total ?? attemptUsage?.total;
|
||||
// Idle-timeout cost-runaway breaker (#76293). Logic lives in the
|
||||
// pure helper below so it stays unit-testable; the run loop just
|
||||
// feeds it the latest attempt outcome and bails through the
|
||||
// existing retry-limit exhaustion path when the cap is hit.
|
||||
const breakerStep = stepIdleTimeoutBreaker(idleTimeoutBreakerState, {
|
||||
idleTimedOut,
|
||||
completedModelProgress: hasCompletedModelProgressForIdleBreaker(attempt),
|
||||
outputTokens: attemptUsage?.output,
|
||||
});
|
||||
if (breakerStep.tripped) {
|
||||
const breakerMessage =
|
||||
`Idle-timeout cost-runaway breaker tripped: ` +
|
||||
`${breakerStep.consecutive} consecutive idle timeouts ` +
|
||||
`without completed model progress ` +
|
||||
`(cap=${MAX_CONSECUTIVE_IDLE_TIMEOUTS_BEFORE_OUTPUT}). ` +
|
||||
`Halting further attempts to bound paid model calls. ` +
|
||||
`See issue #76293.`;
|
||||
log.error(
|
||||
`[idle-timeout-circuit-breaker-tripped] ` +
|
||||
`sessionKey=${params.sessionKey ?? params.sessionId} ` +
|
||||
`provider=${provider}/${modelId} ` +
|
||||
`consecutive=${breakerStep.consecutive} ` +
|
||||
`cap=${MAX_CONSECUTIVE_IDLE_TIMEOUTS_BEFORE_OUTPUT}`,
|
||||
);
|
||||
const breakerDecision = resolveRunFailoverDecision({
|
||||
stage: "retry_limit",
|
||||
fallbackConfigured,
|
||||
failoverReason: lastRetryFailoverReason,
|
||||
});
|
||||
return handleRetryLimitExhaustion({
|
||||
message: breakerMessage,
|
||||
decision: breakerDecision,
|
||||
provider,
|
||||
model: modelId,
|
||||
profileId: lastProfileId,
|
||||
durationMs: Date.now() - started,
|
||||
agentMeta: buildErrorAgentMeta({
|
||||
sessionId: activeSessionId,
|
||||
provider,
|
||||
model: model.id,
|
||||
contextTokens: ctxInfo.tokens,
|
||||
usageAccumulator,
|
||||
lastRunPromptUsage,
|
||||
lastTurnTotal,
|
||||
}),
|
||||
replayInvalid: accumulatedReplayState.replayInvalid ? true : undefined,
|
||||
livenessState: "blocked",
|
||||
});
|
||||
}
|
||||
const attemptCompactionCount = Math.max(0, attempt.compactionCount ?? 0);
|
||||
autoCompactionCount += attemptCompactionCount;
|
||||
if (
|
||||
|
||||
164
src/agents/pi-embedded-runner/run/idle-timeout-breaker.test.ts
Normal file
164
src/agents/pi-embedded-runner/run/idle-timeout-breaker.test.ts
Normal file
@@ -0,0 +1,164 @@
|
||||
import { describe, expect, it } from "vitest";
|
||||
import {
|
||||
MAX_CONSECUTIVE_IDLE_TIMEOUTS_BEFORE_OUTPUT,
|
||||
createIdleTimeoutBreakerState,
|
||||
stepIdleTimeoutBreaker,
|
||||
} from "./idle-timeout-breaker.js";
|
||||
|
||||
// Issue #76293. The wedge: a stalled provider returns from each LLM call
|
||||
// with idleTimedOut=true and no completed model progress. Without this
|
||||
// breaker the outer run loop in run.ts can keep starting fresh attempts (a
|
||||
// new session and a new streamWithIdleTimeout wrapper each time, so any
|
||||
// wrapper-local counter would reset on every iteration). The breaker state
|
||||
// has to live at the outer-loop scope to survive across attempts and profile
|
||||
// failover, which is what stepIdleTimeoutBreaker captures.
|
||||
//
|
||||
// These tests exercise the helper directly. The integration in run.ts is
|
||||
// just `if (step.tripped) return handleRetryLimitExhaustion(...)`, so
|
||||
// proving the helper trips/resets correctly is what matters.
|
||||
describe("stepIdleTimeoutBreaker (#76293)", () => {
|
||||
function drive(
|
||||
inputs: Array<{
|
||||
idleTimedOut: boolean;
|
||||
completedModelProgress: boolean;
|
||||
outputTokens?: number;
|
||||
}>,
|
||||
options?: { cap?: number },
|
||||
) {
|
||||
const state = createIdleTimeoutBreakerState();
|
||||
const steps: Array<{ consecutive: number; tripped: boolean }> = [];
|
||||
for (const input of inputs) {
|
||||
steps.push(stepIdleTimeoutBreaker(state, input, options));
|
||||
}
|
||||
return steps;
|
||||
}
|
||||
|
||||
it("default cap matches the constant the run loop reads from", () => {
|
||||
expect(MAX_CONSECUTIVE_IDLE_TIMEOUTS_BEFORE_OUTPUT).toBe(5);
|
||||
});
|
||||
|
||||
it("does not trip on a single wedged attempt", () => {
|
||||
const steps = drive([{ idleTimedOut: true, completedModelProgress: false }]);
|
||||
expect(steps[0]).toEqual({ consecutive: 1, tripped: false });
|
||||
});
|
||||
|
||||
it("trips on the Nth consecutive wedged attempt at the default cap", () => {
|
||||
const steps = drive([
|
||||
{ idleTimedOut: true, completedModelProgress: false },
|
||||
{ idleTimedOut: true, completedModelProgress: false },
|
||||
{ idleTimedOut: true, completedModelProgress: false },
|
||||
{ idleTimedOut: true, completedModelProgress: false },
|
||||
{ idleTimedOut: true, completedModelProgress: false },
|
||||
]);
|
||||
expect(steps.map((s) => s.tripped)).toEqual([false, false, false, false, true]);
|
||||
expect(steps.at(-1)?.consecutive).toBe(5);
|
||||
});
|
||||
|
||||
it("respects an explicit smaller cap", () => {
|
||||
const steps = drive(
|
||||
[
|
||||
{ idleTimedOut: true, completedModelProgress: false },
|
||||
{ idleTimedOut: true, completedModelProgress: false },
|
||||
{ idleTimedOut: true, completedModelProgress: false },
|
||||
],
|
||||
{ cap: 3 },
|
||||
);
|
||||
expect(steps.map((s) => s.tripped)).toEqual([false, false, true]);
|
||||
});
|
||||
|
||||
it("disables the breaker entirely when cap is 0 (escape hatch)", () => {
|
||||
const steps = drive(
|
||||
[
|
||||
{ idleTimedOut: true, completedModelProgress: false },
|
||||
{ idleTimedOut: true, completedModelProgress: false },
|
||||
{ idleTimedOut: true, completedModelProgress: false },
|
||||
{ idleTimedOut: true, completedModelProgress: false },
|
||||
{ idleTimedOut: true, completedModelProgress: false },
|
||||
{ idleTimedOut: true, completedModelProgress: false },
|
||||
{ idleTimedOut: true, completedModelProgress: false },
|
||||
],
|
||||
{ cap: 0 },
|
||||
);
|
||||
expect(steps.every((s) => !s.tripped)).toBe(true);
|
||||
expect(steps.at(-1)?.consecutive).toBe(7);
|
||||
});
|
||||
|
||||
it("does not trip when the model completed progress, even on a timeout (slow but alive)", () => {
|
||||
// 8 attempts that each timed out but each completed text or tool-call
|
||||
// progress. The model is slow at the tail of its turn, not wedged. The
|
||||
// breaker must stay disarmed so legitimate slow streams keep retrying.
|
||||
const steps = drive(
|
||||
Array.from({ length: 8 }, () => ({
|
||||
idleTimedOut: true,
|
||||
completedModelProgress: true,
|
||||
outputTokens: 220,
|
||||
})),
|
||||
);
|
||||
expect(steps.every((s) => !s.tripped)).toBe(true);
|
||||
expect(steps.at(-1)?.consecutive).toBe(0);
|
||||
});
|
||||
|
||||
it("resets the counter when a productive attempt arrives between wedged attempts", () => {
|
||||
// 4 wedged + 1 productive (completed progress) + 4 wedged. No run of 5
|
||||
// wedged in a row, so the breaker must stay disarmed across the whole
|
||||
// 9-attempt sequence even though 8 of the attempts were wedged in total.
|
||||
const steps = drive([
|
||||
{ idleTimedOut: true, completedModelProgress: false },
|
||||
{ idleTimedOut: true, completedModelProgress: false },
|
||||
{ idleTimedOut: true, completedModelProgress: false },
|
||||
{ idleTimedOut: true, completedModelProgress: false },
|
||||
{ idleTimedOut: false, completedModelProgress: true, outputTokens: 320 },
|
||||
{ idleTimedOut: true, completedModelProgress: false },
|
||||
{ idleTimedOut: true, completedModelProgress: false },
|
||||
{ idleTimedOut: true, completedModelProgress: false },
|
||||
{ idleTimedOut: true, completedModelProgress: false },
|
||||
]);
|
||||
expect(steps.map((s) => s.tripped)).toEqual([
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
]);
|
||||
expect(steps.map((s) => s.consecutive)).toEqual([1, 2, 3, 4, 0, 1, 2, 3, 4]);
|
||||
});
|
||||
|
||||
it("non-timeout error attempts (no output) leave the counter unchanged", () => {
|
||||
// Sequence: 3 wedged, then 2 non-timeout attempts that produced no
|
||||
// completed progress (e.g. transport error, prompt overflow), then 2
|
||||
// more wedged. The non-timeout attempts must NOT reset the counter
|
||||
// (they're not evidence the model is alive) and must NOT increment it
|
||||
// (the breaker is specifically about idle timeouts). So 3+0+0+1+1 = 5,
|
||||
// trip on the last attempt.
|
||||
const steps = drive([
|
||||
{ idleTimedOut: true, completedModelProgress: false },
|
||||
{ idleTimedOut: true, completedModelProgress: false },
|
||||
{ idleTimedOut: true, completedModelProgress: false },
|
||||
{ idleTimedOut: false, completedModelProgress: false },
|
||||
{ idleTimedOut: false, completedModelProgress: false },
|
||||
{ idleTimedOut: true, completedModelProgress: false },
|
||||
{ idleTimedOut: true, completedModelProgress: false },
|
||||
]);
|
||||
expect(steps.map((s) => s.consecutive)).toEqual([1, 2, 3, 3, 3, 4, 5]);
|
||||
expect(steps.at(-1)?.tripped).toBe(true);
|
||||
});
|
||||
|
||||
it("does not reset for partial tool-argument tokens without completed progress", () => {
|
||||
const steps = drive([
|
||||
{ idleTimedOut: true, completedModelProgress: false, outputTokens: 12 },
|
||||
{ idleTimedOut: true, completedModelProgress: false, outputTokens: 18 },
|
||||
{ idleTimedOut: true, completedModelProgress: false, outputTokens: 24 },
|
||||
{ idleTimedOut: true, completedModelProgress: false, outputTokens: 30 },
|
||||
{ idleTimedOut: true, completedModelProgress: false, outputTokens: 36 },
|
||||
]);
|
||||
// Raw provider output tokens can come from partial tool-call argument
|
||||
// deltas before the provider stalls. They are billed, but they are not
|
||||
// completed progress, so they must not reset the breaker.
|
||||
expect(steps.map((s) => s.consecutive)).toEqual([1, 2, 3, 4, 5]);
|
||||
expect(steps.at(-1)?.tripped).toBe(true);
|
||||
});
|
||||
});
|
||||
82
src/agents/pi-embedded-runner/run/idle-timeout-breaker.ts
Normal file
82
src/agents/pi-embedded-runner/run/idle-timeout-breaker.ts
Normal file
@@ -0,0 +1,82 @@
|
||||
/**
|
||||
* Cap on consecutive attempts that ended in an idle timeout without completed
|
||||
* model progress, before the outer run loop refuses to start another attempt.
|
||||
* Distinct from MAX_SAME_MODEL_IDLE_TIMEOUT_RETRIES (which gates one extra
|
||||
* retry on the same model before failover) and the broad MAX_RUN_LOOP_ITERATIONS
|
||||
* backstop in run.ts.
|
||||
*
|
||||
* This one fires across profile/auth retries inside the same embedded run so a
|
||||
* wedged provider cannot fan out paid model calls across every fallback profile
|
||||
* in sequence. Resets when an attempt produces completed text or tool-call
|
||||
* progress, but not merely because the provider billed partial output tokens.
|
||||
*
|
||||
* See issue #76293 for the original report (single heartbeat fire generating
|
||||
* 761-1384 paid Anthropic calls in 60 seconds, costing $20-30 per incident).
|
||||
*/
|
||||
export const MAX_CONSECUTIVE_IDLE_TIMEOUTS_BEFORE_OUTPUT = 5;
|
||||
|
||||
export type IdleTimeoutBreakerState = {
|
||||
consecutiveIdleTimeoutsBeforeOutput: number;
|
||||
};
|
||||
|
||||
export function createIdleTimeoutBreakerState(): IdleTimeoutBreakerState {
|
||||
return { consecutiveIdleTimeoutsBeforeOutput: 0 };
|
||||
}
|
||||
|
||||
export type IdleTimeoutBreakerInput = {
|
||||
idleTimedOut: boolean;
|
||||
completedModelProgress: boolean;
|
||||
outputTokens?: number;
|
||||
};
|
||||
|
||||
export type IdleTimeoutBreakerStep = {
|
||||
consecutive: number;
|
||||
tripped: boolean;
|
||||
};
|
||||
|
||||
/**
|
||||
* Update the breaker counter from the latest attempt's outcome and report
|
||||
* whether the cap is now tripped. Designed to be called from the outer run
|
||||
* loop right after an embedded attempt completes.
|
||||
*
|
||||
* Pure function modulo the mutable `state.consecutiveIdleTimeoutsBeforeOutput`
|
||||
* field, so the caller decides where the state lives (typically a `let` in
|
||||
* the outer loop).
|
||||
*
|
||||
* Decision table:
|
||||
* idleTimedOut completedModelProgress action
|
||||
* ------------ ---------------------- ------
|
||||
* true false count += 1 (wedged provider candidate)
|
||||
* true true count = 0 (model is alive but slow tail)
|
||||
* false true count = 0 (clean progress, all good)
|
||||
* false false count unchanged (e.g. non-timeout error;
|
||||
* don't poison or reset)
|
||||
*
|
||||
* The "false / false" branch matters: a non-timeout error attempt with no
|
||||
* completed progress should not reset the breaker (it isn't a sign the
|
||||
* provider is healthy), but it also shouldn't increment it (the issue at hand
|
||||
* is idle timeouts, not arbitrary errors).
|
||||
*
|
||||
* `outputTokens` is intentionally not part of the reset condition. Some
|
||||
* transports can accumulate billed output tokens from partial tool-call
|
||||
* argument deltas before the model stalls; those tokens are cost, not completed
|
||||
* progress, so they must not keep the breaker disarmed.
|
||||
*/
|
||||
export function stepIdleTimeoutBreaker(
|
||||
state: IdleTimeoutBreakerState,
|
||||
input: IdleTimeoutBreakerInput,
|
||||
options?: { cap?: number },
|
||||
): IdleTimeoutBreakerStep {
|
||||
const cap = options?.cap ?? MAX_CONSECUTIVE_IDLE_TIMEOUTS_BEFORE_OUTPUT;
|
||||
|
||||
if (input.idleTimedOut && !input.completedModelProgress) {
|
||||
state.consecutiveIdleTimeoutsBeforeOutput += 1;
|
||||
} else if (input.completedModelProgress) {
|
||||
state.consecutiveIdleTimeoutsBeforeOutput = 0;
|
||||
}
|
||||
|
||||
return {
|
||||
consecutive: state.consecutiveIdleTimeoutsBeforeOutput,
|
||||
tripped: cap > 0 && state.consecutiveIdleTimeoutsBeforeOutput >= cap,
|
||||
};
|
||||
}
|
||||
Reference in New Issue
Block a user