fix: recover stalled embedded diagnostic runs

This commit is contained in:
Peter Steinberger
2026-05-03 18:12:50 +01:00
parent 2416bc668c
commit 9a22473916
5 changed files with 96 additions and 5 deletions

View File

@@ -34,6 +34,7 @@ Docs: https://docs.openclaw.ai
- CLI/doctor: trust a ready gateway memory probe when CLI-side active memory backend resolution is unavailable, preventing false "No active memory plugin is registered" warnings for healthy runtime setups. Fixes #76792. Thanks @som-686.
- Memory/status: keep plain `openclaw memory status` and `openclaw memory status --json` on the cheap read-only path by reserving vector and embedding provider probes for `--deep` or `--index`. Fixes #76769. Thanks @daruire.
- Telegram: suppress stale same-session replies when a newer accepted message arrives before an older in-flight Telegram dispatch finalizes. Fixes #76642. Thanks @chinar-amrutkar.
- Gateway/diagnostics: abort-drain embedded runs after an extended no-progress stall so a single dead session no longer leaves queued Discord/channel turns blocked behind repeated `recovery=none` liveness warnings.
- Control UI/Sessions: avoid full `sessions.list` reloads for chat-turn `sessions.changed` payloads, so large session stores no longer add multi-second delays while chat responses are being delivered. (#76676) Thanks @VACInc.
- Gateway/watch: run `doctor --fix --non-interactive` once and retry when the dev Gateway child exits during startup, so stale local plugin install/config state does not leave the tmux watch session disappearing without a repair attempt.
- Doctor/Telegram: warn when selected Telegram quote replies can suppress `streaming.preview.toolProgress`, and document the `replyToMode` trade-off without changing runtime delivery. Fixes #73487. Thanks @GodsBoy.

View File

@@ -165,7 +165,7 @@ surfaces, while Codex native hooks remain a separate lower-level Codex mechanism
- `agent.wait` default: 30s (just the wait). `timeoutMs` param overrides.
- Agent runtime: `agents.defaults.timeoutSeconds` default 172800s (48 hours); enforced in `runEmbeddedPiAgent` abort timer.
- Cron runtime: isolated agent-turn `timeoutSeconds` is owned by cron. The scheduler starts that timer when execution begins, aborts the underlying run at the configured deadline, then runs bounded cleanup before recording the timeout so a stale child session cannot keep the lane stuck.
- Session liveness diagnostics: with diagnostics enabled, `diagnostics.stuckSessionWarnMs` classifies long `processing` sessions that have no observed reply, tool, status, block, or ACP progress. Active embedded runs, model calls, and tool calls report as `session.long_running`; active work with no recent progress reports as `session.stalled`; `session.stuck` is reserved for stale session bookkeeping with no active work, and only that path releases the affected session lane so queued startup work can drain. Repeated `session.stuck` diagnostics back off while the session remains unchanged.
- Session liveness diagnostics: with diagnostics enabled, `diagnostics.stuckSessionWarnMs` classifies long `processing` sessions that have no observed reply, tool, status, block, or ACP progress. Active embedded runs, model calls, and tool calls report as `session.long_running`; active work with no recent progress reports as `session.stalled`; `session.stuck` is reserved for stale session bookkeeping with no active work. Stale session bookkeeping releases the affected session lane immediately; stalled embedded runs are abort-drained only after an extended no-progress window (at least 10 minutes and 5x the warning threshold) so queued work can resume without cutting off merely slow runs. Repeated `session.stuck` diagnostics back off while the session remains unchanged.
- Model idle timeout: OpenClaw aborts a model request when no response chunks arrive before the idle window. `models.providers.<id>.timeoutSeconds` extends this idle watchdog for slow local/self-hosted providers; otherwise OpenClaw uses `agents.defaults.timeoutSeconds` when configured, capped at 120s by default. Cron-triggered runs with no explicit model or agent timeout disable the idle watchdog and rely on the cron outer timeout.
- Provider HTTP request timeout: `models.providers.<id>.timeoutSeconds` applies to that provider's model HTTP fetches, including connect, headers, body, SDK request timeout, total guarded-fetch abort handling, and model stream idle watchdog. Use this for slow local/self-hosted providers such as Ollama before raising the whole agent runtime timeout.

View File

@@ -215,9 +215,11 @@ OpenClaw classifies sessions by the work it can still observe:
- `session.long_running`: active embedded work, model calls, or tool calls are
still making progress.
- `session.stalled`: active work exists, but the active run has not reported
recent progress.
- `session.stuck`: stale session bookkeeping with no active work. This is the
only liveness classification that releases the affected session lane.
recent progress. Stalled embedded runs stay observe-only at first, then
abort-drain after at least 10 minutes and 5x `diagnostics.stuckSessionWarnMs`
with no progress so queued turns behind the lane can resume.
- `session.stuck`: stale session bookkeeping with no active work. This releases
the affected session lane immediately.
Only `session.stuck` emits the `openclaw.session.stuck` counter, the
`openclaw.session.stuck_age_ms` histogram, and the `openclaw.session.stuck`

View File

@@ -363,6 +363,49 @@ describe("stuck session diagnostics threshold", () => {
expect(recoverStuckSession).not.toHaveBeenCalled();
});
it("aborts and drains embedded runs after an extended no-progress stall", () => {
const events: DiagnosticEventPayload[] = [];
const recoverStuckSession = vi.fn();
const unsubscribe = onDiagnosticEvent((event) => {
events.push(event);
});
try {
startDiagnosticHeartbeat(
{
diagnostics: {
enabled: true,
stuckSessionWarnMs: 30_000,
},
},
{ recoverStuckSession },
);
logSessionStateChange({ sessionId: "s1", sessionKey: "main", state: "processing" });
markDiagnosticEmbeddedRunStarted({ sessionId: "s1", sessionKey: "main" });
vi.advanceTimersByTime(9 * 60_000);
expect(recoverStuckSession).not.toHaveBeenCalled();
vi.advanceTimersByTime(2 * 60_000);
} finally {
unsubscribe();
}
const stalledEvents = events.filter((event) => event.type === "session.stalled");
expect(stalledEvents.length).toBeGreaterThan(0);
expect(stalledEvents.at(-1)).toMatchObject({
classification: "stalled_agent_run",
reason: "active_work_without_progress",
activeWorkKind: "embedded_run",
});
expect(recoverStuckSession).toHaveBeenCalledWith({
sessionId: "s1",
sessionKey: "main",
ageMs: expect.any(Number),
queueDepth: 0,
allowActiveAbort: true,
});
});
it("reports long-running sessions separately when active work is making progress", () => {
const events: DiagnosticEventPayload[] = [];
const recoverStuckSession = vi.fn();

View File

@@ -54,6 +54,8 @@ const webhookStats = {
const DEFAULT_STUCK_SESSION_WARN_MS = 120_000;
const MIN_STUCK_SESSION_WARN_MS = 1_000;
const MAX_STUCK_SESSION_WARN_MS = 24 * 60 * 60 * 1000;
const MIN_STALLED_EMBEDDED_RUN_ABORT_MS = 10 * 60_000;
const STALLED_EMBEDDED_RUN_ABORT_WARN_MULTIPLIER = 5;
const RECENT_DIAGNOSTIC_ACTIVITY_MS = 120_000;
const DEFAULT_LIVENESS_EVENT_LOOP_DELAY_WARN_MS = 1_000;
const DEFAULT_LIVENESS_EVENT_LOOP_UTILIZATION_WARN = 0.95;
@@ -82,6 +84,7 @@ type RecoverStuckSession = (params: {
sessionKey?: string;
ageMs: number;
queueDepth?: number;
allowActiveAbort?: boolean;
}) => void | Promise<void>;
type DiagnosticLivenessSample = {
@@ -125,6 +128,7 @@ function recoverStuckSession(params: {
sessionKey?: string;
ageMs: number;
queueDepth?: number;
allowActiveAbort?: boolean;
}) {
stuckSessionRecoveryRuntimePromise ??= import("./diagnostic-stuck-session-recovery.runtime.js");
void stuckSessionRecoveryRuntimePromise
@@ -344,6 +348,26 @@ export function resolveStuckSessionWarnMs(config?: OpenClawConfig): number {
return rounded;
}
function resolveStalledEmbeddedRunAbortMs(stuckSessionWarnMs: number): number {
return Math.max(
MIN_STALLED_EMBEDDED_RUN_ABORT_MS,
stuckSessionWarnMs * STALLED_EMBEDDED_RUN_ABORT_WARN_MULTIPLIER,
);
}
function isStalledEmbeddedRunRecoveryEligible(params: {
classification: SessionAttentionClassification | undefined;
ageMs: number;
stuckSessionWarnMs: number;
}): boolean {
return (
params.classification?.eventType === "session.stalled" &&
params.classification.classification === "stalled_agent_run" &&
params.classification.activeWorkKind === "embedded_run" &&
params.ageMs >= resolveStalledEmbeddedRunAbortMs(params.stuckSessionWarnMs)
);
}
export function logWebhookReceived(params: {
channel: string;
updateType?: string;
@@ -594,6 +618,13 @@ export function logSessionAttention(
activity,
staleMs: params.thresholdMs,
});
const recoveryEligible =
classification.recoveryEligible ||
isStalledEmbeddedRunRecoveryEligible({
classification,
ageMs: params.ageMs,
stuckSessionWarnMs: params.thresholdMs,
});
if (classification.eventType === "session.stuck") {
const nextWarnAgeMs =
state.lastStuckWarnAgeMs === undefined
@@ -617,7 +648,7 @@ export function logSessionAttention(
state.queueDepth
} reason=${classification.reason} classification=${classification.classification}${
classification.activeWorkKind ? ` activeWorkKind=${classification.activeWorkKind}` : ""
} recovery=${classification.recoveryEligible ? "checking" : "none"}`,
} recovery=${recoveryEligible ? "checking" : "none"}`,
);
const baseEvent = {
sessionId: state.sessionId,
@@ -816,6 +847,20 @@ export function startDiagnosticHeartbeat(
ageMs,
queueDepth: state.queueDepth,
});
} else if (
isStalledEmbeddedRunRecoveryEligible({
classification,
ageMs,
stuckSessionWarnMs,
})
) {
void (opts?.recoverStuckSession ?? recoverStuckSession)({
sessionId: state.sessionId,
sessionKey: state.sessionKey,
ageMs,
queueDepth: state.queueDepth,
allowActiveAbort: true,
});
}
}
}