diff --git a/CHANGELOG.md b/CHANGELOG.md index 412cd89bbc0..6a0dc2b1df4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -34,6 +34,7 @@ Docs: https://docs.openclaw.ai - CLI/doctor: trust a ready gateway memory probe when CLI-side active memory backend resolution is unavailable, preventing false "No active memory plugin is registered" warnings for healthy runtime setups. Fixes #76792. Thanks @som-686. - Memory/status: keep plain `openclaw memory status` and `openclaw memory status --json` on the cheap read-only path by reserving vector and embedding provider probes for `--deep` or `--index`. Fixes #76769. Thanks @daruire. - Telegram: suppress stale same-session replies when a newer accepted message arrives before an older in-flight Telegram dispatch finalizes. Fixes #76642. Thanks @chinar-amrutkar. +- Gateway/diagnostics: abort-drain embedded runs after an extended no-progress stall so a single dead session no longer leaves queued Discord/channel turns blocked behind repeated `recovery=none` liveness warnings. - Control UI/Sessions: avoid full `sessions.list` reloads for chat-turn `sessions.changed` payloads, so large session stores no longer add multi-second delays while chat responses are being delivered. (#76676) Thanks @VACInc. - Gateway/watch: run `doctor --fix --non-interactive` once and retry when the dev Gateway child exits during startup, so stale local plugin install/config state does not leave the tmux watch session disappearing without a repair attempt. - Doctor/Telegram: warn when selected Telegram quote replies can suppress `streaming.preview.toolProgress`, and document the `replyToMode` trade-off without changing runtime delivery. Fixes #73487. Thanks @GodsBoy. diff --git a/docs/concepts/agent-loop.md b/docs/concepts/agent-loop.md index a81a0ba3a28..9c6584f4d8e 100644 --- a/docs/concepts/agent-loop.md +++ b/docs/concepts/agent-loop.md @@ -165,7 +165,7 @@ surfaces, while Codex native hooks remain a separate lower-level Codex mechanism - `agent.wait` default: 30s (just the wait). `timeoutMs` param overrides. - Agent runtime: `agents.defaults.timeoutSeconds` default 172800s (48 hours); enforced in `runEmbeddedPiAgent` abort timer. - Cron runtime: isolated agent-turn `timeoutSeconds` is owned by cron. The scheduler starts that timer when execution begins, aborts the underlying run at the configured deadline, then runs bounded cleanup before recording the timeout so a stale child session cannot keep the lane stuck. -- Session liveness diagnostics: with diagnostics enabled, `diagnostics.stuckSessionWarnMs` classifies long `processing` sessions that have no observed reply, tool, status, block, or ACP progress. Active embedded runs, model calls, and tool calls report as `session.long_running`; active work with no recent progress reports as `session.stalled`; `session.stuck` is reserved for stale session bookkeeping with no active work, and only that path releases the affected session lane so queued startup work can drain. Repeated `session.stuck` diagnostics back off while the session remains unchanged. +- Session liveness diagnostics: with diagnostics enabled, `diagnostics.stuckSessionWarnMs` classifies long `processing` sessions that have no observed reply, tool, status, block, or ACP progress. Active embedded runs, model calls, and tool calls report as `session.long_running`; active work with no recent progress reports as `session.stalled`; `session.stuck` is reserved for stale session bookkeeping with no active work. Stale session bookkeeping releases the affected session lane immediately; stalled embedded runs are abort-drained only after an extended no-progress window (at least 10 minutes and 5x the warning threshold) so queued work can resume without cutting off merely slow runs. Repeated `session.stuck` diagnostics back off while the session remains unchanged. - Model idle timeout: OpenClaw aborts a model request when no response chunks arrive before the idle window. `models.providers..timeoutSeconds` extends this idle watchdog for slow local/self-hosted providers; otherwise OpenClaw uses `agents.defaults.timeoutSeconds` when configured, capped at 120s by default. Cron-triggered runs with no explicit model or agent timeout disable the idle watchdog and rely on the cron outer timeout. - Provider HTTP request timeout: `models.providers..timeoutSeconds` applies to that provider's model HTTP fetches, including connect, headers, body, SDK request timeout, total guarded-fetch abort handling, and model stream idle watchdog. Use this for slow local/self-hosted providers such as Ollama before raising the whole agent runtime timeout. diff --git a/docs/gateway/opentelemetry.md b/docs/gateway/opentelemetry.md index f9887472559..758cbf2cb38 100644 --- a/docs/gateway/opentelemetry.md +++ b/docs/gateway/opentelemetry.md @@ -215,9 +215,11 @@ OpenClaw classifies sessions by the work it can still observe: - `session.long_running`: active embedded work, model calls, or tool calls are still making progress. - `session.stalled`: active work exists, but the active run has not reported - recent progress. -- `session.stuck`: stale session bookkeeping with no active work. This is the - only liveness classification that releases the affected session lane. + recent progress. Stalled embedded runs stay observe-only at first, then + abort-drain after at least 10 minutes and 5x `diagnostics.stuckSessionWarnMs` + with no progress so queued turns behind the lane can resume. +- `session.stuck`: stale session bookkeeping with no active work. This releases + the affected session lane immediately. Only `session.stuck` emits the `openclaw.session.stuck` counter, the `openclaw.session.stuck_age_ms` histogram, and the `openclaw.session.stuck` diff --git a/src/logging/diagnostic.test.ts b/src/logging/diagnostic.test.ts index 831b8cc6c50..fdbd26e0eb5 100644 --- a/src/logging/diagnostic.test.ts +++ b/src/logging/diagnostic.test.ts @@ -363,6 +363,49 @@ describe("stuck session diagnostics threshold", () => { expect(recoverStuckSession).not.toHaveBeenCalled(); }); + it("aborts and drains embedded runs after an extended no-progress stall", () => { + const events: DiagnosticEventPayload[] = []; + const recoverStuckSession = vi.fn(); + const unsubscribe = onDiagnosticEvent((event) => { + events.push(event); + }); + try { + startDiagnosticHeartbeat( + { + diagnostics: { + enabled: true, + stuckSessionWarnMs: 30_000, + }, + }, + { recoverStuckSession }, + ); + logSessionStateChange({ sessionId: "s1", sessionKey: "main", state: "processing" }); + markDiagnosticEmbeddedRunStarted({ sessionId: "s1", sessionKey: "main" }); + + vi.advanceTimersByTime(9 * 60_000); + expect(recoverStuckSession).not.toHaveBeenCalled(); + + vi.advanceTimersByTime(2 * 60_000); + } finally { + unsubscribe(); + } + + const stalledEvents = events.filter((event) => event.type === "session.stalled"); + expect(stalledEvents.length).toBeGreaterThan(0); + expect(stalledEvents.at(-1)).toMatchObject({ + classification: "stalled_agent_run", + reason: "active_work_without_progress", + activeWorkKind: "embedded_run", + }); + expect(recoverStuckSession).toHaveBeenCalledWith({ + sessionId: "s1", + sessionKey: "main", + ageMs: expect.any(Number), + queueDepth: 0, + allowActiveAbort: true, + }); + }); + it("reports long-running sessions separately when active work is making progress", () => { const events: DiagnosticEventPayload[] = []; const recoverStuckSession = vi.fn(); diff --git a/src/logging/diagnostic.ts b/src/logging/diagnostic.ts index 0898dbf294f..623326182fd 100644 --- a/src/logging/diagnostic.ts +++ b/src/logging/diagnostic.ts @@ -54,6 +54,8 @@ const webhookStats = { const DEFAULT_STUCK_SESSION_WARN_MS = 120_000; const MIN_STUCK_SESSION_WARN_MS = 1_000; const MAX_STUCK_SESSION_WARN_MS = 24 * 60 * 60 * 1000; +const MIN_STALLED_EMBEDDED_RUN_ABORT_MS = 10 * 60_000; +const STALLED_EMBEDDED_RUN_ABORT_WARN_MULTIPLIER = 5; const RECENT_DIAGNOSTIC_ACTIVITY_MS = 120_000; const DEFAULT_LIVENESS_EVENT_LOOP_DELAY_WARN_MS = 1_000; const DEFAULT_LIVENESS_EVENT_LOOP_UTILIZATION_WARN = 0.95; @@ -82,6 +84,7 @@ type RecoverStuckSession = (params: { sessionKey?: string; ageMs: number; queueDepth?: number; + allowActiveAbort?: boolean; }) => void | Promise; type DiagnosticLivenessSample = { @@ -125,6 +128,7 @@ function recoverStuckSession(params: { sessionKey?: string; ageMs: number; queueDepth?: number; + allowActiveAbort?: boolean; }) { stuckSessionRecoveryRuntimePromise ??= import("./diagnostic-stuck-session-recovery.runtime.js"); void stuckSessionRecoveryRuntimePromise @@ -344,6 +348,26 @@ export function resolveStuckSessionWarnMs(config?: OpenClawConfig): number { return rounded; } +function resolveStalledEmbeddedRunAbortMs(stuckSessionWarnMs: number): number { + return Math.max( + MIN_STALLED_EMBEDDED_RUN_ABORT_MS, + stuckSessionWarnMs * STALLED_EMBEDDED_RUN_ABORT_WARN_MULTIPLIER, + ); +} + +function isStalledEmbeddedRunRecoveryEligible(params: { + classification: SessionAttentionClassification | undefined; + ageMs: number; + stuckSessionWarnMs: number; +}): boolean { + return ( + params.classification?.eventType === "session.stalled" && + params.classification.classification === "stalled_agent_run" && + params.classification.activeWorkKind === "embedded_run" && + params.ageMs >= resolveStalledEmbeddedRunAbortMs(params.stuckSessionWarnMs) + ); +} + export function logWebhookReceived(params: { channel: string; updateType?: string; @@ -594,6 +618,13 @@ export function logSessionAttention( activity, staleMs: params.thresholdMs, }); + const recoveryEligible = + classification.recoveryEligible || + isStalledEmbeddedRunRecoveryEligible({ + classification, + ageMs: params.ageMs, + stuckSessionWarnMs: params.thresholdMs, + }); if (classification.eventType === "session.stuck") { const nextWarnAgeMs = state.lastStuckWarnAgeMs === undefined @@ -617,7 +648,7 @@ export function logSessionAttention( state.queueDepth } reason=${classification.reason} classification=${classification.classification}${ classification.activeWorkKind ? ` activeWorkKind=${classification.activeWorkKind}` : "" - } recovery=${classification.recoveryEligible ? "checking" : "none"}`, + } recovery=${recoveryEligible ? "checking" : "none"}`, ); const baseEvent = { sessionId: state.sessionId, @@ -816,6 +847,20 @@ export function startDiagnosticHeartbeat( ageMs, queueDepth: state.queueDepth, }); + } else if ( + isStalledEmbeddedRunRecoveryEligible({ + classification, + ageMs, + stuckSessionWarnMs, + }) + ) { + void (opts?.recoverStuckSession ?? recoverStuckSession)({ + sessionId: state.sessionId, + sessionKey: state.sessionKey, + ageMs, + queueDepth: state.queueDepth, + allowActiveAbort: true, + }); } } }