fix: recover stalled embedded diagnostic runs

2026-05-06 16:30:57 +00:00 · 2026-05-03 18:12:50 +01:00
parent 2416bc668c
commit 9a22473916
5 changed files with 96 additions and 5 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -34,6 +34,7 @@ Docs: https://docs.openclaw.ai
 - CLI/doctor: trust a ready gateway memory probe when CLI-side active memory backend resolution is unavailable, preventing false "No active memory plugin is registered" warnings for healthy runtime setups. Fixes #76792. Thanks @som-686.
 - Memory/status: keep plain `openclaw memory status` and `openclaw memory status --json` on the cheap read-only path by reserving vector and embedding provider probes for `--deep` or `--index`. Fixes #76769. Thanks @daruire.
 - Telegram: suppress stale same-session replies when a newer accepted message arrives before an older in-flight Telegram dispatch finalizes. Fixes #76642. Thanks @chinar-amrutkar.
+- Gateway/diagnostics: abort-drain embedded runs after an extended no-progress stall so a single dead session no longer leaves queued Discord/channel turns blocked behind repeated `recovery=none` liveness warnings.
 - Control UI/Sessions: avoid full `sessions.list` reloads for chat-turn `sessions.changed` payloads, so large session stores no longer add multi-second delays while chat responses are being delivered. (#76676) Thanks @VACInc.
 - Gateway/watch: run `doctor --fix --non-interactive` once and retry when the dev Gateway child exits during startup, so stale local plugin install/config state does not leave the tmux watch session disappearing without a repair attempt.
 - Doctor/Telegram: warn when selected Telegram quote replies can suppress `streaming.preview.toolProgress`, and document the `replyToMode` trade-off without changing runtime delivery. Fixes #73487. Thanks @GodsBoy.
--- a/docs/concepts/agent-loop.md
+++ b/docs/concepts/agent-loop.md
@@ -165,7 +165,7 @@ surfaces, while Codex native hooks remain a separate lower-level Codex mechanism
 - `agent.wait` default: 30s (just the wait). `timeoutMs` param overrides.
 - Agent runtime: `agents.defaults.timeoutSeconds` default 172800s (48 hours); enforced in `runEmbeddedPiAgent` abort timer.
 - Cron runtime: isolated agent-turn `timeoutSeconds` is owned by cron. The scheduler starts that timer when execution begins, aborts the underlying run at the configured deadline, then runs bounded cleanup before recording the timeout so a stale child session cannot keep the lane stuck.
- Session liveness diagnostics: with diagnostics enabled, `diagnostics.stuckSessionWarnMs` classifies long `processing` sessions that have no observed reply, tool, status, block, or ACP progress. Active embedded runs, model calls, and tool calls report as `session.long_running`; active work with no recent progress reports as `session.stalled`; `session.stuck` is reserved for stale session bookkeeping with no active work, and only that path releases the affected session lane so queued startup work can drain. Repeated `session.stuck` diagnostics back off while the session remains unchanged.
+- Session liveness diagnostics: with diagnostics enabled, `diagnostics.stuckSessionWarnMs` classifies long `processing` sessions that have no observed reply, tool, status, block, or ACP progress. Active embedded runs, model calls, and tool calls report as `session.long_running`; active work with no recent progress reports as `session.stalled`; `session.stuck` is reserved for stale session bookkeeping with no active work. Stale session bookkeeping releases the affected session lane immediately; stalled embedded runs are abort-drained only after an extended no-progress window (at least 10 minutes and 5x the warning threshold) so queued work can resume without cutting off merely slow runs. Repeated `session.stuck` diagnostics back off while the session remains unchanged.
 - Model idle timeout: OpenClaw aborts a model request when no response chunks arrive before the idle window. `models.providers.<id>.timeoutSeconds` extends this idle watchdog for slow local/self-hosted providers; otherwise OpenClaw uses `agents.defaults.timeoutSeconds` when configured, capped at 120s by default. Cron-triggered runs with no explicit model or agent timeout disable the idle watchdog and rely on the cron outer timeout.
 - Provider HTTP request timeout: `models.providers.<id>.timeoutSeconds` applies to that provider's model HTTP fetches, including connect, headers, body, SDK request timeout, total guarded-fetch abort handling, and model stream idle watchdog. Use this for slow local/self-hosted providers such as Ollama before raising the whole agent runtime timeout.

--- a/docs/gateway/opentelemetry.md
+++ b/docs/gateway/opentelemetry.md
@@ -215,9 +215,11 @@ OpenClaw classifies sessions by the work it can still observe:
 - `session.long_running`: active embedded work, model calls, or tool calls are
  still making progress.
 - `session.stalled`: active work exists, but the active run has not reported
-  recent progress.
- `session.stuck`: stale session bookkeeping with no active work. This is the
-  only liveness classification that releases the affected session lane.
+  recent progress. Stalled embedded runs stay observe-only at first, then
+  abort-drain after at least 10 minutes and 5x `diagnostics.stuckSessionWarnMs`
+  with no progress so queued turns behind the lane can resume.
+- `session.stuck`: stale session bookkeeping with no active work. This releases
+  the affected session lane immediately.

 Only `session.stuck` emits the `openclaw.session.stuck` counter, the
 `openclaw.session.stuck_age_ms` histogram, and the `openclaw.session.stuck`
--- a/src/logging/diagnostic.test.ts
+++ b/src/logging/diagnostic.test.ts
@@ -363,6 +363,49 @@ describe("stuck session diagnostics threshold", () => {
    expect(recoverStuckSession).not.toHaveBeenCalled();
  });

+  it("aborts and drains embedded runs after an extended no-progress stall", () => {
+    const events: DiagnosticEventPayload[] = [];
+    const recoverStuckSession = vi.fn();
+    const unsubscribe = onDiagnosticEvent((event) => {
+      events.push(event);
+    });
+    try {
+      startDiagnosticHeartbeat(
+        {
+          diagnostics: {
+            enabled: true,
+            stuckSessionWarnMs: 30_000,
+          },
+        },
+        { recoverStuckSession },
+      );
+      logSessionStateChange({ sessionId: "s1", sessionKey: "main", state: "processing" });
+      markDiagnosticEmbeddedRunStarted({ sessionId: "s1", sessionKey: "main" });
+
+      vi.advanceTimersByTime(9 * 60_000);
+      expect(recoverStuckSession).not.toHaveBeenCalled();
+
+      vi.advanceTimersByTime(2 * 60_000);
+    } finally {
+      unsubscribe();
+    }
+
+    const stalledEvents = events.filter((event) => event.type === "session.stalled");
+    expect(stalledEvents.length).toBeGreaterThan(0);
+    expect(stalledEvents.at(-1)).toMatchObject({
+      classification: "stalled_agent_run",
+      reason: "active_work_without_progress",
+      activeWorkKind: "embedded_run",
+    });
+    expect(recoverStuckSession).toHaveBeenCalledWith({
+      sessionId: "s1",
+      sessionKey: "main",
+      ageMs: expect.any(Number),
+      queueDepth: 0,
+      allowActiveAbort: true,
+    });
+  });
+
  it("reports long-running sessions separately when active work is making progress", () => {
    const events: DiagnosticEventPayload[] = [];
    const recoverStuckSession = vi.fn();
--- a/src/logging/diagnostic.ts
+++ b/src/logging/diagnostic.ts
@@ -54,6 +54,8 @@ const webhookStats = {
 const DEFAULT_STUCK_SESSION_WARN_MS = 120_000;
 const MIN_STUCK_SESSION_WARN_MS = 1_000;
 const MAX_STUCK_SESSION_WARN_MS = 24 * 60 * 60 * 1000;
+const MIN_STALLED_EMBEDDED_RUN_ABORT_MS = 10 * 60_000;
+const STALLED_EMBEDDED_RUN_ABORT_WARN_MULTIPLIER = 5;
 const RECENT_DIAGNOSTIC_ACTIVITY_MS = 120_000;
 const DEFAULT_LIVENESS_EVENT_LOOP_DELAY_WARN_MS = 1_000;
 const DEFAULT_LIVENESS_EVENT_LOOP_UTILIZATION_WARN = 0.95;
@@ -82,6 +84,7 @@ type RecoverStuckSession = (params: {
  sessionKey?: string;
  ageMs: number;
  queueDepth?: number;
+  allowActiveAbort?: boolean;
 }) => void | Promise<void>;

 type DiagnosticLivenessSample = {
@@ -125,6 +128,7 @@ function recoverStuckSession(params: {
  sessionKey?: string;
  ageMs: number;
  queueDepth?: number;
+  allowActiveAbort?: boolean;
 }) {
  stuckSessionRecoveryRuntimePromise ??= import("./diagnostic-stuck-session-recovery.runtime.js");
  void stuckSessionRecoveryRuntimePromise
@@ -344,6 +348,26 @@ export function resolveStuckSessionWarnMs(config?: OpenClawConfig): number {
  return rounded;
 }

+function resolveStalledEmbeddedRunAbortMs(stuckSessionWarnMs: number): number {
+  return Math.max(
+    MIN_STALLED_EMBEDDED_RUN_ABORT_MS,
+    stuckSessionWarnMs * STALLED_EMBEDDED_RUN_ABORT_WARN_MULTIPLIER,
+  );
+}
+
+function isStalledEmbeddedRunRecoveryEligible(params: {
+  classification: SessionAttentionClassification | undefined;
+  ageMs: number;
+  stuckSessionWarnMs: number;
+}): boolean {
+  return (
+    params.classification?.eventType === "session.stalled" &&
+    params.classification.classification === "stalled_agent_run" &&
+    params.classification.activeWorkKind === "embedded_run" &&
+    params.ageMs >= resolveStalledEmbeddedRunAbortMs(params.stuckSessionWarnMs)
+  );
+}
+
 export function logWebhookReceived(params: {
  channel: string;
  updateType?: string;
@@ -594,6 +618,13 @@ export function logSessionAttention(
    activity,
    staleMs: params.thresholdMs,
  });
+  const recoveryEligible =
+    classification.recoveryEligible ||
+    isStalledEmbeddedRunRecoveryEligible({
+      classification,
+      ageMs: params.ageMs,
+      stuckSessionWarnMs: params.thresholdMs,
+    });
  if (classification.eventType === "session.stuck") {
    const nextWarnAgeMs =
      state.lastStuckWarnAgeMs === undefined
@@ -617,7 +648,7 @@ export function logSessionAttention(
      state.queueDepth
    } reason=${classification.reason} classification=${classification.classification}${
      classification.activeWorkKind ? ` activeWorkKind=${classification.activeWorkKind}` : ""
-    } recovery=${classification.recoveryEligible ? "checking" : "none"}`,
+    } recovery=${recoveryEligible ? "checking" : "none"}`,
  );
  const baseEvent = {
    sessionId: state.sessionId,
@@ -816,6 +847,20 @@ export function startDiagnosticHeartbeat(
            ageMs,
            queueDepth: state.queueDepth,
          });
+        } else if (
+          isStalledEmbeddedRunRecoveryEligible({
+            classification,
+            ageMs,
+            stuckSessionWarnMs,
+          })
+        ) {
+          void (opts?.recoverStuckSession ?? recoverStuckSession)({
+            sessionId: state.sessionId,
+            sessionKey: state.sessionKey,
+            ageMs,
+            queueDepth: state.queueDepth,
+            allowActiveAbort: true,
+          });
        }
      }
    }