mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 16:30:57 +00:00
fix: recover stalled embedded diagnostic runs
This commit is contained in:
@@ -34,6 +34,7 @@ Docs: https://docs.openclaw.ai
|
||||
- CLI/doctor: trust a ready gateway memory probe when CLI-side active memory backend resolution is unavailable, preventing false "No active memory plugin is registered" warnings for healthy runtime setups. Fixes #76792. Thanks @som-686.
|
||||
- Memory/status: keep plain `openclaw memory status` and `openclaw memory status --json` on the cheap read-only path by reserving vector and embedding provider probes for `--deep` or `--index`. Fixes #76769. Thanks @daruire.
|
||||
- Telegram: suppress stale same-session replies when a newer accepted message arrives before an older in-flight Telegram dispatch finalizes. Fixes #76642. Thanks @chinar-amrutkar.
|
||||
- Gateway/diagnostics: abort-drain embedded runs after an extended no-progress stall so a single dead session no longer leaves queued Discord/channel turns blocked behind repeated `recovery=none` liveness warnings.
|
||||
- Control UI/Sessions: avoid full `sessions.list` reloads for chat-turn `sessions.changed` payloads, so large session stores no longer add multi-second delays while chat responses are being delivered. (#76676) Thanks @VACInc.
|
||||
- Gateway/watch: run `doctor --fix --non-interactive` once and retry when the dev Gateway child exits during startup, so stale local plugin install/config state does not leave the tmux watch session disappearing without a repair attempt.
|
||||
- Doctor/Telegram: warn when selected Telegram quote replies can suppress `streaming.preview.toolProgress`, and document the `replyToMode` trade-off without changing runtime delivery. Fixes #73487. Thanks @GodsBoy.
|
||||
|
||||
@@ -165,7 +165,7 @@ surfaces, while Codex native hooks remain a separate lower-level Codex mechanism
|
||||
- `agent.wait` default: 30s (just the wait). `timeoutMs` param overrides.
|
||||
- Agent runtime: `agents.defaults.timeoutSeconds` default 172800s (48 hours); enforced in `runEmbeddedPiAgent` abort timer.
|
||||
- Cron runtime: isolated agent-turn `timeoutSeconds` is owned by cron. The scheduler starts that timer when execution begins, aborts the underlying run at the configured deadline, then runs bounded cleanup before recording the timeout so a stale child session cannot keep the lane stuck.
|
||||
- Session liveness diagnostics: with diagnostics enabled, `diagnostics.stuckSessionWarnMs` classifies long `processing` sessions that have no observed reply, tool, status, block, or ACP progress. Active embedded runs, model calls, and tool calls report as `session.long_running`; active work with no recent progress reports as `session.stalled`; `session.stuck` is reserved for stale session bookkeeping with no active work, and only that path releases the affected session lane so queued startup work can drain. Repeated `session.stuck` diagnostics back off while the session remains unchanged.
|
||||
- Session liveness diagnostics: with diagnostics enabled, `diagnostics.stuckSessionWarnMs` classifies long `processing` sessions that have no observed reply, tool, status, block, or ACP progress. Active embedded runs, model calls, and tool calls report as `session.long_running`; active work with no recent progress reports as `session.stalled`; `session.stuck` is reserved for stale session bookkeeping with no active work. Stale session bookkeeping releases the affected session lane immediately; stalled embedded runs are abort-drained only after an extended no-progress window (at least 10 minutes and 5x the warning threshold) so queued work can resume without cutting off merely slow runs. Repeated `session.stuck` diagnostics back off while the session remains unchanged.
|
||||
- Model idle timeout: OpenClaw aborts a model request when no response chunks arrive before the idle window. `models.providers.<id>.timeoutSeconds` extends this idle watchdog for slow local/self-hosted providers; otherwise OpenClaw uses `agents.defaults.timeoutSeconds` when configured, capped at 120s by default. Cron-triggered runs with no explicit model or agent timeout disable the idle watchdog and rely on the cron outer timeout.
|
||||
- Provider HTTP request timeout: `models.providers.<id>.timeoutSeconds` applies to that provider's model HTTP fetches, including connect, headers, body, SDK request timeout, total guarded-fetch abort handling, and model stream idle watchdog. Use this for slow local/self-hosted providers such as Ollama before raising the whole agent runtime timeout.
|
||||
|
||||
|
||||
@@ -215,9 +215,11 @@ OpenClaw classifies sessions by the work it can still observe:
|
||||
- `session.long_running`: active embedded work, model calls, or tool calls are
|
||||
still making progress.
|
||||
- `session.stalled`: active work exists, but the active run has not reported
|
||||
recent progress.
|
||||
- `session.stuck`: stale session bookkeeping with no active work. This is the
|
||||
only liveness classification that releases the affected session lane.
|
||||
recent progress. Stalled embedded runs stay observe-only at first, then
|
||||
abort-drain after at least 10 minutes and 5x `diagnostics.stuckSessionWarnMs`
|
||||
with no progress so queued turns behind the lane can resume.
|
||||
- `session.stuck`: stale session bookkeeping with no active work. This releases
|
||||
the affected session lane immediately.
|
||||
|
||||
Only `session.stuck` emits the `openclaw.session.stuck` counter, the
|
||||
`openclaw.session.stuck_age_ms` histogram, and the `openclaw.session.stuck`
|
||||
|
||||
@@ -363,6 +363,49 @@ describe("stuck session diagnostics threshold", () => {
|
||||
expect(recoverStuckSession).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it("aborts and drains embedded runs after an extended no-progress stall", () => {
|
||||
const events: DiagnosticEventPayload[] = [];
|
||||
const recoverStuckSession = vi.fn();
|
||||
const unsubscribe = onDiagnosticEvent((event) => {
|
||||
events.push(event);
|
||||
});
|
||||
try {
|
||||
startDiagnosticHeartbeat(
|
||||
{
|
||||
diagnostics: {
|
||||
enabled: true,
|
||||
stuckSessionWarnMs: 30_000,
|
||||
},
|
||||
},
|
||||
{ recoverStuckSession },
|
||||
);
|
||||
logSessionStateChange({ sessionId: "s1", sessionKey: "main", state: "processing" });
|
||||
markDiagnosticEmbeddedRunStarted({ sessionId: "s1", sessionKey: "main" });
|
||||
|
||||
vi.advanceTimersByTime(9 * 60_000);
|
||||
expect(recoverStuckSession).not.toHaveBeenCalled();
|
||||
|
||||
vi.advanceTimersByTime(2 * 60_000);
|
||||
} finally {
|
||||
unsubscribe();
|
||||
}
|
||||
|
||||
const stalledEvents = events.filter((event) => event.type === "session.stalled");
|
||||
expect(stalledEvents.length).toBeGreaterThan(0);
|
||||
expect(stalledEvents.at(-1)).toMatchObject({
|
||||
classification: "stalled_agent_run",
|
||||
reason: "active_work_without_progress",
|
||||
activeWorkKind: "embedded_run",
|
||||
});
|
||||
expect(recoverStuckSession).toHaveBeenCalledWith({
|
||||
sessionId: "s1",
|
||||
sessionKey: "main",
|
||||
ageMs: expect.any(Number),
|
||||
queueDepth: 0,
|
||||
allowActiveAbort: true,
|
||||
});
|
||||
});
|
||||
|
||||
it("reports long-running sessions separately when active work is making progress", () => {
|
||||
const events: DiagnosticEventPayload[] = [];
|
||||
const recoverStuckSession = vi.fn();
|
||||
|
||||
@@ -54,6 +54,8 @@ const webhookStats = {
|
||||
const DEFAULT_STUCK_SESSION_WARN_MS = 120_000;
|
||||
const MIN_STUCK_SESSION_WARN_MS = 1_000;
|
||||
const MAX_STUCK_SESSION_WARN_MS = 24 * 60 * 60 * 1000;
|
||||
const MIN_STALLED_EMBEDDED_RUN_ABORT_MS = 10 * 60_000;
|
||||
const STALLED_EMBEDDED_RUN_ABORT_WARN_MULTIPLIER = 5;
|
||||
const RECENT_DIAGNOSTIC_ACTIVITY_MS = 120_000;
|
||||
const DEFAULT_LIVENESS_EVENT_LOOP_DELAY_WARN_MS = 1_000;
|
||||
const DEFAULT_LIVENESS_EVENT_LOOP_UTILIZATION_WARN = 0.95;
|
||||
@@ -82,6 +84,7 @@ type RecoverStuckSession = (params: {
|
||||
sessionKey?: string;
|
||||
ageMs: number;
|
||||
queueDepth?: number;
|
||||
allowActiveAbort?: boolean;
|
||||
}) => void | Promise<void>;
|
||||
|
||||
type DiagnosticLivenessSample = {
|
||||
@@ -125,6 +128,7 @@ function recoverStuckSession(params: {
|
||||
sessionKey?: string;
|
||||
ageMs: number;
|
||||
queueDepth?: number;
|
||||
allowActiveAbort?: boolean;
|
||||
}) {
|
||||
stuckSessionRecoveryRuntimePromise ??= import("./diagnostic-stuck-session-recovery.runtime.js");
|
||||
void stuckSessionRecoveryRuntimePromise
|
||||
@@ -344,6 +348,26 @@ export function resolveStuckSessionWarnMs(config?: OpenClawConfig): number {
|
||||
return rounded;
|
||||
}
|
||||
|
||||
function resolveStalledEmbeddedRunAbortMs(stuckSessionWarnMs: number): number {
|
||||
return Math.max(
|
||||
MIN_STALLED_EMBEDDED_RUN_ABORT_MS,
|
||||
stuckSessionWarnMs * STALLED_EMBEDDED_RUN_ABORT_WARN_MULTIPLIER,
|
||||
);
|
||||
}
|
||||
|
||||
function isStalledEmbeddedRunRecoveryEligible(params: {
|
||||
classification: SessionAttentionClassification | undefined;
|
||||
ageMs: number;
|
||||
stuckSessionWarnMs: number;
|
||||
}): boolean {
|
||||
return (
|
||||
params.classification?.eventType === "session.stalled" &&
|
||||
params.classification.classification === "stalled_agent_run" &&
|
||||
params.classification.activeWorkKind === "embedded_run" &&
|
||||
params.ageMs >= resolveStalledEmbeddedRunAbortMs(params.stuckSessionWarnMs)
|
||||
);
|
||||
}
|
||||
|
||||
export function logWebhookReceived(params: {
|
||||
channel: string;
|
||||
updateType?: string;
|
||||
@@ -594,6 +618,13 @@ export function logSessionAttention(
|
||||
activity,
|
||||
staleMs: params.thresholdMs,
|
||||
});
|
||||
const recoveryEligible =
|
||||
classification.recoveryEligible ||
|
||||
isStalledEmbeddedRunRecoveryEligible({
|
||||
classification,
|
||||
ageMs: params.ageMs,
|
||||
stuckSessionWarnMs: params.thresholdMs,
|
||||
});
|
||||
if (classification.eventType === "session.stuck") {
|
||||
const nextWarnAgeMs =
|
||||
state.lastStuckWarnAgeMs === undefined
|
||||
@@ -617,7 +648,7 @@ export function logSessionAttention(
|
||||
state.queueDepth
|
||||
} reason=${classification.reason} classification=${classification.classification}${
|
||||
classification.activeWorkKind ? ` activeWorkKind=${classification.activeWorkKind}` : ""
|
||||
} recovery=${classification.recoveryEligible ? "checking" : "none"}`,
|
||||
} recovery=${recoveryEligible ? "checking" : "none"}`,
|
||||
);
|
||||
const baseEvent = {
|
||||
sessionId: state.sessionId,
|
||||
@@ -816,6 +847,20 @@ export function startDiagnosticHeartbeat(
|
||||
ageMs,
|
||||
queueDepth: state.queueDepth,
|
||||
});
|
||||
} else if (
|
||||
isStalledEmbeddedRunRecoveryEligible({
|
||||
classification,
|
||||
ageMs,
|
||||
stuckSessionWarnMs,
|
||||
})
|
||||
) {
|
||||
void (opts?.recoverStuckSession ?? recoverStuckSession)({
|
||||
sessionId: state.sessionId,
|
||||
sessionKey: state.sessionKey,
|
||||
ageMs,
|
||||
queueDepth: state.queueDepth,
|
||||
allowActiveAbort: true,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user