feat(diagnostics): add configurable stuck-session warning threshold

2026-07-20 20:51:34 +00:00 · 2026-03-02 00:07:02 +00:00
parent d729ab2150
commit 41cc46bbb4
8 changed files with 95 additions and 4 deletions
--- a/src/config/schema.help.ts
+++ b/src/config/schema.help.ts
@@ -418,6 +418,8 @@ export const FIELD_HELP: Record<string, string> = {
    'Enable targeted diagnostics logs by flag (e.g. ["telegram.http"]). Supports wildcards like "telegram.*" or "*".',
  "diagnostics.enabled":
    "Master toggle for diagnostics instrumentation output in logs and telemetry wiring paths. Keep enabled for normal observability, and disable only in tightly constrained environments.",
+  "diagnostics.stuckSessionWarnMs":
+    "Age threshold in milliseconds for emitting stuck-session warnings while a session remains in processing state. Increase for long multi-tool turns to reduce false positives; decrease for faster hang detection.",
  "diagnostics.otel.enabled":
    "Enables OpenTelemetry export pipeline for traces, metrics, and logs based on configured endpoint/protocol settings. Keep disabled unless your collector endpoint and auth are fully configured.",
  "diagnostics.otel.endpoint":
@@ -945,6 +947,8 @@ export const FIELD_HELP: Record<string, string> = {
    "Enables pre-compaction memory flush before the runtime performs stronger history reduction near token limits. Keep enabled unless you intentionally disable memory side effects in constrained environments.",
  "agents.defaults.compaction.memoryFlush.softThresholdTokens":
    "Threshold distance to compaction (in tokens) that triggers pre-compaction memory flush execution. Use earlier thresholds for safer persistence, or tighter thresholds for lower flush frequency.",
+  "agents.defaults.compaction.memoryFlush.forceFlushTranscriptBytes":
+    'Forces pre-compaction memory flush when transcript file size reaches this threshold (bytes or strings like "2mb"). Use this to prevent long-session hangs even when token counters are stale; set to 0 to disable.',
  "agents.defaults.compaction.memoryFlush.prompt":
    "User-prompt template used for the pre-compaction memory flush turn when generating memory candidates. Use this only when you need custom extraction instructions beyond the default memory flush behavior.",
  "agents.defaults.compaction.memoryFlush.systemPrompt":
--- a/src/config/schema.labels.ts
+++ b/src/config/schema.labels.ts
@@ -34,6 +34,7 @@ export const FIELD_LABELS: Record<string, string> = {
  "update.auto.betaCheckIntervalHours": "Auto Update Beta Check Interval (hours)",
  "diagnostics.enabled": "Diagnostics Enabled",
  "diagnostics.flags": "Diagnostics Flags",
+  "diagnostics.stuckSessionWarnMs": "Stuck Session Warning Threshold (ms)",
  "diagnostics.otel.enabled": "OpenTelemetry Enabled",
  "diagnostics.otel.endpoint": "OpenTelemetry Endpoint",
  "diagnostics.otel.protocol": "OpenTelemetry Protocol",
@@ -421,6 +422,8 @@ export const FIELD_LABELS: Record<string, string> = {
  "agents.defaults.compaction.memoryFlush.enabled": "Compaction Memory Flush Enabled",
  "agents.defaults.compaction.memoryFlush.softThresholdTokens":
    "Compaction Memory Flush Soft Threshold",
+  "agents.defaults.compaction.memoryFlush.forceFlushTranscriptBytes":
+    "Compaction Memory Flush Transcript Size Threshold",
  "agents.defaults.compaction.memoryFlush.prompt": "Compaction Memory Flush Prompt",
  "agents.defaults.compaction.memoryFlush.systemPrompt": "Compaction Memory Flush System Prompt",
  "agents.defaults.embeddedPi": "Embedded Pi",
--- a/src/config/types.base.ts
+++ b/src/config/types.base.ts
@@ -205,6 +205,8 @@ export type DiagnosticsConfig = {
  enabled?: boolean;
  /** Optional ad-hoc diagnostics flags (e.g. "telegram.http"). */
  flags?: string[];
+  /** Threshold in ms before a processing session logs "stuck session" diagnostics. */
+  stuckSessionWarnMs?: number;
  otel?: DiagnosticsOtelConfig;
  cacheTrace?: DiagnosticsCacheTraceConfig;
 };
--- a/src/config/zod-schema.ts
+++ b/src/config/zod-schema.ts
@@ -179,6 +179,7 @@ export const OpenClawSchema = z
      .object({
        enabled: z.boolean().optional(),
        flags: z.array(z.string()).optional(),
+        stuckSessionWarnMs: z.number().int().positive().optional(),
        otel: z
          .object({
            enabled: z.boolean().optional(),
--- a/src/gateway/server.impl.ts
+++ b/src/gateway/server.impl.ts
@@ -371,7 +371,7 @@ export async function startGatewayServer(
  ).config;
  const diagnosticsEnabled = isDiagnosticsEnabled(cfgAtStart);
  if (diagnosticsEnabled) {
-    startDiagnosticHeartbeat();
+    startDiagnosticHeartbeat(cfgAtStart);
  }
  setGatewaySigusr1RestartPolicy({ allowExternal: isRestartEnabled(cfgAtStart) });
  setPreRestartDeferralCheck(
--- a/src/logging/diagnostic.test.ts
+++ b/src/logging/diagnostic.test.ts
@@ -1,5 +1,6 @@
 import fs from "node:fs";
 import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
+import { onDiagnosticEvent, resetDiagnosticEventsForTest } from "../infra/diagnostic-events.js";
 import {
  diagnosticSessionStates,
  getDiagnosticSessionStateCountForTest,
@@ -7,6 +8,12 @@ import {
  pruneDiagnosticSessionStates,
  resetDiagnosticSessionStateForTest,
 } from "./diagnostic-session-state.js";
+import {
+  logSessionStateChange,
+  resetDiagnosticStateForTest,
+  resolveStuckSessionWarnMs,
+  startDiagnosticHeartbeat,
+} from "./diagnostic.js";

 describe("diagnostic session state pruning", () => {
  beforeEach(() => {
@@ -74,3 +81,60 @@ describe("logger import side effects", () => {
    expect(mkdirSpy).not.toHaveBeenCalled();
  });
 });
+
+describe("stuck session diagnostics threshold", () => {
+  beforeEach(() => {
+    vi.useFakeTimers();
+    resetDiagnosticStateForTest();
+    resetDiagnosticEventsForTest();
+  });
+
+  afterEach(() => {
+    resetDiagnosticEventsForTest();
+    resetDiagnosticStateForTest();
+    vi.useRealTimers();
+  });
+
+  it("uses the configured diagnostics.stuckSessionWarnMs threshold", () => {
+    const events: Array<{ type: string }> = [];
+    const unsubscribe = onDiagnosticEvent((event) => {
+      events.push({ type: event.type });
+    });
+    try {
+      startDiagnosticHeartbeat({
+        diagnostics: {
+          enabled: true,
+          stuckSessionWarnMs: 30_000,
+        },
+      });
+      logSessionStateChange({ sessionId: "s1", sessionKey: "main", state: "processing" });
+      vi.advanceTimersByTime(61_000);
+    } finally {
+      unsubscribe();
+    }
+
+    expect(events.filter((event) => event.type === "session.stuck")).toHaveLength(1);
+  });
+
+  it("falls back to default threshold when config is absent", () => {
+    const events: Array<{ type: string }> = [];
+    const unsubscribe = onDiagnosticEvent((event) => {
+      events.push({ type: event.type });
+    });
+    try {
+      startDiagnosticHeartbeat();
+      logSessionStateChange({ sessionId: "s2", sessionKey: "main", state: "processing" });
+      vi.advanceTimersByTime(31_000);
+    } finally {
+      unsubscribe();
+    }
+
+    expect(events.filter((event) => event.type === "session.stuck")).toHaveLength(0);
+  });
+
+  it("uses default threshold for invalid values", () => {
+    expect(resolveStuckSessionWarnMs({ diagnostics: { stuckSessionWarnMs: -1 } })).toBe(120_000);
+    expect(resolveStuckSessionWarnMs({ diagnostics: { stuckSessionWarnMs: 0 } })).toBe(120_000);
+    expect(resolveStuckSessionWarnMs()).toBe(120_000);
+  });
+});
--- a/src/logging/diagnostic.ts
+++ b/src/logging/diagnostic.ts
@@ -1,3 +1,4 @@
+import type { OpenClawConfig } from "../config/config.js";
 import { emitDiagnosticEvent } from "../infra/diagnostic-events.js";
 import {
  diagnosticSessionStates,
@@ -20,11 +21,26 @@ const webhookStats = {
 };

 let lastActivityAt = 0;
+const DEFAULT_STUCK_SESSION_WARN_MS = 120_000;
+const MIN_STUCK_SESSION_WARN_MS = 1_000;
+const MAX_STUCK_SESSION_WARN_MS = 24 * 60 * 60 * 1000;

 function markActivity() {
  lastActivityAt = Date.now();
 }

+export function resolveStuckSessionWarnMs(config?: OpenClawConfig): number {
+  const raw = config?.diagnostics?.stuckSessionWarnMs;
+  if (typeof raw !== "number" || !Number.isFinite(raw)) {
+    return DEFAULT_STUCK_SESSION_WARN_MS;
+  }
+  const rounded = Math.floor(raw);
+  if (rounded < MIN_STUCK_SESSION_WARN_MS || rounded > MAX_STUCK_SESSION_WARN_MS) {
+    return DEFAULT_STUCK_SESSION_WARN_MS;
+  }
+  return rounded;
+}
+
 export function logWebhookReceived(params: {
  channel: string;
  updateType?: string;
@@ -305,10 +321,11 @@ export function logActiveRuns() {

 let heartbeatInterval: NodeJS.Timeout | null = null;

-export function startDiagnosticHeartbeat() {
+export function startDiagnosticHeartbeat(config?: OpenClawConfig) {
  if (heartbeatInterval) {
    return;
  }
+  const stuckSessionWarnMs = resolveStuckSessionWarnMs(config);
  heartbeatInterval = setInterval(() => {
    const now = Date.now();
    pruneDiagnosticSessionStates(now, true);
@@ -362,7 +379,7 @@ export function startDiagnosticHeartbeat() {

    for (const [, state] of diagnosticSessionStates) {
      const ageMs = now - state.lastActivity;
-      if (state.state === "processing" && ageMs > 120_000) {
+      if (state.state === "processing" && ageMs > stuckSessionWarnMs) {
        logSessionStuck({
          sessionId: state.sessionId,
          sessionKey: state.sessionKey,
--- a/src/telegram/webhook.ts
+++ b/src/telegram/webhook.ts
@@ -120,7 +120,7 @@ export async function startTelegramWebhook(opts: {
  });

  if (diagnosticsEnabled) {
-    startDiagnosticHeartbeat();
+    startDiagnosticHeartbeat(opts.config);
  }

  const server = createServer((req, res) => {