diff --git a/src/config/schema.help.ts b/src/config/schema.help.ts index 61eb12c6dc6..94f1766e487 100644 --- a/src/config/schema.help.ts +++ b/src/config/schema.help.ts @@ -418,6 +418,8 @@ export const FIELD_HELP: Record = { 'Enable targeted diagnostics logs by flag (e.g. ["telegram.http"]). Supports wildcards like "telegram.*" or "*".', "diagnostics.enabled": "Master toggle for diagnostics instrumentation output in logs and telemetry wiring paths. Keep enabled for normal observability, and disable only in tightly constrained environments.", + "diagnostics.stuckSessionWarnMs": + "Age threshold in milliseconds for emitting stuck-session warnings while a session remains in processing state. Increase for long multi-tool turns to reduce false positives; decrease for faster hang detection.", "diagnostics.otel.enabled": "Enables OpenTelemetry export pipeline for traces, metrics, and logs based on configured endpoint/protocol settings. Keep disabled unless your collector endpoint and auth are fully configured.", "diagnostics.otel.endpoint": @@ -945,6 +947,8 @@ export const FIELD_HELP: Record = { "Enables pre-compaction memory flush before the runtime performs stronger history reduction near token limits. Keep enabled unless you intentionally disable memory side effects in constrained environments.", "agents.defaults.compaction.memoryFlush.softThresholdTokens": "Threshold distance to compaction (in tokens) that triggers pre-compaction memory flush execution. Use earlier thresholds for safer persistence, or tighter thresholds for lower flush frequency.", + "agents.defaults.compaction.memoryFlush.forceFlushTranscriptBytes": + 'Forces pre-compaction memory flush when transcript file size reaches this threshold (bytes or strings like "2mb"). Use this to prevent long-session hangs even when token counters are stale; set to 0 to disable.', "agents.defaults.compaction.memoryFlush.prompt": "User-prompt template used for the pre-compaction memory flush turn when generating memory candidates. Use this only when you need custom extraction instructions beyond the default memory flush behavior.", "agents.defaults.compaction.memoryFlush.systemPrompt": diff --git a/src/config/schema.labels.ts b/src/config/schema.labels.ts index d5a6170c330..de1bcbed3eb 100644 --- a/src/config/schema.labels.ts +++ b/src/config/schema.labels.ts @@ -34,6 +34,7 @@ export const FIELD_LABELS: Record = { "update.auto.betaCheckIntervalHours": "Auto Update Beta Check Interval (hours)", "diagnostics.enabled": "Diagnostics Enabled", "diagnostics.flags": "Diagnostics Flags", + "diagnostics.stuckSessionWarnMs": "Stuck Session Warning Threshold (ms)", "diagnostics.otel.enabled": "OpenTelemetry Enabled", "diagnostics.otel.endpoint": "OpenTelemetry Endpoint", "diagnostics.otel.protocol": "OpenTelemetry Protocol", @@ -421,6 +422,8 @@ export const FIELD_LABELS: Record = { "agents.defaults.compaction.memoryFlush.enabled": "Compaction Memory Flush Enabled", "agents.defaults.compaction.memoryFlush.softThresholdTokens": "Compaction Memory Flush Soft Threshold", + "agents.defaults.compaction.memoryFlush.forceFlushTranscriptBytes": + "Compaction Memory Flush Transcript Size Threshold", "agents.defaults.compaction.memoryFlush.prompt": "Compaction Memory Flush Prompt", "agents.defaults.compaction.memoryFlush.systemPrompt": "Compaction Memory Flush System Prompt", "agents.defaults.embeddedPi": "Embedded Pi", diff --git a/src/config/types.base.ts b/src/config/types.base.ts index bcc3bf6b969..03336561d64 100644 --- a/src/config/types.base.ts +++ b/src/config/types.base.ts @@ -205,6 +205,8 @@ export type DiagnosticsConfig = { enabled?: boolean; /** Optional ad-hoc diagnostics flags (e.g. "telegram.http"). */ flags?: string[]; + /** Threshold in ms before a processing session logs "stuck session" diagnostics. */ + stuckSessionWarnMs?: number; otel?: DiagnosticsOtelConfig; cacheTrace?: DiagnosticsCacheTraceConfig; }; diff --git a/src/config/zod-schema.ts b/src/config/zod-schema.ts index 73677c8bf34..8034c5b5e42 100644 --- a/src/config/zod-schema.ts +++ b/src/config/zod-schema.ts @@ -179,6 +179,7 @@ export const OpenClawSchema = z .object({ enabled: z.boolean().optional(), flags: z.array(z.string()).optional(), + stuckSessionWarnMs: z.number().int().positive().optional(), otel: z .object({ enabled: z.boolean().optional(), diff --git a/src/gateway/server.impl.ts b/src/gateway/server.impl.ts index b3e6a9b3c15..4ae6016a46a 100644 --- a/src/gateway/server.impl.ts +++ b/src/gateway/server.impl.ts @@ -371,7 +371,7 @@ export async function startGatewayServer( ).config; const diagnosticsEnabled = isDiagnosticsEnabled(cfgAtStart); if (diagnosticsEnabled) { - startDiagnosticHeartbeat(); + startDiagnosticHeartbeat(cfgAtStart); } setGatewaySigusr1RestartPolicy({ allowExternal: isRestartEnabled(cfgAtStart) }); setPreRestartDeferralCheck( diff --git a/src/logging/diagnostic.test.ts b/src/logging/diagnostic.test.ts index 37eecaf0b12..45a57770c3f 100644 --- a/src/logging/diagnostic.test.ts +++ b/src/logging/diagnostic.test.ts @@ -1,5 +1,6 @@ import fs from "node:fs"; import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; +import { onDiagnosticEvent, resetDiagnosticEventsForTest } from "../infra/diagnostic-events.js"; import { diagnosticSessionStates, getDiagnosticSessionStateCountForTest, @@ -7,6 +8,12 @@ import { pruneDiagnosticSessionStates, resetDiagnosticSessionStateForTest, } from "./diagnostic-session-state.js"; +import { + logSessionStateChange, + resetDiagnosticStateForTest, + resolveStuckSessionWarnMs, + startDiagnosticHeartbeat, +} from "./diagnostic.js"; describe("diagnostic session state pruning", () => { beforeEach(() => { @@ -74,3 +81,60 @@ describe("logger import side effects", () => { expect(mkdirSpy).not.toHaveBeenCalled(); }); }); + +describe("stuck session diagnostics threshold", () => { + beforeEach(() => { + vi.useFakeTimers(); + resetDiagnosticStateForTest(); + resetDiagnosticEventsForTest(); + }); + + afterEach(() => { + resetDiagnosticEventsForTest(); + resetDiagnosticStateForTest(); + vi.useRealTimers(); + }); + + it("uses the configured diagnostics.stuckSessionWarnMs threshold", () => { + const events: Array<{ type: string }> = []; + const unsubscribe = onDiagnosticEvent((event) => { + events.push({ type: event.type }); + }); + try { + startDiagnosticHeartbeat({ + diagnostics: { + enabled: true, + stuckSessionWarnMs: 30_000, + }, + }); + logSessionStateChange({ sessionId: "s1", sessionKey: "main", state: "processing" }); + vi.advanceTimersByTime(61_000); + } finally { + unsubscribe(); + } + + expect(events.filter((event) => event.type === "session.stuck")).toHaveLength(1); + }); + + it("falls back to default threshold when config is absent", () => { + const events: Array<{ type: string }> = []; + const unsubscribe = onDiagnosticEvent((event) => { + events.push({ type: event.type }); + }); + try { + startDiagnosticHeartbeat(); + logSessionStateChange({ sessionId: "s2", sessionKey: "main", state: "processing" }); + vi.advanceTimersByTime(31_000); + } finally { + unsubscribe(); + } + + expect(events.filter((event) => event.type === "session.stuck")).toHaveLength(0); + }); + + it("uses default threshold for invalid values", () => { + expect(resolveStuckSessionWarnMs({ diagnostics: { stuckSessionWarnMs: -1 } })).toBe(120_000); + expect(resolveStuckSessionWarnMs({ diagnostics: { stuckSessionWarnMs: 0 } })).toBe(120_000); + expect(resolveStuckSessionWarnMs()).toBe(120_000); + }); +}); diff --git a/src/logging/diagnostic.ts b/src/logging/diagnostic.ts index 3751416c13a..ffc36cf98ee 100644 --- a/src/logging/diagnostic.ts +++ b/src/logging/diagnostic.ts @@ -1,3 +1,4 @@ +import type { OpenClawConfig } from "../config/config.js"; import { emitDiagnosticEvent } from "../infra/diagnostic-events.js"; import { diagnosticSessionStates, @@ -20,11 +21,26 @@ const webhookStats = { }; let lastActivityAt = 0; +const DEFAULT_STUCK_SESSION_WARN_MS = 120_000; +const MIN_STUCK_SESSION_WARN_MS = 1_000; +const MAX_STUCK_SESSION_WARN_MS = 24 * 60 * 60 * 1000; function markActivity() { lastActivityAt = Date.now(); } +export function resolveStuckSessionWarnMs(config?: OpenClawConfig): number { + const raw = config?.diagnostics?.stuckSessionWarnMs; + if (typeof raw !== "number" || !Number.isFinite(raw)) { + return DEFAULT_STUCK_SESSION_WARN_MS; + } + const rounded = Math.floor(raw); + if (rounded < MIN_STUCK_SESSION_WARN_MS || rounded > MAX_STUCK_SESSION_WARN_MS) { + return DEFAULT_STUCK_SESSION_WARN_MS; + } + return rounded; +} + export function logWebhookReceived(params: { channel: string; updateType?: string; @@ -305,10 +321,11 @@ export function logActiveRuns() { let heartbeatInterval: NodeJS.Timeout | null = null; -export function startDiagnosticHeartbeat() { +export function startDiagnosticHeartbeat(config?: OpenClawConfig) { if (heartbeatInterval) { return; } + const stuckSessionWarnMs = resolveStuckSessionWarnMs(config); heartbeatInterval = setInterval(() => { const now = Date.now(); pruneDiagnosticSessionStates(now, true); @@ -362,7 +379,7 @@ export function startDiagnosticHeartbeat() { for (const [, state] of diagnosticSessionStates) { const ageMs = now - state.lastActivity; - if (state.state === "processing" && ageMs > 120_000) { + if (state.state === "processing" && ageMs > stuckSessionWarnMs) { logSessionStuck({ sessionId: state.sessionId, sessionKey: state.sessionKey, diff --git a/src/telegram/webhook.ts b/src/telegram/webhook.ts index a55720102dd..8333a6a1ebe 100644 --- a/src/telegram/webhook.ts +++ b/src/telegram/webhook.ts @@ -120,7 +120,7 @@ export async function startTelegramWebhook(opts: { }); if (diagnosticsEnabled) { - startDiagnosticHeartbeat(); + startDiagnosticHeartbeat(opts.config); } const server = createServer((req, res) => {