fix(diagnostics): keep recovery scheduling out of the stuck-session warning backoff (#92752)

Summary:
- The branch changes diagnostic stuck/long-running warning backoff so recovery-eligible classifications are still returned during throttled warning ticks and updates the diagnostic tests.
- PR surface: Source +17, Tests +48. Total +65 across 2 files.
- Reproducibility: yes. Current main source shows logSessionAttention can return undefined during stuck or lon ... g backoff before the heartbeat reaches requestStuckSessionRecovery; I did not run a live QQ gateway replay.

Automerge notes:
- PR branch already contained follow-up commit before automerge: fix(diagnostics): keep recovery scheduling out of the stuck-session w…

Validation:
- ClawSweeper review passed for head f61ec3a33f.
- Required merge gates passed before the squash merge.

Prepared head SHA: f61ec3a33f
Review: https://github.com/openclaw/openclaw/pull/92752#issuecomment-4699298908

Co-authored-by: Gnanam <gnanasekaran.sekareee@gmail.com>
Co-authored-by: clawsweeper <274271284+clawsweeper[bot]@users.noreply.github.com>
Co-authored-by: clawsweeper[bot] <274271284+clawsweeper[bot]@users.noreply.github.com>
Approved-by: takhoffman
Co-authored-by: takhoffman <781889+takhoffman@users.noreply.github.com>
This commit is contained in:
KRATOS
2026-06-14 01:35:33 +05:30
committed by GitHub
parent b2da129e51
commit 4e4ea1c16b
2 changed files with 70 additions and 5 deletions

View File

@@ -468,8 +468,9 @@ describe("stuck session diagnostics threshold", () => {
);
logSessionStateChange({ sessionId: "s1", sessionKey: "main", state: "processing" });
vi.advanceTimersByTime(91_000);
// One warning emitted (60s); the 90s tick is throttled but still recovers.
expect(events).toHaveLength(1);
expect(recoverStuckSession).toHaveBeenCalledTimes(1);
expect(recoverStuckSession).toHaveBeenCalledTimes(2);
vi.advanceTimersByTime(31_000);
} finally {
@@ -477,7 +478,54 @@ describe("stuck session diagnostics threshold", () => {
}
expect(events.map((event) => event.ageMs)).toEqual([60_000, 120_000]);
// Recovery is requested on every heartbeat tick the session stays stuck,
// including the throttled tick at 90s, so it must outpace the warn backoff.
expect(recoverStuckSession).toHaveBeenCalledTimes(3);
});
it("keeps scheduling recovery for a recovery-eligible stuck session while warnings are throttled", () => {
const stuckEvents: Array<{ ageMs?: number }> = [];
const recoveryRequests: Array<{ ageMs?: number }> = [];
const recoverStuckSession = vi.fn();
const unsubscribe = onDiagnosticEvent((event) => {
if (event.type === "session.stuck") {
stuckEvents.push(event);
} else if (event.type === "session.recovery.requested") {
recoveryRequests.push(event);
}
});
try {
startDiagnosticHeartbeat(
{
diagnostics: {
enabled: true,
stuckSessionWarnMs: 30_000,
},
},
{ recoverStuckSession },
);
logSessionStateChange({ sessionId: "s1", sessionKey: "main", state: "processing" });
// First warn tick (60s): emit the stuck warning and request recovery once.
vi.advanceTimersByTime(61_000);
expect(stuckEvents).toHaveLength(1);
expect(recoverStuckSession).toHaveBeenCalledTimes(1);
// Backoff tick (90s): the next warn age is 120s, so the warning is
// throttled. Recovery must still be scheduled because the session is
// recovery-eligible — the warning backoff must not gate recovery.
vi.advanceTimersByTime(30_000);
} finally {
unsubscribe();
}
// Warning stays throttled: still only the single 60s warning.
expect(stuckEvents).toHaveLength(1);
expect(stuckEvents.map((event) => event.ageMs)).toEqual([60_000]);
// Recovery was not suppressed by the warning backoff on the 90s tick.
expect(recoverStuckSession).toHaveBeenCalledTimes(2);
expect(recoveryRequests).toHaveLength(2);
expect(recoveryRequests.map((event) => event.ageMs)).toEqual([60_000, 90_000]);
});
it("reports active sessions as stalled instead of stuck when active work stops progressing", () => {

View File

@@ -1022,15 +1022,23 @@ export function logSessionAttention(
stuckSessionAbortMs:
params.abortThresholdMs ?? resolveStalledEmbeddedRunAbortMs(params.thresholdMs),
});
// The warning backoff throttles repeated log lines/events only. It must never
// gate recovery: a recovery-eligible session has to return its classification
// so the heartbeat can still schedule recovery on every tick.
let suppressWarning = false;
if (classification.eventType === "session.stuck") {
const nextWarnAgeMs =
state.lastStuckWarnAgeMs === undefined
? params.thresholdMs
: Math.max(state.lastStuckWarnAgeMs + params.thresholdMs, state.lastStuckWarnAgeMs * 2);
if (params.ageMs < nextWarnAgeMs) {
return undefined;
if (!recoveryEligible) {
return undefined;
}
suppressWarning = true;
} else {
state.lastStuckWarnAgeMs = params.ageMs;
}
state.lastStuckWarnAgeMs = params.ageMs;
}
if (classification.eventType === "session.long_running") {
const nextWarnAgeMs =
@@ -1041,9 +1049,18 @@ export function logSessionAttention(
state.lastLongRunningWarnAgeMs * 2,
);
if (params.ageMs < nextWarnAgeMs) {
return undefined;
if (!recoveryEligible) {
return undefined;
}
suppressWarning = true;
} else {
state.lastLongRunningWarnAgeMs = params.ageMs;
}
state.lastLongRunningWarnAgeMs = params.ageMs;
}
if (suppressWarning) {
// Throttled warning, but recovery-eligible: skip the log/event and return
// the classification so the heartbeat can drive recovery.
return classification;
}
const label =
classification.eventType === "session.stuck"