mirror of
https://github.com/openclaw/openclaw.git
synced 2026-06-23 04:58:06 +00:00
fix(diagnostics): keep recovery scheduling out of the stuck-session warning backoff (#92752)
Summary: - The branch changes diagnostic stuck/long-running warning backoff so recovery-eligible classifications are still returned during throttled warning ticks and updates the diagnostic tests. - PR surface: Source +17, Tests +48. Total +65 across 2 files. - Reproducibility: yes. Current main source shows logSessionAttention can return undefined during stuck or lon ... g backoff before the heartbeat reaches requestStuckSessionRecovery; I did not run a live QQ gateway replay. Automerge notes: - PR branch already contained follow-up commit before automerge: fix(diagnostics): keep recovery scheduling out of the stuck-session w… Validation: - ClawSweeper review passed for headf61ec3a33f. - Required merge gates passed before the squash merge. Prepared head SHA:f61ec3a33fReview: https://github.com/openclaw/openclaw/pull/92752#issuecomment-4699298908 Co-authored-by: Gnanam <gnanasekaran.sekareee@gmail.com> Co-authored-by: clawsweeper <274271284+clawsweeper[bot]@users.noreply.github.com> Co-authored-by: clawsweeper[bot] <274271284+clawsweeper[bot]@users.noreply.github.com> Approved-by: takhoffman Co-authored-by: takhoffman <781889+takhoffman@users.noreply.github.com>
This commit is contained in:
@@ -468,8 +468,9 @@ describe("stuck session diagnostics threshold", () => {
|
||||
);
|
||||
logSessionStateChange({ sessionId: "s1", sessionKey: "main", state: "processing" });
|
||||
vi.advanceTimersByTime(91_000);
|
||||
// One warning emitted (60s); the 90s tick is throttled but still recovers.
|
||||
expect(events).toHaveLength(1);
|
||||
expect(recoverStuckSession).toHaveBeenCalledTimes(1);
|
||||
expect(recoverStuckSession).toHaveBeenCalledTimes(2);
|
||||
|
||||
vi.advanceTimersByTime(31_000);
|
||||
} finally {
|
||||
@@ -477,7 +478,54 @@ describe("stuck session diagnostics threshold", () => {
|
||||
}
|
||||
|
||||
expect(events.map((event) => event.ageMs)).toEqual([60_000, 120_000]);
|
||||
// Recovery is requested on every heartbeat tick the session stays stuck,
|
||||
// including the throttled tick at 90s, so it must outpace the warn backoff.
|
||||
expect(recoverStuckSession).toHaveBeenCalledTimes(3);
|
||||
});
|
||||
|
||||
it("keeps scheduling recovery for a recovery-eligible stuck session while warnings are throttled", () => {
|
||||
const stuckEvents: Array<{ ageMs?: number }> = [];
|
||||
const recoveryRequests: Array<{ ageMs?: number }> = [];
|
||||
const recoverStuckSession = vi.fn();
|
||||
const unsubscribe = onDiagnosticEvent((event) => {
|
||||
if (event.type === "session.stuck") {
|
||||
stuckEvents.push(event);
|
||||
} else if (event.type === "session.recovery.requested") {
|
||||
recoveryRequests.push(event);
|
||||
}
|
||||
});
|
||||
try {
|
||||
startDiagnosticHeartbeat(
|
||||
{
|
||||
diagnostics: {
|
||||
enabled: true,
|
||||
stuckSessionWarnMs: 30_000,
|
||||
},
|
||||
},
|
||||
{ recoverStuckSession },
|
||||
);
|
||||
logSessionStateChange({ sessionId: "s1", sessionKey: "main", state: "processing" });
|
||||
|
||||
// First warn tick (60s): emit the stuck warning and request recovery once.
|
||||
vi.advanceTimersByTime(61_000);
|
||||
expect(stuckEvents).toHaveLength(1);
|
||||
expect(recoverStuckSession).toHaveBeenCalledTimes(1);
|
||||
|
||||
// Backoff tick (90s): the next warn age is 120s, so the warning is
|
||||
// throttled. Recovery must still be scheduled because the session is
|
||||
// recovery-eligible — the warning backoff must not gate recovery.
|
||||
vi.advanceTimersByTime(30_000);
|
||||
} finally {
|
||||
unsubscribe();
|
||||
}
|
||||
|
||||
// Warning stays throttled: still only the single 60s warning.
|
||||
expect(stuckEvents).toHaveLength(1);
|
||||
expect(stuckEvents.map((event) => event.ageMs)).toEqual([60_000]);
|
||||
// Recovery was not suppressed by the warning backoff on the 90s tick.
|
||||
expect(recoverStuckSession).toHaveBeenCalledTimes(2);
|
||||
expect(recoveryRequests).toHaveLength(2);
|
||||
expect(recoveryRequests.map((event) => event.ageMs)).toEqual([60_000, 90_000]);
|
||||
});
|
||||
|
||||
it("reports active sessions as stalled instead of stuck when active work stops progressing", () => {
|
||||
|
||||
@@ -1022,15 +1022,23 @@ export function logSessionAttention(
|
||||
stuckSessionAbortMs:
|
||||
params.abortThresholdMs ?? resolveStalledEmbeddedRunAbortMs(params.thresholdMs),
|
||||
});
|
||||
// The warning backoff throttles repeated log lines/events only. It must never
|
||||
// gate recovery: a recovery-eligible session has to return its classification
|
||||
// so the heartbeat can still schedule recovery on every tick.
|
||||
let suppressWarning = false;
|
||||
if (classification.eventType === "session.stuck") {
|
||||
const nextWarnAgeMs =
|
||||
state.lastStuckWarnAgeMs === undefined
|
||||
? params.thresholdMs
|
||||
: Math.max(state.lastStuckWarnAgeMs + params.thresholdMs, state.lastStuckWarnAgeMs * 2);
|
||||
if (params.ageMs < nextWarnAgeMs) {
|
||||
return undefined;
|
||||
if (!recoveryEligible) {
|
||||
return undefined;
|
||||
}
|
||||
suppressWarning = true;
|
||||
} else {
|
||||
state.lastStuckWarnAgeMs = params.ageMs;
|
||||
}
|
||||
state.lastStuckWarnAgeMs = params.ageMs;
|
||||
}
|
||||
if (classification.eventType === "session.long_running") {
|
||||
const nextWarnAgeMs =
|
||||
@@ -1041,9 +1049,18 @@ export function logSessionAttention(
|
||||
state.lastLongRunningWarnAgeMs * 2,
|
||||
);
|
||||
if (params.ageMs < nextWarnAgeMs) {
|
||||
return undefined;
|
||||
if (!recoveryEligible) {
|
||||
return undefined;
|
||||
}
|
||||
suppressWarning = true;
|
||||
} else {
|
||||
state.lastLongRunningWarnAgeMs = params.ageMs;
|
||||
}
|
||||
state.lastLongRunningWarnAgeMs = params.ageMs;
|
||||
}
|
||||
if (suppressWarning) {
|
||||
// Throttled warning, but recovery-eligible: skip the log/event and return
|
||||
// the classification so the heartbeat can drive recovery.
|
||||
return classification;
|
||||
}
|
||||
const label =
|
||||
classification.eventType === "session.stuck"
|
||||
|
||||
Reference in New Issue
Block a user