mirror of
https://github.com/openclaw/openclaw.git
synced 2026-03-14 11:30:41 +00:00
fix(gateway): drain active turns before restart to prevent message loss (#13931)
* fix(gateway): drain active turns before restart to prevent message loss On SIGUSR1 restart, the gateway now waits up to 30s for in-flight agent turns to complete before tearing down the server. This prevents buffered messages from being dropped when config.patch or update triggers a restart while agents are mid-turn. Changes: - command-queue.ts: add getActiveTaskCount() and waitForActiveTasks() helpers to track and wait on active lane tasks - run-loop.ts: on restart signal, drain active tasks before server.close() with a 30s timeout; extend force-exit timer accordingly - command-queue.test.ts: update imports for new exports Fixes #13883 * fix(queue): snapshot active tasks for restart drain --------- Co-authored-by: Elonito <0xRaini@users.noreply.github.com> Co-authored-by: Tak Hoffman <781889+Takhoffman@users.noreply.github.com>
This commit is contained in:
@@ -19,11 +19,13 @@ type LaneState = {
|
||||
lane: string;
|
||||
queue: QueueEntry[];
|
||||
active: number;
|
||||
activeTaskIds: Set<number>;
|
||||
maxConcurrent: number;
|
||||
draining: boolean;
|
||||
};
|
||||
|
||||
const lanes = new Map<string, LaneState>();
|
||||
let nextTaskId = 1;
|
||||
|
||||
function getLaneState(lane: string): LaneState {
|
||||
const existing = lanes.get(lane);
|
||||
@@ -34,6 +36,7 @@ function getLaneState(lane: string): LaneState {
|
||||
lane,
|
||||
queue: [],
|
||||
active: 0,
|
||||
activeTaskIds: new Set(),
|
||||
maxConcurrent: 1,
|
||||
draining: false,
|
||||
};
|
||||
@@ -59,12 +62,15 @@ function drainLane(lane: string) {
|
||||
);
|
||||
}
|
||||
logLaneDequeue(lane, waitedMs, state.queue.length);
|
||||
const taskId = nextTaskId++;
|
||||
state.active += 1;
|
||||
state.activeTaskIds.add(taskId);
|
||||
void (async () => {
|
||||
const startTime = Date.now();
|
||||
try {
|
||||
const result = await entry.task();
|
||||
state.active -= 1;
|
||||
state.activeTaskIds.delete(taskId);
|
||||
diag.debug(
|
||||
`lane task done: lane=${lane} durationMs=${Date.now() - startTime} active=${state.active} queued=${state.queue.length}`,
|
||||
);
|
||||
@@ -72,6 +78,7 @@ function drainLane(lane: string) {
|
||||
entry.resolve(result);
|
||||
} catch (err) {
|
||||
state.active -= 1;
|
||||
state.activeTaskIds.delete(taskId);
|
||||
const isProbeLane = lane.startsWith("auth-probe:") || lane.startsWith("session:probe-");
|
||||
if (!isProbeLane) {
|
||||
diag.error(
|
||||
@@ -158,3 +165,67 @@ export function clearCommandLane(lane: string = CommandLane.Main) {
|
||||
state.queue.length = 0;
|
||||
return removed;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the total number of actively executing tasks across all lanes
|
||||
* (excludes queued-but-not-started entries).
|
||||
*/
|
||||
export function getActiveTaskCount(): number {
|
||||
let total = 0;
|
||||
for (const s of lanes.values()) {
|
||||
total += s.active;
|
||||
}
|
||||
return total;
|
||||
}
|
||||
|
||||
/**
|
||||
* Wait for all currently active tasks across all lanes to finish.
|
||||
* Polls at a short interval; resolves when no tasks are active or
|
||||
* when `timeoutMs` elapses (whichever comes first).
|
||||
*
|
||||
* New tasks enqueued after this call are ignored — only tasks that are
|
||||
* already executing are waited on.
|
||||
*/
|
||||
export function waitForActiveTasks(timeoutMs: number): Promise<{ drained: boolean }> {
|
||||
const POLL_INTERVAL_MS = 250;
|
||||
const deadline = Date.now() + timeoutMs;
|
||||
const activeAtStart = new Set<number>();
|
||||
for (const state of lanes.values()) {
|
||||
for (const taskId of state.activeTaskIds) {
|
||||
activeAtStart.add(taskId);
|
||||
}
|
||||
}
|
||||
|
||||
return new Promise((resolve) => {
|
||||
const check = () => {
|
||||
if (activeAtStart.size === 0) {
|
||||
resolve({ drained: true });
|
||||
return;
|
||||
}
|
||||
|
||||
let hasPending = false;
|
||||
for (const state of lanes.values()) {
|
||||
for (const taskId of state.activeTaskIds) {
|
||||
if (activeAtStart.has(taskId)) {
|
||||
hasPending = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (hasPending) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!hasPending) {
|
||||
resolve({ drained: true });
|
||||
return;
|
||||
}
|
||||
if (Date.now() >= deadline) {
|
||||
resolve({ drained: false });
|
||||
return;
|
||||
}
|
||||
setTimeout(check, POLL_INTERVAL_MS);
|
||||
};
|
||||
check();
|
||||
});
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user