mirror of
https://github.com/openclaw/openclaw.git
synced 2026-04-27 00:52:05 +00:00
fix(agents): bound compaction retry wait and drain embedded runs on restart (#40324)
Merged via squash.
Prepared head SHA: cfd99562d6
Co-authored-by: cgdusek <38732970+cgdusek@users.noreply.github.com>
Co-authored-by: jalehman <550978+jalehman@users.noreply.github.com>
Reviewed-by: @jalehman
This commit is contained in:
@@ -15,6 +15,11 @@ const resetAllLanes = vi.fn();
|
||||
const restartGatewayProcessWithFreshPid = vi.fn<
|
||||
() => { mode: "spawned" | "supervised" | "disabled" | "failed"; pid?: number; detail?: string }
|
||||
>(() => ({ mode: "disabled" }));
|
||||
const abortEmbeddedPiRun = vi.fn(
|
||||
(_sessionId?: string, _opts?: { mode?: "all" | "compacting" }) => false,
|
||||
);
|
||||
const getActiveEmbeddedRunCount = vi.fn(() => 0);
|
||||
const waitForActiveEmbeddedRuns = vi.fn(async (_timeoutMs: number) => ({ drained: true }));
|
||||
const DRAIN_TIMEOUT_LOG = "drain timeout reached; proceeding with restart";
|
||||
const gatewayLog = {
|
||||
info: vi.fn(),
|
||||
@@ -43,6 +48,13 @@ vi.mock("../../process/command-queue.js", () => ({
|
||||
resetAllLanes: () => resetAllLanes(),
|
||||
}));
|
||||
|
||||
vi.mock("../../agents/pi-embedded-runner/runs.js", () => ({
|
||||
abortEmbeddedPiRun: (sessionId?: string, opts?: { mode?: "all" | "compacting" }) =>
|
||||
abortEmbeddedPiRun(sessionId, opts),
|
||||
getActiveEmbeddedRunCount: () => getActiveEmbeddedRunCount(),
|
||||
waitForActiveEmbeddedRuns: (timeoutMs: number) => waitForActiveEmbeddedRuns(timeoutMs),
|
||||
}));
|
||||
|
||||
vi.mock("../../logging/subsystem.js", () => ({
|
||||
createSubsystemLogger: () => gatewayLog,
|
||||
}));
|
||||
@@ -186,7 +198,9 @@ describe("runGatewayLoop", () => {
|
||||
|
||||
await withIsolatedSignals(async ({ captureSignal }) => {
|
||||
getActiveTaskCount.mockReturnValueOnce(2).mockReturnValueOnce(0);
|
||||
getActiveEmbeddedRunCount.mockReturnValueOnce(1).mockReturnValueOnce(0);
|
||||
waitForActiveTasks.mockResolvedValueOnce({ drained: false });
|
||||
waitForActiveEmbeddedRuns.mockResolvedValueOnce({ drained: true });
|
||||
|
||||
type StartServer = () => Promise<{
|
||||
close: (opts: { reason: string; restartExpectedMs: number | null }) => Promise<void>;
|
||||
@@ -243,7 +257,10 @@ describe("runGatewayLoop", () => {
|
||||
expect(start).toHaveBeenCalledTimes(2);
|
||||
await new Promise<void>((resolve) => setImmediate(resolve));
|
||||
|
||||
expect(waitForActiveTasks).toHaveBeenCalledWith(30_000);
|
||||
expect(abortEmbeddedPiRun).toHaveBeenCalledWith(undefined, { mode: "compacting" });
|
||||
expect(waitForActiveTasks).toHaveBeenCalledWith(90_000);
|
||||
expect(waitForActiveEmbeddedRuns).toHaveBeenCalledWith(90_000);
|
||||
expect(abortEmbeddedPiRun).toHaveBeenCalledWith(undefined, { mode: "all" });
|
||||
expect(markGatewayDraining).toHaveBeenCalledTimes(1);
|
||||
expect(gatewayLog.warn).toHaveBeenCalledWith(DRAIN_TIMEOUT_LOG);
|
||||
expect(closeFirst).toHaveBeenCalledWith({
|
||||
|
||||
@@ -1,3 +1,8 @@
|
||||
import {
|
||||
abortEmbeddedPiRun,
|
||||
getActiveEmbeddedRunCount,
|
||||
waitForActiveEmbeddedRuns,
|
||||
} from "../../agents/pi-embedded-runner/runs.js";
|
||||
import type { startGatewayServer } from "../../gateway/server.js";
|
||||
import { acquireGatewayLock } from "../../infra/gateway-lock.js";
|
||||
import { restartGatewayProcessWithFreshPid } from "../../infra/process-respawn.js";
|
||||
@@ -90,7 +95,7 @@ export async function runGatewayLoop(params: {
|
||||
exitProcess(0);
|
||||
};
|
||||
|
||||
const DRAIN_TIMEOUT_MS = 30_000;
|
||||
const DRAIN_TIMEOUT_MS = 90_000;
|
||||
const SHUTDOWN_TIMEOUT_MS = 5_000;
|
||||
|
||||
const request = (action: GatewayRunSignalAction, signal: string) => {
|
||||
@@ -121,15 +126,33 @@ export async function runGatewayLoop(params: {
|
||||
// sessions get an explicit restart error instead of silent task loss.
|
||||
markGatewayDraining();
|
||||
const activeTasks = getActiveTaskCount();
|
||||
if (activeTasks > 0) {
|
||||
const activeRuns = getActiveEmbeddedRunCount();
|
||||
|
||||
// Best-effort abort for compacting runs so long compaction operations
|
||||
// don't hold session write locks across restart boundaries.
|
||||
if (activeRuns > 0) {
|
||||
abortEmbeddedPiRun(undefined, { mode: "compacting" });
|
||||
}
|
||||
|
||||
if (activeTasks > 0 || activeRuns > 0) {
|
||||
gatewayLog.info(
|
||||
`draining ${activeTasks} active task(s) before restart (timeout ${DRAIN_TIMEOUT_MS}ms)`,
|
||||
`draining ${activeTasks} active task(s) and ${activeRuns} active embedded run(s) before restart (timeout ${DRAIN_TIMEOUT_MS}ms)`,
|
||||
);
|
||||
const { drained } = await waitForActiveTasks(DRAIN_TIMEOUT_MS);
|
||||
if (drained) {
|
||||
gatewayLog.info("all active tasks drained");
|
||||
const [tasksDrain, runsDrain] = await Promise.all([
|
||||
activeTasks > 0
|
||||
? waitForActiveTasks(DRAIN_TIMEOUT_MS)
|
||||
: Promise.resolve({ drained: true }),
|
||||
activeRuns > 0
|
||||
? waitForActiveEmbeddedRuns(DRAIN_TIMEOUT_MS)
|
||||
: Promise.resolve({ drained: true }),
|
||||
]);
|
||||
if (tasksDrain.drained && runsDrain.drained) {
|
||||
gatewayLog.info("all active work drained");
|
||||
} else {
|
||||
gatewayLog.warn("drain timeout reached; proceeding with restart");
|
||||
// Final best-effort abort to avoid carrying active runs into the
|
||||
// next lifecycle when drain time budget is exhausted.
|
||||
abortEmbeddedPiRun(undefined, { mode: "all" });
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user