fix(agents): bound compaction retry wait and drain embedded runs on restart (#40324)

Merged via squash.

Prepared head SHA: cfd99562d6
Co-authored-by: cgdusek <38732970+cgdusek@users.noreply.github.com>
Co-authored-by: jalehman <550978+jalehman@users.noreply.github.com>
Reviewed-by: @jalehman
This commit is contained in:
Charles Dusek
2026-03-09 10:27:29 -05:00
committed by GitHub
parent fbf5d56366
commit 54be30ef89
10 changed files with 478 additions and 19 deletions

View File

@@ -15,6 +15,11 @@ const resetAllLanes = vi.fn();
const restartGatewayProcessWithFreshPid = vi.fn<
() => { mode: "spawned" | "supervised" | "disabled" | "failed"; pid?: number; detail?: string }
>(() => ({ mode: "disabled" }));
const abortEmbeddedPiRun = vi.fn(
(_sessionId?: string, _opts?: { mode?: "all" | "compacting" }) => false,
);
const getActiveEmbeddedRunCount = vi.fn(() => 0);
const waitForActiveEmbeddedRuns = vi.fn(async (_timeoutMs: number) => ({ drained: true }));
const DRAIN_TIMEOUT_LOG = "drain timeout reached; proceeding with restart";
const gatewayLog = {
info: vi.fn(),
@@ -43,6 +48,13 @@ vi.mock("../../process/command-queue.js", () => ({
resetAllLanes: () => resetAllLanes(),
}));
vi.mock("../../agents/pi-embedded-runner/runs.js", () => ({
abortEmbeddedPiRun: (sessionId?: string, opts?: { mode?: "all" | "compacting" }) =>
abortEmbeddedPiRun(sessionId, opts),
getActiveEmbeddedRunCount: () => getActiveEmbeddedRunCount(),
waitForActiveEmbeddedRuns: (timeoutMs: number) => waitForActiveEmbeddedRuns(timeoutMs),
}));
vi.mock("../../logging/subsystem.js", () => ({
createSubsystemLogger: () => gatewayLog,
}));
@@ -186,7 +198,9 @@ describe("runGatewayLoop", () => {
await withIsolatedSignals(async ({ captureSignal }) => {
getActiveTaskCount.mockReturnValueOnce(2).mockReturnValueOnce(0);
getActiveEmbeddedRunCount.mockReturnValueOnce(1).mockReturnValueOnce(0);
waitForActiveTasks.mockResolvedValueOnce({ drained: false });
waitForActiveEmbeddedRuns.mockResolvedValueOnce({ drained: true });
type StartServer = () => Promise<{
close: (opts: { reason: string; restartExpectedMs: number | null }) => Promise<void>;
@@ -243,7 +257,10 @@ describe("runGatewayLoop", () => {
expect(start).toHaveBeenCalledTimes(2);
await new Promise<void>((resolve) => setImmediate(resolve));
expect(waitForActiveTasks).toHaveBeenCalledWith(30_000);
expect(abortEmbeddedPiRun).toHaveBeenCalledWith(undefined, { mode: "compacting" });
expect(waitForActiveTasks).toHaveBeenCalledWith(90_000);
expect(waitForActiveEmbeddedRuns).toHaveBeenCalledWith(90_000);
expect(abortEmbeddedPiRun).toHaveBeenCalledWith(undefined, { mode: "all" });
expect(markGatewayDraining).toHaveBeenCalledTimes(1);
expect(gatewayLog.warn).toHaveBeenCalledWith(DRAIN_TIMEOUT_LOG);
expect(closeFirst).toHaveBeenCalledWith({

View File

@@ -1,3 +1,8 @@
import {
abortEmbeddedPiRun,
getActiveEmbeddedRunCount,
waitForActiveEmbeddedRuns,
} from "../../agents/pi-embedded-runner/runs.js";
import type { startGatewayServer } from "../../gateway/server.js";
import { acquireGatewayLock } from "../../infra/gateway-lock.js";
import { restartGatewayProcessWithFreshPid } from "../../infra/process-respawn.js";
@@ -90,7 +95,7 @@ export async function runGatewayLoop(params: {
exitProcess(0);
};
const DRAIN_TIMEOUT_MS = 30_000;
const DRAIN_TIMEOUT_MS = 90_000;
const SHUTDOWN_TIMEOUT_MS = 5_000;
const request = (action: GatewayRunSignalAction, signal: string) => {
@@ -121,15 +126,33 @@ export async function runGatewayLoop(params: {
// sessions get an explicit restart error instead of silent task loss.
markGatewayDraining();
const activeTasks = getActiveTaskCount();
if (activeTasks > 0) {
const activeRuns = getActiveEmbeddedRunCount();
// Best-effort abort for compacting runs so long compaction operations
// don't hold session write locks across restart boundaries.
if (activeRuns > 0) {
abortEmbeddedPiRun(undefined, { mode: "compacting" });
}
if (activeTasks > 0 || activeRuns > 0) {
gatewayLog.info(
`draining ${activeTasks} active task(s) before restart (timeout ${DRAIN_TIMEOUT_MS}ms)`,
`draining ${activeTasks} active task(s) and ${activeRuns} active embedded run(s) before restart (timeout ${DRAIN_TIMEOUT_MS}ms)`,
);
const { drained } = await waitForActiveTasks(DRAIN_TIMEOUT_MS);
if (drained) {
gatewayLog.info("all active tasks drained");
const [tasksDrain, runsDrain] = await Promise.all([
activeTasks > 0
? waitForActiveTasks(DRAIN_TIMEOUT_MS)
: Promise.resolve({ drained: true }),
activeRuns > 0
? waitForActiveEmbeddedRuns(DRAIN_TIMEOUT_MS)
: Promise.resolve({ drained: true }),
]);
if (tasksDrain.drained && runsDrain.drained) {
gatewayLog.info("all active work drained");
} else {
gatewayLog.warn("drain timeout reached; proceeding with restart");
// Final best-effort abort to avoid carrying active runs into the
// next lifecycle when drain time budget is exhausted.
abortEmbeddedPiRun(undefined, { mode: "all" });
}
}
}