fix(perf): harden gateway restart bench exits

This commit is contained in:
Vincent Koc
2026-05-25 10:31:23 +02:00
parent c791e4242b
commit 82bbcf60b0
4 changed files with 142 additions and 18 deletions

View File

@@ -18,6 +18,8 @@ Docs: https://docs.openclaw.ai
- Install/update: bypass npm `min-release-age` policies with `--min-release-age=0` instead of `--before` so hosted installers keep working on npm versions that reject the combined config. (#84749) Thanks @TeodoroRodrigo.
- WebChat: keep message-tool replies visible in the chat while still summarizing internal tool results for the model. Fixes #86347. Thanks @shakkernerd.
- Gateway/perf: fail startup benchmark samples when the Gateway process exits before benchmark teardown, including signal deaths after readiness probes.
- Gateway/perf: fail restart benchmark samples when the Gateway exits before benchmark teardown, including clean exits and signal deaths after successful restart probes.
- Agents/tests: keep model catalog visibility on static selection helpers so catalog visibility checks avoid the broad model-selection barrel import.
- Agents/commitments: serialize commitment store load-modify-save writes so concurrent heartbeat and CLI updates no longer lose dismissal, sent, or attempt state. (#81153) Thanks @ai-hpc.
- xAI/LM Studio: promote plain-text tool-call fallbacks into structured tool calls and strip leaked internal tool syntax before user-facing delivery. (#86222) Thanks @fuller-stack-dev.
- CLI: suppress benign self-update version-skew warnings during package post-update finalization.

View File

@@ -68,6 +68,15 @@ type GatewayRestartFailureCode =
| "child_nonzero_exit"
| "cleanup_failed";
type ChildExit = {
exitCode: number | null;
signal: string | null;
};
type StopChildResult = ChildExit & {
exitedBeforeTeardown: boolean;
};
type RestartIteration = {
cpuCoreRatio: number | null;
cpuMs: number | null;
@@ -98,6 +107,7 @@ type GatewayRestartSample = {
childExitCode: number | null;
childSignal: string | null;
events: BenchmarkEvent[];
exitedBeforeTeardown: boolean;
failureCode: GatewayRestartFailureCode | null;
firstOutputMs: number | null;
initialGatewayReadyLogLine: string | null;
@@ -869,36 +879,52 @@ function writeRestartIntent(env: NodeJS.ProcessEnv, targetPid: number, reason: s
}
}
async function stopChild(child: ChildProcessWithoutNullStreams): Promise<{
exitCode: number | null;
signal: string | null;
}> {
if (child.exitCode != null || child.signalCode != null) {
return { exitCode: child.exitCode, signal: child.signalCode };
async function stopChild(child: ChildProcessWithoutNullStreams): Promise<StopChildResult> {
const currentExit = (): ChildExit | null =>
child.exitCode != null || child.signalCode != null
? { exitCode: child.exitCode, signal: child.signalCode }
: null;
const existingExit = currentExit();
if (existingExit != null) {
return { ...existingExit, exitedBeforeTeardown: true };
}
const exited = new Promise<{ exitCode: number | null; signal: string | null }>((resolve) => {
child.once("exit", (exitCode, signal) => resolve({ exitCode, signal }));
let observedExit: ChildExit | null = null;
const exited = new Promise<ChildExit>((resolve) => {
child.once("exit", (exitCode, signal) => {
observedExit = { exitCode, signal };
resolve(observedExit);
});
});
killProcessTree(child, "SIGTERM");
await new Promise<void>((resolve) => setImmediate(resolve));
const queuedExit = observedExit ?? currentExit();
if (queuedExit != null) {
return { ...queuedExit, exitedBeforeTeardown: true };
}
const sentTeardownSignal = killProcessTree(child, "SIGTERM");
const timeout = delay(2000).then(() => {
if (child.exitCode == null && child.signalCode == null) {
killProcessTree(child, "SIGKILL");
}
return exited;
});
return Promise.race([exited, timeout]);
const exit = await Promise.race([exited, timeout]);
return { ...exit, exitedBeforeTeardown: !sentTeardownSignal };
}
function killProcessTree(child: ChildProcessWithoutNullStreams, signal: NodeJS.Signals): void {
function killProcessTree(child: ChildProcessWithoutNullStreams, signal: NodeJS.Signals): boolean {
if (process.platform !== "win32" && child.pid !== undefined) {
try {
process.kill(-child.pid, signal);
return;
return true;
} catch {
// Fall back to the direct child below.
}
}
child.kill(signal);
return child.kill(signal);
}
function readProcessRssMb(pid: number | undefined): number | null {
@@ -1197,6 +1223,15 @@ function resolveRestartDeadlineFailure(childExited: boolean): GatewayRestartFail
return childExited ? "restart_child_exited" : "restart_deadline_timeout";
}
function resolveSampleExitFailure(exit: StopChildResult): GatewayRestartFailureCode | null {
if (!exit.exitedBeforeTeardown) {
return null;
}
return exit.exitCode !== null && exit.exitCode !== 0
? "child_nonzero_exit"
: "restart_child_exited";
}
function computeResourceSlope(iterations: RestartIteration[]): ResourceSlope {
return {
activeHandlesCountPerRestart: slope(
@@ -1528,9 +1563,7 @@ async function runGatewaySample(options: {
flushOutputLineBuffers(outputBuffers, onLine, performance.now() - sampleStartAt, {
flushPartial: true,
});
if (exit.exitCode !== null && exit.exitCode !== 0 && failureCode === null) {
failureCode = "child_nonzero_exit";
}
failureCode ??= resolveSampleExitFailure(exit);
try {
rmSync(root, { force: true, maxRetries: 3, recursive: true, retryDelay: 100 });
} catch {
@@ -1541,6 +1574,7 @@ async function runGatewaySample(options: {
childExitCode: exit.exitCode,
childSignal: exit.signal,
events,
exitedBeforeTeardown: exit.exitedBeforeTeardown,
failureCode,
firstOutputMs,
initialGatewayReadyLogLine,
@@ -1693,8 +1727,10 @@ export const testing = {
resolveRestartDeadlineFailure,
resolveEntry,
resolvePhaseDeadlineAt,
resolveSampleExitFailure,
sanitizedEnv,
shouldFailBenchmark,
stopChild,
summarizeCase,
waitForRestartProbe,
writeConfig,

View File

@@ -1,7 +1,8 @@
import type { OpenClawConfig } from "../config/types.openclaw.js";
import type { ModelCatalogEntry } from "./model-catalog.js";
import { createProviderAuthChecker } from "./model-provider-auth.js";
import { buildConfiguredModelCatalog, modelKey } from "./model-selection.js";
import { modelKey } from "./model-selection-normalize.js";
import { buildConfiguredModelCatalog } from "./model-selection-shared.js";
import { createModelVisibilityPolicy } from "./model-visibility-policy.js";
type ModelCatalogVisibilityView = "default" | "configured" | "all";

View File

@@ -1,10 +1,11 @@
import { spawnSync } from "node:child_process";
import { EventEmitter } from "node:events";
import fs from "node:fs";
import { createServer } from "node:http";
import os from "node:os";
import path from "node:path";
import { performance } from "node:perf_hooks";
import { describe, expect, it } from "vitest";
import { describe, expect, it, vi } from "vitest";
import { testing } from "../../scripts/bench-gateway-restart.ts";
describe("gateway restart benchmark script", () => {
@@ -210,6 +211,87 @@ node 1234 user 12u IPv4 0t0 TCP localhost:1234
expect(testing.resolveRestartDeadlineFailure(true)).toBe("restart_child_exited");
});
it("classifies queued child exits before sending teardown signals", async () => {
const child = new EventEmitter() as EventEmitter & {
exitCode: number | null;
kill: ReturnType<typeof vi.fn>;
signalCode: NodeJS.Signals | null;
};
child.exitCode = null;
child.signalCode = null;
child.kill = vi.fn(() => true);
const stopped = testing.stopChild(child as unknown as Parameters<typeof testing.stopChild>[0]);
queueMicrotask(() => {
child.exitCode = 0;
child.emit("exit", 0, null);
});
await expect(stopped).resolves.toEqual({
exitedBeforeTeardown: true,
exitCode: 0,
signal: null,
});
expect(child.kill).not.toHaveBeenCalled();
});
it("classifies failed teardown signaling as a pre-teardown child exit", async () => {
const child = new EventEmitter() as EventEmitter & {
exitCode: number | null;
kill: ReturnType<typeof vi.fn>;
signalCode: NodeJS.Signals | null;
};
child.exitCode = null;
child.signalCode = null;
child.kill = vi.fn(() => {
setImmediate(() => {
child.exitCode = 8;
child.emit("exit", 8, null);
});
return false;
});
await expect(
testing.stopChild(child as unknown as Parameters<typeof testing.stopChild>[0]),
).resolves.toEqual({
exitedBeforeTeardown: true,
exitCode: 8,
signal: null,
});
expect(child.kill).toHaveBeenCalledWith("SIGTERM");
});
it("marks clean and signaled pre-teardown child exits as benchmark failures", () => {
expect(
testing.resolveSampleExitFailure({
exitedBeforeTeardown: true,
exitCode: 0,
signal: null,
}),
).toBe("restart_child_exited");
expect(
testing.resolveSampleExitFailure({
exitedBeforeTeardown: true,
exitCode: null,
signal: "SIGSEGV",
}),
).toBe("restart_child_exited");
expect(
testing.resolveSampleExitFailure({
exitedBeforeTeardown: true,
exitCode: 9,
signal: null,
}),
).toBe("child_nonzero_exit");
expect(
testing.resolveSampleExitFailure({
exitedBeforeTeardown: false,
exitCode: null,
signal: "SIGTERM",
}),
).toBeNull();
});
it("budgets timeout per restart instead of against the whole sample", () => {
const sampleStartAt = 1_000;
const timeoutMs = 30_000;
@@ -252,6 +334,7 @@ node 1234 user 12u IPv4 0t0 TCP localhost:1234
childExitCode: null,
childSignal: "SIGTERM",
events: [],
exitedBeforeTeardown: false,
failureCode: null,
firstOutputMs: 1,
initialGatewayReadyLogLine: "[gateway] ready",
@@ -377,6 +460,7 @@ node 1234 user 12u IPv4 0t0 TCP localhost:1234
childExitCode: null,
childSignal: null,
events: [],
exitedBeforeTeardown: true,
failureCode: "initial_readyz_timeout",
firstOutputMs: 1,
initialGatewayReadyLogLine: "[gateway] ready",
@@ -429,6 +513,7 @@ node 1234 user 12u IPv4 0t0 TCP localhost:1234
childExitCode: 0,
childSignal: null,
events: [],
exitedBeforeTeardown: false,
failureCode: null,
firstOutputMs: 1,
initialGatewayReadyLogLine: "[gateway] ready",