mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-28 06:59:56 +00:00
fix(perf): harden gateway restart bench exits
This commit is contained in:
@@ -18,6 +18,8 @@ Docs: https://docs.openclaw.ai
|
||||
- Install/update: bypass npm `min-release-age` policies with `--min-release-age=0` instead of `--before` so hosted installers keep working on npm versions that reject the combined config. (#84749) Thanks @TeodoroRodrigo.
|
||||
- WebChat: keep message-tool replies visible in the chat while still summarizing internal tool results for the model. Fixes #86347. Thanks @shakkernerd.
|
||||
- Gateway/perf: fail startup benchmark samples when the Gateway process exits before benchmark teardown, including signal deaths after readiness probes.
|
||||
- Gateway/perf: fail restart benchmark samples when the Gateway exits before benchmark teardown, including clean exits and signal deaths after successful restart probes.
|
||||
- Agents/tests: keep model catalog visibility on static selection helpers so catalog visibility checks avoid the broad model-selection barrel import.
|
||||
- Agents/commitments: serialize commitment store load-modify-save writes so concurrent heartbeat and CLI updates no longer lose dismissal, sent, or attempt state. (#81153) Thanks @ai-hpc.
|
||||
- xAI/LM Studio: promote plain-text tool-call fallbacks into structured tool calls and strip leaked internal tool syntax before user-facing delivery. (#86222) Thanks @fuller-stack-dev.
|
||||
- CLI: suppress benign self-update version-skew warnings during package post-update finalization.
|
||||
|
||||
@@ -68,6 +68,15 @@ type GatewayRestartFailureCode =
|
||||
| "child_nonzero_exit"
|
||||
| "cleanup_failed";
|
||||
|
||||
type ChildExit = {
|
||||
exitCode: number | null;
|
||||
signal: string | null;
|
||||
};
|
||||
|
||||
type StopChildResult = ChildExit & {
|
||||
exitedBeforeTeardown: boolean;
|
||||
};
|
||||
|
||||
type RestartIteration = {
|
||||
cpuCoreRatio: number | null;
|
||||
cpuMs: number | null;
|
||||
@@ -98,6 +107,7 @@ type GatewayRestartSample = {
|
||||
childExitCode: number | null;
|
||||
childSignal: string | null;
|
||||
events: BenchmarkEvent[];
|
||||
exitedBeforeTeardown: boolean;
|
||||
failureCode: GatewayRestartFailureCode | null;
|
||||
firstOutputMs: number | null;
|
||||
initialGatewayReadyLogLine: string | null;
|
||||
@@ -869,36 +879,52 @@ function writeRestartIntent(env: NodeJS.ProcessEnv, targetPid: number, reason: s
|
||||
}
|
||||
}
|
||||
|
||||
async function stopChild(child: ChildProcessWithoutNullStreams): Promise<{
|
||||
exitCode: number | null;
|
||||
signal: string | null;
|
||||
}> {
|
||||
if (child.exitCode != null || child.signalCode != null) {
|
||||
return { exitCode: child.exitCode, signal: child.signalCode };
|
||||
async function stopChild(child: ChildProcessWithoutNullStreams): Promise<StopChildResult> {
|
||||
const currentExit = (): ChildExit | null =>
|
||||
child.exitCode != null || child.signalCode != null
|
||||
? { exitCode: child.exitCode, signal: child.signalCode }
|
||||
: null;
|
||||
|
||||
const existingExit = currentExit();
|
||||
if (existingExit != null) {
|
||||
return { ...existingExit, exitedBeforeTeardown: true };
|
||||
}
|
||||
const exited = new Promise<{ exitCode: number | null; signal: string | null }>((resolve) => {
|
||||
child.once("exit", (exitCode, signal) => resolve({ exitCode, signal }));
|
||||
|
||||
let observedExit: ChildExit | null = null;
|
||||
const exited = new Promise<ChildExit>((resolve) => {
|
||||
child.once("exit", (exitCode, signal) => {
|
||||
observedExit = { exitCode, signal };
|
||||
resolve(observedExit);
|
||||
});
|
||||
});
|
||||
killProcessTree(child, "SIGTERM");
|
||||
|
||||
await new Promise<void>((resolve) => setImmediate(resolve));
|
||||
const queuedExit = observedExit ?? currentExit();
|
||||
if (queuedExit != null) {
|
||||
return { ...queuedExit, exitedBeforeTeardown: true };
|
||||
}
|
||||
|
||||
const sentTeardownSignal = killProcessTree(child, "SIGTERM");
|
||||
const timeout = delay(2000).then(() => {
|
||||
if (child.exitCode == null && child.signalCode == null) {
|
||||
killProcessTree(child, "SIGKILL");
|
||||
}
|
||||
return exited;
|
||||
});
|
||||
return Promise.race([exited, timeout]);
|
||||
const exit = await Promise.race([exited, timeout]);
|
||||
return { ...exit, exitedBeforeTeardown: !sentTeardownSignal };
|
||||
}
|
||||
|
||||
function killProcessTree(child: ChildProcessWithoutNullStreams, signal: NodeJS.Signals): void {
|
||||
function killProcessTree(child: ChildProcessWithoutNullStreams, signal: NodeJS.Signals): boolean {
|
||||
if (process.platform !== "win32" && child.pid !== undefined) {
|
||||
try {
|
||||
process.kill(-child.pid, signal);
|
||||
return;
|
||||
return true;
|
||||
} catch {
|
||||
// Fall back to the direct child below.
|
||||
}
|
||||
}
|
||||
child.kill(signal);
|
||||
return child.kill(signal);
|
||||
}
|
||||
|
||||
function readProcessRssMb(pid: number | undefined): number | null {
|
||||
@@ -1197,6 +1223,15 @@ function resolveRestartDeadlineFailure(childExited: boolean): GatewayRestartFail
|
||||
return childExited ? "restart_child_exited" : "restart_deadline_timeout";
|
||||
}
|
||||
|
||||
function resolveSampleExitFailure(exit: StopChildResult): GatewayRestartFailureCode | null {
|
||||
if (!exit.exitedBeforeTeardown) {
|
||||
return null;
|
||||
}
|
||||
return exit.exitCode !== null && exit.exitCode !== 0
|
||||
? "child_nonzero_exit"
|
||||
: "restart_child_exited";
|
||||
}
|
||||
|
||||
function computeResourceSlope(iterations: RestartIteration[]): ResourceSlope {
|
||||
return {
|
||||
activeHandlesCountPerRestart: slope(
|
||||
@@ -1528,9 +1563,7 @@ async function runGatewaySample(options: {
|
||||
flushOutputLineBuffers(outputBuffers, onLine, performance.now() - sampleStartAt, {
|
||||
flushPartial: true,
|
||||
});
|
||||
if (exit.exitCode !== null && exit.exitCode !== 0 && failureCode === null) {
|
||||
failureCode = "child_nonzero_exit";
|
||||
}
|
||||
failureCode ??= resolveSampleExitFailure(exit);
|
||||
try {
|
||||
rmSync(root, { force: true, maxRetries: 3, recursive: true, retryDelay: 100 });
|
||||
} catch {
|
||||
@@ -1541,6 +1574,7 @@ async function runGatewaySample(options: {
|
||||
childExitCode: exit.exitCode,
|
||||
childSignal: exit.signal,
|
||||
events,
|
||||
exitedBeforeTeardown: exit.exitedBeforeTeardown,
|
||||
failureCode,
|
||||
firstOutputMs,
|
||||
initialGatewayReadyLogLine,
|
||||
@@ -1693,8 +1727,10 @@ export const testing = {
|
||||
resolveRestartDeadlineFailure,
|
||||
resolveEntry,
|
||||
resolvePhaseDeadlineAt,
|
||||
resolveSampleExitFailure,
|
||||
sanitizedEnv,
|
||||
shouldFailBenchmark,
|
||||
stopChild,
|
||||
summarizeCase,
|
||||
waitForRestartProbe,
|
||||
writeConfig,
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
import type { OpenClawConfig } from "../config/types.openclaw.js";
|
||||
import type { ModelCatalogEntry } from "./model-catalog.js";
|
||||
import { createProviderAuthChecker } from "./model-provider-auth.js";
|
||||
import { buildConfiguredModelCatalog, modelKey } from "./model-selection.js";
|
||||
import { modelKey } from "./model-selection-normalize.js";
|
||||
import { buildConfiguredModelCatalog } from "./model-selection-shared.js";
|
||||
import { createModelVisibilityPolicy } from "./model-visibility-policy.js";
|
||||
|
||||
type ModelCatalogVisibilityView = "default" | "configured" | "all";
|
||||
|
||||
@@ -1,10 +1,11 @@
|
||||
import { spawnSync } from "node:child_process";
|
||||
import { EventEmitter } from "node:events";
|
||||
import fs from "node:fs";
|
||||
import { createServer } from "node:http";
|
||||
import os from "node:os";
|
||||
import path from "node:path";
|
||||
import { performance } from "node:perf_hooks";
|
||||
import { describe, expect, it } from "vitest";
|
||||
import { describe, expect, it, vi } from "vitest";
|
||||
import { testing } from "../../scripts/bench-gateway-restart.ts";
|
||||
|
||||
describe("gateway restart benchmark script", () => {
|
||||
@@ -210,6 +211,87 @@ node 1234 user 12u IPv4 0t0 TCP localhost:1234
|
||||
expect(testing.resolveRestartDeadlineFailure(true)).toBe("restart_child_exited");
|
||||
});
|
||||
|
||||
it("classifies queued child exits before sending teardown signals", async () => {
|
||||
const child = new EventEmitter() as EventEmitter & {
|
||||
exitCode: number | null;
|
||||
kill: ReturnType<typeof vi.fn>;
|
||||
signalCode: NodeJS.Signals | null;
|
||||
};
|
||||
child.exitCode = null;
|
||||
child.signalCode = null;
|
||||
child.kill = vi.fn(() => true);
|
||||
|
||||
const stopped = testing.stopChild(child as unknown as Parameters<typeof testing.stopChild>[0]);
|
||||
queueMicrotask(() => {
|
||||
child.exitCode = 0;
|
||||
child.emit("exit", 0, null);
|
||||
});
|
||||
|
||||
await expect(stopped).resolves.toEqual({
|
||||
exitedBeforeTeardown: true,
|
||||
exitCode: 0,
|
||||
signal: null,
|
||||
});
|
||||
expect(child.kill).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it("classifies failed teardown signaling as a pre-teardown child exit", async () => {
|
||||
const child = new EventEmitter() as EventEmitter & {
|
||||
exitCode: number | null;
|
||||
kill: ReturnType<typeof vi.fn>;
|
||||
signalCode: NodeJS.Signals | null;
|
||||
};
|
||||
child.exitCode = null;
|
||||
child.signalCode = null;
|
||||
child.kill = vi.fn(() => {
|
||||
setImmediate(() => {
|
||||
child.exitCode = 8;
|
||||
child.emit("exit", 8, null);
|
||||
});
|
||||
return false;
|
||||
});
|
||||
|
||||
await expect(
|
||||
testing.stopChild(child as unknown as Parameters<typeof testing.stopChild>[0]),
|
||||
).resolves.toEqual({
|
||||
exitedBeforeTeardown: true,
|
||||
exitCode: 8,
|
||||
signal: null,
|
||||
});
|
||||
expect(child.kill).toHaveBeenCalledWith("SIGTERM");
|
||||
});
|
||||
|
||||
it("marks clean and signaled pre-teardown child exits as benchmark failures", () => {
|
||||
expect(
|
||||
testing.resolveSampleExitFailure({
|
||||
exitedBeforeTeardown: true,
|
||||
exitCode: 0,
|
||||
signal: null,
|
||||
}),
|
||||
).toBe("restart_child_exited");
|
||||
expect(
|
||||
testing.resolveSampleExitFailure({
|
||||
exitedBeforeTeardown: true,
|
||||
exitCode: null,
|
||||
signal: "SIGSEGV",
|
||||
}),
|
||||
).toBe("restart_child_exited");
|
||||
expect(
|
||||
testing.resolveSampleExitFailure({
|
||||
exitedBeforeTeardown: true,
|
||||
exitCode: 9,
|
||||
signal: null,
|
||||
}),
|
||||
).toBe("child_nonzero_exit");
|
||||
expect(
|
||||
testing.resolveSampleExitFailure({
|
||||
exitedBeforeTeardown: false,
|
||||
exitCode: null,
|
||||
signal: "SIGTERM",
|
||||
}),
|
||||
).toBeNull();
|
||||
});
|
||||
|
||||
it("budgets timeout per restart instead of against the whole sample", () => {
|
||||
const sampleStartAt = 1_000;
|
||||
const timeoutMs = 30_000;
|
||||
@@ -252,6 +334,7 @@ node 1234 user 12u IPv4 0t0 TCP localhost:1234
|
||||
childExitCode: null,
|
||||
childSignal: "SIGTERM",
|
||||
events: [],
|
||||
exitedBeforeTeardown: false,
|
||||
failureCode: null,
|
||||
firstOutputMs: 1,
|
||||
initialGatewayReadyLogLine: "[gateway] ready",
|
||||
@@ -377,6 +460,7 @@ node 1234 user 12u IPv4 0t0 TCP localhost:1234
|
||||
childExitCode: null,
|
||||
childSignal: null,
|
||||
events: [],
|
||||
exitedBeforeTeardown: true,
|
||||
failureCode: "initial_readyz_timeout",
|
||||
firstOutputMs: 1,
|
||||
initialGatewayReadyLogLine: "[gateway] ready",
|
||||
@@ -429,6 +513,7 @@ node 1234 user 12u IPv4 0t0 TCP localhost:1234
|
||||
childExitCode: 0,
|
||||
childSignal: null,
|
||||
events: [],
|
||||
exitedBeforeTeardown: false,
|
||||
failureCode: null,
|
||||
firstOutputMs: 1,
|
||||
initialGatewayReadyLogLine: "[gateway] ready",
|
||||
|
||||
Reference in New Issue
Block a user