diff --git a/CHANGELOG.md b/CHANGELOG.md index f4c2424e406..096da42f182 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,6 +18,8 @@ Docs: https://docs.openclaw.ai - Install/update: bypass npm `min-release-age` policies with `--min-release-age=0` instead of `--before` so hosted installers keep working on npm versions that reject the combined config. (#84749) Thanks @TeodoroRodrigo. - WebChat: keep message-tool replies visible in the chat while still summarizing internal tool results for the model. Fixes #86347. Thanks @shakkernerd. - Gateway/perf: fail startup benchmark samples when the Gateway process exits before benchmark teardown, including signal deaths after readiness probes. +- Gateway/perf: fail restart benchmark samples when the Gateway exits before benchmark teardown, including clean exits and signal deaths after successful restart probes. +- Agents/tests: keep model catalog visibility on static selection helpers so catalog visibility checks avoid the broad model-selection barrel import. - Agents/commitments: serialize commitment store load-modify-save writes so concurrent heartbeat and CLI updates no longer lose dismissal, sent, or attempt state. (#81153) Thanks @ai-hpc. - xAI/LM Studio: promote plain-text tool-call fallbacks into structured tool calls and strip leaked internal tool syntax before user-facing delivery. (#86222) Thanks @fuller-stack-dev. - CLI: suppress benign self-update version-skew warnings during package post-update finalization. diff --git a/scripts/bench-gateway-restart.ts b/scripts/bench-gateway-restart.ts index 0ca5973bc6b..5a80e1ecd13 100644 --- a/scripts/bench-gateway-restart.ts +++ b/scripts/bench-gateway-restart.ts @@ -68,6 +68,15 @@ type GatewayRestartFailureCode = | "child_nonzero_exit" | "cleanup_failed"; +type ChildExit = { + exitCode: number | null; + signal: string | null; +}; + +type StopChildResult = ChildExit & { + exitedBeforeTeardown: boolean; +}; + type RestartIteration = { cpuCoreRatio: number | null; cpuMs: number | null; @@ -98,6 +107,7 @@ type GatewayRestartSample = { childExitCode: number | null; childSignal: string | null; events: BenchmarkEvent[]; + exitedBeforeTeardown: boolean; failureCode: GatewayRestartFailureCode | null; firstOutputMs: number | null; initialGatewayReadyLogLine: string | null; @@ -869,36 +879,52 @@ function writeRestartIntent(env: NodeJS.ProcessEnv, targetPid: number, reason: s } } -async function stopChild(child: ChildProcessWithoutNullStreams): Promise<{ - exitCode: number | null; - signal: string | null; -}> { - if (child.exitCode != null || child.signalCode != null) { - return { exitCode: child.exitCode, signal: child.signalCode }; +async function stopChild(child: ChildProcessWithoutNullStreams): Promise { + const currentExit = (): ChildExit | null => + child.exitCode != null || child.signalCode != null + ? { exitCode: child.exitCode, signal: child.signalCode } + : null; + + const existingExit = currentExit(); + if (existingExit != null) { + return { ...existingExit, exitedBeforeTeardown: true }; } - const exited = new Promise<{ exitCode: number | null; signal: string | null }>((resolve) => { - child.once("exit", (exitCode, signal) => resolve({ exitCode, signal })); + + let observedExit: ChildExit | null = null; + const exited = new Promise((resolve) => { + child.once("exit", (exitCode, signal) => { + observedExit = { exitCode, signal }; + resolve(observedExit); + }); }); - killProcessTree(child, "SIGTERM"); + + await new Promise((resolve) => setImmediate(resolve)); + const queuedExit = observedExit ?? currentExit(); + if (queuedExit != null) { + return { ...queuedExit, exitedBeforeTeardown: true }; + } + + const sentTeardownSignal = killProcessTree(child, "SIGTERM"); const timeout = delay(2000).then(() => { if (child.exitCode == null && child.signalCode == null) { killProcessTree(child, "SIGKILL"); } return exited; }); - return Promise.race([exited, timeout]); + const exit = await Promise.race([exited, timeout]); + return { ...exit, exitedBeforeTeardown: !sentTeardownSignal }; } -function killProcessTree(child: ChildProcessWithoutNullStreams, signal: NodeJS.Signals): void { +function killProcessTree(child: ChildProcessWithoutNullStreams, signal: NodeJS.Signals): boolean { if (process.platform !== "win32" && child.pid !== undefined) { try { process.kill(-child.pid, signal); - return; + return true; } catch { // Fall back to the direct child below. } } - child.kill(signal); + return child.kill(signal); } function readProcessRssMb(pid: number | undefined): number | null { @@ -1197,6 +1223,15 @@ function resolveRestartDeadlineFailure(childExited: boolean): GatewayRestartFail return childExited ? "restart_child_exited" : "restart_deadline_timeout"; } +function resolveSampleExitFailure(exit: StopChildResult): GatewayRestartFailureCode | null { + if (!exit.exitedBeforeTeardown) { + return null; + } + return exit.exitCode !== null && exit.exitCode !== 0 + ? "child_nonzero_exit" + : "restart_child_exited"; +} + function computeResourceSlope(iterations: RestartIteration[]): ResourceSlope { return { activeHandlesCountPerRestart: slope( @@ -1528,9 +1563,7 @@ async function runGatewaySample(options: { flushOutputLineBuffers(outputBuffers, onLine, performance.now() - sampleStartAt, { flushPartial: true, }); - if (exit.exitCode !== null && exit.exitCode !== 0 && failureCode === null) { - failureCode = "child_nonzero_exit"; - } + failureCode ??= resolveSampleExitFailure(exit); try { rmSync(root, { force: true, maxRetries: 3, recursive: true, retryDelay: 100 }); } catch { @@ -1541,6 +1574,7 @@ async function runGatewaySample(options: { childExitCode: exit.exitCode, childSignal: exit.signal, events, + exitedBeforeTeardown: exit.exitedBeforeTeardown, failureCode, firstOutputMs, initialGatewayReadyLogLine, @@ -1693,8 +1727,10 @@ export const testing = { resolveRestartDeadlineFailure, resolveEntry, resolvePhaseDeadlineAt, + resolveSampleExitFailure, sanitizedEnv, shouldFailBenchmark, + stopChild, summarizeCase, waitForRestartProbe, writeConfig, diff --git a/src/agents/model-catalog-visibility.ts b/src/agents/model-catalog-visibility.ts index b76fbd89fa5..3061003caa1 100644 --- a/src/agents/model-catalog-visibility.ts +++ b/src/agents/model-catalog-visibility.ts @@ -1,7 +1,8 @@ import type { OpenClawConfig } from "../config/types.openclaw.js"; import type { ModelCatalogEntry } from "./model-catalog.js"; import { createProviderAuthChecker } from "./model-provider-auth.js"; -import { buildConfiguredModelCatalog, modelKey } from "./model-selection.js"; +import { modelKey } from "./model-selection-normalize.js"; +import { buildConfiguredModelCatalog } from "./model-selection-shared.js"; import { createModelVisibilityPolicy } from "./model-visibility-policy.js"; type ModelCatalogVisibilityView = "default" | "configured" | "all"; diff --git a/test/scripts/bench-gateway-restart.test.ts b/test/scripts/bench-gateway-restart.test.ts index c9893b934c8..0d3822f51b5 100644 --- a/test/scripts/bench-gateway-restart.test.ts +++ b/test/scripts/bench-gateway-restart.test.ts @@ -1,10 +1,11 @@ import { spawnSync } from "node:child_process"; +import { EventEmitter } from "node:events"; import fs from "node:fs"; import { createServer } from "node:http"; import os from "node:os"; import path from "node:path"; import { performance } from "node:perf_hooks"; -import { describe, expect, it } from "vitest"; +import { describe, expect, it, vi } from "vitest"; import { testing } from "../../scripts/bench-gateway-restart.ts"; describe("gateway restart benchmark script", () => { @@ -210,6 +211,87 @@ node 1234 user 12u IPv4 0t0 TCP localhost:1234 expect(testing.resolveRestartDeadlineFailure(true)).toBe("restart_child_exited"); }); + it("classifies queued child exits before sending teardown signals", async () => { + const child = new EventEmitter() as EventEmitter & { + exitCode: number | null; + kill: ReturnType; + signalCode: NodeJS.Signals | null; + }; + child.exitCode = null; + child.signalCode = null; + child.kill = vi.fn(() => true); + + const stopped = testing.stopChild(child as unknown as Parameters[0]); + queueMicrotask(() => { + child.exitCode = 0; + child.emit("exit", 0, null); + }); + + await expect(stopped).resolves.toEqual({ + exitedBeforeTeardown: true, + exitCode: 0, + signal: null, + }); + expect(child.kill).not.toHaveBeenCalled(); + }); + + it("classifies failed teardown signaling as a pre-teardown child exit", async () => { + const child = new EventEmitter() as EventEmitter & { + exitCode: number | null; + kill: ReturnType; + signalCode: NodeJS.Signals | null; + }; + child.exitCode = null; + child.signalCode = null; + child.kill = vi.fn(() => { + setImmediate(() => { + child.exitCode = 8; + child.emit("exit", 8, null); + }); + return false; + }); + + await expect( + testing.stopChild(child as unknown as Parameters[0]), + ).resolves.toEqual({ + exitedBeforeTeardown: true, + exitCode: 8, + signal: null, + }); + expect(child.kill).toHaveBeenCalledWith("SIGTERM"); + }); + + it("marks clean and signaled pre-teardown child exits as benchmark failures", () => { + expect( + testing.resolveSampleExitFailure({ + exitedBeforeTeardown: true, + exitCode: 0, + signal: null, + }), + ).toBe("restart_child_exited"); + expect( + testing.resolveSampleExitFailure({ + exitedBeforeTeardown: true, + exitCode: null, + signal: "SIGSEGV", + }), + ).toBe("restart_child_exited"); + expect( + testing.resolveSampleExitFailure({ + exitedBeforeTeardown: true, + exitCode: 9, + signal: null, + }), + ).toBe("child_nonzero_exit"); + expect( + testing.resolveSampleExitFailure({ + exitedBeforeTeardown: false, + exitCode: null, + signal: "SIGTERM", + }), + ).toBeNull(); + }); + it("budgets timeout per restart instead of against the whole sample", () => { const sampleStartAt = 1_000; const timeoutMs = 30_000; @@ -252,6 +334,7 @@ node 1234 user 12u IPv4 0t0 TCP localhost:1234 childExitCode: null, childSignal: "SIGTERM", events: [], + exitedBeforeTeardown: false, failureCode: null, firstOutputMs: 1, initialGatewayReadyLogLine: "[gateway] ready", @@ -377,6 +460,7 @@ node 1234 user 12u IPv4 0t0 TCP localhost:1234 childExitCode: null, childSignal: null, events: [], + exitedBeforeTeardown: true, failureCode: "initial_readyz_timeout", firstOutputMs: 1, initialGatewayReadyLogLine: "[gateway] ready", @@ -429,6 +513,7 @@ node 1234 user 12u IPv4 0t0 TCP localhost:1234 childExitCode: 0, childSignal: null, events: [], + exitedBeforeTeardown: false, failureCode: null, firstOutputMs: 1, initialGatewayReadyLogLine: "[gateway] ready",