diff --git a/CHANGELOG.md b/CHANGELOG.md index b34a7e31c4c..4da6c15f7e3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -37,6 +37,7 @@ Docs: https://docs.openclaw.ai - TUI/chat: skip full provider model normalization during context-window warmup while preserving provider-owned context metadata, avoiding cold-start stalls with large model registries. Thanks @547895019. - Memory Wiki: accept relative Markdown links that include the `.md` suffix during broken-wikilink validation, avoiding false positives for native render-mode links. Thanks @Kenneth8128. - OpenAI Codex: show the device-pairing code in the interactive SSH/headless prompt while keeping the short-lived code out of persistent runtime logs. Fixes #74212. Thanks @da22le123. +- QA Lab: stop gateway children when the suite parent disappears, so interrupted local QA runs cannot leave hot orphaned gateways behind. - Plugins/CLI: cache plugin CLI registration entries per command program so completion state generation does not repeat the full plugin sweep in one invocation. Thanks @ScientificProgrammer. - Plugins: reuse gateway-bindable plugin loader cache entries for later default-mode loads without serving default-built registries to gateway-bound requests, reducing repeated plugin registration during dispatch. Refs #61756. Thanks @DmitryPogodaev. - Gateway/secrets: include the caught error message in `secrets.reload` and `secrets.resolve` warning logs while keeping RPC errors generic, so operators can diagnose reload and permission failures. Thanks @davidangularme. diff --git a/extensions/qa-lab/src/gateway-child.test.ts b/extensions/qa-lab/src/gateway-child.test.ts index 7d6aa13c501..e250ec5970f 100644 --- a/extensions/qa-lab/src/gateway-child.test.ts +++ b/extensions/qa-lab/src/gateway-child.test.ts @@ -85,6 +85,7 @@ describe("buildQaRuntimeEnv", () => { }); expect(env.OPENCLAW_TEST_FAST).toBe("1"); + expect(env.OPENCLAW_QA_PARENT_PID).toBe(String(process.pid)); expect(env.OPENCLAW_QA_ALLOW_LOCAL_IMAGE_PROVIDER).toBe("1"); expect(env.OPENCLAW_ALLOW_SLOW_REPLY_TESTS).toBe("1"); expect(env.OPENCLAW_SKIP_STARTUP_MODEL_PREWARM).toBe("1"); diff --git a/extensions/qa-lab/src/gateway-child.ts b/extensions/qa-lab/src/gateway-child.ts index 09c68a501c3..62fdae18243 100644 --- a/extensions/qa-lab/src/gateway-child.ts +++ b/extensions/qa-lab/src/gateway-child.ts @@ -216,6 +216,7 @@ export function buildQaRuntimeEnv(params: { OPENCLAW_SKIP_STARTUP_MODEL_PREWARM: "1", OPENCLAW_NO_RESPAWN: "1", OPENCLAW_TEST_FAST: "1", + OPENCLAW_QA_PARENT_PID: String(process.pid), OPENCLAW_QA_ALLOW_LOCAL_IMAGE_PROVIDER: "1", // QA uses the fast runtime envelope for speed, but it still exercises // normal config-driven heartbeats and runtime config writes. diff --git a/src/cli/gateway-cli/qa-parent-watchdog.test.ts b/src/cli/gateway-cli/qa-parent-watchdog.test.ts new file mode 100644 index 00000000000..599ec995578 --- /dev/null +++ b/src/cli/gateway-cli/qa-parent-watchdog.test.ts @@ -0,0 +1,50 @@ +import { describe, expect, it, vi } from "vitest"; +import { installQaParentWatchdog, QA_PARENT_PID_ENV } from "./qa-parent-watchdog.js"; + +describe("installQaParentWatchdog", () => { + it("does not install without a QA parent pid", () => { + expect(installQaParentWatchdog({ env: {}, ownPid: 10 })).toBeNull(); + expect(installQaParentWatchdog({ env: { [QA_PARENT_PID_ENV]: "10" }, ownPid: 10 })).toBeNull(); + expect( + installQaParentWatchdog({ env: { [QA_PARENT_PID_ENV]: "not-a-pid" }, ownPid: 10 }), + ).toBeNull(); + }); + + it("exits when the QA parent process disappears", () => { + let tick: () => void = () => { + throw new Error("watchdog interval was not installed"); + }; + const timer = { unref: vi.fn() }; + const clearIntervalMock = vi.fn(); + const exit = vi.fn(); + const logger = { warn: vi.fn() }; + const kill = vi.fn(() => { + const error = new Error("missing") as NodeJS.ErrnoException; + error.code = "ESRCH"; + throw error; + }); + + const handle = installQaParentWatchdog({ + clearInterval: clearIntervalMock, + env: { [QA_PARENT_PID_ENV]: "12345" }, + exit, + kill, + logger, + ownPid: 10, + setInterval: (callback) => { + tick = callback; + return timer; + }, + }); + + expect(handle?.parentPid).toBe(12345); + expect(timer.unref).toHaveBeenCalledTimes(1); + tick(); + expect(kill).toHaveBeenCalledWith(12345, 0); + expect(logger.warn).toHaveBeenCalledWith( + "QA gateway parent pid 12345 exited; shutting down orphaned QA gateway", + ); + expect(clearIntervalMock).toHaveBeenCalledWith(timer); + expect(exit).toHaveBeenCalledWith(0); + }); +}); diff --git a/src/cli/gateway-cli/qa-parent-watchdog.ts b/src/cli/gateway-cli/qa-parent-watchdog.ts new file mode 100644 index 00000000000..c5f93ab1e6d --- /dev/null +++ b/src/cli/gateway-cli/qa-parent-watchdog.ts @@ -0,0 +1,96 @@ +import { createSubsystemLogger } from "../../logging/subsystem.js"; + +export const QA_PARENT_PID_ENV = "OPENCLAW_QA_PARENT_PID"; + +const DEFAULT_QA_PARENT_WATCHDOG_INTERVAL_MS = 1000; + +type QaParentWatchdogTimer = + | number + | { + unref?: () => unknown; + }; + +type QaParentWatchdogDeps = { + clearInterval?: (timer: QaParentWatchdogTimer) => void; + env?: NodeJS.ProcessEnv; + exit?: (code?: number) => never | void; + intervalMs?: number; + kill?: (pid: number, signal?: NodeJS.Signals | 0) => boolean; + logger?: Pick, "warn">; + ownPid?: number; + setInterval?: (callback: () => void, ms: number) => QaParentWatchdogTimer; +}; + +export type QaParentWatchdogHandle = { + parentPid: number; + stop: () => void; +}; + +function resolveQaParentPid(env: NodeJS.ProcessEnv, ownPid: number): number | null { + const raw = env[QA_PARENT_PID_ENV]?.trim(); + if (!raw) { + return null; + } + const parentPid = Number(raw); + if (!Number.isSafeInteger(parentPid) || parentPid <= 0 || parentPid === ownPid) { + return null; + } + return parentPid; +} + +export function installQaParentWatchdog( + deps: QaParentWatchdogDeps = {}, +): QaParentWatchdogHandle | null { + const env = deps.env ?? process.env; + const ownPid = deps.ownPid ?? process.pid; + const parentPid = resolveQaParentPid(env, ownPid); + if (parentPid === null) { + return null; + } + + const clearIntervalFn = + deps.clearInterval ?? + ((activeTimer: QaParentWatchdogTimer) => { + clearInterval(activeTimer as ReturnType); + }); + const exit = deps.exit ?? ((code?: number) => process.exit(code)); + const kill = + deps.kill ?? ((pid: number, signal?: NodeJS.Signals | 0) => process.kill(pid, signal)); + const logger = deps.logger ?? createSubsystemLogger("gateway"); + const setIntervalFn = + deps.setInterval ?? + ((callback: () => void, ms: number) => setInterval(callback, ms) as QaParentWatchdogTimer); + let stopped = false; + let timer: QaParentWatchdogTimer; + + const stop = () => { + if (stopped) { + return; + } + stopped = true; + clearIntervalFn(timer); + }; + + timer = setIntervalFn(() => { + if (stopped) { + return; + } + try { + kill(parentPid, 0); + } catch (error) { + if ((error as NodeJS.ErrnoException).code === "ESRCH") { + logger.warn(`QA gateway parent pid ${parentPid} exited; shutting down orphaned QA gateway`); + stop(); + exit(0); + } + } + }, deps.intervalMs ?? DEFAULT_QA_PARENT_WATCHDOG_INTERVAL_MS); + if (typeof timer === "object") { + timer.unref?.(); + } + + return { + parentPid, + stop, + }; +} diff --git a/src/cli/gateway-cli/run.ts b/src/cli/gateway-cli/run.ts index df9a7f5a3dc..fa4e8303c6e 100644 --- a/src/cli/gateway-cli/run.ts +++ b/src/cli/gateway-cli/run.ts @@ -35,6 +35,7 @@ import { formatCliCommand } from "../command-format.js"; import { inheritOptionFromParent } from "../command-options.js"; import { withProgress } from "../progress.js"; import { parsePort } from "../shared/parse-port.js"; +import { installQaParentWatchdog } from "./qa-parent-watchdog.js"; import { runGatewayLoop } from "./run-loop.js"; type GatewayRunOpts = { @@ -504,6 +505,7 @@ async function maybeWriteGatewayStartupFailureBundle(err: unknown): Promise