fix: stop orphaned QA gateway children

This commit is contained in:
Peter Steinberger
2026-05-02 11:14:50 +01:00
parent 7686136419
commit aebac43d97
6 changed files with 151 additions and 0 deletions

View File

@@ -37,6 +37,7 @@ Docs: https://docs.openclaw.ai
- TUI/chat: skip full provider model normalization during context-window warmup while preserving provider-owned context metadata, avoiding cold-start stalls with large model registries. Thanks @547895019.
- Memory Wiki: accept relative Markdown links that include the `.md` suffix during broken-wikilink validation, avoiding false positives for native render-mode links. Thanks @Kenneth8128.
- OpenAI Codex: show the device-pairing code in the interactive SSH/headless prompt while keeping the short-lived code out of persistent runtime logs. Fixes #74212. Thanks @da22le123.
- QA Lab: stop gateway children when the suite parent disappears, so interrupted local QA runs cannot leave hot orphaned gateways behind.
- Plugins/CLI: cache plugin CLI registration entries per command program so completion state generation does not repeat the full plugin sweep in one invocation. Thanks @ScientificProgrammer.
- Plugins: reuse gateway-bindable plugin loader cache entries for later default-mode loads without serving default-built registries to gateway-bound requests, reducing repeated plugin registration during dispatch. Refs #61756. Thanks @DmitryPogodaev.
- Gateway/secrets: include the caught error message in `secrets.reload` and `secrets.resolve` warning logs while keeping RPC errors generic, so operators can diagnose reload and permission failures. Thanks @davidangularme.

View File

@@ -85,6 +85,7 @@ describe("buildQaRuntimeEnv", () => {
});
expect(env.OPENCLAW_TEST_FAST).toBe("1");
expect(env.OPENCLAW_QA_PARENT_PID).toBe(String(process.pid));
expect(env.OPENCLAW_QA_ALLOW_LOCAL_IMAGE_PROVIDER).toBe("1");
expect(env.OPENCLAW_ALLOW_SLOW_REPLY_TESTS).toBe("1");
expect(env.OPENCLAW_SKIP_STARTUP_MODEL_PREWARM).toBe("1");

View File

@@ -216,6 +216,7 @@ export function buildQaRuntimeEnv(params: {
OPENCLAW_SKIP_STARTUP_MODEL_PREWARM: "1",
OPENCLAW_NO_RESPAWN: "1",
OPENCLAW_TEST_FAST: "1",
OPENCLAW_QA_PARENT_PID: String(process.pid),
OPENCLAW_QA_ALLOW_LOCAL_IMAGE_PROVIDER: "1",
// QA uses the fast runtime envelope for speed, but it still exercises
// normal config-driven heartbeats and runtime config writes.

View File

@@ -0,0 +1,50 @@
import { describe, expect, it, vi } from "vitest";
import { installQaParentWatchdog, QA_PARENT_PID_ENV } from "./qa-parent-watchdog.js";
describe("installQaParentWatchdog", () => {
it("does not install without a QA parent pid", () => {
expect(installQaParentWatchdog({ env: {}, ownPid: 10 })).toBeNull();
expect(installQaParentWatchdog({ env: { [QA_PARENT_PID_ENV]: "10" }, ownPid: 10 })).toBeNull();
expect(
installQaParentWatchdog({ env: { [QA_PARENT_PID_ENV]: "not-a-pid" }, ownPid: 10 }),
).toBeNull();
});
it("exits when the QA parent process disappears", () => {
let tick: () => void = () => {
throw new Error("watchdog interval was not installed");
};
const timer = { unref: vi.fn() };
const clearIntervalMock = vi.fn();
const exit = vi.fn();
const logger = { warn: vi.fn() };
const kill = vi.fn(() => {
const error = new Error("missing") as NodeJS.ErrnoException;
error.code = "ESRCH";
throw error;
});
const handle = installQaParentWatchdog({
clearInterval: clearIntervalMock,
env: { [QA_PARENT_PID_ENV]: "12345" },
exit,
kill,
logger,
ownPid: 10,
setInterval: (callback) => {
tick = callback;
return timer;
},
});
expect(handle?.parentPid).toBe(12345);
expect(timer.unref).toHaveBeenCalledTimes(1);
tick();
expect(kill).toHaveBeenCalledWith(12345, 0);
expect(logger.warn).toHaveBeenCalledWith(
"QA gateway parent pid 12345 exited; shutting down orphaned QA gateway",
);
expect(clearIntervalMock).toHaveBeenCalledWith(timer);
expect(exit).toHaveBeenCalledWith(0);
});
});

View File

@@ -0,0 +1,96 @@
import { createSubsystemLogger } from "../../logging/subsystem.js";
export const QA_PARENT_PID_ENV = "OPENCLAW_QA_PARENT_PID";
const DEFAULT_QA_PARENT_WATCHDOG_INTERVAL_MS = 1000;
type QaParentWatchdogTimer =
| number
| {
unref?: () => unknown;
};
type QaParentWatchdogDeps = {
clearInterval?: (timer: QaParentWatchdogTimer) => void;
env?: NodeJS.ProcessEnv;
exit?: (code?: number) => never | void;
intervalMs?: number;
kill?: (pid: number, signal?: NodeJS.Signals | 0) => boolean;
logger?: Pick<ReturnType<typeof createSubsystemLogger>, "warn">;
ownPid?: number;
setInterval?: (callback: () => void, ms: number) => QaParentWatchdogTimer;
};
export type QaParentWatchdogHandle = {
parentPid: number;
stop: () => void;
};
function resolveQaParentPid(env: NodeJS.ProcessEnv, ownPid: number): number | null {
const raw = env[QA_PARENT_PID_ENV]?.trim();
if (!raw) {
return null;
}
const parentPid = Number(raw);
if (!Number.isSafeInteger(parentPid) || parentPid <= 0 || parentPid === ownPid) {
return null;
}
return parentPid;
}
export function installQaParentWatchdog(
deps: QaParentWatchdogDeps = {},
): QaParentWatchdogHandle | null {
const env = deps.env ?? process.env;
const ownPid = deps.ownPid ?? process.pid;
const parentPid = resolveQaParentPid(env, ownPid);
if (parentPid === null) {
return null;
}
const clearIntervalFn =
deps.clearInterval ??
((activeTimer: QaParentWatchdogTimer) => {
clearInterval(activeTimer as ReturnType<typeof setInterval>);
});
const exit = deps.exit ?? ((code?: number) => process.exit(code));
const kill =
deps.kill ?? ((pid: number, signal?: NodeJS.Signals | 0) => process.kill(pid, signal));
const logger = deps.logger ?? createSubsystemLogger("gateway");
const setIntervalFn =
deps.setInterval ??
((callback: () => void, ms: number) => setInterval(callback, ms) as QaParentWatchdogTimer);
let stopped = false;
let timer: QaParentWatchdogTimer;
const stop = () => {
if (stopped) {
return;
}
stopped = true;
clearIntervalFn(timer);
};
timer = setIntervalFn(() => {
if (stopped) {
return;
}
try {
kill(parentPid, 0);
} catch (error) {
if ((error as NodeJS.ErrnoException).code === "ESRCH") {
logger.warn(`QA gateway parent pid ${parentPid} exited; shutting down orphaned QA gateway`);
stop();
exit(0);
}
}
}, deps.intervalMs ?? DEFAULT_QA_PARENT_WATCHDOG_INTERVAL_MS);
if (typeof timer === "object") {
timer.unref?.();
}
return {
parentPid,
stop,
};
}

View File

@@ -35,6 +35,7 @@ import { formatCliCommand } from "../command-format.js";
import { inheritOptionFromParent } from "../command-options.js";
import { withProgress } from "../progress.js";
import { parsePort } from "../shared/parse-port.js";
import { installQaParentWatchdog } from "./qa-parent-watchdog.js";
import { runGatewayLoop } from "./run-loop.js";
type GatewayRunOpts = {
@@ -504,6 +505,7 @@ async function maybeWriteGatewayStartupFailureBundle(err: unknown): Promise<void
}
async function runGatewayCommand(opts: GatewayRunOpts) {
installQaParentWatchdog();
const isDevProfile = normalizeOptionalLowercaseString(process.env.OPENCLAW_PROFILE) === "dev";
const devMode = Boolean(opts.dev) || isDevProfile;
if (opts.reset && !devMode) {