fix(qa-lab): clean orphaned gateway runtimes

This commit is contained in:
Vincent Koc
2026-05-17 19:10:21 +08:00
parent 045d7aae50
commit 7d6e45ef7c
9 changed files with 329 additions and 10 deletions

View File

@@ -30,6 +30,7 @@ Docs: https://docs.openclaw.ai
- Codex app-server: rotate oversized native Codex threads before resume and cap dynamic tool-result text entering native Codex sessions, preventing stale oversized context from surviving OpenClaw compaction. (#82981) Thanks @hansolo949.
- Gateway/restart: drain pending replies and active chat runs during restart shutdown before sockets and channels close, aborting timed-out chat runs through the normal cleanup path. (#69121) Thanks @alexlomt.
- Agents/Codex: use the Codex runtime context window for OpenAI-model preflight compaction and memory flush checks, so GPT-5.5 Codex sessions compact before hitting the smaller native context limit. Fixes #82982. Thanks @vliuyt.
- QA-Lab: clean orphaned gateway temp roots when a suite parent exits and wait on gateway plus transport readiness after config restarts, reducing stale `qa-channel` noise from interrupted runs. Fixes #65506. Thanks @100yenadmin.
- QA-Lab: wake qa-bus long polls that arrive with stale future cursors after a bus restart, preserving reconnect readiness for harness clients. (#67142) Thanks @hxy91819.
- QA-Lab: stage Multipass transfer scripts under OpenClaw's preferred temp root instead of raw OS temp paths, keeping the VM runner inside temp-path guardrails. (#64098) Thanks @ImLukeF.
- Agents/replies: keep surviving reply media and append a warning when other media references fail, so partial media normalization no longer drops failures silently. Thanks @Jerry-Xin.

View File

@@ -46,10 +46,12 @@ function createParams(baseEnv?: NodeJS.ProcessEnv) {
gatewayToken: "qa-token",
homeDir: "/tmp/openclaw-qa/home",
stateDir: "/tmp/openclaw-qa/state",
tempRoot: "/tmp/openclaw-qa",
xdgConfigHome: "/tmp/openclaw-qa/xdg-config",
xdgDataHome: "/tmp/openclaw-qa/xdg-data",
xdgCacheHome: "/tmp/openclaw-qa/xdg-cache",
bundledPluginsDir: "/tmp/openclaw-qa/bundled-plugins",
stagedBundledPluginsRoot: "/repo/.artifacts/qa-runtime/openclaw-qa-suite-test",
compatibilityHostVersion: "2026.4.8",
baseEnv,
};
@@ -139,6 +141,10 @@ describe("buildQaRuntimeEnv", () => {
expect(env.OPENCLAW_TEST_FAST).toBe("1");
expect(env.OPENCLAW_QA_PARENT_PID).toBe(String(process.pid));
expect(env.OPENCLAW_QA_TEMP_ROOT).toBe("/tmp/openclaw-qa");
expect(env.OPENCLAW_QA_STAGED_RUNTIME_ROOT).toBe(
"/repo/.artifacts/qa-runtime/openclaw-qa-suite-test",
);
expect(env.OPENCLAW_QA_ALLOW_LOCAL_IMAGE_PROVIDER).toBe("1");
expect(env.OPENCLAW_ALLOW_SLOW_REPLY_TESTS).toBe("1");
expect(env.OPENCLAW_SKIP_STARTUP_MODEL_PREWARM).toBe("1");

View File

@@ -183,10 +183,12 @@ export function buildQaRuntimeEnv(params: {
homeDir: string;
forwardHostHome?: boolean;
stateDir: string;
tempRoot: string;
xdgConfigHome: string;
xdgDataHome: string;
xdgCacheHome: string;
bundledPluginsDir?: string;
stagedBundledPluginsRoot?: string | null;
compatibilityHostVersion?: string;
providerMode?: QaProviderMode;
baseEnv?: NodeJS.ProcessEnv;
@@ -219,6 +221,10 @@ export function buildQaRuntimeEnv(params: {
OPENCLAW_NO_RESPAWN: "1",
OPENCLAW_TEST_FAST: "1",
OPENCLAW_QA_PARENT_PID: String(process.pid),
OPENCLAW_QA_TEMP_ROOT: params.tempRoot,
...(params.stagedBundledPluginsRoot
? { OPENCLAW_QA_STAGED_RUNTIME_ROOT: params.stagedBundledPluginsRoot }
: {}),
OPENCLAW_QA_ALLOW_LOCAL_IMAGE_PROVIDER: "1",
// QA uses the fast runtime envelope for speed, but it still exercises
// normal config-driven heartbeats and runtime config writes.
@@ -666,10 +672,12 @@ export async function startQaGatewayChild(params: {
homeDir,
forwardHostHome: params.forwardHostHome,
stateDir,
tempRoot,
xdgConfigHome,
xdgDataHome,
xdgCacheHome,
bundledPluginsDir: stagedPluginRuntime.bundledPluginsDir,
stagedBundledPluginsRoot,
compatibilityHostVersion: stagedPluginRuntime.runtimeHostVersion,
providerMode,
forwardHostHomeForClaudeCli: liveProviderIds.includes("claude-cli"),

View File

@@ -1,10 +1,30 @@
import { describe, expect, it } from "vitest";
import { afterEach, describe, expect, it, vi } from "vitest";
import {
getGatewayRetryAfterMs,
isConfigApplyNoopForSnapshot,
isConfigHashConflict,
isConfigPatchNoopForSnapshot,
waitForConfigRestartSettle,
} from "./suite-runtime-gateway.js";
import type { QaSuiteRuntimeEnv } from "./suite-runtime-types.js";
const fetchWithSsrFGuardMock = vi.hoisted(() => vi.fn());
vi.mock("openclaw/plugin-sdk/ssrf-runtime", () => ({
fetchWithSsrFGuard: fetchWithSsrFGuardMock,
}));
afterEach(() => {
fetchWithSsrFGuardMock.mockReset();
vi.useRealTimers();
});
function createRestartSettleEnv(waitReady: (params: unknown) => Promise<void>) {
return {
gateway: { baseUrl: "http://127.0.0.1:43123" },
transport: { waitReady },
} as unknown as Pick<QaSuiteRuntimeEnv, "gateway" | "transport">;
}
describe("qa suite gateway helpers", () => {
it("reads retry-after from the primary gateway error before appended logs", () => {
@@ -113,4 +133,45 @@ describe("qa suite gateway helpers", () => {
),
).toBe(false);
});
it("waits for transport readiness after gateway restart health", async () => {
const release = vi.fn(async () => {});
fetchWithSsrFGuardMock.mockResolvedValue({
response: { ok: true },
release,
});
const waitReady = vi.fn(async () => {});
await waitForConfigRestartSettle(createRestartSettleEnv(waitReady), 0, 1_000);
expect(fetchWithSsrFGuardMock).toHaveBeenCalledWith(
expect.objectContaining({
url: "http://127.0.0.1:43123/readyz",
auditContext: "qa-lab-suite-wait-for-gateway-healthy",
}),
);
expect(waitReady).toHaveBeenCalledWith({
gateway: { baseUrl: "http://127.0.0.1:43123" },
timeoutMs: expect.any(Number),
});
expect(release).toHaveBeenCalled();
});
it("keeps polling gateway health instead of sleeping blindly through restart settle", async () => {
vi.useFakeTimers();
const release = vi.fn(async () => {});
fetchWithSsrFGuardMock.mockRejectedValueOnce(new Error("restart boundary")).mockResolvedValue({
response: { ok: true },
release,
});
const waitReady = vi.fn(async () => {});
const settling = waitForConfigRestartSettle(createRestartSettleEnv(waitReady), 500, 5_000);
await vi.advanceTimersByTimeAsync(1_250);
await settling;
expect(fetchWithSsrFGuardMock).toHaveBeenCalledTimes(2);
expect(waitReady).toHaveBeenCalledTimes(1);
});
});

View File

@@ -65,8 +65,30 @@ async function waitForConfigRestartSettle(
restartDelayMs = 1_000,
timeoutMs = 60_000,
) {
await sleep(restartDelayMs + 750);
await waitForGatewayHealthy(env, timeoutMs);
const startedAt = Date.now();
const deadline = startedAt + timeoutMs;
const readyAfterMs = restartDelayMs + 750;
let lastHealthError: unknown = null;
while (Date.now() < deadline) {
try {
await waitForGatewayHealthy(env, Math.max(1, Math.min(1_000, deadline - Date.now())));
if (Date.now() - startedAt >= readyAfterMs) {
const remainingMs = Math.max(1, deadline - Date.now());
await waitForTransportReady(env, remainingMs);
return;
}
} catch (error) {
lastHealthError = error;
}
await sleep(Math.min(250, Math.max(1, deadline - Date.now())));
}
throw new Error(
`timed out after ${timeoutMs}ms waiting for config restart readiness${
lastHealthError ? `: ${formatErrorMessage(lastHealthError)}` : ""
}`,
);
}
function formatGatewayPrimaryErrorText(error: unknown) {

View File

@@ -1,6 +1,17 @@
import { describe, expect, it, vi } from "vitest";
import { afterEach, describe, expect, it, vi } from "vitest";
import { qaSuiteProgressTesting, runQaSuite } from "./suite.js";
const fetchWithSsrFGuardMock = vi.hoisted(() => vi.fn());
vi.mock("openclaw/plugin-sdk/ssrf-runtime", () => ({
fetchWithSsrFGuard: fetchWithSsrFGuardMock,
}));
afterEach(() => {
fetchWithSsrFGuardMock.mockReset();
vi.useRealTimers();
});
describe("qa suite", () => {
it("rejects unsupported transport ids before starting the lab", async () => {
const startLab = vi.fn();
@@ -23,6 +34,46 @@ describe("qa suite", () => {
expect(qaSuiteProgressTesting.parseQaSuiteBooleanEnv("maybe")).toBeUndefined();
});
it("stops an owned lab when readiness never becomes healthy", async () => {
const stop = vi.fn(async () => {});
fetchWithSsrFGuardMock.mockResolvedValue({
response: { ok: false },
release: vi.fn(async () => {}),
});
await expect(
qaSuiteProgressTesting.waitForQaLabReadyOrStopOwned({
lab: {
listenUrl: "http://127.0.0.1:43123",
stop,
},
ownsLab: true,
timeoutMs: 1,
}),
).rejects.toThrow("timed out after 1ms waiting for qa-lab ready");
expect(stop).toHaveBeenCalledTimes(1);
});
it("leaves caller-owned labs running when readiness never becomes healthy", async () => {
const stop = vi.fn(async () => {});
fetchWithSsrFGuardMock.mockResolvedValue({
response: { ok: false },
release: vi.fn(async () => {}),
});
await expect(
qaSuiteProgressTesting.waitForQaLabReadyOrStopOwned({
lab: {
listenUrl: "http://127.0.0.1:43123",
stop,
},
ownsLab: false,
timeoutMs: 1,
}),
).rejects.toThrow("timed out after 1ms waiting for qa-lab ready");
expect(stop).not.toHaveBeenCalled();
});
it("defaults progress logging from CI when no override is set", () => {
expect(qaSuiteProgressTesting.shouldLogQaSuiteProgress({ CI: "true" })).toBe(true);
expect(qaSuiteProgressTesting.shouldLogQaSuiteProgress({ CI: "false" })).toBe(false);

View File

@@ -4,6 +4,7 @@ import { setTimeout as sleep } from "node:timers/promises";
import { disposeRegisteredAgentHarnesses } from "openclaw/plugin-sdk/agent-harness";
import type { OpenClawConfig } from "openclaw/plugin-sdk/config-contracts";
import { formatErrorMessage } from "openclaw/plugin-sdk/error-runtime";
import { fetchWithSsrFGuard } from "openclaw/plugin-sdk/ssrf-runtime";
import { startQaGatewayChild, type QaCliBackendAuthMode } from "./gateway-child.js";
import type {
QaLabLatestReport,
@@ -151,6 +152,45 @@ function writeQaSuiteProgress(enabled: boolean, message: string) {
process.stderr.write(`[qa-suite] ${message}\n`);
}
async function waitForQaLabReady(baseUrl: string, timeoutMs = 10_000) {
const startedAt = Date.now();
while (Date.now() - startedAt < timeoutMs) {
try {
const { response, release } = await fetchWithSsrFGuard({
url: `${baseUrl}/readyz`,
policy: { allowPrivateNetwork: true },
auditContext: "qa-lab-suite-wait-for-lab-ready",
});
try {
if (response.ok) {
return;
}
} finally {
await release();
}
} catch {
// retry
}
await sleep(100);
}
throw new Error(`timed out after ${timeoutMs}ms waiting for qa-lab ready`);
}
async function waitForQaLabReadyOrStopOwned(params: {
lab: Pick<QaLabServerHandle, "listenUrl" | "stop">;
ownsLab: boolean;
timeoutMs?: number;
}) {
try {
await waitForQaLabReady(params.lab.listenUrl, params.timeoutMs);
} catch (error) {
if (params.ownsLab) {
await params.lab.stop();
}
throw error;
}
}
function sanitizeQaSuiteProgressValue(value: string): string {
let normalized = "";
for (const char of value) {
@@ -1068,6 +1108,7 @@ export async function runQaSuite(params?: QaSuiteRunParams): Promise<QaSuiteResu
embeddedGateway: "disabled",
}));
writeQaSuiteProgress(progressEnabled, `lab ready: ${sanitizeQaSuiteProgressValue(lab.baseUrl)}`);
await waitForQaLabReadyOrStopOwned({ lab, ownsLab });
const transport = createQaTransportAdapter({
id: transportId,
state: lab.state,
@@ -1302,4 +1343,5 @@ export const qaSuiteProgressTesting = {
resolveQaSuiteTransportReadyTimeoutMs,
sanitizeQaSuiteProgressValue,
shouldLogQaSuiteProgress,
waitForQaLabReadyOrStopOwned,
};

View File

@@ -1,5 +1,10 @@
import { describe, expect, it, vi } from "vitest";
import { installQaParentWatchdog, QA_PARENT_PID_ENV } from "./qa-parent-watchdog.js";
import {
installQaParentWatchdog,
QA_PARENT_PID_ENV,
QA_STAGED_RUNTIME_ROOT_ENV,
QA_TEMP_ROOT_ENV,
} from "./qa-parent-watchdog.js";
describe("installQaParentWatchdog", () => {
it("does not install without a QA parent pid", () => {
@@ -10,13 +15,15 @@ describe("installQaParentWatchdog", () => {
).toBeNull();
});
it("exits when the QA parent process disappears", () => {
it("exits when the QA parent process disappears", async () => {
let tick: () => void = () => {
throw new Error("watchdog interval was not installed");
};
const timer = { unref: vi.fn() };
const chdir = vi.fn();
const clearIntervalMock = vi.fn();
const exit = vi.fn();
const rm = vi.fn(async () => {});
const logger = { warn: vi.fn() };
const kill = vi.fn(() => {
const error = new Error("missing") as NodeJS.ErrnoException;
@@ -25,12 +32,19 @@ describe("installQaParentWatchdog", () => {
});
const handle = installQaParentWatchdog({
chdir,
clearInterval: clearIntervalMock,
env: { [QA_PARENT_PID_ENV]: "12345" },
cwd: () => "/tmp/openclaw-qa-suite-test",
env: {
[QA_PARENT_PID_ENV]: "12345",
[QA_STAGED_RUNTIME_ROOT_ENV]: "/repo/.artifacts/qa-runtime/openclaw-qa-suite-test",
[QA_TEMP_ROOT_ENV]: "/tmp/openclaw-qa-suite-test",
},
exit,
kill,
logger,
ownPid: 10,
rm,
setInterval: (callback) => {
tick = callback;
return timer;
@@ -45,6 +59,46 @@ describe("installQaParentWatchdog", () => {
"QA gateway parent pid 12345 exited; shutting down orphaned QA gateway",
);
expect(clearIntervalMock).toHaveBeenCalledWith(timer);
expect(exit).toHaveBeenCalledWith(0);
await vi.waitFor(() => {
expect(chdir).toHaveBeenCalledWith("/tmp");
expect(rm).toHaveBeenCalledWith("/tmp/openclaw-qa-suite-test");
expect(rm).toHaveBeenCalledWith("/repo/.artifacts/qa-runtime/openclaw-qa-suite-test");
expect(exit).toHaveBeenCalledWith(0);
});
});
it("ignores unsafe QA temp root cleanup paths", async () => {
let tick: () => void = () => {
throw new Error("watchdog interval was not installed");
};
const exit = vi.fn();
const rm = vi.fn(async () => {});
const kill = vi.fn(() => {
const error = new Error("missing") as NodeJS.ErrnoException;
error.code = "ESRCH";
throw error;
});
installQaParentWatchdog({
env: {
[QA_PARENT_PID_ENV]: "12345",
[QA_STAGED_RUNTIME_ROOT_ENV]: "/repo/.artifacts/qa-runtime/not-qa-suite",
[QA_TEMP_ROOT_ENV]: "/tmp/not-qa-suite",
},
exit,
kill,
logger: { warn: vi.fn() },
ownPid: 10,
rm,
setInterval: (callback) => {
tick = callback;
return { unref: vi.fn() };
},
});
tick();
await vi.waitFor(() => expect(exit).toHaveBeenCalledWith(0));
expect(rm).not.toHaveBeenCalled();
});
});

View File

@@ -1,8 +1,13 @@
import fs from "node:fs/promises";
import path from "node:path";
import { createSubsystemLogger } from "../../logging/subsystem.js";
export const QA_PARENT_PID_ENV = "OPENCLAW_QA_PARENT_PID";
export const QA_TEMP_ROOT_ENV = "OPENCLAW_QA_TEMP_ROOT";
export const QA_STAGED_RUNTIME_ROOT_ENV = "OPENCLAW_QA_STAGED_RUNTIME_ROOT";
const DEFAULT_QA_PARENT_WATCHDOG_INTERVAL_MS = 1000;
const QA_TEMP_ROOT_PREFIX = "openclaw-qa-suite-";
type QaParentWatchdogTimer =
| number
@@ -11,13 +16,16 @@ type QaParentWatchdogTimer =
};
type QaParentWatchdogDeps = {
chdir?: (directory: string) => void;
clearInterval?: (timer: QaParentWatchdogTimer) => void;
cwd?: () => string;
env?: NodeJS.ProcessEnv;
exit?: (code?: number) => never | void;
intervalMs?: number;
kill?: (pid: number, signal?: NodeJS.Signals | 0) => boolean;
logger?: Pick<ReturnType<typeof createSubsystemLogger>, "warn">;
ownPid?: number;
rm?: (target: string) => Promise<void>;
setInterval?: (callback: () => void, ms: number) => QaParentWatchdogTimer;
};
@@ -38,6 +46,35 @@ function resolveQaParentPid(env: NodeJS.ProcessEnv, ownPid: number): number | nu
return parentPid;
}
function resolveQaCleanupRoot(rawValue: string | undefined): string | null {
const raw = rawValue?.trim();
if (!raw) {
return null;
}
const cleanupRoot = path.resolve(raw);
if (!path.basename(cleanupRoot).startsWith(QA_TEMP_ROOT_PREFIX)) {
return null;
}
return cleanupRoot;
}
function resolveQaCleanupRoots(env: NodeJS.ProcessEnv): string[] {
return [
resolveQaCleanupRoot(env[QA_TEMP_ROOT_ENV]),
resolveQaCleanupRoot(env[QA_STAGED_RUNTIME_ROOT_ENV]),
].filter((target, index, array): target is string => {
return target !== null && array.indexOf(target) === index;
});
}
function pathContains(root: string, candidate: string): boolean {
const relative = path.relative(root, candidate);
return (
relative === "" ||
(relative.length > 0 && !relative.startsWith("..") && !path.isAbsolute(relative))
);
}
export function installQaParentWatchdog(
deps: QaParentWatchdogDeps = {},
): QaParentWatchdogHandle | null {
@@ -57,10 +94,19 @@ export function installQaParentWatchdog(
const kill =
deps.kill ?? ((pid: number, signal?: NodeJS.Signals | 0) => process.kill(pid, signal));
const logger = deps.logger ?? createSubsystemLogger("gateway");
const qaCleanupRoots = resolveQaCleanupRoots(env);
const chdir = deps.chdir ?? ((directory: string) => process.chdir(directory));
const cwd = deps.cwd ?? (() => process.cwd());
const rm =
deps.rm ??
(async (target: string) => {
await fs.rm(target, { recursive: true, force: true });
});
const setIntervalFn =
deps.setInterval ??
((callback: () => void, ms: number) => setInterval(callback, ms) as QaParentWatchdogTimer);
let stopped = false;
let exiting = false;
let timer: QaParentWatchdogTimer;
const stop = () => {
@@ -72,7 +118,7 @@ export function installQaParentWatchdog(
};
timer = setIntervalFn(() => {
if (stopped) {
if (stopped || exiting) {
return;
}
try {
@@ -80,8 +126,36 @@ export function installQaParentWatchdog(
} catch (error) {
if ((error as NodeJS.ErrnoException).code === "ESRCH") {
logger.warn(`QA gateway parent pid ${parentPid} exited; shutting down orphaned QA gateway`);
exiting = true;
stop();
exit(0);
void (async () => {
const currentCwd = path.resolve(cwd());
const activeCwdRoot = qaCleanupRoots.find((cleanupRoot) =>
pathContains(cleanupRoot, currentCwd),
);
if (activeCwdRoot) {
const safeCwd = path.dirname(activeCwdRoot);
try {
chdir(safeCwd);
} catch (chdirError) {
logger.warn(
`QA gateway parent pid ${parentPid} exited; failed to leave runtime root ${activeCwdRoot}: ${
chdirError instanceof Error ? chdirError.message : String(chdirError)
}`,
);
}
}
for (const cleanupRoot of qaCleanupRoots) {
await rm(cleanupRoot).catch((cleanupError) => {
logger.warn(
`QA gateway parent pid ${parentPid} exited; failed to clean runtime root ${cleanupRoot}: ${
cleanupError instanceof Error ? cleanupError.message : String(cleanupError)
}`,
);
});
}
exit(0);
})();
}
}
}, deps.intervalMs ?? DEFAULT_QA_PARENT_WATCHDOG_INTERVAL_MS);