diff --git a/CHANGELOG.md b/CHANGELOG.md index ac12a7983ce..dced06a676d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -30,6 +30,7 @@ Docs: https://docs.openclaw.ai - Codex app-server: rotate oversized native Codex threads before resume and cap dynamic tool-result text entering native Codex sessions, preventing stale oversized context from surviving OpenClaw compaction. (#82981) Thanks @hansolo949. - Gateway/restart: drain pending replies and active chat runs during restart shutdown before sockets and channels close, aborting timed-out chat runs through the normal cleanup path. (#69121) Thanks @alexlomt. - Agents/Codex: use the Codex runtime context window for OpenAI-model preflight compaction and memory flush checks, so GPT-5.5 Codex sessions compact before hitting the smaller native context limit. Fixes #82982. Thanks @vliuyt. +- QA-Lab: clean orphaned gateway temp roots when a suite parent exits and wait on gateway plus transport readiness after config restarts, reducing stale `qa-channel` noise from interrupted runs. Fixes #65506. Thanks @100yenadmin. - QA-Lab: wake qa-bus long polls that arrive with stale future cursors after a bus restart, preserving reconnect readiness for harness clients. (#67142) Thanks @hxy91819. - QA-Lab: stage Multipass transfer scripts under OpenClaw's preferred temp root instead of raw OS temp paths, keeping the VM runner inside temp-path guardrails. (#64098) Thanks @ImLukeF. - Agents/replies: keep surviving reply media and append a warning when other media references fail, so partial media normalization no longer drops failures silently. Thanks @Jerry-Xin. diff --git a/extensions/qa-lab/src/gateway-child.test.ts b/extensions/qa-lab/src/gateway-child.test.ts index 7c771437727..e63b95e3cb7 100644 --- a/extensions/qa-lab/src/gateway-child.test.ts +++ b/extensions/qa-lab/src/gateway-child.test.ts @@ -46,10 +46,12 @@ function createParams(baseEnv?: NodeJS.ProcessEnv) { gatewayToken: "qa-token", homeDir: "/tmp/openclaw-qa/home", stateDir: "/tmp/openclaw-qa/state", + tempRoot: "/tmp/openclaw-qa", xdgConfigHome: "/tmp/openclaw-qa/xdg-config", xdgDataHome: "/tmp/openclaw-qa/xdg-data", xdgCacheHome: "/tmp/openclaw-qa/xdg-cache", bundledPluginsDir: "/tmp/openclaw-qa/bundled-plugins", + stagedBundledPluginsRoot: "/repo/.artifacts/qa-runtime/openclaw-qa-suite-test", compatibilityHostVersion: "2026.4.8", baseEnv, }; @@ -139,6 +141,10 @@ describe("buildQaRuntimeEnv", () => { expect(env.OPENCLAW_TEST_FAST).toBe("1"); expect(env.OPENCLAW_QA_PARENT_PID).toBe(String(process.pid)); + expect(env.OPENCLAW_QA_TEMP_ROOT).toBe("/tmp/openclaw-qa"); + expect(env.OPENCLAW_QA_STAGED_RUNTIME_ROOT).toBe( + "/repo/.artifacts/qa-runtime/openclaw-qa-suite-test", + ); expect(env.OPENCLAW_QA_ALLOW_LOCAL_IMAGE_PROVIDER).toBe("1"); expect(env.OPENCLAW_ALLOW_SLOW_REPLY_TESTS).toBe("1"); expect(env.OPENCLAW_SKIP_STARTUP_MODEL_PREWARM).toBe("1"); diff --git a/extensions/qa-lab/src/gateway-child.ts b/extensions/qa-lab/src/gateway-child.ts index af60f45ff68..b0f9eeecd9e 100644 --- a/extensions/qa-lab/src/gateway-child.ts +++ b/extensions/qa-lab/src/gateway-child.ts @@ -183,10 +183,12 @@ export function buildQaRuntimeEnv(params: { homeDir: string; forwardHostHome?: boolean; stateDir: string; + tempRoot: string; xdgConfigHome: string; xdgDataHome: string; xdgCacheHome: string; bundledPluginsDir?: string; + stagedBundledPluginsRoot?: string | null; compatibilityHostVersion?: string; providerMode?: QaProviderMode; baseEnv?: NodeJS.ProcessEnv; @@ -219,6 +221,10 @@ export function buildQaRuntimeEnv(params: { OPENCLAW_NO_RESPAWN: "1", OPENCLAW_TEST_FAST: "1", OPENCLAW_QA_PARENT_PID: String(process.pid), + OPENCLAW_QA_TEMP_ROOT: params.tempRoot, + ...(params.stagedBundledPluginsRoot + ? { OPENCLAW_QA_STAGED_RUNTIME_ROOT: params.stagedBundledPluginsRoot } + : {}), OPENCLAW_QA_ALLOW_LOCAL_IMAGE_PROVIDER: "1", // QA uses the fast runtime envelope for speed, but it still exercises // normal config-driven heartbeats and runtime config writes. @@ -666,10 +672,12 @@ export async function startQaGatewayChild(params: { homeDir, forwardHostHome: params.forwardHostHome, stateDir, + tempRoot, xdgConfigHome, xdgDataHome, xdgCacheHome, bundledPluginsDir: stagedPluginRuntime.bundledPluginsDir, + stagedBundledPluginsRoot, compatibilityHostVersion: stagedPluginRuntime.runtimeHostVersion, providerMode, forwardHostHomeForClaudeCli: liveProviderIds.includes("claude-cli"), diff --git a/extensions/qa-lab/src/suite-runtime-gateway.test.ts b/extensions/qa-lab/src/suite-runtime-gateway.test.ts index 0d88e12c764..7ad23c44cd8 100644 --- a/extensions/qa-lab/src/suite-runtime-gateway.test.ts +++ b/extensions/qa-lab/src/suite-runtime-gateway.test.ts @@ -1,10 +1,30 @@ -import { describe, expect, it } from "vitest"; +import { afterEach, describe, expect, it, vi } from "vitest"; import { getGatewayRetryAfterMs, isConfigApplyNoopForSnapshot, isConfigHashConflict, isConfigPatchNoopForSnapshot, + waitForConfigRestartSettle, } from "./suite-runtime-gateway.js"; +import type { QaSuiteRuntimeEnv } from "./suite-runtime-types.js"; + +const fetchWithSsrFGuardMock = vi.hoisted(() => vi.fn()); + +vi.mock("openclaw/plugin-sdk/ssrf-runtime", () => ({ + fetchWithSsrFGuard: fetchWithSsrFGuardMock, +})); + +afterEach(() => { + fetchWithSsrFGuardMock.mockReset(); + vi.useRealTimers(); +}); + +function createRestartSettleEnv(waitReady: (params: unknown) => Promise) { + return { + gateway: { baseUrl: "http://127.0.0.1:43123" }, + transport: { waitReady }, + } as unknown as Pick; +} describe("qa suite gateway helpers", () => { it("reads retry-after from the primary gateway error before appended logs", () => { @@ -113,4 +133,45 @@ describe("qa suite gateway helpers", () => { ), ).toBe(false); }); + + it("waits for transport readiness after gateway restart health", async () => { + const release = vi.fn(async () => {}); + fetchWithSsrFGuardMock.mockResolvedValue({ + response: { ok: true }, + release, + }); + const waitReady = vi.fn(async () => {}); + + await waitForConfigRestartSettle(createRestartSettleEnv(waitReady), 0, 1_000); + + expect(fetchWithSsrFGuardMock).toHaveBeenCalledWith( + expect.objectContaining({ + url: "http://127.0.0.1:43123/readyz", + auditContext: "qa-lab-suite-wait-for-gateway-healthy", + }), + ); + expect(waitReady).toHaveBeenCalledWith({ + gateway: { baseUrl: "http://127.0.0.1:43123" }, + timeoutMs: expect.any(Number), + }); + expect(release).toHaveBeenCalled(); + }); + + it("keeps polling gateway health instead of sleeping blindly through restart settle", async () => { + vi.useFakeTimers(); + const release = vi.fn(async () => {}); + fetchWithSsrFGuardMock.mockRejectedValueOnce(new Error("restart boundary")).mockResolvedValue({ + response: { ok: true }, + release, + }); + const waitReady = vi.fn(async () => {}); + + const settling = waitForConfigRestartSettle(createRestartSettleEnv(waitReady), 500, 5_000); + + await vi.advanceTimersByTimeAsync(1_250); + await settling; + + expect(fetchWithSsrFGuardMock).toHaveBeenCalledTimes(2); + expect(waitReady).toHaveBeenCalledTimes(1); + }); }); diff --git a/extensions/qa-lab/src/suite-runtime-gateway.ts b/extensions/qa-lab/src/suite-runtime-gateway.ts index 2a936f87afe..fb694998579 100644 --- a/extensions/qa-lab/src/suite-runtime-gateway.ts +++ b/extensions/qa-lab/src/suite-runtime-gateway.ts @@ -65,8 +65,30 @@ async function waitForConfigRestartSettle( restartDelayMs = 1_000, timeoutMs = 60_000, ) { - await sleep(restartDelayMs + 750); - await waitForGatewayHealthy(env, timeoutMs); + const startedAt = Date.now(); + const deadline = startedAt + timeoutMs; + const readyAfterMs = restartDelayMs + 750; + let lastHealthError: unknown = null; + + while (Date.now() < deadline) { + try { + await waitForGatewayHealthy(env, Math.max(1, Math.min(1_000, deadline - Date.now()))); + if (Date.now() - startedAt >= readyAfterMs) { + const remainingMs = Math.max(1, deadline - Date.now()); + await waitForTransportReady(env, remainingMs); + return; + } + } catch (error) { + lastHealthError = error; + } + await sleep(Math.min(250, Math.max(1, deadline - Date.now()))); + } + + throw new Error( + `timed out after ${timeoutMs}ms waiting for config restart readiness${ + lastHealthError ? `: ${formatErrorMessage(lastHealthError)}` : "" + }`, + ); } function formatGatewayPrimaryErrorText(error: unknown) { diff --git a/extensions/qa-lab/src/suite.test.ts b/extensions/qa-lab/src/suite.test.ts index 70223851313..f332b4c5c82 100644 --- a/extensions/qa-lab/src/suite.test.ts +++ b/extensions/qa-lab/src/suite.test.ts @@ -1,6 +1,17 @@ -import { describe, expect, it, vi } from "vitest"; +import { afterEach, describe, expect, it, vi } from "vitest"; import { qaSuiteProgressTesting, runQaSuite } from "./suite.js"; +const fetchWithSsrFGuardMock = vi.hoisted(() => vi.fn()); + +vi.mock("openclaw/plugin-sdk/ssrf-runtime", () => ({ + fetchWithSsrFGuard: fetchWithSsrFGuardMock, +})); + +afterEach(() => { + fetchWithSsrFGuardMock.mockReset(); + vi.useRealTimers(); +}); + describe("qa suite", () => { it("rejects unsupported transport ids before starting the lab", async () => { const startLab = vi.fn(); @@ -23,6 +34,46 @@ describe("qa suite", () => { expect(qaSuiteProgressTesting.parseQaSuiteBooleanEnv("maybe")).toBeUndefined(); }); + it("stops an owned lab when readiness never becomes healthy", async () => { + const stop = vi.fn(async () => {}); + fetchWithSsrFGuardMock.mockResolvedValue({ + response: { ok: false }, + release: vi.fn(async () => {}), + }); + + await expect( + qaSuiteProgressTesting.waitForQaLabReadyOrStopOwned({ + lab: { + listenUrl: "http://127.0.0.1:43123", + stop, + }, + ownsLab: true, + timeoutMs: 1, + }), + ).rejects.toThrow("timed out after 1ms waiting for qa-lab ready"); + expect(stop).toHaveBeenCalledTimes(1); + }); + + it("leaves caller-owned labs running when readiness never becomes healthy", async () => { + const stop = vi.fn(async () => {}); + fetchWithSsrFGuardMock.mockResolvedValue({ + response: { ok: false }, + release: vi.fn(async () => {}), + }); + + await expect( + qaSuiteProgressTesting.waitForQaLabReadyOrStopOwned({ + lab: { + listenUrl: "http://127.0.0.1:43123", + stop, + }, + ownsLab: false, + timeoutMs: 1, + }), + ).rejects.toThrow("timed out after 1ms waiting for qa-lab ready"); + expect(stop).not.toHaveBeenCalled(); + }); + it("defaults progress logging from CI when no override is set", () => { expect(qaSuiteProgressTesting.shouldLogQaSuiteProgress({ CI: "true" })).toBe(true); expect(qaSuiteProgressTesting.shouldLogQaSuiteProgress({ CI: "false" })).toBe(false); diff --git a/extensions/qa-lab/src/suite.ts b/extensions/qa-lab/src/suite.ts index 7aa3964d8ca..fdd009c1791 100644 --- a/extensions/qa-lab/src/suite.ts +++ b/extensions/qa-lab/src/suite.ts @@ -4,6 +4,7 @@ import { setTimeout as sleep } from "node:timers/promises"; import { disposeRegisteredAgentHarnesses } from "openclaw/plugin-sdk/agent-harness"; import type { OpenClawConfig } from "openclaw/plugin-sdk/config-contracts"; import { formatErrorMessage } from "openclaw/plugin-sdk/error-runtime"; +import { fetchWithSsrFGuard } from "openclaw/plugin-sdk/ssrf-runtime"; import { startQaGatewayChild, type QaCliBackendAuthMode } from "./gateway-child.js"; import type { QaLabLatestReport, @@ -151,6 +152,45 @@ function writeQaSuiteProgress(enabled: boolean, message: string) { process.stderr.write(`[qa-suite] ${message}\n`); } +async function waitForQaLabReady(baseUrl: string, timeoutMs = 10_000) { + const startedAt = Date.now(); + while (Date.now() - startedAt < timeoutMs) { + try { + const { response, release } = await fetchWithSsrFGuard({ + url: `${baseUrl}/readyz`, + policy: { allowPrivateNetwork: true }, + auditContext: "qa-lab-suite-wait-for-lab-ready", + }); + try { + if (response.ok) { + return; + } + } finally { + await release(); + } + } catch { + // retry + } + await sleep(100); + } + throw new Error(`timed out after ${timeoutMs}ms waiting for qa-lab ready`); +} + +async function waitForQaLabReadyOrStopOwned(params: { + lab: Pick; + ownsLab: boolean; + timeoutMs?: number; +}) { + try { + await waitForQaLabReady(params.lab.listenUrl, params.timeoutMs); + } catch (error) { + if (params.ownsLab) { + await params.lab.stop(); + } + throw error; + } +} + function sanitizeQaSuiteProgressValue(value: string): string { let normalized = ""; for (const char of value) { @@ -1068,6 +1108,7 @@ export async function runQaSuite(params?: QaSuiteRunParams): Promise { it("does not install without a QA parent pid", () => { @@ -10,13 +15,15 @@ describe("installQaParentWatchdog", () => { ).toBeNull(); }); - it("exits when the QA parent process disappears", () => { + it("exits when the QA parent process disappears", async () => { let tick: () => void = () => { throw new Error("watchdog interval was not installed"); }; const timer = { unref: vi.fn() }; + const chdir = vi.fn(); const clearIntervalMock = vi.fn(); const exit = vi.fn(); + const rm = vi.fn(async () => {}); const logger = { warn: vi.fn() }; const kill = vi.fn(() => { const error = new Error("missing") as NodeJS.ErrnoException; @@ -25,12 +32,19 @@ describe("installQaParentWatchdog", () => { }); const handle = installQaParentWatchdog({ + chdir, clearInterval: clearIntervalMock, - env: { [QA_PARENT_PID_ENV]: "12345" }, + cwd: () => "/tmp/openclaw-qa-suite-test", + env: { + [QA_PARENT_PID_ENV]: "12345", + [QA_STAGED_RUNTIME_ROOT_ENV]: "/repo/.artifacts/qa-runtime/openclaw-qa-suite-test", + [QA_TEMP_ROOT_ENV]: "/tmp/openclaw-qa-suite-test", + }, exit, kill, logger, ownPid: 10, + rm, setInterval: (callback) => { tick = callback; return timer; @@ -45,6 +59,46 @@ describe("installQaParentWatchdog", () => { "QA gateway parent pid 12345 exited; shutting down orphaned QA gateway", ); expect(clearIntervalMock).toHaveBeenCalledWith(timer); - expect(exit).toHaveBeenCalledWith(0); + await vi.waitFor(() => { + expect(chdir).toHaveBeenCalledWith("/tmp"); + expect(rm).toHaveBeenCalledWith("/tmp/openclaw-qa-suite-test"); + expect(rm).toHaveBeenCalledWith("/repo/.artifacts/qa-runtime/openclaw-qa-suite-test"); + expect(exit).toHaveBeenCalledWith(0); + }); + }); + + it("ignores unsafe QA temp root cleanup paths", async () => { + let tick: () => void = () => { + throw new Error("watchdog interval was not installed"); + }; + const exit = vi.fn(); + const rm = vi.fn(async () => {}); + const kill = vi.fn(() => { + const error = new Error("missing") as NodeJS.ErrnoException; + error.code = "ESRCH"; + throw error; + }); + + installQaParentWatchdog({ + env: { + [QA_PARENT_PID_ENV]: "12345", + [QA_STAGED_RUNTIME_ROOT_ENV]: "/repo/.artifacts/qa-runtime/not-qa-suite", + [QA_TEMP_ROOT_ENV]: "/tmp/not-qa-suite", + }, + exit, + kill, + logger: { warn: vi.fn() }, + ownPid: 10, + rm, + setInterval: (callback) => { + tick = callback; + return { unref: vi.fn() }; + }, + }); + + tick(); + + await vi.waitFor(() => expect(exit).toHaveBeenCalledWith(0)); + expect(rm).not.toHaveBeenCalled(); }); }); diff --git a/src/cli/gateway-cli/qa-parent-watchdog.ts b/src/cli/gateway-cli/qa-parent-watchdog.ts index c5f93ab1e6d..63b444a8ec3 100644 --- a/src/cli/gateway-cli/qa-parent-watchdog.ts +++ b/src/cli/gateway-cli/qa-parent-watchdog.ts @@ -1,8 +1,13 @@ +import fs from "node:fs/promises"; +import path from "node:path"; import { createSubsystemLogger } from "../../logging/subsystem.js"; export const QA_PARENT_PID_ENV = "OPENCLAW_QA_PARENT_PID"; +export const QA_TEMP_ROOT_ENV = "OPENCLAW_QA_TEMP_ROOT"; +export const QA_STAGED_RUNTIME_ROOT_ENV = "OPENCLAW_QA_STAGED_RUNTIME_ROOT"; const DEFAULT_QA_PARENT_WATCHDOG_INTERVAL_MS = 1000; +const QA_TEMP_ROOT_PREFIX = "openclaw-qa-suite-"; type QaParentWatchdogTimer = | number @@ -11,13 +16,16 @@ type QaParentWatchdogTimer = }; type QaParentWatchdogDeps = { + chdir?: (directory: string) => void; clearInterval?: (timer: QaParentWatchdogTimer) => void; + cwd?: () => string; env?: NodeJS.ProcessEnv; exit?: (code?: number) => never | void; intervalMs?: number; kill?: (pid: number, signal?: NodeJS.Signals | 0) => boolean; logger?: Pick, "warn">; ownPid?: number; + rm?: (target: string) => Promise; setInterval?: (callback: () => void, ms: number) => QaParentWatchdogTimer; }; @@ -38,6 +46,35 @@ function resolveQaParentPid(env: NodeJS.ProcessEnv, ownPid: number): number | nu return parentPid; } +function resolveQaCleanupRoot(rawValue: string | undefined): string | null { + const raw = rawValue?.trim(); + if (!raw) { + return null; + } + const cleanupRoot = path.resolve(raw); + if (!path.basename(cleanupRoot).startsWith(QA_TEMP_ROOT_PREFIX)) { + return null; + } + return cleanupRoot; +} + +function resolveQaCleanupRoots(env: NodeJS.ProcessEnv): string[] { + return [ + resolveQaCleanupRoot(env[QA_TEMP_ROOT_ENV]), + resolveQaCleanupRoot(env[QA_STAGED_RUNTIME_ROOT_ENV]), + ].filter((target, index, array): target is string => { + return target !== null && array.indexOf(target) === index; + }); +} + +function pathContains(root: string, candidate: string): boolean { + const relative = path.relative(root, candidate); + return ( + relative === "" || + (relative.length > 0 && !relative.startsWith("..") && !path.isAbsolute(relative)) + ); +} + export function installQaParentWatchdog( deps: QaParentWatchdogDeps = {}, ): QaParentWatchdogHandle | null { @@ -57,10 +94,19 @@ export function installQaParentWatchdog( const kill = deps.kill ?? ((pid: number, signal?: NodeJS.Signals | 0) => process.kill(pid, signal)); const logger = deps.logger ?? createSubsystemLogger("gateway"); + const qaCleanupRoots = resolveQaCleanupRoots(env); + const chdir = deps.chdir ?? ((directory: string) => process.chdir(directory)); + const cwd = deps.cwd ?? (() => process.cwd()); + const rm = + deps.rm ?? + (async (target: string) => { + await fs.rm(target, { recursive: true, force: true }); + }); const setIntervalFn = deps.setInterval ?? ((callback: () => void, ms: number) => setInterval(callback, ms) as QaParentWatchdogTimer); let stopped = false; + let exiting = false; let timer: QaParentWatchdogTimer; const stop = () => { @@ -72,7 +118,7 @@ export function installQaParentWatchdog( }; timer = setIntervalFn(() => { - if (stopped) { + if (stopped || exiting) { return; } try { @@ -80,8 +126,36 @@ export function installQaParentWatchdog( } catch (error) { if ((error as NodeJS.ErrnoException).code === "ESRCH") { logger.warn(`QA gateway parent pid ${parentPid} exited; shutting down orphaned QA gateway`); + exiting = true; stop(); - exit(0); + void (async () => { + const currentCwd = path.resolve(cwd()); + const activeCwdRoot = qaCleanupRoots.find((cleanupRoot) => + pathContains(cleanupRoot, currentCwd), + ); + if (activeCwdRoot) { + const safeCwd = path.dirname(activeCwdRoot); + try { + chdir(safeCwd); + } catch (chdirError) { + logger.warn( + `QA gateway parent pid ${parentPid} exited; failed to leave runtime root ${activeCwdRoot}: ${ + chdirError instanceof Error ? chdirError.message : String(chdirError) + }`, + ); + } + } + for (const cleanupRoot of qaCleanupRoots) { + await rm(cleanupRoot).catch((cleanupError) => { + logger.warn( + `QA gateway parent pid ${parentPid} exited; failed to clean runtime root ${cleanupRoot}: ${ + cleanupError instanceof Error ? cleanupError.message : String(cleanupError) + }`, + ); + }); + } + exit(0); + })(); } } }, deps.intervalMs ?? DEFAULT_QA_PARENT_WATCHDOG_INTERVAL_MS);