fix(qa): kill timed out Matrix CLI runs

This commit is contained in:
Vincent Koc
2026-05-27 12:59:12 +02:00
parent 545ad7f256
commit b8fc2f6587
2 changed files with 203 additions and 29 deletions

View File

@@ -1,5 +1,6 @@
import { mkdir, mkdtemp, rm, writeFile } from "node:fs/promises";
import { mkdir, mkdtemp, readFile, rm, writeFile } from "node:fs/promises";
import path from "node:path";
import { setTimeout as sleep } from "node:timers/promises";
import { resolvePreferredOpenClawTmpDir } from "openclaw/plugin-sdk/temp-path";
import { describe, expect, it } from "vitest";
import {
@@ -10,6 +11,15 @@ import {
startMatrixQaOpenClawCli,
} from "./scenario-runtime-cli.js";
function isProcessRunning(pid: number): boolean {
try {
process.kill(pid, 0);
return true;
} catch {
return false;
}
}
describe("Matrix QA CLI runtime", () => {
it("redacts secret CLI arguments in diagnostic command text", () => {
expect(
@@ -176,4 +186,131 @@ describe("Matrix QA CLI runtime", () => {
await rm(root, { force: true, recursive: true });
}
});
it("kills CLI commands that ignore graceful timeout termination", async () => {
const root = await mkdtemp(
path.join(resolvePreferredOpenClawTmpDir(), "matrix-qa-cli-timeout-kill-"),
);
const pidPath = path.join(root, "cli.pid");
let childPid: number | undefined;
try {
await mkdir(path.join(root, "dist"));
await writeFile(
path.join(root, "dist", "index.mjs"),
[
"import { writeFileSync } from 'node:fs';",
`writeFileSync(${JSON.stringify(pidPath)}, String(process.pid));`,
"process.stdout.write('waiting despite graceful shutdown\\n');",
"process.on('SIGTERM', () => { process.stdout.write('ignored sigterm\\n'); });",
"setInterval(() => {}, 1000);",
].join("\n"),
);
await expect(
runMatrixQaOpenClawCli({
args: ["matrix", "verify", "self"],
cwd: root,
env: process.env,
timeoutMs: 500,
}),
).rejects.toThrow(/timed out after 500ms/u);
childPid = Number(await readFile(pidPath, "utf8"));
expect(isProcessRunning(childPid)).toBe(false);
} finally {
if (childPid && isProcessRunning(childPid)) {
process.kill(childPid, "SIGKILL");
}
await rm(root, { force: true, recursive: true });
}
});
it("preserves timeout diagnostics when wait attaches after timeout", async () => {
const root = await mkdtemp(
path.join(resolvePreferredOpenClawTmpDir(), "matrix-qa-cli-late-wait-timeout-"),
);
const pidPath = path.join(root, "cli.pid");
let childPid: number | undefined;
try {
await mkdir(path.join(root, "dist"));
await writeFile(
path.join(root, "dist", "index.mjs"),
[
"import { writeFileSync } from 'node:fs';",
`writeFileSync(${JSON.stringify(pidPath)}, String(process.pid));`,
"process.stdout.write('late wait timeout marker\\n');",
"process.on('SIGTERM', () => {});",
"setInterval(() => {}, 1000);",
].join("\n"),
);
const session = startMatrixQaOpenClawCli({
args: ["matrix", "verify", "self"],
cwd: root,
env: process.env,
timeoutMs: 500,
});
await sleep(850);
await expect(session.wait()).rejects.toThrow(/timed out after 500ms/u);
await expect(session.wait()).rejects.toThrow(/late wait timeout marker/u);
childPid = Number(await readFile(pidPath, "utf8"));
expect(isProcessRunning(childPid)).toBe(false);
} finally {
if (childPid && isProcessRunning(childPid)) {
process.kill(childPid, "SIGKILL");
}
await rm(root, { force: true, recursive: true });
}
});
it("settles and kills descendants that keep timed-out CLI stdio open", async () => {
const root = await mkdtemp(
path.join(resolvePreferredOpenClawTmpDir(), "matrix-qa-cli-timeout-tree-"),
);
const childPidPath = path.join(root, "child.pid");
const grandchildPidPath = path.join(root, "grandchild.pid");
let childPid: number | undefined;
let grandchildPid: number | undefined;
try {
await mkdir(path.join(root, "dist"));
await writeFile(
path.join(root, "dist", "index.mjs"),
[
"import { spawn } from 'node:child_process';",
"import { writeFileSync } from 'node:fs';",
`writeFileSync(${JSON.stringify(childPidPath)}, String(process.pid));`,
"const grandchild = spawn(process.execPath, ['-e', 'setInterval(() => {}, 1000);'], { stdio: ['ignore', 'inherit', 'inherit'] });",
`writeFileSync(${JSON.stringify(grandchildPidPath)}, String(grandchild.pid));`,
"process.stdout.write('spawned persistent descendant\\n');",
"process.on('SIGTERM', () => {});",
"setInterval(() => {}, 1000);",
].join("\n"),
);
await expect(
runMatrixQaOpenClawCli({
args: ["matrix", "verify", "self"],
cwd: root,
env: process.env,
timeoutMs: 500,
}),
).rejects.toThrow(/timed out after 500ms/u);
childPid = Number(await readFile(childPidPath, "utf8"));
grandchildPid = Number(await readFile(grandchildPidPath, "utf8"));
expect(isProcessRunning(childPid)).toBe(false);
if (process.platform !== "win32") {
expect(isProcessRunning(grandchildPid)).toBe(false);
}
} finally {
for (const pid of [grandchildPid, childPid]) {
if (pid && isProcessRunning(pid)) {
process.kill(pid, "SIGKILL");
}
}
await rm(root, { force: true, recursive: true });
}
});
});

View File

@@ -30,6 +30,8 @@ export type MatrixQaCliSession = {
};
const MATRIX_QA_CLI_SECRET_ARG_FLAGS = new Set(["--access-token", "--password", "--recovery-key"]);
const MATRIX_QA_CLI_TIMEOUT_KILL_GRACE_MS = 250;
const MATRIX_QA_CLI_TIMEOUT_FORCE_SETTLE_MS = 100;
function isMatrixQaCliSecretPositionalArg(args: string[], index: number): boolean {
return args[0] === "matrix" && args[1] === "verify" && args[2] === "device" && index === 3;
@@ -91,6 +93,31 @@ function formatMatrixQaCliExitError(result: MatrixQaCliRunResult) {
.join("\n");
}
function formatMatrixQaCliTimeoutError(result: MatrixQaCliRunResult, timeoutMs: number) {
return [
`${formatMatrixQaCliCommand(result.args)} timed out after ${timeoutMs}ms`,
result.stderr.trim() ? `stderr:\n${redactMatrixQaCliOutput(result.stderr.trim())}` : null,
result.stdout.trim() ? `stdout:\n${redactMatrixQaCliOutput(result.stdout.trim())}` : null,
]
.filter(Boolean)
.join("\n");
}
function killMatrixQaCliChild(
child: ReturnType<typeof startOpenClawCliProcess>,
signal: NodeJS.Signals,
): void {
if (process.platform !== "win32" && child.pid) {
try {
process.kill(-child.pid, signal);
return;
} catch {
// Fall back to the direct child if process-group signaling is unavailable.
}
}
child.kill(signal);
}
export function startMatrixQaOpenClawCli(params: {
allowNonZero?: boolean;
args: string[];
@@ -104,8 +131,11 @@ export function startMatrixQaOpenClawCli(params: {
const stdout: Buffer[] = [];
const stderr: Buffer[] = [];
let closed = false;
let closeError: Error | undefined;
let closeResult: MatrixQaCliRunResult | undefined;
let timedOut = false;
let forceKillTimeout: NodeJS.Timeout | undefined;
let forceSettleTimeout: NodeJS.Timeout | undefined;
let settleWait:
| {
reject: (error: Error) => void;
@@ -115,6 +145,7 @@ export function startMatrixQaOpenClawCli(params: {
const child = startOpenClawCliProcess(process.execPath, [distEntryPath, ...params.args], {
cwd,
detached: process.platform !== "win32",
env: params.env,
stdio: ["pipe", "pipe", "pipe"],
});
@@ -127,6 +158,7 @@ export function startMatrixQaOpenClawCli(params: {
return;
}
closed = true;
closeError = error;
closeResult = result;
if (!settleWait) {
return;
@@ -140,30 +172,20 @@ export function startMatrixQaOpenClawCli(params: {
const timeout = setTimeout(() => {
timedOut = true;
child.kill("SIGTERM");
setTimeout(() => {
const result = buildMatrixQaCliResult({
args: params.args,
exitCode: 1,
output: readOutput(),
});
finish(
result,
new Error(
[
`${formatMatrixQaCliCommand(params.args)} timed out after ${params.timeoutMs}ms`,
result.stderr.trim()
? `stderr:\n${redactMatrixQaCliOutput(result.stderr.trim())}`
: null,
result.stdout.trim()
? `stdout:\n${redactMatrixQaCliOutput(result.stdout.trim())}`
: null,
]
.filter(Boolean)
.join("\n"),
),
);
}, 25);
killMatrixQaCliChild(child, "SIGTERM");
forceKillTimeout = setTimeout(() => {
if (!closed) {
killMatrixQaCliChild(child, "SIGKILL");
forceSettleTimeout = setTimeout(() => {
const result = buildMatrixQaCliResult({
args: params.args,
exitCode: 1,
output: readOutput(),
});
finish(result, new Error(formatMatrixQaCliTimeoutError(result, params.timeoutMs)));
}, MATRIX_QA_CLI_TIMEOUT_FORCE_SETTLE_MS);
}
}, MATRIX_QA_CLI_TIMEOUT_KILL_GRACE_MS);
}, params.timeoutMs);
child.stdout.on("data", (chunk) => stdout.push(Buffer.from(chunk)));
@@ -173,6 +195,12 @@ export function startMatrixQaOpenClawCli(params: {
}
child.on("error", (error) => {
clearTimeout(timeout);
if (forceKillTimeout) {
clearTimeout(forceKillTimeout);
}
if (forceSettleTimeout) {
clearTimeout(forceSettleTimeout);
}
finish(
buildMatrixQaCliResult({
args: params.args,
@@ -184,14 +212,21 @@ export function startMatrixQaOpenClawCli(params: {
});
child.on("close", (exitCode) => {
clearTimeout(timeout);
if (timedOut) {
return;
if (forceKillTimeout) {
clearTimeout(forceKillTimeout);
}
if (forceSettleTimeout) {
clearTimeout(forceSettleTimeout);
}
const result = buildMatrixQaCliResult({
args: params.args,
exitCode: exitCode ?? 1,
output: readOutput(),
});
if (timedOut) {
finish(result, new Error(formatMatrixQaCliTimeoutError(result, params.timeoutMs)));
return;
}
if (result.exitCode !== 0 && params.allowNonZero !== true) {
finish(result, new Error(formatMatrixQaCliExitError(result)));
return;
@@ -210,7 +245,9 @@ export function startMatrixQaOpenClawCli(params: {
wait: async () =>
await new Promise<MatrixQaCliRunResult>((resolve, reject) => {
if (closed && closeResult) {
if (closeResult.exitCode === 0 || params.allowNonZero === true) {
if (closeError) {
reject(closeError);
} else if (closeResult.exitCode === 0 || params.allowNonZero === true) {
resolve(closeResult);
} else {
reject(new Error(formatMatrixQaCliExitError(closeResult)));
@@ -248,7 +285,7 @@ export function startMatrixQaOpenClawCli(params: {
},
kill: () => {
if (!closed) {
child.kill("SIGTERM");
killMatrixQaCliChild(child, "SIGTERM");
}
},
};