fix: stop test-parallel from waiting forever on child close

2026-05-14 00:40:43 +00:00 · 2026-03-27 12:20:51 +00:00
parent f625a0b106
commit ac2c2ac954
2 changed files with 135 additions and 46 deletions
--- a/scripts/test-planner/executor.mjs
+++ b/scripts/test-planner/executor.mjs
@@ -280,6 +280,7 @@ export async function executePlan(plan, options = {}) {
  const heapSnapshotEnabled =
    process.platform !== "win32" && heapSnapshotIntervalMs >= heapSnapshotMinIntervalMs;
  const heapSnapshotSignal = env.OPENCLAW_TEST_HEAPSNAPSHOT_SIGNAL?.trim() || "SIGUSR2";
+  const closeGraceMs = Math.max(100, parseEnvNumber("OPENCLAW_TEST_CLOSE_GRACE_MS", 2000));
  const heapSnapshotBaseDir = heapSnapshotEnabled
    ? path.resolve(
        env.OPENCLAW_TEST_HEAPSNAPSHOT_DIR?.trim() ||
@@ -382,11 +383,14 @@ export async function executePlan(plan, options = {}) {
      let pendingLine = "";
      let memoryPollTimer = null;
      let heapSnapshotTimer = null;
+      let closeFallbackTimer = null;
      const memoryFileRecords = [];
      let initialTreeSample = null;
      let latestTreeSample = null;
      let peakTreeSample = null;
      let heapSnapshotSequence = 0;
+      let childExitState = null;
+      let settled = false;
      const updatePeakTreeSample = (sample, reason) => {
        if (!sample) {
          return;
@@ -505,6 +509,72 @@ export async function executePlan(plan, options = {}) {
          } top=${topGrowthFiles.length > 0 ? topGrowthFiles.join(", ") : "none"}`,
        );
      };
+      const clearChildTimers = () => {
+        if (memoryPollTimer) {
+          clearInterval(memoryPollTimer);
+          memoryPollTimer = null;
+        }
+        if (heapSnapshotTimer) {
+          clearInterval(heapSnapshotTimer);
+          heapSnapshotTimer = null;
+        }
+        if (closeFallbackTimer) {
+          clearTimeout(closeFallbackTimer);
+          closeFallbackTimer = null;
+        }
+      };
+      const finalizeRun = (code, signal, source = "close") => {
+        if (settled) {
+          return;
+        }
+        settled = true;
+        clearChildTimers();
+        children.delete(child);
+        const resolvedCode = resolveTestRunExitCode({
+          code,
+          signal,
+          output,
+          fatalSeen,
+          childError,
+        });
+        const elapsedMs = Date.now() - startedAt;
+        logMemoryTraceSummary();
+        if (resolvedCode !== 0) {
+          const failureTail = formatCapturedOutputTail(output);
+          const failureArtifactPath = artifacts.writeTempJsonArtifact(`${artifactStem}-failure`, {
+            entry: unit.id,
+            command: [pnpmInvocation.command, ...spawnArgs],
+            elapsedMs,
+            error: childError ? String(childError) : null,
+            exitCode: resolvedCode,
+            fatalSeen,
+            logPath: laneLogPath,
+            outputTail: failureTail,
+            signal: signal ?? null,
+          });
+          if (failureTail) {
+            console.error(`[test-parallel] failure tail ${unit.id}\n${failureTail}`);
+          }
+          console.error(
+            `[test-parallel] failure artifacts ${unit.id} log=${laneLogPath} meta=${failureArtifactPath}`,
+          );
+        }
+        if (source !== "close") {
+          laneLogStream.write(
+            `\n[test-parallel] finalize source=${source} after child exit without close\n`,
+          );
+        }
+        laneLogStream.write(
+          `\n[test-parallel] done ${unit.id} code=${String(resolvedCode)} signal=${
+            signal ?? "none"
+          } elapsed=${formatElapsedMs(elapsedMs)}\n`,
+        );
+        laneLogStream.end();
+        console.log(
+          `[test-parallel] done ${unit.id} code=${String(resolvedCode)} elapsed=${formatElapsedMs(elapsedMs)}`,
+        );
+        resolve(resolvedCode);
+      };
      try {
        const childEnv = {
          ...env,
@@ -565,53 +635,20 @@ export async function executePlan(plan, options = {}) {
        laneLogStream.write(`\n[test-parallel] child error: ${String(err)}\n`);
        console.error(`[test-parallel] child error: ${String(err)}`);
      });
+      child.on("exit", (code, signal) => {
+        childExitState = { code, signal };
+        if (settled || closeFallbackTimer) {
+          return;
+        }
+        closeFallbackTimer = setTimeout(() => {
+          child.stdout?.destroy();
+          child.stderr?.destroy();
+          finalizeRun(code, signal, "exit-timeout");
+        }, closeGraceMs);
+        closeFallbackTimer.unref?.();
+      });
      child.on("close", (code, signal) => {
-        if (memoryPollTimer) {
-          clearInterval(memoryPollTimer);
-        }
-        if (heapSnapshotTimer) {
-          clearInterval(heapSnapshotTimer);
-        }
-        children.delete(child);
-        const resolvedCode = resolveTestRunExitCode({
-          code,
-          signal,
-          output,
-          fatalSeen,
-          childError,
-        });
-        const elapsedMs = Date.now() - startedAt;
-        logMemoryTraceSummary();
-        if (resolvedCode !== 0) {
-          const failureTail = formatCapturedOutputTail(output);
-          const failureArtifactPath = artifacts.writeTempJsonArtifact(`${artifactStem}-failure`, {
-            entry: unit.id,
-            command: [pnpmInvocation.command, ...spawnArgs],
-            elapsedMs,
-            error: childError ? String(childError) : null,
-            exitCode: resolvedCode,
-            fatalSeen,
-            logPath: laneLogPath,
-            outputTail: failureTail,
-            signal: signal ?? null,
-          });
-          if (failureTail) {
-            console.error(`[test-parallel] failure tail ${unit.id}\n${failureTail}`);
-          }
-          console.error(
-            `[test-parallel] failure artifacts ${unit.id} log=${laneLogPath} meta=${failureArtifactPath}`,
-          );
-        }
-        laneLogStream.write(
-          `\n[test-parallel] done ${unit.id} code=${String(resolvedCode)} signal=${
-            signal ?? "none"
-          } elapsed=${formatElapsedMs(elapsedMs)}\n`,
-        );
-        laneLogStream.end();
-        console.log(
-          `[test-parallel] done ${unit.id} code=${String(resolvedCode)} elapsed=${formatElapsedMs(elapsedMs)}`,
-        );
-        resolve(resolvedCode);
+        finalizeRun(childExitState?.code ?? code, childExitState?.signal ?? signal, "close");
      });
    });

--- a/test/scripts/test-planner.executor-fallback.test.ts
+++ b/test/scripts/test-planner.executor-fallback.test.ts
@@ -0,0 +1,52 @@
+import { EventEmitter } from "node:events";
+import { PassThrough } from "node:stream";
+import { afterEach, describe, expect, it, vi } from "vitest";
+import { importFreshModule } from "../helpers/import-fresh.js";
+
+afterEach(() => {
+  vi.restoreAllMocks();
+  vi.resetModules();
+});
+
+describe("test planner executor", () => {
+  it("falls back to child exit when close never arrives", async () => {
+    const stdout = new PassThrough();
+    const stderr = new PassThrough();
+    const fakeChild = Object.assign(new EventEmitter(), {
+      stdout,
+      stderr,
+      pid: 12345,
+      kill: vi.fn(),
+    });
+    const spawnMock = vi.fn(() => {
+      setTimeout(() => {
+        fakeChild.emit("exit", 0, null);
+      }, 0);
+      return fakeChild;
+    });
+    vi.doMock("node:child_process", () => ({
+      spawn: spawnMock,
+    }));
+
+    const { executePlan, createExecutionArtifacts } = await importFreshModule<
+      typeof import("../../scripts/test-planner/executor.mjs")
+    >(import.meta.url, "../../scripts/test-planner/executor.mjs?scope=exit-fallback");
+    const artifacts = createExecutionArtifacts({ OPENCLAW_TEST_CLOSE_GRACE_MS: "10" });
+    const executePromise = executePlan(
+      {
+        passthroughMetadataOnly: true,
+        passthroughOptionArgs: [],
+        runtimeCapabilities: { isWindowsCi: false, isCI: false, isWindows: false },
+      },
+      {
+        env: { OPENCLAW_TEST_CLOSE_GRACE_MS: "10" },
+        artifacts,
+      },
+    );
+
+    await expect(executePromise).resolves.toBe(0);
+    expect(spawnMock).toHaveBeenCalledTimes(1);
+
+    artifacts.cleanupTempArtifacts();
+  });
+});