fix(qa): expose codex tools for runtime parity

2026-05-18 19:54:46 +00:00 · 2026-05-17 06:46:27 +08:00
parent 2c9f68f42b
commit 37dcf385e5
15 changed files with 454 additions and 48 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -123,6 +123,7 @@ Docs: https://docs.openclaw.ai
 - Mac app: cache settings config schema/drafts and load channel config in parallel with channel probes, making repeated Channels and Config tab switches responsive over remote tunnels.
 - Control UI: negotiate the Gateway protocol from shared constants so rebuilt dashboards connect to current gateways instead of reporting a protocol mismatch.
 - Mac app: let menu gateway/session error text wrap across a few lines and stop rebuilding dynamic Context/Gateway menu rows while the menu is open, reducing flicker.
+- QA-Lab: expose Codex runtime tools during private parity runs and treat completed structural/tool-shape runtime drift as advisory, while preserving real runtime failures as lane blockers.
 - Mac app: make device pairing approval sheets friendlier, with concise Mac/device copy, shortened identifiers, friendly scope labels, and Approve as the primary action.
 - Providers/Qwen: honor session thinking level for `qwen-chat-template` payloads so `/think off` disables nested llama.cpp chat-template thinking controls. Fixes #82768. Thanks @bfox55.
 - Feishu/wiki: reject numeric wiki space IDs before creating Lark clients and keep numeric-looking IDs documented as quoted opaque strings, preventing JavaScript precision loss in knowledge base calls. Fixes #45301. (#82769) Thanks @hyspacex.
--- a/extensions/codex/src/app-server/dynamic-tool-profile.ts
+++ b/extensions/codex/src/app-server/dynamic-tool-profile.ts
@@ -1,4 +1,4 @@
-import type { CodexPluginConfig } from "./config.js";
+import type { CodexDynamicToolsLoading, CodexPluginConfig } from "./config.js";

 export const CODEX_APP_SERVER_OWNED_DYNAMIC_TOOL_EXCLUDES = [
  "read",
@@ -19,18 +19,44 @@ const DYNAMIC_TOOL_NAME_ALIASES: Record<string, string> = {
  "apply-patch": "apply_patch",
 };

+type CodexDynamicToolProfileEnv = {
+  OPENCLAW_BUILD_PRIVATE_QA?: string;
+  OPENCLAW_QA_FORCE_RUNTIME?: string;
+};
+
 export function normalizeCodexDynamicToolName(name: string): string {
  const normalized = name.trim().toLowerCase();
  return DYNAMIC_TOOL_NAME_ALIASES[normalized] ?? normalized;
 }

+export function isForcedPrivateQaCodexRuntime(
+  env: CodexDynamicToolProfileEnv = process.env,
+): boolean {
+  return (
+    env.OPENCLAW_BUILD_PRIVATE_QA === "1" &&
+    env.OPENCLAW_QA_FORCE_RUNTIME?.trim().toLowerCase() === "codex"
+  );
+}
+
+export function resolveCodexDynamicToolsLoading(
+  config: Pick<CodexPluginConfig, "codexDynamicToolsLoading">,
+  env: CodexDynamicToolProfileEnv = process.env,
+): CodexDynamicToolsLoading {
+  return isForcedPrivateQaCodexRuntime(env)
+    ? "direct"
+    : (config.codexDynamicToolsLoading ?? "searchable");
+}
+
 export function filterCodexDynamicTools<T extends { name: string }>(
  tools: T[],
  config: Pick<CodexPluginConfig, "codexDynamicToolsExclude">,
+  env: CodexDynamicToolProfileEnv = process.env,
 ): T[] {
  const excludes = new Set<string>();
-  for (const name of CODEX_APP_SERVER_OWNED_DYNAMIC_TOOL_EXCLUDES) {
-    excludes.add(name);
+  if (!isForcedPrivateQaCodexRuntime(env)) {
+    for (const name of CODEX_APP_SERVER_OWNED_DYNAMIC_TOOL_EXCLUDES) {
+      excludes.add(name);
+    }
  }
  for (const name of config.codexDynamicToolsExclude ?? []) {
    const trimmed = normalizeCodexDynamicToolName(name);
--- a/extensions/codex/src/app-server/run-attempt.test.ts
+++ b/extensions/codex/src/app-server/run-attempt.test.ts
@@ -646,6 +646,21 @@ describe("runCodexAppServerAttempt", () => {
    ).toEqual(["message"]);
  });

+  it("exposes app-server-owned tools directly for forced private QA Codex runtime", () => {
+    const tools = ["read", "write", "image_generate", "message"].map((name) => ({ name }));
+    const privateQaCodexEnv = {
+      OPENCLAW_BUILD_PRIVATE_QA: "1",
+      OPENCLAW_QA_FORCE_RUNTIME: "codex",
+    };
+
+    expect(
+      __testing
+        .filterCodexDynamicTools(tools, {}, privateQaCodexEnv)
+        .map((tool) => tool.name),
+    ).toEqual(["read", "write", "image_generate", "message"]);
+    expect(__testing.resolveCodexDynamicToolsLoading({}, privateQaCodexEnv)).toBe("direct");
+  });
+
  it("starts Codex threads without duplicate OpenClaw workspace tools by default", async () => {
    const sessionFile = path.join(tempDir, "session.jsonl");
    const workspaceDir = path.join(tempDir, "workspace");
@@ -897,6 +912,38 @@ describe("runCodexAppServerAttempt", () => {
    expect((factoryOptions[0] as { modelApi?: unknown }).modelApi).toBe("openai-responses");
  });

+  it("enables gateway subagent binding for forced private QA Codex runs", async () => {
+    vi.stubEnv("OPENCLAW_BUILD_PRIVATE_QA", "1");
+    vi.stubEnv("OPENCLAW_QA_FORCE_RUNTIME", "codex");
+    const sessionFile = path.join(tempDir, "session.jsonl");
+    const workspaceDir = path.join(tempDir, "workspace");
+    const params = createParams(sessionFile, workspaceDir);
+    params.disableTools = false;
+    params.runtimePlan = createCodexRuntimePlanFixture();
+    const factoryOptions: unknown[] = [];
+    __testing.setOpenClawCodingToolsFactoryForTests((options) => {
+      factoryOptions.push(options);
+      return [createRuntimeDynamicTool("sessions_spawn")];
+    });
+
+    const tools = await __testing.buildDynamicTools({
+      params,
+      resolvedWorkspace: workspaceDir,
+      effectiveWorkspace: workspaceDir,
+      sandboxSessionKey: params.sessionKey!,
+      sandbox: null as never,
+      runAbortController: new AbortController(),
+      sessionAgentId: "main",
+      pluginConfig: {},
+      onYieldDetected: () => undefined,
+    });
+
+    expect(factoryOptions).toHaveLength(1);
+    const factoryOption = factoryOptions[0] as { allowGatewaySubagentBinding?: unknown };
+    expect(factoryOption.allowGatewaySubagentBinding).toBe(true);
+    expect(tools.map((tool) => tool.name)).toEqual(["sessions_spawn"]);
+  });
+
  it("normalizes Codex dynamic toolsAllow entries before filtering", () => {
    const tools = ["exec", "apply_patch", "read", "message"].map((name) => ({ name }));

--- a/extensions/codex/src/app-server/run-attempt.ts
+++ b/extensions/codex/src/app-server/run-attempt.ts
@@ -78,7 +78,12 @@ import {
  resolveCodexContextEngineProjectionMaxChars,
  resolveCodexContextEngineProjectionReserveTokens,
 } from "./context-engine-projection.js";
-import { filterCodexDynamicTools, normalizeCodexDynamicToolName } from "./dynamic-tool-profile.js";
+import {
+  filterCodexDynamicTools,
+  isForcedPrivateQaCodexRuntime,
+  normalizeCodexDynamicToolName,
+  resolveCodexDynamicToolsLoading,
+} from "./dynamic-tool-profile.js";
 import { createCodexDynamicToolBridge, type CodexDynamicToolBridge } from "./dynamic-tools.js";
 import { handleCodexAppServerElicitationRequest } from "./elicitation-bridge.js";
 import { CodexAppServerEventProjector } from "./event-projector.js";
@@ -618,7 +623,7 @@ export async function runCodexAppServerAttempt(
  const toolBridge = createCodexDynamicToolBridge({
    tools,
    signal: runAbortController.signal,
-    loading: pluginConfig.codexDynamicToolsLoading ?? "searchable",
+    loading: resolveCodexDynamicToolsLoading(pluginConfig),
    directToolNames: shouldForceMessageTool(params) ? ["message"] : [],
    hookContext: {
      agentId: sessionAgentId,
@@ -2748,7 +2753,8 @@ async function buildDynamicTools(input: DynamicToolBuildParams) {
    senderUsername: params.senderUsername,
    senderE164: params.senderE164,
    senderIsOwner: params.senderIsOwner,
-    allowGatewaySubagentBinding: params.allowGatewaySubagentBinding,
+    allowGatewaySubagentBinding:
+      params.allowGatewaySubagentBinding || isForcedPrivateQaCodexRuntime(),
    ...sessionKeys,
    sessionId: params.sessionId,
    runId: params.runId,
@@ -3933,6 +3939,7 @@ export const __testing = {
  isInvalidCodexImagePayloadError,
  remapCodexContextFilePath,
  resolveDynamicToolCallTimeoutMs,
+  resolveCodexDynamicToolsLoading,
  restrictCodexAppServerSandboxForOpenClawSandbox,
  resolveCodexAppServerForOpenClawToolPolicy,
  resolveOpenClawCodingToolsSessionKeys,
--- a/extensions/codex/src/app-server/side-question.ts
+++ b/extensions/codex/src/app-server/side-question.ts
@@ -16,7 +16,10 @@ import { handleCodexAppServerApprovalRequest } from "./approval-bridge.js";
 import { refreshCodexAppServerAuthTokens } from "./auth-bridge.js";
 import { isCodexAppServerApprovalRequest, type CodexAppServerClient } from "./client.js";
 import { readCodexPluginConfig, resolveCodexAppServerRuntimeOptions } from "./config.js";
-import { filterCodexDynamicTools } from "./dynamic-tool-profile.js";
+import {
+  filterCodexDynamicTools,
+  resolveCodexDynamicToolsLoading,
+} from "./dynamic-tool-profile.js";
 import { createCodexDynamicToolBridge, type CodexDynamicToolBridge } from "./dynamic-tools.js";
 import { handleCodexAppServerElicitationRequest } from "./elicitation-bridge.js";
 import {
@@ -378,7 +381,7 @@ async function createCodexSideToolBridge(input: {
  return createCodexDynamicToolBridge({
    tools,
    signal: input.signal,
-    loading: input.pluginConfig.codexDynamicToolsLoading ?? "searchable",
+    loading: resolveCodexDynamicToolsLoading(input.pluginConfig),
    hookContext: {
      agentId: input.sessionAgentId,
      config: input.params.cfg,
--- a/extensions/qa-lab/src/agentic-parity-report.test.ts
+++ b/extensions/qa-lab/src/agentic-parity-report.test.ts
@@ -66,7 +66,7 @@ function makeRuntimeParitySummary(): QaRuntimeParitySuiteSummary {
      },
      {
        name: "Compaction retry after mutating tool",
-        status: "fail",
+        status: "pass",
        steps: [],
        runtimeParity: {
          scenarioId: "compaction-retry-after-mutating-tool",
@@ -97,8 +97,8 @@ function makeRuntimeParitySummary(): QaRuntimeParitySuiteSummary {
    ],
    counts: {
      total: 2,
-      passed: 1,
-      failed: 1,
+      passed: 2,
+      failed: 0,
    },
    run: {
      providerMode: "mock-openai",
@@ -801,9 +801,28 @@ status=done`,
    });

    expect(report.runtimePair).toEqual(["pi", "codex"]);
-    expect(report.pass).toBe(false);
+    expect(report.pass).toBe(true);
    expect(report.driftCounts.none).toBe(1);
    expect(report.driftCounts["tool-call-shape"]).toBe(1);
+    expect(report.failures).toEqual([]);
+  });
+
+  it("fails runtime parity reports when a runtime cell fails", () => {
+    const summary = makeRuntimeParitySummary();
+    const scenario = summary.scenarios[1];
+    if (!scenario?.runtimeParity) {
+      throw new Error("runtime parity fixture missing");
+    }
+    scenario.status = "fail";
+    scenario.runtimeParity.cells.codex.runtimeErrorClass = "tool-error";
+
+    const report = buildQaRuntimeParityReport({
+      summary,
+      comparedAt: "2026-05-10T00:00:00.000Z",
+    });
+
+    expect(report.pass).toBe(false);
+    expect(report.failedScenarios).toBe(1);
    expect(report.failures).toContain(
      "Compaction retry after mutating tool drift=tool-call-shape (tool call 1 differs).",
    );
--- a/extensions/qa-lab/src/agentic-parity-report.ts
+++ b/extensions/qa-lab/src/agentic-parity-report.ts
@@ -4,10 +4,10 @@ import {
 } from "./agentic-parity.js";
 import type {
  RuntimeId,
-  RuntimeParityCell,
  RuntimeParityDrift,
  RuntimeParityResult,
 } from "./runtime-parity.js";
+import { isRuntimeParityResultPass, runtimeParityCellStatus } from "./runtime-parity.js";

 type QaParityReportStep = {
  name: string;
@@ -260,13 +260,6 @@ function normalizeRuntimePair(
  return ["pi", "codex"];
 }

-function runtimeCellStatus(cell: RuntimeParityCell | undefined): "pass" | "fail" | "missing" {
-  if (!cell) {
-    return "missing";
-  }
-  return cell.runtimeErrorClass || cell.transportErrorClass ? "fail" : "pass";
-}
-
 function requiredCoverageStatus(
  scenario: QaParityReportScenario | undefined,
 ): "pass" | "fail" | "skip" | "missing" {
@@ -637,9 +630,9 @@ export function buildQaRuntimeParityReport(params: {
    driftCounts[parity.drift] += 1;
    const piCell = parity.cells.pi;
    const codexCell = parity.cells.codex;
-    const piStatus = runtimeCellStatus(piCell);
-    const codexStatus = runtimeCellStatus(codexCell);
-    const status = scenario.status === "pass" ? "pass" : "fail";
+    const piStatus = runtimeParityCellStatus(piCell);
+    const codexStatus = runtimeParityCellStatus(codexCell);
+    const status = isRuntimeParityResultPass(parity) ? "pass" : "fail";
    if (status === "fail") {
      failures.push(
        `${scenario.name} drift=${parity.drift}${parity.driftDetails ? ` (${parity.driftDetails})` : ""}.`,
@@ -660,12 +653,8 @@ export function buildQaRuntimeParityReport(params: {
  });

  const totalScenarios = params.summary.counts?.total ?? scenarios.length;
-  const passedScenarios =
-    params.summary.counts?.passed ??
-    scenarios.filter((scenario) => scenario.status === "pass").length;
-  const failedScenarios =
-    params.summary.counts?.failed ??
-    scenarios.filter((scenario) => scenario.status === "fail").length;
+  const passedScenarios = scenarios.filter((scenario) => scenario.status === "pass").length;
+  const failedScenarios = scenarios.filter((scenario) => scenario.status === "fail").length;

  return {
    runtimePair,
@@ -680,7 +669,7 @@ export function buildQaRuntimeParityReport(params: {
    pass: failures.length === 0 && failedScenarios === 0,
    failures,
    notes: [
-      "Runtime parity treats none and text-only drift as pass; all structural, tool-shape, and failure-mode drift classes fail the lane.",
+      "Runtime parity fails runtime, transport, and failure-mode drift; structural and tool-shape drift is recorded as advisory when both runtimes complete.",
      "Token totals here are assistant-message usage captured from the normalized transcript, not provider transport payloads.",
    ],
  };
--- a/extensions/qa-lab/src/cli.runtime.test.ts
+++ b/extensions/qa-lab/src/cli.runtime.test.ts
@@ -868,6 +868,7 @@ describe("qa cli runtime", () => {
                    finalText: "done",
                    usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 },
                    wallClockMs: 10,
+                    runtimeErrorClass: "tool-error",
                    bootStateLines: [],
                  },
                },
--- a/extensions/qa-lab/src/providers/mock-openai/server.test.ts
+++ b/extensions/qa-lab/src/providers/mock-openai/server.test.ts
@@ -982,6 +982,74 @@ describe("qa mock openai server", () => {
    expect(finalPayload.output?.[0]?.content?.[0]?.text).toContain("replay unsafe after write");
  });

+  it("keeps compaction retry planning across continuation prompts", async () => {
+    const server = await startQaMockOpenAiServer({
+      host: "127.0.0.1",
+      port: 0,
+    });
+    cleanups.push(async () => {
+      await server.stop();
+    });
+
+    const prompt =
+      "Compaction retry mutating tool check: read COMPACTION_RETRY_CONTEXT.md, then create compaction-retry-summary.txt and keep replay safety explicit.";
+    const writePlan = await fetch(`${server.baseUrl}/v1/responses`, {
+      method: "POST",
+      headers: { "content-type": "application/json" },
+      body: JSON.stringify({
+        stream: true,
+        model: "gpt-5.5",
+        input: [
+          makeUserInput(prompt),
+          {
+            type: "function_call_output",
+            output: "compaction retry evidence block 0000\ncompaction retry evidence block 0001",
+          },
+          makeUserInput("Continue after compaction."),
+        ],
+      }),
+    });
+    expect(writePlan.status).toBe(200);
+    expect(await writePlan.text()).toContain('"name":"write"');
+
+    const contextOnlyWritePlan = await fetch(`${server.baseUrl}/v1/responses`, {
+      method: "POST",
+      headers: { "content-type": "application/json" },
+      body: JSON.stringify({
+        stream: true,
+        model: "gpt-5.5",
+        input: [
+          {
+            type: "function_call_output",
+            output: "compaction retry evidence block 0000\ncompaction retry evidence block 0001",
+          },
+          makeUserInput("Continue after compaction."),
+        ],
+      }),
+    });
+    expect(contextOnlyWritePlan.status).toBe(200);
+    expect(await contextOnlyWritePlan.text()).toContain('"name":"write"');
+
+    const finalReply = await fetch(`${server.baseUrl}/v1/responses`, {
+      method: "POST",
+      headers: { "content-type": "application/json" },
+      body: JSON.stringify({
+        stream: false,
+        model: "gpt-5.5",
+        input: [
+          makeUserInput(prompt),
+          {
+            type: "function_call_output",
+            output: "Successfully wrote 41 bytes to compaction-retry-summary.txt.",
+          },
+          makeUserInput("Continue after compaction."),
+        ],
+      }),
+    });
+    expect(finalReply.status).toBe(200);
+    expect(outputText(await finalReply.json())).toContain("replay unsafe after write");
+  });
+
  it("supports exact reply memory prompts and embeddings requests", async () => {
    const server = await startQaMockOpenAiServer({
      host: "127.0.0.1",
@@ -1866,6 +1934,165 @@ describe("qa mock openai server", () => {
    expect(outputText(await phaseOnlyFinal.json())).toBe("subagent-1: ok\nsubagent-2: ok");
  });

+  it("uses full request text when planning continuation subagent tool calls", async () => {
+    const server = await startQaMockOpenAiServer({
+      host: "127.0.0.1",
+      port: 0,
+    });
+    cleanups.push(async () => {
+      await server.stop();
+    });
+
+    const handoffPrompt =
+      "Delegate one bounded QA task to a subagent. Wait for the subagent to finish.";
+    const handoff = await fetch(`${server.baseUrl}/v1/responses`, {
+      method: "POST",
+      headers: { "content-type": "application/json" },
+      body: JSON.stringify({
+        stream: true,
+        tools: [SESSIONS_SPAWN_TOOL],
+        input: [makeUserInput(handoffPrompt), makeUserInput("Continue.")],
+      }),
+    });
+    expect(handoff.status).toBe(200);
+    expect(await handoff.text()).toContain('"name":"sessions_spawn"');
+
+    const handoffServer = await startQaMockOpenAiServer({
+      host: "127.0.0.1",
+      port: 0,
+    });
+    cleanups.push(async () => {
+      await handoffServer.stop();
+    });
+
+    const appServerHandoff = await fetch(`${handoffServer.baseUrl}/v1/responses`, {
+      method: "POST",
+      headers: { "content-type": "application/json" },
+      body: JSON.stringify({
+        stream: true,
+        input: [makeUserInput(handoffPrompt), makeUserInput("Continue.")],
+      }),
+    });
+    expect(appServerHandoff.status).toBe(200);
+    expect(await appServerHandoff.text()).toContain('"name":"sessions_spawn"');
+
+    const repeatedHandoff = await fetch(`${handoffServer.baseUrl}/v1/responses`, {
+      method: "POST",
+      headers: { "content-type": "application/json" },
+      body: JSON.stringify({
+        stream: true,
+        input: [makeUserInput(handoffPrompt), makeUserInput("Continue again.")],
+      }),
+    });
+    expect(repeatedHandoff.status).toBe(200);
+    expect(await repeatedHandoff.text()).not.toContain('"name":"sessions_spawn"');
+
+    const handoffFinal = await fetch(`${server.baseUrl}/v1/responses`, {
+      method: "POST",
+      headers: { "content-type": "application/json" },
+      body: JSON.stringify({
+        stream: false,
+        tools: [SESSIONS_SPAWN_TOOL],
+        input: [
+          makeUserInput(handoffPrompt),
+          { type: "function_call_output", output: "SUBAGENT-OK" },
+          makeUserInput("Continue."),
+        ],
+      }),
+    });
+    expect(handoffFinal.status).toBe(200);
+    expect(outputText(await handoffFinal.json())).toContain("Delegated task");
+
+    const fanoutPrompt =
+      "Subagent fanout synthesis check: delegate two bounded subagents sequentially, then report both results together.";
+    const appServerFanout = await fetch(`${server.baseUrl}/v1/responses`, {
+      method: "POST",
+      headers: { "content-type": "application/json" },
+      body: JSON.stringify({
+        stream: true,
+        input: [makeUserInput(fanoutPrompt), makeUserInput("Continue.")],
+      }),
+    });
+    expect(appServerFanout.status).toBe(200);
+    expect(await appServerFanout.text()).toContain('\\"label\\":\\"qa-fanout-alpha\\"');
+
+    const fanoutServer = await startQaMockOpenAiServer({
+      host: "127.0.0.1",
+      port: 0,
+    });
+    cleanups.push(async () => {
+      await fanoutServer.stop();
+    });
+
+    const firstFanout = await fetch(`${fanoutServer.baseUrl}/v1/responses`, {
+      method: "POST",
+      headers: { "content-type": "application/json" },
+      body: JSON.stringify({
+        stream: true,
+        tools: [SESSIONS_SPAWN_TOOL],
+        input: [makeUserInput(fanoutPrompt)],
+      }),
+    });
+    expect(firstFanout.status).toBe(200);
+    expect(await firstFanout.text()).toContain('\\"label\\":\\"qa-fanout-alpha\\"');
+
+    const secondFanout = await fetch(`${fanoutServer.baseUrl}/v1/responses`, {
+      method: "POST",
+      headers: { "content-type": "application/json" },
+      body: JSON.stringify({
+        stream: true,
+        tools: [SESSIONS_SPAWN_TOOL],
+        input: [
+          makeUserInput(fanoutPrompt),
+          {
+            type: "function_call_output",
+            output:
+              '{"status":"accepted","childSessionKey":"agent:qa:subagent:alpha","note":"ALPHA-OK"}',
+          },
+          makeUserInput("Continue."),
+        ],
+      }),
+    });
+    expect(secondFanout.status).toBe(200);
+    expect(await secondFanout.text()).toContain('\\"label\\":\\"qa-fanout-beta\\"');
+  });
+
+  it("keeps source discovery reports out of subagent handoff prose", async () => {
+    const server = await startQaMockOpenAiServer({
+      host: "127.0.0.1",
+      port: 0,
+    });
+    cleanups.push(async () => {
+      await server.stop();
+    });
+
+    const response = await fetch(`${server.baseUrl}/v1/responses`, {
+      method: "POST",
+      headers: { "content-type": "application/json" },
+      body: JSON.stringify({
+        stream: false,
+        input: [
+          makeUserInput(
+            "Read the seeded docs and source plan, then report grouped into Worked, Failed, Blocked, and Follow-up.",
+          ),
+          {
+            type: "function_call_output",
+            output:
+              "repo/qa/scenarios/index.md includes scenario: subagent-handoff and repo/extensions/qa-lab/src/suite.ts.",
+          },
+          makeUserInput("Continue."),
+        ],
+      }),
+    });
+
+    expect(response.status).toBe(200);
+    const text = outputText(await response.json());
+    expect(text).toContain("Worked:");
+    expect(text).toContain("repo/docs/help/testing.md");
+    expect(text).toContain("Follow-up:");
+    expect(text).not.toContain("Delegated task");
+  });
+
  it("does not let fanout completion state hijack child worker replies", async () => {
    const server = await startQaMockOpenAiServer({
      host: "127.0.0.1",
@@ -2727,7 +2954,7 @@ describe("qa mock openai server", () => {
      | { name: string; input: Record<string, unknown> }
      | undefined;
    expect(toolUseBlock?.name).toBe("read");
-    expect(toolUseBlock?.input).toEqual({ path: "QA_SCENARIO_PLAN.md" });
+    expect(toolUseBlock?.input).toEqual({ path: "repo/qa/scenarios/index.md" });

    const debugResponse = await fetch(`${server.baseUrl}/debug/last-request`);
    expect(debugResponse.status).toBe(200);
@@ -2985,7 +3212,7 @@ describe("qa mock openai server", () => {
    expect(body).toContain("event: content_block_start");
    expect(body).toContain('"type":"tool_use"');
    expect(body).toContain('"name":"read"');
-    expect(body).toContain("QA_SCENARIO_PLAN.md");
+    expect(body).toContain("repo/qa/scenarios/index.md");
    expect(body).toContain("event: message_delta");
    expect(body).toContain("event: message_stop");
  });
--- a/extensions/qa-lab/src/providers/mock-openai/server.ts
+++ b/extensions/qa-lab/src/providers/mock-openai/server.ts
@@ -177,6 +177,7 @@ const QA_TOOL_SEARCH_FAILURE_PROMPT_RE = /tool search qa failure/i;

 type MockScenarioState = {
  subagentFanoutPhase: number;
+  subagentHandoffSpawned: boolean;
 };

 const MOCK_OPENAI_MAX_BODY_BYTES = 16 * 1024 * 1024;
@@ -1128,7 +1129,11 @@ function buildAssistantText(
      "- None.",
    ].join("\n");
  }
-  if (toolOutput && (/\bdelegate\b/i.test(prompt) || /subagent handoff/i.test(prompt))) {
+  if (
+    toolOutput &&
+    (/delegate (?:one |a )bounded qa task/i.test(allInputText) ||
+      /subagent handoff/i.test(allInputText))
+  ) {
    const compact = toolOutput.replace(/\s+/g, " ").trim() || "no delegated output";
    return `Delegated task:\n- Inspect the QA workspace via a bounded subagent.\nResult:\n- ${compact}\nEvidence:\n- The child result was folded back into the main thread exactly once.`;
  }
@@ -1141,7 +1146,11 @@ function buildAssistantText(
    }
    return `Protocol note: Lobster Invaders built at lobster-invaders.html.`;
  }
-  if (toolOutput && /compaction retry mutating tool check/i.test(prompt)) {
+  if (
+    toolOutput &&
+    (/compaction retry mutating tool check/i.test(allInputText) ||
+      /compaction-retry-summary\.txt/i.test(toolOutput))
+  ) {
    if (
      toolOutput.includes("Replay safety: unsafe after write.") ||
      /compaction-retry-summary\.txt/i.test(toolOutput) ||
@@ -1152,6 +1161,22 @@ function buildAssistantText(
    }
    return "";
  }
+  if (
+    toolOutput &&
+    /(worked, failed, blocked|worked\/failed\/blocked|source and docs)/i.test(allInputText)
+  ) {
+    return [
+      "Worked:",
+      "- Read all three seeded files: repo/qa/scenarios/index.md, repo/extensions/qa-lab/src/suite.ts, and repo/docs/help/testing.md.",
+      "- Extra QA scenario candidates: config restart capability flip and image generation roundtrip.",
+      "Failed:",
+      "- None observed in mock mode.",
+      "Blocked:",
+      "- No live provider evidence in this lane.",
+      "Follow-up:",
+      "- Re-run with a real model for qualitative coverage.",
+    ].join("\n");
+  }
  if (toolOutput) {
    const snippet = toolOutput.replace(/\s+/g, " ").trim().slice(0, 220);
    return `Protocol note: I reviewed the requested material. Evidence snippet: ${snippet || "no content"}`;
@@ -1501,11 +1526,17 @@ async function buildResponsesPayload(
  const isBaselineUnmentionedChannelChatter = /\bno bot ping here\b/i.test(prompt);
  const hasReasoningOnlyRetryInstruction = allInputText.includes(QA_REASONING_ONLY_RETRY_NEEDLE);
  const hasEmptyResponseRetryInstruction = allInputText.includes(QA_EMPTY_RESPONSE_RETRY_NEEDLE);
-  const canCallSessionsSpawn = hasDeclaredTool(body, "sessions_spawn");
-  const canCallSessionsYield = hasDeclaredTool(body, "sessions_yield");
-  const canPlanQaSessionsSpawn =
-    canCallSessionsSpawn ||
-    /subagent fanout synthesis check|delegate one bounded qa task|subagent handoff/i.test(prompt);
+  const canCallMockSubagentTool =
+    QA_SUBAGENT_DIRECT_FALLBACK_PROMPT_RE.test(allInputText) ||
+    /subagent fanout synthesis check/i.test(allInputText) ||
+    /forked subagent context qa check/i.test(allInputText) ||
+    /delegate (?:one |a )bounded qa task/i.test(allInputText) ||
+    /subagent handoff/i.test(allInputText) ||
+    buildExplicitSessionsSpawnArgs(allInputText) !== null;
+  const canCallSessionsSpawn = hasDeclaredTool(body, "sessions_spawn") || canCallMockSubagentTool;
+  const canCallSessionsYield =
+    hasDeclaredTool(body, "sessions_yield") ||
+    QA_SUBAGENT_DIRECT_FALLBACK_PROMPT_RE.test(allInputText);
  const buildToolProgressReadEvents = (pattern: RegExp) => {
    const toolProgressPrompt = extractLastMatchingUserText(extractAllUserTexts(input), pattern);
    return buildToolCallEventsWithArgs("read", {
@@ -1819,7 +1850,11 @@ async function buildResponsesPayload(
      });
    }
  }
-  if (/compaction retry mutating tool check/i.test(prompt)) {
+  if (
+    /compaction retry mutating tool check/i.test(allInputText) ||
+    /compaction retry evidence/i.test(toolOutput) ||
+    /compaction-retry-summary\.txt/i.test(toolOutput)
+  ) {
    if (!toolOutput) {
      return buildToolCallEventsWithArgs("read", { path: "COMPACTION_RETRY_CONTEXT.md" });
    }
@@ -2002,7 +2037,7 @@ async function buildResponsesPayload(
      size: "1024x1024",
    });
  }
-  if (canPlanQaSessionsSpawn && /subagent fanout synthesis check/i.test(prompt)) {
+  if (canCallSessionsSpawn && /subagent fanout synthesis check/i.test(allInputText)) {
    if (!toolOutput && scenarioState.subagentFanoutPhase === 0) {
      scenarioState.subagentFanoutPhase = 1;
      return buildToolCallEventsWithArgs("sessions_spawn", {
@@ -2078,10 +2113,13 @@ async function buildResponsesPayload(
    }
  }
  if (
-    canPlanQaSessionsSpawn &&
-    (/\bdelegate\b/i.test(prompt) || /subagent handoff/i.test(prompt)) &&
-    !toolOutput
+    canCallSessionsSpawn &&
+    (/delegate (?:one |a )bounded qa task/i.test(allInputText) ||
+      /subagent handoff/i.test(allInputText)) &&
+    !toolOutput &&
+    !scenarioState.subagentHandoffSpawned
  ) {
+    scenarioState.subagentHandoffSpawned = true;
    return buildToolCallEventsWithArgs("sessions_spawn", {
      task: "Inspect the QA workspace and return one concise protocol note.",
      label: "qa-sidecar",
@@ -2092,7 +2130,7 @@ async function buildResponsesPayload(
    /(worked, failed, blocked|worked\/failed\/blocked|source and docs)/i.test(prompt) &&
    !toolOutput
  ) {
-    return buildToolCallEventsWithArgs("read", { path: "QA_SCENARIO_PLAN.md" });
+    return buildToolCallEventsWithArgs("read", { path: "repo/qa/scenarios/index.md" });
  }
  if (!toolOutput && /\b(read|inspect|repo|docs|scenario|kickoff)\b/i.test(prompt)) {
    return buildToolCallEvents(prompt);
@@ -2496,7 +2534,10 @@ async function buildMessagesPayload(

 export async function startQaMockOpenAiServer(params?: { host?: string; port?: number }) {
  const host = params?.host ?? "127.0.0.1";
-  const scenarioState: MockScenarioState = { subagentFanoutPhase: 0 };
+  const scenarioState: MockScenarioState = {
+    subagentFanoutPhase: 0,
+    subagentHandoffSpawned: false,
+  };
  let lastRequest: MockOpenAiRequestSnapshot | null = null;
  const requests: MockOpenAiRequestSnapshot[] = [];
  const imageGenerationRequests: Array<Record<string, unknown>> = [];
--- a/extensions/qa-lab/src/runtime-parity.test.ts
+++ b/extensions/qa-lab/src/runtime-parity.test.ts
@@ -5,6 +5,7 @@ import path from "node:path";
 import { afterEach, describe, expect, it, vi } from "vitest";
 import {
  captureRuntimeParityCell,
+  isRuntimeParityResultPass,
  runRuntimeParityScenario,
  type RuntimeId,
  type RuntimeParityCell,
@@ -179,6 +180,7 @@ describe("runtime parity", () => {
    });

    expect(result.drift).toBe("tool-call-shape");
+    expect(isRuntimeParityResultPass(result)).toBe(true);
  });

  it("classifies tool result shape drift", async () => {
@@ -220,6 +222,7 @@ describe("runtime parity", () => {
    });

    expect(result.drift).toBe("failure-mode");
+    expect(isRuntimeParityResultPass(result)).toBe(false);
  });

  it("surfaces tool-call-shape when one runtime fails because the tool path drifted", async () => {
@@ -235,6 +238,7 @@ describe("runtime parity", () => {
    });

    expect(result.drift).toBe("tool-call-shape");
+    expect(isRuntimeParityResultPass(result)).toBe(false);
  });

  it("surfaces tool-result-shape when a downstream timeout follows divergent tool output", async () => {
--- a/extensions/qa-lab/src/runtime-parity.ts
+++ b/extensions/qa-lab/src/runtime-parity.ts
@@ -59,6 +59,23 @@ export type RuntimeParityScenarioExecution = {
  cell: RuntimeParityCell;
 };

+export function runtimeParityCellStatus(
+  cell: RuntimeParityCell | undefined,
+): "pass" | "fail" | "missing" {
+  if (!cell) {
+    return "missing";
+  }
+  return cell.runtimeErrorClass || cell.transportErrorClass ? "fail" : "pass";
+}
+
+export function isRuntimeParityResultPass(result: RuntimeParityResult) {
+  return (
+    result.drift !== "failure-mode" &&
+    runtimeParityCellStatus(result.cells.pi) === "pass" &&
+    runtimeParityCellStatus(result.cells.codex) === "pass"
+  );
+}
+
 type QaGatewayLike = {
  logs?: () => string;
  tempRoot: string;
--- a/extensions/qa-lab/src/suite.ts
+++ b/extensions/qa-lab/src/suite.ts
@@ -31,6 +31,7 @@ import { renderQaMarkdownReport, type QaReportCheck, type QaReportScenario } fro
 import { defaultQaModelForMode } from "./run-config.js";
 import {
  captureRuntimeParityCell,
+  isRuntimeParityResultPass,
  runRuntimeParityScenario,
  type RuntimeId,
  type RuntimeParityCell,
@@ -276,7 +277,7 @@ async function runScenarioDefinition(
 }

 function isRuntimeParityPass(result: RuntimeParityResult) {
-  return result.drift === "none" || result.drift === "text-only";
+  return isRuntimeParityResultPass(result);
 }

 function formatRuntimeParityCellDetails(cell: RuntimeParityCell) {
--- a/src/agents/openclaw-tools.ts
+++ b/src/agents/openclaw-tools.ts
@@ -346,6 +346,7 @@ export function createOpenClawTools(
    !embedded ||
    options?.sourceReplyDeliveryMode === "message_tool_only" ||
    messageExplicitlyAllowed;
+  const includeSubagentSpawnTool = !embedded || options?.allowGatewaySubagentBinding === true;
  const effectiveCallGateway = embedded
    ? createEmbeddedCallGateway()
    : openClawToolsDeps.callGateway;
@@ -424,6 +425,9 @@ export function createOpenClawTools(
            config: resolvedConfig,
            callGateway: openClawToolsDeps.callGateway,
          }),
+        ]),
+    ...(includeSubagentSpawnTool
+      ? [
          createSessionsSpawnTool({
            agentSessionKey: options?.agentSessionKey,
            agentChannel: options?.agentChannel,
@@ -441,7 +445,8 @@ export function createOpenClawTools(
            inheritedToolAllowlist: options?.inheritedToolAllowlist,
            inheritedToolDenylist: options?.inheritedToolDenylist,
          }),
-        ]),
+        ]
+      : []),
    createSessionsYieldTool({
      sessionId: options?.sessionId,
      onYield: options?.onYield,
--- a/src/agents/openclaw-tools.update-plan.test.ts
+++ b/src/agents/openclaw-tools.update-plan.test.ts
@@ -123,6 +123,24 @@ describe("openclaw-tools update_plan gating", () => {
    expect(toolNames(denied)).not.toContain("message");
  });

+  it("keeps subagent spawn available for trusted embedded gateway-bound runs", () => {
+    setEmbeddedMode(true);
+    const defaultTools = createOpenClawTools({
+      config: {} as OpenClawConfig,
+      disablePluginTools: true,
+    });
+    const gatewayBoundTools = createOpenClawTools({
+      config: {} as OpenClawConfig,
+      disablePluginTools: true,
+      allowGatewaySubagentBinding: true,
+    });
+
+    expect(toolNames(defaultTools)).not.toContain("sessions_spawn");
+    expect(toolNames(defaultTools)).not.toContain("sessions_send");
+    expect(toolNames(gatewayBoundTools)).toContain("sessions_spawn");
+    expect(toolNames(gatewayBoundTools)).not.toContain("sessions_send");
+  });
+
  it("registers update_plan when explicitly enabled", () => {
    const config = {
      tools: {