From 37dcf385e5fa73fbeea959c3d2b0f0bdcaeb52ca Mon Sep 17 00:00:00 2001
From: Vincent Koc <vincentkoc@ieee.org>
Date: Sun, 17 May 2026 06:46:27 +0800
Subject: [PATCH] fix(qa): expose codex tools for runtime parity

---
 CHANGELOG.md                                  |   1 +
 .../src/app-server/dynamic-tool-profile.ts    |  32 ++-
 .../codex/src/app-server/run-attempt.test.ts  |  47 ++++
 .../codex/src/app-server/run-attempt.ts       |  13 +-
 .../codex/src/app-server/side-question.ts     |   7 +-
 .../qa-lab/src/agentic-parity-report.test.ts  |  27 +-
 .../qa-lab/src/agentic-parity-report.ts       |  25 +-
 extensions/qa-lab/src/cli.runtime.test.ts     |   1 +
 .../src/providers/mock-openai/server.test.ts  | 231 +++++++++++++++++-
 .../src/providers/mock-openai/server.ts       |  69 ++++--
 extensions/qa-lab/src/runtime-parity.test.ts  |   4 +
 extensions/qa-lab/src/runtime-parity.ts       |  17 ++
 extensions/qa-lab/src/suite.ts                |   3 +-
 src/agents/openclaw-tools.ts                  |   7 +-
 src/agents/openclaw-tools.update-plan.test.ts |  18 ++
 15 files changed, 454 insertions(+), 48 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index c7d921709a3..3161d793c79 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -123,6 +123,7 @@ Docs: https://docs.openclaw.ai
 - Mac app: cache settings config schema/drafts and load channel config in parallel with channel probes, making repeated Channels and Config tab switches responsive over remote tunnels.
 - Control UI: negotiate the Gateway protocol from shared constants so rebuilt dashboards connect to current gateways instead of reporting a protocol mismatch.
 - Mac app: let menu gateway/session error text wrap across a few lines and stop rebuilding dynamic Context/Gateway menu rows while the menu is open, reducing flicker.
+- QA-Lab: expose Codex runtime tools during private parity runs and treat completed structural/tool-shape runtime drift as advisory, while preserving real runtime failures as lane blockers.
 - Mac app: make device pairing approval sheets friendlier, with concise Mac/device copy, shortened identifiers, friendly scope labels, and Approve as the primary action.
 - Providers/Qwen: honor session thinking level for `qwen-chat-template` payloads so `/think off` disables nested llama.cpp chat-template thinking controls. Fixes #82768. Thanks @bfox55.
 - Feishu/wiki: reject numeric wiki space IDs before creating Lark clients and keep numeric-looking IDs documented as quoted opaque strings, preventing JavaScript precision loss in knowledge base calls. Fixes #45301. (#82769) Thanks @hyspacex.
diff --git a/extensions/codex/src/app-server/dynamic-tool-profile.ts b/extensions/codex/src/app-server/dynamic-tool-profile.ts
index 852ff09fc89..e6a2e30b95f 100644
--- a/extensions/codex/src/app-server/dynamic-tool-profile.ts
+++ b/extensions/codex/src/app-server/dynamic-tool-profile.ts
@@ -1,4 +1,4 @@
-import type { CodexPluginConfig } from "./config.js";
+import type { CodexDynamicToolsLoading, CodexPluginConfig } from "./config.js";
 
 export const CODEX_APP_SERVER_OWNED_DYNAMIC_TOOL_EXCLUDES = [
   "read",
@@ -19,18 +19,44 @@ const DYNAMIC_TOOL_NAME_ALIASES: Record<string, string> = {
   "apply-patch": "apply_patch",
 };
 
+type CodexDynamicToolProfileEnv = {
+  OPENCLAW_BUILD_PRIVATE_QA?: string;
+  OPENCLAW_QA_FORCE_RUNTIME?: string;
+};
+
 export function normalizeCodexDynamicToolName(name: string): string {
   const normalized = name.trim().toLowerCase();
   return DYNAMIC_TOOL_NAME_ALIASES[normalized] ?? normalized;
 }
 
+export function isForcedPrivateQaCodexRuntime(
+  env: CodexDynamicToolProfileEnv = process.env,
+): boolean {
+  return (
+    env.OPENCLAW_BUILD_PRIVATE_QA === "1" &&
+    env.OPENCLAW_QA_FORCE_RUNTIME?.trim().toLowerCase() === "codex"
+  );
+}
+
+export function resolveCodexDynamicToolsLoading(
+  config: Pick<CodexPluginConfig, "codexDynamicToolsLoading">,
+  env: CodexDynamicToolProfileEnv = process.env,
+): CodexDynamicToolsLoading {
+  return isForcedPrivateQaCodexRuntime(env)
+    ? "direct"
+    : (config.codexDynamicToolsLoading ?? "searchable");
+}
+
 export function filterCodexDynamicTools<T extends { name: string }>(
   tools: T[],
   config: Pick<CodexPluginConfig, "codexDynamicToolsExclude">,
+  env: CodexDynamicToolProfileEnv = process.env,
 ): T[] {
   const excludes = new Set<string>();
-  for (const name of CODEX_APP_SERVER_OWNED_DYNAMIC_TOOL_EXCLUDES) {
-    excludes.add(name);
+  if (!isForcedPrivateQaCodexRuntime(env)) {
+    for (const name of CODEX_APP_SERVER_OWNED_DYNAMIC_TOOL_EXCLUDES) {
+      excludes.add(name);
+    }
   }
   for (const name of config.codexDynamicToolsExclude ?? []) {
     const trimmed = normalizeCodexDynamicToolName(name);
diff --git a/extensions/codex/src/app-server/run-attempt.test.ts b/extensions/codex/src/app-server/run-attempt.test.ts
index 48e95327a61..a29f6c82d81 100644
--- a/extensions/codex/src/app-server/run-attempt.test.ts
+++ b/extensions/codex/src/app-server/run-attempt.test.ts
@@ -646,6 +646,21 @@ describe("runCodexAppServerAttempt", () => {
     ).toEqual(["message"]);
   });
 
+  it("exposes app-server-owned tools directly for forced private QA Codex runtime", () => {
+    const tools = ["read", "write", "image_generate", "message"].map((name) => ({ name }));
+    const privateQaCodexEnv = {
+      OPENCLAW_BUILD_PRIVATE_QA: "1",
+      OPENCLAW_QA_FORCE_RUNTIME: "codex",
+    };
+
+    expect(
+      __testing
+        .filterCodexDynamicTools(tools, {}, privateQaCodexEnv)
+        .map((tool) => tool.name),
+    ).toEqual(["read", "write", "image_generate", "message"]);
+    expect(__testing.resolveCodexDynamicToolsLoading({}, privateQaCodexEnv)).toBe("direct");
+  });
+
   it("starts Codex threads without duplicate OpenClaw workspace tools by default", async () => {
     const sessionFile = path.join(tempDir, "session.jsonl");
     const workspaceDir = path.join(tempDir, "workspace");
@@ -897,6 +912,38 @@ describe("runCodexAppServerAttempt", () => {
     expect((factoryOptions[0] as { modelApi?: unknown }).modelApi).toBe("openai-responses");
   });
 
+  it("enables gateway subagent binding for forced private QA Codex runs", async () => {
+    vi.stubEnv("OPENCLAW_BUILD_PRIVATE_QA", "1");
+    vi.stubEnv("OPENCLAW_QA_FORCE_RUNTIME", "codex");
+    const sessionFile = path.join(tempDir, "session.jsonl");
+    const workspaceDir = path.join(tempDir, "workspace");
+    const params = createParams(sessionFile, workspaceDir);
+    params.disableTools = false;
+    params.runtimePlan = createCodexRuntimePlanFixture();
+    const factoryOptions: unknown[] = [];
+    __testing.setOpenClawCodingToolsFactoryForTests((options) => {
+      factoryOptions.push(options);
+      return [createRuntimeDynamicTool("sessions_spawn")];
+    });
+
+    const tools = await __testing.buildDynamicTools({
+      params,
+      resolvedWorkspace: workspaceDir,
+      effectiveWorkspace: workspaceDir,
+      sandboxSessionKey: params.sessionKey!,
+      sandbox: null as never,
+      runAbortController: new AbortController(),
+      sessionAgentId: "main",
+      pluginConfig: {},
+      onYieldDetected: () => undefined,
+    });
+
+    expect(factoryOptions).toHaveLength(1);
+    const factoryOption = factoryOptions[0] as { allowGatewaySubagentBinding?: unknown };
+    expect(factoryOption.allowGatewaySubagentBinding).toBe(true);
+    expect(tools.map((tool) => tool.name)).toEqual(["sessions_spawn"]);
+  });
+
   it("normalizes Codex dynamic toolsAllow entries before filtering", () => {
     const tools = ["exec", "apply_patch", "read", "message"].map((name) => ({ name }));
 
diff --git a/extensions/codex/src/app-server/run-attempt.ts b/extensions/codex/src/app-server/run-attempt.ts
index 93009a7cb6e..cbebccb8ee3 100644
--- a/extensions/codex/src/app-server/run-attempt.ts
+++ b/extensions/codex/src/app-server/run-attempt.ts
@@ -78,7 +78,12 @@ import {
   resolveCodexContextEngineProjectionMaxChars,
   resolveCodexContextEngineProjectionReserveTokens,
 } from "./context-engine-projection.js";
-import { filterCodexDynamicTools, normalizeCodexDynamicToolName } from "./dynamic-tool-profile.js";
+import {
+  filterCodexDynamicTools,
+  isForcedPrivateQaCodexRuntime,
+  normalizeCodexDynamicToolName,
+  resolveCodexDynamicToolsLoading,
+} from "./dynamic-tool-profile.js";
 import { createCodexDynamicToolBridge, type CodexDynamicToolBridge } from "./dynamic-tools.js";
 import { handleCodexAppServerElicitationRequest } from "./elicitation-bridge.js";
 import { CodexAppServerEventProjector } from "./event-projector.js";
@@ -618,7 +623,7 @@ export async function runCodexAppServerAttempt(
   const toolBridge = createCodexDynamicToolBridge({
     tools,
     signal: runAbortController.signal,
-    loading: pluginConfig.codexDynamicToolsLoading ?? "searchable",
+    loading: resolveCodexDynamicToolsLoading(pluginConfig),
     directToolNames: shouldForceMessageTool(params) ? ["message"] : [],
     hookContext: {
       agentId: sessionAgentId,
@@ -2748,7 +2753,8 @@ async function buildDynamicTools(input: DynamicToolBuildParams) {
     senderUsername: params.senderUsername,
     senderE164: params.senderE164,
     senderIsOwner: params.senderIsOwner,
-    allowGatewaySubagentBinding: params.allowGatewaySubagentBinding,
+    allowGatewaySubagentBinding:
+      params.allowGatewaySubagentBinding || isForcedPrivateQaCodexRuntime(),
     ...sessionKeys,
     sessionId: params.sessionId,
     runId: params.runId,
@@ -3933,6 +3939,7 @@ export const __testing = {
   isInvalidCodexImagePayloadError,
   remapCodexContextFilePath,
   resolveDynamicToolCallTimeoutMs,
+  resolveCodexDynamicToolsLoading,
   restrictCodexAppServerSandboxForOpenClawSandbox,
   resolveCodexAppServerForOpenClawToolPolicy,
   resolveOpenClawCodingToolsSessionKeys,
diff --git a/extensions/codex/src/app-server/side-question.ts b/extensions/codex/src/app-server/side-question.ts
index 51848c8c30a..3bb7278306b 100644
--- a/extensions/codex/src/app-server/side-question.ts
+++ b/extensions/codex/src/app-server/side-question.ts
@@ -16,7 +16,10 @@ import { handleCodexAppServerApprovalRequest } from "./approval-bridge.js";
 import { refreshCodexAppServerAuthTokens } from "./auth-bridge.js";
 import { isCodexAppServerApprovalRequest, type CodexAppServerClient } from "./client.js";
 import { readCodexPluginConfig, resolveCodexAppServerRuntimeOptions } from "./config.js";
-import { filterCodexDynamicTools } from "./dynamic-tool-profile.js";
+import {
+  filterCodexDynamicTools,
+  resolveCodexDynamicToolsLoading,
+} from "./dynamic-tool-profile.js";
 import { createCodexDynamicToolBridge, type CodexDynamicToolBridge } from "./dynamic-tools.js";
 import { handleCodexAppServerElicitationRequest } from "./elicitation-bridge.js";
 import {
@@ -378,7 +381,7 @@ async function createCodexSideToolBridge(input: {
   return createCodexDynamicToolBridge({
     tools,
     signal: input.signal,
-    loading: input.pluginConfig.codexDynamicToolsLoading ?? "searchable",
+    loading: resolveCodexDynamicToolsLoading(input.pluginConfig),
     hookContext: {
       agentId: input.sessionAgentId,
       config: input.params.cfg,
diff --git a/extensions/qa-lab/src/agentic-parity-report.test.ts b/extensions/qa-lab/src/agentic-parity-report.test.ts
index 70eade55ef9..c780857801f 100644
--- a/extensions/qa-lab/src/agentic-parity-report.test.ts
+++ b/extensions/qa-lab/src/agentic-parity-report.test.ts
@@ -66,7 +66,7 @@ function makeRuntimeParitySummary(): QaRuntimeParitySuiteSummary {
       },
       {
         name: "Compaction retry after mutating tool",
-        status: "fail",
+        status: "pass",
         steps: [],
         runtimeParity: {
           scenarioId: "compaction-retry-after-mutating-tool",
@@ -97,8 +97,8 @@ function makeRuntimeParitySummary(): QaRuntimeParitySuiteSummary {
     ],
     counts: {
       total: 2,
-      passed: 1,
-      failed: 1,
+      passed: 2,
+      failed: 0,
     },
     run: {
       providerMode: "mock-openai",
@@ -801,9 +801,28 @@ status=done`,
     });
 
     expect(report.runtimePair).toEqual(["pi", "codex"]);
-    expect(report.pass).toBe(false);
+    expect(report.pass).toBe(true);
     expect(report.driftCounts.none).toBe(1);
     expect(report.driftCounts["tool-call-shape"]).toBe(1);
+    expect(report.failures).toEqual([]);
+  });
+
+  it("fails runtime parity reports when a runtime cell fails", () => {
+    const summary = makeRuntimeParitySummary();
+    const scenario = summary.scenarios[1];
+    if (!scenario?.runtimeParity) {
+      throw new Error("runtime parity fixture missing");
+    }
+    scenario.status = "fail";
+    scenario.runtimeParity.cells.codex.runtimeErrorClass = "tool-error";
+
+    const report = buildQaRuntimeParityReport({
+      summary,
+      comparedAt: "2026-05-10T00:00:00.000Z",
+    });
+
+    expect(report.pass).toBe(false);
+    expect(report.failedScenarios).toBe(1);
     expect(report.failures).toContain(
       "Compaction retry after mutating tool drift=tool-call-shape (tool call 1 differs).",
     );
diff --git a/extensions/qa-lab/src/agentic-parity-report.ts b/extensions/qa-lab/src/agentic-parity-report.ts
index 08045bd6c19..7a80c429842 100644
--- a/extensions/qa-lab/src/agentic-parity-report.ts
+++ b/extensions/qa-lab/src/agentic-parity-report.ts
@@ -4,10 +4,10 @@ import {
 } from "./agentic-parity.js";
 import type {
   RuntimeId,
-  RuntimeParityCell,
   RuntimeParityDrift,
   RuntimeParityResult,
 } from "./runtime-parity.js";
+import { isRuntimeParityResultPass, runtimeParityCellStatus } from "./runtime-parity.js";
 
 type QaParityReportStep = {
   name: string;
@@ -260,13 +260,6 @@ function normalizeRuntimePair(
   return ["pi", "codex"];
 }
 
-function runtimeCellStatus(cell: RuntimeParityCell | undefined): "pass" | "fail" | "missing" {
-  if (!cell) {
-    return "missing";
-  }
-  return cell.runtimeErrorClass || cell.transportErrorClass ? "fail" : "pass";
-}
-
 function requiredCoverageStatus(
   scenario: QaParityReportScenario | undefined,
 ): "pass" | "fail" | "skip" | "missing" {
@@ -637,9 +630,9 @@ export function buildQaRuntimeParityReport(params: {
     driftCounts[parity.drift] += 1;
     const piCell = parity.cells.pi;
     const codexCell = parity.cells.codex;
-    const piStatus = runtimeCellStatus(piCell);
-    const codexStatus = runtimeCellStatus(codexCell);
-    const status = scenario.status === "pass" ? "pass" : "fail";
+    const piStatus = runtimeParityCellStatus(piCell);
+    const codexStatus = runtimeParityCellStatus(codexCell);
+    const status = isRuntimeParityResultPass(parity) ? "pass" : "fail";
     if (status === "fail") {
       failures.push(
         `${scenario.name} drift=${parity.drift}${parity.driftDetails ? ` (${parity.driftDetails})` : ""}.`,
@@ -660,12 +653,8 @@ export function buildQaRuntimeParityReport(params: {
   });
 
   const totalScenarios = params.summary.counts?.total ?? scenarios.length;
-  const passedScenarios =
-    params.summary.counts?.passed ??
-    scenarios.filter((scenario) => scenario.status === "pass").length;
-  const failedScenarios =
-    params.summary.counts?.failed ??
-    scenarios.filter((scenario) => scenario.status === "fail").length;
+  const passedScenarios = scenarios.filter((scenario) => scenario.status === "pass").length;
+  const failedScenarios = scenarios.filter((scenario) => scenario.status === "fail").length;
 
   return {
     runtimePair,
@@ -680,7 +669,7 @@ export function buildQaRuntimeParityReport(params: {
     pass: failures.length === 0 && failedScenarios === 0,
     failures,
     notes: [
-      "Runtime parity treats none and text-only drift as pass; all structural, tool-shape, and failure-mode drift classes fail the lane.",
+      "Runtime parity fails runtime, transport, and failure-mode drift; structural and tool-shape drift is recorded as advisory when both runtimes complete.",
       "Token totals here are assistant-message usage captured from the normalized transcript, not provider transport payloads.",
     ],
   };
diff --git a/extensions/qa-lab/src/cli.runtime.test.ts b/extensions/qa-lab/src/cli.runtime.test.ts
index ca5fdd1b096..96ab12effe4 100644
--- a/extensions/qa-lab/src/cli.runtime.test.ts
+++ b/extensions/qa-lab/src/cli.runtime.test.ts
@@ -868,6 +868,7 @@ describe("qa cli runtime", () => {
                     finalText: "done",
                     usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 },
                     wallClockMs: 10,
+                    runtimeErrorClass: "tool-error",
                     bootStateLines: [],
                   },
                 },
diff --git a/extensions/qa-lab/src/providers/mock-openai/server.test.ts b/extensions/qa-lab/src/providers/mock-openai/server.test.ts
index 8ee33634584..a61109fe992 100644
--- a/extensions/qa-lab/src/providers/mock-openai/server.test.ts
+++ b/extensions/qa-lab/src/providers/mock-openai/server.test.ts
@@ -982,6 +982,74 @@ describe("qa mock openai server", () => {
     expect(finalPayload.output?.[0]?.content?.[0]?.text).toContain("replay unsafe after write");
   });
 
+  it("keeps compaction retry planning across continuation prompts", async () => {
+    const server = await startQaMockOpenAiServer({
+      host: "127.0.0.1",
+      port: 0,
+    });
+    cleanups.push(async () => {
+      await server.stop();
+    });
+
+    const prompt =
+      "Compaction retry mutating tool check: read COMPACTION_RETRY_CONTEXT.md, then create compaction-retry-summary.txt and keep replay safety explicit.";
+    const writePlan = await fetch(`${server.baseUrl}/v1/responses`, {
+      method: "POST",
+      headers: { "content-type": "application/json" },
+      body: JSON.stringify({
+        stream: true,
+        model: "gpt-5.5",
+        input: [
+          makeUserInput(prompt),
+          {
+            type: "function_call_output",
+            output: "compaction retry evidence block 0000\ncompaction retry evidence block 0001",
+          },
+          makeUserInput("Continue after compaction."),
+        ],
+      }),
+    });
+    expect(writePlan.status).toBe(200);
+    expect(await writePlan.text()).toContain('"name":"write"');
+
+    const contextOnlyWritePlan = await fetch(`${server.baseUrl}/v1/responses`, {
+      method: "POST",
+      headers: { "content-type": "application/json" },
+      body: JSON.stringify({
+        stream: true,
+        model: "gpt-5.5",
+        input: [
+          {
+            type: "function_call_output",
+            output: "compaction retry evidence block 0000\ncompaction retry evidence block 0001",
+          },
+          makeUserInput("Continue after compaction."),
+        ],
+      }),
+    });
+    expect(contextOnlyWritePlan.status).toBe(200);
+    expect(await contextOnlyWritePlan.text()).toContain('"name":"write"');
+
+    const finalReply = await fetch(`${server.baseUrl}/v1/responses`, {
+      method: "POST",
+      headers: { "content-type": "application/json" },
+      body: JSON.stringify({
+        stream: false,
+        model: "gpt-5.5",
+        input: [
+          makeUserInput(prompt),
+          {
+            type: "function_call_output",
+            output: "Successfully wrote 41 bytes to compaction-retry-summary.txt.",
+          },
+          makeUserInput("Continue after compaction."),
+        ],
+      }),
+    });
+    expect(finalReply.status).toBe(200);
+    expect(outputText(await finalReply.json())).toContain("replay unsafe after write");
+  });
+
   it("supports exact reply memory prompts and embeddings requests", async () => {
     const server = await startQaMockOpenAiServer({
       host: "127.0.0.1",
@@ -1866,6 +1934,165 @@ describe("qa mock openai server", () => {
     expect(outputText(await phaseOnlyFinal.json())).toBe("subagent-1: ok\nsubagent-2: ok");
   });
 
+  it("uses full request text when planning continuation subagent tool calls", async () => {
+    const server = await startQaMockOpenAiServer({
+      host: "127.0.0.1",
+      port: 0,
+    });
+    cleanups.push(async () => {
+      await server.stop();
+    });
+
+    const handoffPrompt =
+      "Delegate one bounded QA task to a subagent. Wait for the subagent to finish.";
+    const handoff = await fetch(`${server.baseUrl}/v1/responses`, {
+      method: "POST",
+      headers: { "content-type": "application/json" },
+      body: JSON.stringify({
+        stream: true,
+        tools: [SESSIONS_SPAWN_TOOL],
+        input: [makeUserInput(handoffPrompt), makeUserInput("Continue.")],
+      }),
+    });
+    expect(handoff.status).toBe(200);
+    expect(await handoff.text()).toContain('"name":"sessions_spawn"');
+
+    const handoffServer = await startQaMockOpenAiServer({
+      host: "127.0.0.1",
+      port: 0,
+    });
+    cleanups.push(async () => {
+      await handoffServer.stop();
+    });
+
+    const appServerHandoff = await fetch(`${handoffServer.baseUrl}/v1/responses`, {
+      method: "POST",
+      headers: { "content-type": "application/json" },
+      body: JSON.stringify({
+        stream: true,
+        input: [makeUserInput(handoffPrompt), makeUserInput("Continue.")],
+      }),
+    });
+    expect(appServerHandoff.status).toBe(200);
+    expect(await appServerHandoff.text()).toContain('"name":"sessions_spawn"');
+
+    const repeatedHandoff = await fetch(`${handoffServer.baseUrl}/v1/responses`, {
+      method: "POST",
+      headers: { "content-type": "application/json" },
+      body: JSON.stringify({
+        stream: true,
+        input: [makeUserInput(handoffPrompt), makeUserInput("Continue again.")],
+      }),
+    });
+    expect(repeatedHandoff.status).toBe(200);
+    expect(await repeatedHandoff.text()).not.toContain('"name":"sessions_spawn"');
+
+    const handoffFinal = await fetch(`${server.baseUrl}/v1/responses`, {
+      method: "POST",
+      headers: { "content-type": "application/json" },
+      body: JSON.stringify({
+        stream: false,
+        tools: [SESSIONS_SPAWN_TOOL],
+        input: [
+          makeUserInput(handoffPrompt),
+          { type: "function_call_output", output: "SUBAGENT-OK" },
+          makeUserInput("Continue."),
+        ],
+      }),
+    });
+    expect(handoffFinal.status).toBe(200);
+    expect(outputText(await handoffFinal.json())).toContain("Delegated task");
+
+    const fanoutPrompt =
+      "Subagent fanout synthesis check: delegate two bounded subagents sequentially, then report both results together.";
+    const appServerFanout = await fetch(`${server.baseUrl}/v1/responses`, {
+      method: "POST",
+      headers: { "content-type": "application/json" },
+      body: JSON.stringify({
+        stream: true,
+        input: [makeUserInput(fanoutPrompt), makeUserInput("Continue.")],
+      }),
+    });
+    expect(appServerFanout.status).toBe(200);
+    expect(await appServerFanout.text()).toContain('\\"label\\":\\"qa-fanout-alpha\\"');
+
+    const fanoutServer = await startQaMockOpenAiServer({
+      host: "127.0.0.1",
+      port: 0,
+    });
+    cleanups.push(async () => {
+      await fanoutServer.stop();
+    });
+
+    const firstFanout = await fetch(`${fanoutServer.baseUrl}/v1/responses`, {
+      method: "POST",
+      headers: { "content-type": "application/json" },
+      body: JSON.stringify({
+        stream: true,
+        tools: [SESSIONS_SPAWN_TOOL],
+        input: [makeUserInput(fanoutPrompt)],
+      }),
+    });
+    expect(firstFanout.status).toBe(200);
+    expect(await firstFanout.text()).toContain('\\"label\\":\\"qa-fanout-alpha\\"');
+
+    const secondFanout = await fetch(`${fanoutServer.baseUrl}/v1/responses`, {
+      method: "POST",
+      headers: { "content-type": "application/json" },
+      body: JSON.stringify({
+        stream: true,
+        tools: [SESSIONS_SPAWN_TOOL],
+        input: [
+          makeUserInput(fanoutPrompt),
+          {
+            type: "function_call_output",
+            output:
+              '{"status":"accepted","childSessionKey":"agent:qa:subagent:alpha","note":"ALPHA-OK"}',
+          },
+          makeUserInput("Continue."),
+        ],
+      }),
+    });
+    expect(secondFanout.status).toBe(200);
+    expect(await secondFanout.text()).toContain('\\"label\\":\\"qa-fanout-beta\\"');
+  });
+
+  it("keeps source discovery reports out of subagent handoff prose", async () => {
+    const server = await startQaMockOpenAiServer({
+      host: "127.0.0.1",
+      port: 0,
+    });
+    cleanups.push(async () => {
+      await server.stop();
+    });
+
+    const response = await fetch(`${server.baseUrl}/v1/responses`, {
+      method: "POST",
+      headers: { "content-type": "application/json" },
+      body: JSON.stringify({
+        stream: false,
+        input: [
+          makeUserInput(
+            "Read the seeded docs and source plan, then report grouped into Worked, Failed, Blocked, and Follow-up.",
+          ),
+          {
+            type: "function_call_output",
+            output:
+              "repo/qa/scenarios/index.md includes scenario: subagent-handoff and repo/extensions/qa-lab/src/suite.ts.",
+          },
+          makeUserInput("Continue."),
+        ],
+      }),
+    });
+
+    expect(response.status).toBe(200);
+    const text = outputText(await response.json());
+    expect(text).toContain("Worked:");
+    expect(text).toContain("repo/docs/help/testing.md");
+    expect(text).toContain("Follow-up:");
+    expect(text).not.toContain("Delegated task");
+  });
+
   it("does not let fanout completion state hijack child worker replies", async () => {
     const server = await startQaMockOpenAiServer({
       host: "127.0.0.1",
@@ -2727,7 +2954,7 @@ describe("qa mock openai server", () => {
       | { name: string; input: Record<string, unknown> }
       | undefined;
     expect(toolUseBlock?.name).toBe("read");
-    expect(toolUseBlock?.input).toEqual({ path: "QA_SCENARIO_PLAN.md" });
+    expect(toolUseBlock?.input).toEqual({ path: "repo/qa/scenarios/index.md" });
 
     const debugResponse = await fetch(`${server.baseUrl}/debug/last-request`);
     expect(debugResponse.status).toBe(200);
@@ -2985,7 +3212,7 @@ describe("qa mock openai server", () => {
     expect(body).toContain("event: content_block_start");
     expect(body).toContain('"type":"tool_use"');
     expect(body).toContain('"name":"read"');
-    expect(body).toContain("QA_SCENARIO_PLAN.md");
+    expect(body).toContain("repo/qa/scenarios/index.md");
     expect(body).toContain("event: message_delta");
     expect(body).toContain("event: message_stop");
   });
diff --git a/extensions/qa-lab/src/providers/mock-openai/server.ts b/extensions/qa-lab/src/providers/mock-openai/server.ts
index 57fd40139de..85b6fc6c0e0 100644
--- a/extensions/qa-lab/src/providers/mock-openai/server.ts
+++ b/extensions/qa-lab/src/providers/mock-openai/server.ts
@@ -177,6 +177,7 @@ const QA_TOOL_SEARCH_FAILURE_PROMPT_RE = /tool search qa failure/i;
 
 type MockScenarioState = {
   subagentFanoutPhase: number;
+  subagentHandoffSpawned: boolean;
 };
 
 const MOCK_OPENAI_MAX_BODY_BYTES = 16 * 1024 * 1024;
@@ -1128,7 +1129,11 @@ function buildAssistantText(
       "- None.",
     ].join("\n");
   }
-  if (toolOutput && (/\bdelegate\b/i.test(prompt) || /subagent handoff/i.test(prompt))) {
+  if (
+    toolOutput &&
+    (/delegate (?:one |a )bounded qa task/i.test(allInputText) ||
+      /subagent handoff/i.test(allInputText))
+  ) {
     const compact = toolOutput.replace(/\s+/g, " ").trim() || "no delegated output";
     return `Delegated task:\n- Inspect the QA workspace via a bounded subagent.\nResult:\n- ${compact}\nEvidence:\n- The child result was folded back into the main thread exactly once.`;
   }
@@ -1141,7 +1146,11 @@ function buildAssistantText(
     }
     return `Protocol note: Lobster Invaders built at lobster-invaders.html.`;
   }
-  if (toolOutput && /compaction retry mutating tool check/i.test(prompt)) {
+  if (
+    toolOutput &&
+    (/compaction retry mutating tool check/i.test(allInputText) ||
+      /compaction-retry-summary\.txt/i.test(toolOutput))
+  ) {
     if (
       toolOutput.includes("Replay safety: unsafe after write.") ||
       /compaction-retry-summary\.txt/i.test(toolOutput) ||
@@ -1152,6 +1161,22 @@ function buildAssistantText(
     }
     return "";
   }
+  if (
+    toolOutput &&
+    /(worked, failed, blocked|worked\/failed\/blocked|source and docs)/i.test(allInputText)
+  ) {
+    return [
+      "Worked:",
+      "- Read all three seeded files: repo/qa/scenarios/index.md, repo/extensions/qa-lab/src/suite.ts, and repo/docs/help/testing.md.",
+      "- Extra QA scenario candidates: config restart capability flip and image generation roundtrip.",
+      "Failed:",
+      "- None observed in mock mode.",
+      "Blocked:",
+      "- No live provider evidence in this lane.",
+      "Follow-up:",
+      "- Re-run with a real model for qualitative coverage.",
+    ].join("\n");
+  }
   if (toolOutput) {
     const snippet = toolOutput.replace(/\s+/g, " ").trim().slice(0, 220);
     return `Protocol note: I reviewed the requested material. Evidence snippet: ${snippet || "no content"}`;
@@ -1501,11 +1526,17 @@ async function buildResponsesPayload(
   const isBaselineUnmentionedChannelChatter = /\bno bot ping here\b/i.test(prompt);
   const hasReasoningOnlyRetryInstruction = allInputText.includes(QA_REASONING_ONLY_RETRY_NEEDLE);
   const hasEmptyResponseRetryInstruction = allInputText.includes(QA_EMPTY_RESPONSE_RETRY_NEEDLE);
-  const canCallSessionsSpawn = hasDeclaredTool(body, "sessions_spawn");
-  const canCallSessionsYield = hasDeclaredTool(body, "sessions_yield");
-  const canPlanQaSessionsSpawn =
-    canCallSessionsSpawn ||
-    /subagent fanout synthesis check|delegate one bounded qa task|subagent handoff/i.test(prompt);
+  const canCallMockSubagentTool =
+    QA_SUBAGENT_DIRECT_FALLBACK_PROMPT_RE.test(allInputText) ||
+    /subagent fanout synthesis check/i.test(allInputText) ||
+    /forked subagent context qa check/i.test(allInputText) ||
+    /delegate (?:one |a )bounded qa task/i.test(allInputText) ||
+    /subagent handoff/i.test(allInputText) ||
+    buildExplicitSessionsSpawnArgs(allInputText) !== null;
+  const canCallSessionsSpawn = hasDeclaredTool(body, "sessions_spawn") || canCallMockSubagentTool;
+  const canCallSessionsYield =
+    hasDeclaredTool(body, "sessions_yield") ||
+    QA_SUBAGENT_DIRECT_FALLBACK_PROMPT_RE.test(allInputText);
   const buildToolProgressReadEvents = (pattern: RegExp) => {
     const toolProgressPrompt = extractLastMatchingUserText(extractAllUserTexts(input), pattern);
     return buildToolCallEventsWithArgs("read", {
@@ -1819,7 +1850,11 @@ async function buildResponsesPayload(
       });
     }
   }
-  if (/compaction retry mutating tool check/i.test(prompt)) {
+  if (
+    /compaction retry mutating tool check/i.test(allInputText) ||
+    /compaction retry evidence/i.test(toolOutput) ||
+    /compaction-retry-summary\.txt/i.test(toolOutput)
+  ) {
     if (!toolOutput) {
       return buildToolCallEventsWithArgs("read", { path: "COMPACTION_RETRY_CONTEXT.md" });
     }
@@ -2002,7 +2037,7 @@ async function buildResponsesPayload(
       size: "1024x1024",
     });
   }
-  if (canPlanQaSessionsSpawn && /subagent fanout synthesis check/i.test(prompt)) {
+  if (canCallSessionsSpawn && /subagent fanout synthesis check/i.test(allInputText)) {
     if (!toolOutput && scenarioState.subagentFanoutPhase === 0) {
       scenarioState.subagentFanoutPhase = 1;
       return buildToolCallEventsWithArgs("sessions_spawn", {
@@ -2078,10 +2113,13 @@ async function buildResponsesPayload(
     }
   }
   if (
-    canPlanQaSessionsSpawn &&
-    (/\bdelegate\b/i.test(prompt) || /subagent handoff/i.test(prompt)) &&
-    !toolOutput
+    canCallSessionsSpawn &&
+    (/delegate (?:one |a )bounded qa task/i.test(allInputText) ||
+      /subagent handoff/i.test(allInputText)) &&
+    !toolOutput &&
+    !scenarioState.subagentHandoffSpawned
   ) {
+    scenarioState.subagentHandoffSpawned = true;
     return buildToolCallEventsWithArgs("sessions_spawn", {
       task: "Inspect the QA workspace and return one concise protocol note.",
       label: "qa-sidecar",
@@ -2092,7 +2130,7 @@ async function buildResponsesPayload(
     /(worked, failed, blocked|worked\/failed\/blocked|source and docs)/i.test(prompt) &&
     !toolOutput
   ) {
-    return buildToolCallEventsWithArgs("read", { path: "QA_SCENARIO_PLAN.md" });
+    return buildToolCallEventsWithArgs("read", { path: "repo/qa/scenarios/index.md" });
   }
   if (!toolOutput && /\b(read|inspect|repo|docs|scenario|kickoff)\b/i.test(prompt)) {
     return buildToolCallEvents(prompt);
@@ -2496,7 +2534,10 @@ async function buildMessagesPayload(
 
 export async function startQaMockOpenAiServer(params?: { host?: string; port?: number }) {
   const host = params?.host ?? "127.0.0.1";
-  const scenarioState: MockScenarioState = { subagentFanoutPhase: 0 };
+  const scenarioState: MockScenarioState = {
+    subagentFanoutPhase: 0,
+    subagentHandoffSpawned: false,
+  };
   let lastRequest: MockOpenAiRequestSnapshot | null = null;
   const requests: MockOpenAiRequestSnapshot[] = [];
   const imageGenerationRequests: Array<Record<string, unknown>> = [];
diff --git a/extensions/qa-lab/src/runtime-parity.test.ts b/extensions/qa-lab/src/runtime-parity.test.ts
index 6a804bc56f6..5d1ec6afe91 100644
--- a/extensions/qa-lab/src/runtime-parity.test.ts
+++ b/extensions/qa-lab/src/runtime-parity.test.ts
@@ -5,6 +5,7 @@ import path from "node:path";
 import { afterEach, describe, expect, it, vi } from "vitest";
 import {
   captureRuntimeParityCell,
+  isRuntimeParityResultPass,
   runRuntimeParityScenario,
   type RuntimeId,
   type RuntimeParityCell,
@@ -179,6 +180,7 @@ describe("runtime parity", () => {
     });
 
     expect(result.drift).toBe("tool-call-shape");
+    expect(isRuntimeParityResultPass(result)).toBe(true);
   });
 
   it("classifies tool result shape drift", async () => {
@@ -220,6 +222,7 @@ describe("runtime parity", () => {
     });
 
     expect(result.drift).toBe("failure-mode");
+    expect(isRuntimeParityResultPass(result)).toBe(false);
   });
 
   it("surfaces tool-call-shape when one runtime fails because the tool path drifted", async () => {
@@ -235,6 +238,7 @@ describe("runtime parity", () => {
     });
 
     expect(result.drift).toBe("tool-call-shape");
+    expect(isRuntimeParityResultPass(result)).toBe(false);
   });
 
   it("surfaces tool-result-shape when a downstream timeout follows divergent tool output", async () => {
diff --git a/extensions/qa-lab/src/runtime-parity.ts b/extensions/qa-lab/src/runtime-parity.ts
index 6879ab2f873..e1ff8464cc0 100644
--- a/extensions/qa-lab/src/runtime-parity.ts
+++ b/extensions/qa-lab/src/runtime-parity.ts
@@ -59,6 +59,23 @@ export type RuntimeParityScenarioExecution = {
   cell: RuntimeParityCell;
 };
 
+export function runtimeParityCellStatus(
+  cell: RuntimeParityCell | undefined,
+): "pass" | "fail" | "missing" {
+  if (!cell) {
+    return "missing";
+  }
+  return cell.runtimeErrorClass || cell.transportErrorClass ? "fail" : "pass";
+}
+
+export function isRuntimeParityResultPass(result: RuntimeParityResult) {
+  return (
+    result.drift !== "failure-mode" &&
+    runtimeParityCellStatus(result.cells.pi) === "pass" &&
+    runtimeParityCellStatus(result.cells.codex) === "pass"
+  );
+}
+
 type QaGatewayLike = {
   logs?: () => string;
   tempRoot: string;
diff --git a/extensions/qa-lab/src/suite.ts b/extensions/qa-lab/src/suite.ts
index 1b73d133c00..7aa3964d8ca 100644
--- a/extensions/qa-lab/src/suite.ts
+++ b/extensions/qa-lab/src/suite.ts
@@ -31,6 +31,7 @@ import { renderQaMarkdownReport, type QaReportCheck, type QaReportScenario } fro
 import { defaultQaModelForMode } from "./run-config.js";
 import {
   captureRuntimeParityCell,
+  isRuntimeParityResultPass,
   runRuntimeParityScenario,
   type RuntimeId,
   type RuntimeParityCell,
@@ -276,7 +277,7 @@ async function runScenarioDefinition(
 }
 
 function isRuntimeParityPass(result: RuntimeParityResult) {
-  return result.drift === "none" || result.drift === "text-only";
+  return isRuntimeParityResultPass(result);
 }
 
 function formatRuntimeParityCellDetails(cell: RuntimeParityCell) {
diff --git a/src/agents/openclaw-tools.ts b/src/agents/openclaw-tools.ts
index 7d9684397c0..74324cdfbd2 100644
--- a/src/agents/openclaw-tools.ts
+++ b/src/agents/openclaw-tools.ts
@@ -346,6 +346,7 @@ export function createOpenClawTools(
     !embedded ||
     options?.sourceReplyDeliveryMode === "message_tool_only" ||
     messageExplicitlyAllowed;
+  const includeSubagentSpawnTool = !embedded || options?.allowGatewaySubagentBinding === true;
   const effectiveCallGateway = embedded
     ? createEmbeddedCallGateway()
     : openClawToolsDeps.callGateway;
@@ -424,6 +425,9 @@ export function createOpenClawTools(
             config: resolvedConfig,
             callGateway: openClawToolsDeps.callGateway,
           }),
+        ]),
+    ...(includeSubagentSpawnTool
+      ? [
           createSessionsSpawnTool({
             agentSessionKey: options?.agentSessionKey,
             agentChannel: options?.agentChannel,
@@ -441,7 +445,8 @@ export function createOpenClawTools(
             inheritedToolAllowlist: options?.inheritedToolAllowlist,
             inheritedToolDenylist: options?.inheritedToolDenylist,
           }),
-        ]),
+        ]
+      : []),
     createSessionsYieldTool({
       sessionId: options?.sessionId,
       onYield: options?.onYield,
diff --git a/src/agents/openclaw-tools.update-plan.test.ts b/src/agents/openclaw-tools.update-plan.test.ts
index ffd676c32c6..6ae4c853b2b 100644
--- a/src/agents/openclaw-tools.update-plan.test.ts
+++ b/src/agents/openclaw-tools.update-plan.test.ts
@@ -123,6 +123,24 @@ describe("openclaw-tools update_plan gating", () => {
     expect(toolNames(denied)).not.toContain("message");
   });
 
+  it("keeps subagent spawn available for trusted embedded gateway-bound runs", () => {
+    setEmbeddedMode(true);
+    const defaultTools = createOpenClawTools({
+      config: {} as OpenClawConfig,
+      disablePluginTools: true,
+    });
+    const gatewayBoundTools = createOpenClawTools({
+      config: {} as OpenClawConfig,
+      disablePluginTools: true,
+      allowGatewaySubagentBinding: true,
+    });
+
+    expect(toolNames(defaultTools)).not.toContain("sessions_spawn");
+    expect(toolNames(defaultTools)).not.toContain("sessions_send");
+    expect(toolNames(gatewayBoundTools)).toContain("sessions_spawn");
+    expect(toolNames(gatewayBoundTools)).not.toContain("sessions_send");
+  });
+
   it("registers update_plan when explicitly enabled", () => {
     const config = {
       tools: {