From 37dcf385e5fa73fbeea959c3d2b0f0bdcaeb52ca Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sun, 17 May 2026 06:46:27 +0800 Subject: [PATCH] fix(qa): expose codex tools for runtime parity --- CHANGELOG.md | 1 + .../src/app-server/dynamic-tool-profile.ts | 32 ++- .../codex/src/app-server/run-attempt.test.ts | 47 ++++ .../codex/src/app-server/run-attempt.ts | 13 +- .../codex/src/app-server/side-question.ts | 7 +- .../qa-lab/src/agentic-parity-report.test.ts | 27 +- .../qa-lab/src/agentic-parity-report.ts | 25 +- extensions/qa-lab/src/cli.runtime.test.ts | 1 + .../src/providers/mock-openai/server.test.ts | 231 +++++++++++++++++- .../src/providers/mock-openai/server.ts | 69 ++++-- extensions/qa-lab/src/runtime-parity.test.ts | 4 + extensions/qa-lab/src/runtime-parity.ts | 17 ++ extensions/qa-lab/src/suite.ts | 3 +- src/agents/openclaw-tools.ts | 7 +- src/agents/openclaw-tools.update-plan.test.ts | 18 ++ 15 files changed, 454 insertions(+), 48 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c7d921709a3..3161d793c79 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -123,6 +123,7 @@ Docs: https://docs.openclaw.ai - Mac app: cache settings config schema/drafts and load channel config in parallel with channel probes, making repeated Channels and Config tab switches responsive over remote tunnels. - Control UI: negotiate the Gateway protocol from shared constants so rebuilt dashboards connect to current gateways instead of reporting a protocol mismatch. - Mac app: let menu gateway/session error text wrap across a few lines and stop rebuilding dynamic Context/Gateway menu rows while the menu is open, reducing flicker. +- QA-Lab: expose Codex runtime tools during private parity runs and treat completed structural/tool-shape runtime drift as advisory, while preserving real runtime failures as lane blockers. - Mac app: make device pairing approval sheets friendlier, with concise Mac/device copy, shortened identifiers, friendly scope labels, and Approve as the primary action. - Providers/Qwen: honor session thinking level for `qwen-chat-template` payloads so `/think off` disables nested llama.cpp chat-template thinking controls. Fixes #82768. Thanks @bfox55. - Feishu/wiki: reject numeric wiki space IDs before creating Lark clients and keep numeric-looking IDs documented as quoted opaque strings, preventing JavaScript precision loss in knowledge base calls. Fixes #45301. (#82769) Thanks @hyspacex. diff --git a/extensions/codex/src/app-server/dynamic-tool-profile.ts b/extensions/codex/src/app-server/dynamic-tool-profile.ts index 852ff09fc89..e6a2e30b95f 100644 --- a/extensions/codex/src/app-server/dynamic-tool-profile.ts +++ b/extensions/codex/src/app-server/dynamic-tool-profile.ts @@ -1,4 +1,4 @@ -import type { CodexPluginConfig } from "./config.js"; +import type { CodexDynamicToolsLoading, CodexPluginConfig } from "./config.js"; export const CODEX_APP_SERVER_OWNED_DYNAMIC_TOOL_EXCLUDES = [ "read", @@ -19,18 +19,44 @@ const DYNAMIC_TOOL_NAME_ALIASES: Record = { "apply-patch": "apply_patch", }; +type CodexDynamicToolProfileEnv = { + OPENCLAW_BUILD_PRIVATE_QA?: string; + OPENCLAW_QA_FORCE_RUNTIME?: string; +}; + export function normalizeCodexDynamicToolName(name: string): string { const normalized = name.trim().toLowerCase(); return DYNAMIC_TOOL_NAME_ALIASES[normalized] ?? normalized; } +export function isForcedPrivateQaCodexRuntime( + env: CodexDynamicToolProfileEnv = process.env, +): boolean { + return ( + env.OPENCLAW_BUILD_PRIVATE_QA === "1" && + env.OPENCLAW_QA_FORCE_RUNTIME?.trim().toLowerCase() === "codex" + ); +} + +export function resolveCodexDynamicToolsLoading( + config: Pick, + env: CodexDynamicToolProfileEnv = process.env, +): CodexDynamicToolsLoading { + return isForcedPrivateQaCodexRuntime(env) + ? "direct" + : (config.codexDynamicToolsLoading ?? "searchable"); +} + export function filterCodexDynamicTools( tools: T[], config: Pick, + env: CodexDynamicToolProfileEnv = process.env, ): T[] { const excludes = new Set(); - for (const name of CODEX_APP_SERVER_OWNED_DYNAMIC_TOOL_EXCLUDES) { - excludes.add(name); + if (!isForcedPrivateQaCodexRuntime(env)) { + for (const name of CODEX_APP_SERVER_OWNED_DYNAMIC_TOOL_EXCLUDES) { + excludes.add(name); + } } for (const name of config.codexDynamicToolsExclude ?? []) { const trimmed = normalizeCodexDynamicToolName(name); diff --git a/extensions/codex/src/app-server/run-attempt.test.ts b/extensions/codex/src/app-server/run-attempt.test.ts index 48e95327a61..a29f6c82d81 100644 --- a/extensions/codex/src/app-server/run-attempt.test.ts +++ b/extensions/codex/src/app-server/run-attempt.test.ts @@ -646,6 +646,21 @@ describe("runCodexAppServerAttempt", () => { ).toEqual(["message"]); }); + it("exposes app-server-owned tools directly for forced private QA Codex runtime", () => { + const tools = ["read", "write", "image_generate", "message"].map((name) => ({ name })); + const privateQaCodexEnv = { + OPENCLAW_BUILD_PRIVATE_QA: "1", + OPENCLAW_QA_FORCE_RUNTIME: "codex", + }; + + expect( + __testing + .filterCodexDynamicTools(tools, {}, privateQaCodexEnv) + .map((tool) => tool.name), + ).toEqual(["read", "write", "image_generate", "message"]); + expect(__testing.resolveCodexDynamicToolsLoading({}, privateQaCodexEnv)).toBe("direct"); + }); + it("starts Codex threads without duplicate OpenClaw workspace tools by default", async () => { const sessionFile = path.join(tempDir, "session.jsonl"); const workspaceDir = path.join(tempDir, "workspace"); @@ -897,6 +912,38 @@ describe("runCodexAppServerAttempt", () => { expect((factoryOptions[0] as { modelApi?: unknown }).modelApi).toBe("openai-responses"); }); + it("enables gateway subagent binding for forced private QA Codex runs", async () => { + vi.stubEnv("OPENCLAW_BUILD_PRIVATE_QA", "1"); + vi.stubEnv("OPENCLAW_QA_FORCE_RUNTIME", "codex"); + const sessionFile = path.join(tempDir, "session.jsonl"); + const workspaceDir = path.join(tempDir, "workspace"); + const params = createParams(sessionFile, workspaceDir); + params.disableTools = false; + params.runtimePlan = createCodexRuntimePlanFixture(); + const factoryOptions: unknown[] = []; + __testing.setOpenClawCodingToolsFactoryForTests((options) => { + factoryOptions.push(options); + return [createRuntimeDynamicTool("sessions_spawn")]; + }); + + const tools = await __testing.buildDynamicTools({ + params, + resolvedWorkspace: workspaceDir, + effectiveWorkspace: workspaceDir, + sandboxSessionKey: params.sessionKey!, + sandbox: null as never, + runAbortController: new AbortController(), + sessionAgentId: "main", + pluginConfig: {}, + onYieldDetected: () => undefined, + }); + + expect(factoryOptions).toHaveLength(1); + const factoryOption = factoryOptions[0] as { allowGatewaySubagentBinding?: unknown }; + expect(factoryOption.allowGatewaySubagentBinding).toBe(true); + expect(tools.map((tool) => tool.name)).toEqual(["sessions_spawn"]); + }); + it("normalizes Codex dynamic toolsAllow entries before filtering", () => { const tools = ["exec", "apply_patch", "read", "message"].map((name) => ({ name })); diff --git a/extensions/codex/src/app-server/run-attempt.ts b/extensions/codex/src/app-server/run-attempt.ts index 93009a7cb6e..cbebccb8ee3 100644 --- a/extensions/codex/src/app-server/run-attempt.ts +++ b/extensions/codex/src/app-server/run-attempt.ts @@ -78,7 +78,12 @@ import { resolveCodexContextEngineProjectionMaxChars, resolveCodexContextEngineProjectionReserveTokens, } from "./context-engine-projection.js"; -import { filterCodexDynamicTools, normalizeCodexDynamicToolName } from "./dynamic-tool-profile.js"; +import { + filterCodexDynamicTools, + isForcedPrivateQaCodexRuntime, + normalizeCodexDynamicToolName, + resolveCodexDynamicToolsLoading, +} from "./dynamic-tool-profile.js"; import { createCodexDynamicToolBridge, type CodexDynamicToolBridge } from "./dynamic-tools.js"; import { handleCodexAppServerElicitationRequest } from "./elicitation-bridge.js"; import { CodexAppServerEventProjector } from "./event-projector.js"; @@ -618,7 +623,7 @@ export async function runCodexAppServerAttempt( const toolBridge = createCodexDynamicToolBridge({ tools, signal: runAbortController.signal, - loading: pluginConfig.codexDynamicToolsLoading ?? "searchable", + loading: resolveCodexDynamicToolsLoading(pluginConfig), directToolNames: shouldForceMessageTool(params) ? ["message"] : [], hookContext: { agentId: sessionAgentId, @@ -2748,7 +2753,8 @@ async function buildDynamicTools(input: DynamicToolBuildParams) { senderUsername: params.senderUsername, senderE164: params.senderE164, senderIsOwner: params.senderIsOwner, - allowGatewaySubagentBinding: params.allowGatewaySubagentBinding, + allowGatewaySubagentBinding: + params.allowGatewaySubagentBinding || isForcedPrivateQaCodexRuntime(), ...sessionKeys, sessionId: params.sessionId, runId: params.runId, @@ -3933,6 +3939,7 @@ export const __testing = { isInvalidCodexImagePayloadError, remapCodexContextFilePath, resolveDynamicToolCallTimeoutMs, + resolveCodexDynamicToolsLoading, restrictCodexAppServerSandboxForOpenClawSandbox, resolveCodexAppServerForOpenClawToolPolicy, resolveOpenClawCodingToolsSessionKeys, diff --git a/extensions/codex/src/app-server/side-question.ts b/extensions/codex/src/app-server/side-question.ts index 51848c8c30a..3bb7278306b 100644 --- a/extensions/codex/src/app-server/side-question.ts +++ b/extensions/codex/src/app-server/side-question.ts @@ -16,7 +16,10 @@ import { handleCodexAppServerApprovalRequest } from "./approval-bridge.js"; import { refreshCodexAppServerAuthTokens } from "./auth-bridge.js"; import { isCodexAppServerApprovalRequest, type CodexAppServerClient } from "./client.js"; import { readCodexPluginConfig, resolveCodexAppServerRuntimeOptions } from "./config.js"; -import { filterCodexDynamicTools } from "./dynamic-tool-profile.js"; +import { + filterCodexDynamicTools, + resolveCodexDynamicToolsLoading, +} from "./dynamic-tool-profile.js"; import { createCodexDynamicToolBridge, type CodexDynamicToolBridge } from "./dynamic-tools.js"; import { handleCodexAppServerElicitationRequest } from "./elicitation-bridge.js"; import { @@ -378,7 +381,7 @@ async function createCodexSideToolBridge(input: { return createCodexDynamicToolBridge({ tools, signal: input.signal, - loading: input.pluginConfig.codexDynamicToolsLoading ?? "searchable", + loading: resolveCodexDynamicToolsLoading(input.pluginConfig), hookContext: { agentId: input.sessionAgentId, config: input.params.cfg, diff --git a/extensions/qa-lab/src/agentic-parity-report.test.ts b/extensions/qa-lab/src/agentic-parity-report.test.ts index 70eade55ef9..c780857801f 100644 --- a/extensions/qa-lab/src/agentic-parity-report.test.ts +++ b/extensions/qa-lab/src/agentic-parity-report.test.ts @@ -66,7 +66,7 @@ function makeRuntimeParitySummary(): QaRuntimeParitySuiteSummary { }, { name: "Compaction retry after mutating tool", - status: "fail", + status: "pass", steps: [], runtimeParity: { scenarioId: "compaction-retry-after-mutating-tool", @@ -97,8 +97,8 @@ function makeRuntimeParitySummary(): QaRuntimeParitySuiteSummary { ], counts: { total: 2, - passed: 1, - failed: 1, + passed: 2, + failed: 0, }, run: { providerMode: "mock-openai", @@ -801,9 +801,28 @@ status=done`, }); expect(report.runtimePair).toEqual(["pi", "codex"]); - expect(report.pass).toBe(false); + expect(report.pass).toBe(true); expect(report.driftCounts.none).toBe(1); expect(report.driftCounts["tool-call-shape"]).toBe(1); + expect(report.failures).toEqual([]); + }); + + it("fails runtime parity reports when a runtime cell fails", () => { + const summary = makeRuntimeParitySummary(); + const scenario = summary.scenarios[1]; + if (!scenario?.runtimeParity) { + throw new Error("runtime parity fixture missing"); + } + scenario.status = "fail"; + scenario.runtimeParity.cells.codex.runtimeErrorClass = "tool-error"; + + const report = buildQaRuntimeParityReport({ + summary, + comparedAt: "2026-05-10T00:00:00.000Z", + }); + + expect(report.pass).toBe(false); + expect(report.failedScenarios).toBe(1); expect(report.failures).toContain( "Compaction retry after mutating tool drift=tool-call-shape (tool call 1 differs).", ); diff --git a/extensions/qa-lab/src/agentic-parity-report.ts b/extensions/qa-lab/src/agentic-parity-report.ts index 08045bd6c19..7a80c429842 100644 --- a/extensions/qa-lab/src/agentic-parity-report.ts +++ b/extensions/qa-lab/src/agentic-parity-report.ts @@ -4,10 +4,10 @@ import { } from "./agentic-parity.js"; import type { RuntimeId, - RuntimeParityCell, RuntimeParityDrift, RuntimeParityResult, } from "./runtime-parity.js"; +import { isRuntimeParityResultPass, runtimeParityCellStatus } from "./runtime-parity.js"; type QaParityReportStep = { name: string; @@ -260,13 +260,6 @@ function normalizeRuntimePair( return ["pi", "codex"]; } -function runtimeCellStatus(cell: RuntimeParityCell | undefined): "pass" | "fail" | "missing" { - if (!cell) { - return "missing"; - } - return cell.runtimeErrorClass || cell.transportErrorClass ? "fail" : "pass"; -} - function requiredCoverageStatus( scenario: QaParityReportScenario | undefined, ): "pass" | "fail" | "skip" | "missing" { @@ -637,9 +630,9 @@ export function buildQaRuntimeParityReport(params: { driftCounts[parity.drift] += 1; const piCell = parity.cells.pi; const codexCell = parity.cells.codex; - const piStatus = runtimeCellStatus(piCell); - const codexStatus = runtimeCellStatus(codexCell); - const status = scenario.status === "pass" ? "pass" : "fail"; + const piStatus = runtimeParityCellStatus(piCell); + const codexStatus = runtimeParityCellStatus(codexCell); + const status = isRuntimeParityResultPass(parity) ? "pass" : "fail"; if (status === "fail") { failures.push( `${scenario.name} drift=${parity.drift}${parity.driftDetails ? ` (${parity.driftDetails})` : ""}.`, @@ -660,12 +653,8 @@ export function buildQaRuntimeParityReport(params: { }); const totalScenarios = params.summary.counts?.total ?? scenarios.length; - const passedScenarios = - params.summary.counts?.passed ?? - scenarios.filter((scenario) => scenario.status === "pass").length; - const failedScenarios = - params.summary.counts?.failed ?? - scenarios.filter((scenario) => scenario.status === "fail").length; + const passedScenarios = scenarios.filter((scenario) => scenario.status === "pass").length; + const failedScenarios = scenarios.filter((scenario) => scenario.status === "fail").length; return { runtimePair, @@ -680,7 +669,7 @@ export function buildQaRuntimeParityReport(params: { pass: failures.length === 0 && failedScenarios === 0, failures, notes: [ - "Runtime parity treats none and text-only drift as pass; all structural, tool-shape, and failure-mode drift classes fail the lane.", + "Runtime parity fails runtime, transport, and failure-mode drift; structural and tool-shape drift is recorded as advisory when both runtimes complete.", "Token totals here are assistant-message usage captured from the normalized transcript, not provider transport payloads.", ], }; diff --git a/extensions/qa-lab/src/cli.runtime.test.ts b/extensions/qa-lab/src/cli.runtime.test.ts index ca5fdd1b096..96ab12effe4 100644 --- a/extensions/qa-lab/src/cli.runtime.test.ts +++ b/extensions/qa-lab/src/cli.runtime.test.ts @@ -868,6 +868,7 @@ describe("qa cli runtime", () => { finalText: "done", usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 }, wallClockMs: 10, + runtimeErrorClass: "tool-error", bootStateLines: [], }, }, diff --git a/extensions/qa-lab/src/providers/mock-openai/server.test.ts b/extensions/qa-lab/src/providers/mock-openai/server.test.ts index 8ee33634584..a61109fe992 100644 --- a/extensions/qa-lab/src/providers/mock-openai/server.test.ts +++ b/extensions/qa-lab/src/providers/mock-openai/server.test.ts @@ -982,6 +982,74 @@ describe("qa mock openai server", () => { expect(finalPayload.output?.[0]?.content?.[0]?.text).toContain("replay unsafe after write"); }); + it("keeps compaction retry planning across continuation prompts", async () => { + const server = await startQaMockOpenAiServer({ + host: "127.0.0.1", + port: 0, + }); + cleanups.push(async () => { + await server.stop(); + }); + + const prompt = + "Compaction retry mutating tool check: read COMPACTION_RETRY_CONTEXT.md, then create compaction-retry-summary.txt and keep replay safety explicit."; + const writePlan = await fetch(`${server.baseUrl}/v1/responses`, { + method: "POST", + headers: { "content-type": "application/json" }, + body: JSON.stringify({ + stream: true, + model: "gpt-5.5", + input: [ + makeUserInput(prompt), + { + type: "function_call_output", + output: "compaction retry evidence block 0000\ncompaction retry evidence block 0001", + }, + makeUserInput("Continue after compaction."), + ], + }), + }); + expect(writePlan.status).toBe(200); + expect(await writePlan.text()).toContain('"name":"write"'); + + const contextOnlyWritePlan = await fetch(`${server.baseUrl}/v1/responses`, { + method: "POST", + headers: { "content-type": "application/json" }, + body: JSON.stringify({ + stream: true, + model: "gpt-5.5", + input: [ + { + type: "function_call_output", + output: "compaction retry evidence block 0000\ncompaction retry evidence block 0001", + }, + makeUserInput("Continue after compaction."), + ], + }), + }); + expect(contextOnlyWritePlan.status).toBe(200); + expect(await contextOnlyWritePlan.text()).toContain('"name":"write"'); + + const finalReply = await fetch(`${server.baseUrl}/v1/responses`, { + method: "POST", + headers: { "content-type": "application/json" }, + body: JSON.stringify({ + stream: false, + model: "gpt-5.5", + input: [ + makeUserInput(prompt), + { + type: "function_call_output", + output: "Successfully wrote 41 bytes to compaction-retry-summary.txt.", + }, + makeUserInput("Continue after compaction."), + ], + }), + }); + expect(finalReply.status).toBe(200); + expect(outputText(await finalReply.json())).toContain("replay unsafe after write"); + }); + it("supports exact reply memory prompts and embeddings requests", async () => { const server = await startQaMockOpenAiServer({ host: "127.0.0.1", @@ -1866,6 +1934,165 @@ describe("qa mock openai server", () => { expect(outputText(await phaseOnlyFinal.json())).toBe("subagent-1: ok\nsubagent-2: ok"); }); + it("uses full request text when planning continuation subagent tool calls", async () => { + const server = await startQaMockOpenAiServer({ + host: "127.0.0.1", + port: 0, + }); + cleanups.push(async () => { + await server.stop(); + }); + + const handoffPrompt = + "Delegate one bounded QA task to a subagent. Wait for the subagent to finish."; + const handoff = await fetch(`${server.baseUrl}/v1/responses`, { + method: "POST", + headers: { "content-type": "application/json" }, + body: JSON.stringify({ + stream: true, + tools: [SESSIONS_SPAWN_TOOL], + input: [makeUserInput(handoffPrompt), makeUserInput("Continue.")], + }), + }); + expect(handoff.status).toBe(200); + expect(await handoff.text()).toContain('"name":"sessions_spawn"'); + + const handoffServer = await startQaMockOpenAiServer({ + host: "127.0.0.1", + port: 0, + }); + cleanups.push(async () => { + await handoffServer.stop(); + }); + + const appServerHandoff = await fetch(`${handoffServer.baseUrl}/v1/responses`, { + method: "POST", + headers: { "content-type": "application/json" }, + body: JSON.stringify({ + stream: true, + input: [makeUserInput(handoffPrompt), makeUserInput("Continue.")], + }), + }); + expect(appServerHandoff.status).toBe(200); + expect(await appServerHandoff.text()).toContain('"name":"sessions_spawn"'); + + const repeatedHandoff = await fetch(`${handoffServer.baseUrl}/v1/responses`, { + method: "POST", + headers: { "content-type": "application/json" }, + body: JSON.stringify({ + stream: true, + input: [makeUserInput(handoffPrompt), makeUserInput("Continue again.")], + }), + }); + expect(repeatedHandoff.status).toBe(200); + expect(await repeatedHandoff.text()).not.toContain('"name":"sessions_spawn"'); + + const handoffFinal = await fetch(`${server.baseUrl}/v1/responses`, { + method: "POST", + headers: { "content-type": "application/json" }, + body: JSON.stringify({ + stream: false, + tools: [SESSIONS_SPAWN_TOOL], + input: [ + makeUserInput(handoffPrompt), + { type: "function_call_output", output: "SUBAGENT-OK" }, + makeUserInput("Continue."), + ], + }), + }); + expect(handoffFinal.status).toBe(200); + expect(outputText(await handoffFinal.json())).toContain("Delegated task"); + + const fanoutPrompt = + "Subagent fanout synthesis check: delegate two bounded subagents sequentially, then report both results together."; + const appServerFanout = await fetch(`${server.baseUrl}/v1/responses`, { + method: "POST", + headers: { "content-type": "application/json" }, + body: JSON.stringify({ + stream: true, + input: [makeUserInput(fanoutPrompt), makeUserInput("Continue.")], + }), + }); + expect(appServerFanout.status).toBe(200); + expect(await appServerFanout.text()).toContain('\\"label\\":\\"qa-fanout-alpha\\"'); + + const fanoutServer = await startQaMockOpenAiServer({ + host: "127.0.0.1", + port: 0, + }); + cleanups.push(async () => { + await fanoutServer.stop(); + }); + + const firstFanout = await fetch(`${fanoutServer.baseUrl}/v1/responses`, { + method: "POST", + headers: { "content-type": "application/json" }, + body: JSON.stringify({ + stream: true, + tools: [SESSIONS_SPAWN_TOOL], + input: [makeUserInput(fanoutPrompt)], + }), + }); + expect(firstFanout.status).toBe(200); + expect(await firstFanout.text()).toContain('\\"label\\":\\"qa-fanout-alpha\\"'); + + const secondFanout = await fetch(`${fanoutServer.baseUrl}/v1/responses`, { + method: "POST", + headers: { "content-type": "application/json" }, + body: JSON.stringify({ + stream: true, + tools: [SESSIONS_SPAWN_TOOL], + input: [ + makeUserInput(fanoutPrompt), + { + type: "function_call_output", + output: + '{"status":"accepted","childSessionKey":"agent:qa:subagent:alpha","note":"ALPHA-OK"}', + }, + makeUserInput("Continue."), + ], + }), + }); + expect(secondFanout.status).toBe(200); + expect(await secondFanout.text()).toContain('\\"label\\":\\"qa-fanout-beta\\"'); + }); + + it("keeps source discovery reports out of subagent handoff prose", async () => { + const server = await startQaMockOpenAiServer({ + host: "127.0.0.1", + port: 0, + }); + cleanups.push(async () => { + await server.stop(); + }); + + const response = await fetch(`${server.baseUrl}/v1/responses`, { + method: "POST", + headers: { "content-type": "application/json" }, + body: JSON.stringify({ + stream: false, + input: [ + makeUserInput( + "Read the seeded docs and source plan, then report grouped into Worked, Failed, Blocked, and Follow-up.", + ), + { + type: "function_call_output", + output: + "repo/qa/scenarios/index.md includes scenario: subagent-handoff and repo/extensions/qa-lab/src/suite.ts.", + }, + makeUserInput("Continue."), + ], + }), + }); + + expect(response.status).toBe(200); + const text = outputText(await response.json()); + expect(text).toContain("Worked:"); + expect(text).toContain("repo/docs/help/testing.md"); + expect(text).toContain("Follow-up:"); + expect(text).not.toContain("Delegated task"); + }); + it("does not let fanout completion state hijack child worker replies", async () => { const server = await startQaMockOpenAiServer({ host: "127.0.0.1", @@ -2727,7 +2954,7 @@ describe("qa mock openai server", () => { | { name: string; input: Record } | undefined; expect(toolUseBlock?.name).toBe("read"); - expect(toolUseBlock?.input).toEqual({ path: "QA_SCENARIO_PLAN.md" }); + expect(toolUseBlock?.input).toEqual({ path: "repo/qa/scenarios/index.md" }); const debugResponse = await fetch(`${server.baseUrl}/debug/last-request`); expect(debugResponse.status).toBe(200); @@ -2985,7 +3212,7 @@ describe("qa mock openai server", () => { expect(body).toContain("event: content_block_start"); expect(body).toContain('"type":"tool_use"'); expect(body).toContain('"name":"read"'); - expect(body).toContain("QA_SCENARIO_PLAN.md"); + expect(body).toContain("repo/qa/scenarios/index.md"); expect(body).toContain("event: message_delta"); expect(body).toContain("event: message_stop"); }); diff --git a/extensions/qa-lab/src/providers/mock-openai/server.ts b/extensions/qa-lab/src/providers/mock-openai/server.ts index 57fd40139de..85b6fc6c0e0 100644 --- a/extensions/qa-lab/src/providers/mock-openai/server.ts +++ b/extensions/qa-lab/src/providers/mock-openai/server.ts @@ -177,6 +177,7 @@ const QA_TOOL_SEARCH_FAILURE_PROMPT_RE = /tool search qa failure/i; type MockScenarioState = { subagentFanoutPhase: number; + subagentHandoffSpawned: boolean; }; const MOCK_OPENAI_MAX_BODY_BYTES = 16 * 1024 * 1024; @@ -1128,7 +1129,11 @@ function buildAssistantText( "- None.", ].join("\n"); } - if (toolOutput && (/\bdelegate\b/i.test(prompt) || /subagent handoff/i.test(prompt))) { + if ( + toolOutput && + (/delegate (?:one |a )bounded qa task/i.test(allInputText) || + /subagent handoff/i.test(allInputText)) + ) { const compact = toolOutput.replace(/\s+/g, " ").trim() || "no delegated output"; return `Delegated task:\n- Inspect the QA workspace via a bounded subagent.\nResult:\n- ${compact}\nEvidence:\n- The child result was folded back into the main thread exactly once.`; } @@ -1141,7 +1146,11 @@ function buildAssistantText( } return `Protocol note: Lobster Invaders built at lobster-invaders.html.`; } - if (toolOutput && /compaction retry mutating tool check/i.test(prompt)) { + if ( + toolOutput && + (/compaction retry mutating tool check/i.test(allInputText) || + /compaction-retry-summary\.txt/i.test(toolOutput)) + ) { if ( toolOutput.includes("Replay safety: unsafe after write.") || /compaction-retry-summary\.txt/i.test(toolOutput) || @@ -1152,6 +1161,22 @@ function buildAssistantText( } return ""; } + if ( + toolOutput && + /(worked, failed, blocked|worked\/failed\/blocked|source and docs)/i.test(allInputText) + ) { + return [ + "Worked:", + "- Read all three seeded files: repo/qa/scenarios/index.md, repo/extensions/qa-lab/src/suite.ts, and repo/docs/help/testing.md.", + "- Extra QA scenario candidates: config restart capability flip and image generation roundtrip.", + "Failed:", + "- None observed in mock mode.", + "Blocked:", + "- No live provider evidence in this lane.", + "Follow-up:", + "- Re-run with a real model for qualitative coverage.", + ].join("\n"); + } if (toolOutput) { const snippet = toolOutput.replace(/\s+/g, " ").trim().slice(0, 220); return `Protocol note: I reviewed the requested material. Evidence snippet: ${snippet || "no content"}`; @@ -1501,11 +1526,17 @@ async function buildResponsesPayload( const isBaselineUnmentionedChannelChatter = /\bno bot ping here\b/i.test(prompt); const hasReasoningOnlyRetryInstruction = allInputText.includes(QA_REASONING_ONLY_RETRY_NEEDLE); const hasEmptyResponseRetryInstruction = allInputText.includes(QA_EMPTY_RESPONSE_RETRY_NEEDLE); - const canCallSessionsSpawn = hasDeclaredTool(body, "sessions_spawn"); - const canCallSessionsYield = hasDeclaredTool(body, "sessions_yield"); - const canPlanQaSessionsSpawn = - canCallSessionsSpawn || - /subagent fanout synthesis check|delegate one bounded qa task|subagent handoff/i.test(prompt); + const canCallMockSubagentTool = + QA_SUBAGENT_DIRECT_FALLBACK_PROMPT_RE.test(allInputText) || + /subagent fanout synthesis check/i.test(allInputText) || + /forked subagent context qa check/i.test(allInputText) || + /delegate (?:one |a )bounded qa task/i.test(allInputText) || + /subagent handoff/i.test(allInputText) || + buildExplicitSessionsSpawnArgs(allInputText) !== null; + const canCallSessionsSpawn = hasDeclaredTool(body, "sessions_spawn") || canCallMockSubagentTool; + const canCallSessionsYield = + hasDeclaredTool(body, "sessions_yield") || + QA_SUBAGENT_DIRECT_FALLBACK_PROMPT_RE.test(allInputText); const buildToolProgressReadEvents = (pattern: RegExp) => { const toolProgressPrompt = extractLastMatchingUserText(extractAllUserTexts(input), pattern); return buildToolCallEventsWithArgs("read", { @@ -1819,7 +1850,11 @@ async function buildResponsesPayload( }); } } - if (/compaction retry mutating tool check/i.test(prompt)) { + if ( + /compaction retry mutating tool check/i.test(allInputText) || + /compaction retry evidence/i.test(toolOutput) || + /compaction-retry-summary\.txt/i.test(toolOutput) + ) { if (!toolOutput) { return buildToolCallEventsWithArgs("read", { path: "COMPACTION_RETRY_CONTEXT.md" }); } @@ -2002,7 +2037,7 @@ async function buildResponsesPayload( size: "1024x1024", }); } - if (canPlanQaSessionsSpawn && /subagent fanout synthesis check/i.test(prompt)) { + if (canCallSessionsSpawn && /subagent fanout synthesis check/i.test(allInputText)) { if (!toolOutput && scenarioState.subagentFanoutPhase === 0) { scenarioState.subagentFanoutPhase = 1; return buildToolCallEventsWithArgs("sessions_spawn", { @@ -2078,10 +2113,13 @@ async function buildResponsesPayload( } } if ( - canPlanQaSessionsSpawn && - (/\bdelegate\b/i.test(prompt) || /subagent handoff/i.test(prompt)) && - !toolOutput + canCallSessionsSpawn && + (/delegate (?:one |a )bounded qa task/i.test(allInputText) || + /subagent handoff/i.test(allInputText)) && + !toolOutput && + !scenarioState.subagentHandoffSpawned ) { + scenarioState.subagentHandoffSpawned = true; return buildToolCallEventsWithArgs("sessions_spawn", { task: "Inspect the QA workspace and return one concise protocol note.", label: "qa-sidecar", @@ -2092,7 +2130,7 @@ async function buildResponsesPayload( /(worked, failed, blocked|worked\/failed\/blocked|source and docs)/i.test(prompt) && !toolOutput ) { - return buildToolCallEventsWithArgs("read", { path: "QA_SCENARIO_PLAN.md" }); + return buildToolCallEventsWithArgs("read", { path: "repo/qa/scenarios/index.md" }); } if (!toolOutput && /\b(read|inspect|repo|docs|scenario|kickoff)\b/i.test(prompt)) { return buildToolCallEvents(prompt); @@ -2496,7 +2534,10 @@ async function buildMessagesPayload( export async function startQaMockOpenAiServer(params?: { host?: string; port?: number }) { const host = params?.host ?? "127.0.0.1"; - const scenarioState: MockScenarioState = { subagentFanoutPhase: 0 }; + const scenarioState: MockScenarioState = { + subagentFanoutPhase: 0, + subagentHandoffSpawned: false, + }; let lastRequest: MockOpenAiRequestSnapshot | null = null; const requests: MockOpenAiRequestSnapshot[] = []; const imageGenerationRequests: Array> = []; diff --git a/extensions/qa-lab/src/runtime-parity.test.ts b/extensions/qa-lab/src/runtime-parity.test.ts index 6a804bc56f6..5d1ec6afe91 100644 --- a/extensions/qa-lab/src/runtime-parity.test.ts +++ b/extensions/qa-lab/src/runtime-parity.test.ts @@ -5,6 +5,7 @@ import path from "node:path"; import { afterEach, describe, expect, it, vi } from "vitest"; import { captureRuntimeParityCell, + isRuntimeParityResultPass, runRuntimeParityScenario, type RuntimeId, type RuntimeParityCell, @@ -179,6 +180,7 @@ describe("runtime parity", () => { }); expect(result.drift).toBe("tool-call-shape"); + expect(isRuntimeParityResultPass(result)).toBe(true); }); it("classifies tool result shape drift", async () => { @@ -220,6 +222,7 @@ describe("runtime parity", () => { }); expect(result.drift).toBe("failure-mode"); + expect(isRuntimeParityResultPass(result)).toBe(false); }); it("surfaces tool-call-shape when one runtime fails because the tool path drifted", async () => { @@ -235,6 +238,7 @@ describe("runtime parity", () => { }); expect(result.drift).toBe("tool-call-shape"); + expect(isRuntimeParityResultPass(result)).toBe(false); }); it("surfaces tool-result-shape when a downstream timeout follows divergent tool output", async () => { diff --git a/extensions/qa-lab/src/runtime-parity.ts b/extensions/qa-lab/src/runtime-parity.ts index 6879ab2f873..e1ff8464cc0 100644 --- a/extensions/qa-lab/src/runtime-parity.ts +++ b/extensions/qa-lab/src/runtime-parity.ts @@ -59,6 +59,23 @@ export type RuntimeParityScenarioExecution = { cell: RuntimeParityCell; }; +export function runtimeParityCellStatus( + cell: RuntimeParityCell | undefined, +): "pass" | "fail" | "missing" { + if (!cell) { + return "missing"; + } + return cell.runtimeErrorClass || cell.transportErrorClass ? "fail" : "pass"; +} + +export function isRuntimeParityResultPass(result: RuntimeParityResult) { + return ( + result.drift !== "failure-mode" && + runtimeParityCellStatus(result.cells.pi) === "pass" && + runtimeParityCellStatus(result.cells.codex) === "pass" + ); +} + type QaGatewayLike = { logs?: () => string; tempRoot: string; diff --git a/extensions/qa-lab/src/suite.ts b/extensions/qa-lab/src/suite.ts index 1b73d133c00..7aa3964d8ca 100644 --- a/extensions/qa-lab/src/suite.ts +++ b/extensions/qa-lab/src/suite.ts @@ -31,6 +31,7 @@ import { renderQaMarkdownReport, type QaReportCheck, type QaReportScenario } fro import { defaultQaModelForMode } from "./run-config.js"; import { captureRuntimeParityCell, + isRuntimeParityResultPass, runRuntimeParityScenario, type RuntimeId, type RuntimeParityCell, @@ -276,7 +277,7 @@ async function runScenarioDefinition( } function isRuntimeParityPass(result: RuntimeParityResult) { - return result.drift === "none" || result.drift === "text-only"; + return isRuntimeParityResultPass(result); } function formatRuntimeParityCellDetails(cell: RuntimeParityCell) { diff --git a/src/agents/openclaw-tools.ts b/src/agents/openclaw-tools.ts index 7d9684397c0..74324cdfbd2 100644 --- a/src/agents/openclaw-tools.ts +++ b/src/agents/openclaw-tools.ts @@ -346,6 +346,7 @@ export function createOpenClawTools( !embedded || options?.sourceReplyDeliveryMode === "message_tool_only" || messageExplicitlyAllowed; + const includeSubagentSpawnTool = !embedded || options?.allowGatewaySubagentBinding === true; const effectiveCallGateway = embedded ? createEmbeddedCallGateway() : openClawToolsDeps.callGateway; @@ -424,6 +425,9 @@ export function createOpenClawTools( config: resolvedConfig, callGateway: openClawToolsDeps.callGateway, }), + ]), + ...(includeSubagentSpawnTool + ? [ createSessionsSpawnTool({ agentSessionKey: options?.agentSessionKey, agentChannel: options?.agentChannel, @@ -441,7 +445,8 @@ export function createOpenClawTools( inheritedToolAllowlist: options?.inheritedToolAllowlist, inheritedToolDenylist: options?.inheritedToolDenylist, }), - ]), + ] + : []), createSessionsYieldTool({ sessionId: options?.sessionId, onYield: options?.onYield, diff --git a/src/agents/openclaw-tools.update-plan.test.ts b/src/agents/openclaw-tools.update-plan.test.ts index ffd676c32c6..6ae4c853b2b 100644 --- a/src/agents/openclaw-tools.update-plan.test.ts +++ b/src/agents/openclaw-tools.update-plan.test.ts @@ -123,6 +123,24 @@ describe("openclaw-tools update_plan gating", () => { expect(toolNames(denied)).not.toContain("message"); }); + it("keeps subagent spawn available for trusted embedded gateway-bound runs", () => { + setEmbeddedMode(true); + const defaultTools = createOpenClawTools({ + config: {} as OpenClawConfig, + disablePluginTools: true, + }); + const gatewayBoundTools = createOpenClawTools({ + config: {} as OpenClawConfig, + disablePluginTools: true, + allowGatewaySubagentBinding: true, + }); + + expect(toolNames(defaultTools)).not.toContain("sessions_spawn"); + expect(toolNames(defaultTools)).not.toContain("sessions_send"); + expect(toolNames(gatewayBoundTools)).toContain("sessions_spawn"); + expect(toolNames(gatewayBoundTools)).not.toContain("sessions_send"); + }); + it("registers update_plan when explicitly enabled", () => { const config = { tools: {