fix(qa): expose codex tools for runtime parity

This commit is contained in:
Vincent Koc
2026-05-17 06:46:27 +08:00
parent 2c9f68f42b
commit 37dcf385e5
15 changed files with 454 additions and 48 deletions

View File

@@ -123,6 +123,7 @@ Docs: https://docs.openclaw.ai
- Mac app: cache settings config schema/drafts and load channel config in parallel with channel probes, making repeated Channels and Config tab switches responsive over remote tunnels.
- Control UI: negotiate the Gateway protocol from shared constants so rebuilt dashboards connect to current gateways instead of reporting a protocol mismatch.
- Mac app: let menu gateway/session error text wrap across a few lines and stop rebuilding dynamic Context/Gateway menu rows while the menu is open, reducing flicker.
- QA-Lab: expose Codex runtime tools during private parity runs and treat completed structural/tool-shape runtime drift as advisory, while preserving real runtime failures as lane blockers.
- Mac app: make device pairing approval sheets friendlier, with concise Mac/device copy, shortened identifiers, friendly scope labels, and Approve as the primary action.
- Providers/Qwen: honor session thinking level for `qwen-chat-template` payloads so `/think off` disables nested llama.cpp chat-template thinking controls. Fixes #82768. Thanks @bfox55.
- Feishu/wiki: reject numeric wiki space IDs before creating Lark clients and keep numeric-looking IDs documented as quoted opaque strings, preventing JavaScript precision loss in knowledge base calls. Fixes #45301. (#82769) Thanks @hyspacex.

View File

@@ -1,4 +1,4 @@
import type { CodexPluginConfig } from "./config.js";
import type { CodexDynamicToolsLoading, CodexPluginConfig } from "./config.js";
export const CODEX_APP_SERVER_OWNED_DYNAMIC_TOOL_EXCLUDES = [
"read",
@@ -19,18 +19,44 @@ const DYNAMIC_TOOL_NAME_ALIASES: Record<string, string> = {
"apply-patch": "apply_patch",
};
type CodexDynamicToolProfileEnv = {
OPENCLAW_BUILD_PRIVATE_QA?: string;
OPENCLAW_QA_FORCE_RUNTIME?: string;
};
export function normalizeCodexDynamicToolName(name: string): string {
const normalized = name.trim().toLowerCase();
return DYNAMIC_TOOL_NAME_ALIASES[normalized] ?? normalized;
}
export function isForcedPrivateQaCodexRuntime(
env: CodexDynamicToolProfileEnv = process.env,
): boolean {
return (
env.OPENCLAW_BUILD_PRIVATE_QA === "1" &&
env.OPENCLAW_QA_FORCE_RUNTIME?.trim().toLowerCase() === "codex"
);
}
export function resolveCodexDynamicToolsLoading(
config: Pick<CodexPluginConfig, "codexDynamicToolsLoading">,
env: CodexDynamicToolProfileEnv = process.env,
): CodexDynamicToolsLoading {
return isForcedPrivateQaCodexRuntime(env)
? "direct"
: (config.codexDynamicToolsLoading ?? "searchable");
}
export function filterCodexDynamicTools<T extends { name: string }>(
tools: T[],
config: Pick<CodexPluginConfig, "codexDynamicToolsExclude">,
env: CodexDynamicToolProfileEnv = process.env,
): T[] {
const excludes = new Set<string>();
for (const name of CODEX_APP_SERVER_OWNED_DYNAMIC_TOOL_EXCLUDES) {
excludes.add(name);
if (!isForcedPrivateQaCodexRuntime(env)) {
for (const name of CODEX_APP_SERVER_OWNED_DYNAMIC_TOOL_EXCLUDES) {
excludes.add(name);
}
}
for (const name of config.codexDynamicToolsExclude ?? []) {
const trimmed = normalizeCodexDynamicToolName(name);

View File

@@ -646,6 +646,21 @@ describe("runCodexAppServerAttempt", () => {
).toEqual(["message"]);
});
it("exposes app-server-owned tools directly for forced private QA Codex runtime", () => {
const tools = ["read", "write", "image_generate", "message"].map((name) => ({ name }));
const privateQaCodexEnv = {
OPENCLAW_BUILD_PRIVATE_QA: "1",
OPENCLAW_QA_FORCE_RUNTIME: "codex",
};
expect(
__testing
.filterCodexDynamicTools(tools, {}, privateQaCodexEnv)
.map((tool) => tool.name),
).toEqual(["read", "write", "image_generate", "message"]);
expect(__testing.resolveCodexDynamicToolsLoading({}, privateQaCodexEnv)).toBe("direct");
});
it("starts Codex threads without duplicate OpenClaw workspace tools by default", async () => {
const sessionFile = path.join(tempDir, "session.jsonl");
const workspaceDir = path.join(tempDir, "workspace");
@@ -897,6 +912,38 @@ describe("runCodexAppServerAttempt", () => {
expect((factoryOptions[0] as { modelApi?: unknown }).modelApi).toBe("openai-responses");
});
it("enables gateway subagent binding for forced private QA Codex runs", async () => {
vi.stubEnv("OPENCLAW_BUILD_PRIVATE_QA", "1");
vi.stubEnv("OPENCLAW_QA_FORCE_RUNTIME", "codex");
const sessionFile = path.join(tempDir, "session.jsonl");
const workspaceDir = path.join(tempDir, "workspace");
const params = createParams(sessionFile, workspaceDir);
params.disableTools = false;
params.runtimePlan = createCodexRuntimePlanFixture();
const factoryOptions: unknown[] = [];
__testing.setOpenClawCodingToolsFactoryForTests((options) => {
factoryOptions.push(options);
return [createRuntimeDynamicTool("sessions_spawn")];
});
const tools = await __testing.buildDynamicTools({
params,
resolvedWorkspace: workspaceDir,
effectiveWorkspace: workspaceDir,
sandboxSessionKey: params.sessionKey!,
sandbox: null as never,
runAbortController: new AbortController(),
sessionAgentId: "main",
pluginConfig: {},
onYieldDetected: () => undefined,
});
expect(factoryOptions).toHaveLength(1);
const factoryOption = factoryOptions[0] as { allowGatewaySubagentBinding?: unknown };
expect(factoryOption.allowGatewaySubagentBinding).toBe(true);
expect(tools.map((tool) => tool.name)).toEqual(["sessions_spawn"]);
});
it("normalizes Codex dynamic toolsAllow entries before filtering", () => {
const tools = ["exec", "apply_patch", "read", "message"].map((name) => ({ name }));

View File

@@ -78,7 +78,12 @@ import {
resolveCodexContextEngineProjectionMaxChars,
resolveCodexContextEngineProjectionReserveTokens,
} from "./context-engine-projection.js";
import { filterCodexDynamicTools, normalizeCodexDynamicToolName } from "./dynamic-tool-profile.js";
import {
filterCodexDynamicTools,
isForcedPrivateQaCodexRuntime,
normalizeCodexDynamicToolName,
resolveCodexDynamicToolsLoading,
} from "./dynamic-tool-profile.js";
import { createCodexDynamicToolBridge, type CodexDynamicToolBridge } from "./dynamic-tools.js";
import { handleCodexAppServerElicitationRequest } from "./elicitation-bridge.js";
import { CodexAppServerEventProjector } from "./event-projector.js";
@@ -618,7 +623,7 @@ export async function runCodexAppServerAttempt(
const toolBridge = createCodexDynamicToolBridge({
tools,
signal: runAbortController.signal,
loading: pluginConfig.codexDynamicToolsLoading ?? "searchable",
loading: resolveCodexDynamicToolsLoading(pluginConfig),
directToolNames: shouldForceMessageTool(params) ? ["message"] : [],
hookContext: {
agentId: sessionAgentId,
@@ -2748,7 +2753,8 @@ async function buildDynamicTools(input: DynamicToolBuildParams) {
senderUsername: params.senderUsername,
senderE164: params.senderE164,
senderIsOwner: params.senderIsOwner,
allowGatewaySubagentBinding: params.allowGatewaySubagentBinding,
allowGatewaySubagentBinding:
params.allowGatewaySubagentBinding || isForcedPrivateQaCodexRuntime(),
...sessionKeys,
sessionId: params.sessionId,
runId: params.runId,
@@ -3933,6 +3939,7 @@ export const __testing = {
isInvalidCodexImagePayloadError,
remapCodexContextFilePath,
resolveDynamicToolCallTimeoutMs,
resolveCodexDynamicToolsLoading,
restrictCodexAppServerSandboxForOpenClawSandbox,
resolveCodexAppServerForOpenClawToolPolicy,
resolveOpenClawCodingToolsSessionKeys,

View File

@@ -16,7 +16,10 @@ import { handleCodexAppServerApprovalRequest } from "./approval-bridge.js";
import { refreshCodexAppServerAuthTokens } from "./auth-bridge.js";
import { isCodexAppServerApprovalRequest, type CodexAppServerClient } from "./client.js";
import { readCodexPluginConfig, resolveCodexAppServerRuntimeOptions } from "./config.js";
import { filterCodexDynamicTools } from "./dynamic-tool-profile.js";
import {
filterCodexDynamicTools,
resolveCodexDynamicToolsLoading,
} from "./dynamic-tool-profile.js";
import { createCodexDynamicToolBridge, type CodexDynamicToolBridge } from "./dynamic-tools.js";
import { handleCodexAppServerElicitationRequest } from "./elicitation-bridge.js";
import {
@@ -378,7 +381,7 @@ async function createCodexSideToolBridge(input: {
return createCodexDynamicToolBridge({
tools,
signal: input.signal,
loading: input.pluginConfig.codexDynamicToolsLoading ?? "searchable",
loading: resolveCodexDynamicToolsLoading(input.pluginConfig),
hookContext: {
agentId: input.sessionAgentId,
config: input.params.cfg,

View File

@@ -66,7 +66,7 @@ function makeRuntimeParitySummary(): QaRuntimeParitySuiteSummary {
},
{
name: "Compaction retry after mutating tool",
status: "fail",
status: "pass",
steps: [],
runtimeParity: {
scenarioId: "compaction-retry-after-mutating-tool",
@@ -97,8 +97,8 @@ function makeRuntimeParitySummary(): QaRuntimeParitySuiteSummary {
],
counts: {
total: 2,
passed: 1,
failed: 1,
passed: 2,
failed: 0,
},
run: {
providerMode: "mock-openai",
@@ -801,9 +801,28 @@ status=done`,
});
expect(report.runtimePair).toEqual(["pi", "codex"]);
expect(report.pass).toBe(false);
expect(report.pass).toBe(true);
expect(report.driftCounts.none).toBe(1);
expect(report.driftCounts["tool-call-shape"]).toBe(1);
expect(report.failures).toEqual([]);
});
it("fails runtime parity reports when a runtime cell fails", () => {
const summary = makeRuntimeParitySummary();
const scenario = summary.scenarios[1];
if (!scenario?.runtimeParity) {
throw new Error("runtime parity fixture missing");
}
scenario.status = "fail";
scenario.runtimeParity.cells.codex.runtimeErrorClass = "tool-error";
const report = buildQaRuntimeParityReport({
summary,
comparedAt: "2026-05-10T00:00:00.000Z",
});
expect(report.pass).toBe(false);
expect(report.failedScenarios).toBe(1);
expect(report.failures).toContain(
"Compaction retry after mutating tool drift=tool-call-shape (tool call 1 differs).",
);

View File

@@ -4,10 +4,10 @@ import {
} from "./agentic-parity.js";
import type {
RuntimeId,
RuntimeParityCell,
RuntimeParityDrift,
RuntimeParityResult,
} from "./runtime-parity.js";
import { isRuntimeParityResultPass, runtimeParityCellStatus } from "./runtime-parity.js";
type QaParityReportStep = {
name: string;
@@ -260,13 +260,6 @@ function normalizeRuntimePair(
return ["pi", "codex"];
}
function runtimeCellStatus(cell: RuntimeParityCell | undefined): "pass" | "fail" | "missing" {
if (!cell) {
return "missing";
}
return cell.runtimeErrorClass || cell.transportErrorClass ? "fail" : "pass";
}
function requiredCoverageStatus(
scenario: QaParityReportScenario | undefined,
): "pass" | "fail" | "skip" | "missing" {
@@ -637,9 +630,9 @@ export function buildQaRuntimeParityReport(params: {
driftCounts[parity.drift] += 1;
const piCell = parity.cells.pi;
const codexCell = parity.cells.codex;
const piStatus = runtimeCellStatus(piCell);
const codexStatus = runtimeCellStatus(codexCell);
const status = scenario.status === "pass" ? "pass" : "fail";
const piStatus = runtimeParityCellStatus(piCell);
const codexStatus = runtimeParityCellStatus(codexCell);
const status = isRuntimeParityResultPass(parity) ? "pass" : "fail";
if (status === "fail") {
failures.push(
`${scenario.name} drift=${parity.drift}${parity.driftDetails ? ` (${parity.driftDetails})` : ""}.`,
@@ -660,12 +653,8 @@ export function buildQaRuntimeParityReport(params: {
});
const totalScenarios = params.summary.counts?.total ?? scenarios.length;
const passedScenarios =
params.summary.counts?.passed ??
scenarios.filter((scenario) => scenario.status === "pass").length;
const failedScenarios =
params.summary.counts?.failed ??
scenarios.filter((scenario) => scenario.status === "fail").length;
const passedScenarios = scenarios.filter((scenario) => scenario.status === "pass").length;
const failedScenarios = scenarios.filter((scenario) => scenario.status === "fail").length;
return {
runtimePair,
@@ -680,7 +669,7 @@ export function buildQaRuntimeParityReport(params: {
pass: failures.length === 0 && failedScenarios === 0,
failures,
notes: [
"Runtime parity treats none and text-only drift as pass; all structural, tool-shape, and failure-mode drift classes fail the lane.",
"Runtime parity fails runtime, transport, and failure-mode drift; structural and tool-shape drift is recorded as advisory when both runtimes complete.",
"Token totals here are assistant-message usage captured from the normalized transcript, not provider transport payloads.",
],
};

View File

@@ -868,6 +868,7 @@ describe("qa cli runtime", () => {
finalText: "done",
usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 },
wallClockMs: 10,
runtimeErrorClass: "tool-error",
bootStateLines: [],
},
},

View File

@@ -982,6 +982,74 @@ describe("qa mock openai server", () => {
expect(finalPayload.output?.[0]?.content?.[0]?.text).toContain("replay unsafe after write");
});
it("keeps compaction retry planning across continuation prompts", async () => {
const server = await startQaMockOpenAiServer({
host: "127.0.0.1",
port: 0,
});
cleanups.push(async () => {
await server.stop();
});
const prompt =
"Compaction retry mutating tool check: read COMPACTION_RETRY_CONTEXT.md, then create compaction-retry-summary.txt and keep replay safety explicit.";
const writePlan = await fetch(`${server.baseUrl}/v1/responses`, {
method: "POST",
headers: { "content-type": "application/json" },
body: JSON.stringify({
stream: true,
model: "gpt-5.5",
input: [
makeUserInput(prompt),
{
type: "function_call_output",
output: "compaction retry evidence block 0000\ncompaction retry evidence block 0001",
},
makeUserInput("Continue after compaction."),
],
}),
});
expect(writePlan.status).toBe(200);
expect(await writePlan.text()).toContain('"name":"write"');
const contextOnlyWritePlan = await fetch(`${server.baseUrl}/v1/responses`, {
method: "POST",
headers: { "content-type": "application/json" },
body: JSON.stringify({
stream: true,
model: "gpt-5.5",
input: [
{
type: "function_call_output",
output: "compaction retry evidence block 0000\ncompaction retry evidence block 0001",
},
makeUserInput("Continue after compaction."),
],
}),
});
expect(contextOnlyWritePlan.status).toBe(200);
expect(await contextOnlyWritePlan.text()).toContain('"name":"write"');
const finalReply = await fetch(`${server.baseUrl}/v1/responses`, {
method: "POST",
headers: { "content-type": "application/json" },
body: JSON.stringify({
stream: false,
model: "gpt-5.5",
input: [
makeUserInput(prompt),
{
type: "function_call_output",
output: "Successfully wrote 41 bytes to compaction-retry-summary.txt.",
},
makeUserInput("Continue after compaction."),
],
}),
});
expect(finalReply.status).toBe(200);
expect(outputText(await finalReply.json())).toContain("replay unsafe after write");
});
it("supports exact reply memory prompts and embeddings requests", async () => {
const server = await startQaMockOpenAiServer({
host: "127.0.0.1",
@@ -1866,6 +1934,165 @@ describe("qa mock openai server", () => {
expect(outputText(await phaseOnlyFinal.json())).toBe("subagent-1: ok\nsubagent-2: ok");
});
it("uses full request text when planning continuation subagent tool calls", async () => {
const server = await startQaMockOpenAiServer({
host: "127.0.0.1",
port: 0,
});
cleanups.push(async () => {
await server.stop();
});
const handoffPrompt =
"Delegate one bounded QA task to a subagent. Wait for the subagent to finish.";
const handoff = await fetch(`${server.baseUrl}/v1/responses`, {
method: "POST",
headers: { "content-type": "application/json" },
body: JSON.stringify({
stream: true,
tools: [SESSIONS_SPAWN_TOOL],
input: [makeUserInput(handoffPrompt), makeUserInput("Continue.")],
}),
});
expect(handoff.status).toBe(200);
expect(await handoff.text()).toContain('"name":"sessions_spawn"');
const handoffServer = await startQaMockOpenAiServer({
host: "127.0.0.1",
port: 0,
});
cleanups.push(async () => {
await handoffServer.stop();
});
const appServerHandoff = await fetch(`${handoffServer.baseUrl}/v1/responses`, {
method: "POST",
headers: { "content-type": "application/json" },
body: JSON.stringify({
stream: true,
input: [makeUserInput(handoffPrompt), makeUserInput("Continue.")],
}),
});
expect(appServerHandoff.status).toBe(200);
expect(await appServerHandoff.text()).toContain('"name":"sessions_spawn"');
const repeatedHandoff = await fetch(`${handoffServer.baseUrl}/v1/responses`, {
method: "POST",
headers: { "content-type": "application/json" },
body: JSON.stringify({
stream: true,
input: [makeUserInput(handoffPrompt), makeUserInput("Continue again.")],
}),
});
expect(repeatedHandoff.status).toBe(200);
expect(await repeatedHandoff.text()).not.toContain('"name":"sessions_spawn"');
const handoffFinal = await fetch(`${server.baseUrl}/v1/responses`, {
method: "POST",
headers: { "content-type": "application/json" },
body: JSON.stringify({
stream: false,
tools: [SESSIONS_SPAWN_TOOL],
input: [
makeUserInput(handoffPrompt),
{ type: "function_call_output", output: "SUBAGENT-OK" },
makeUserInput("Continue."),
],
}),
});
expect(handoffFinal.status).toBe(200);
expect(outputText(await handoffFinal.json())).toContain("Delegated task");
const fanoutPrompt =
"Subagent fanout synthesis check: delegate two bounded subagents sequentially, then report both results together.";
const appServerFanout = await fetch(`${server.baseUrl}/v1/responses`, {
method: "POST",
headers: { "content-type": "application/json" },
body: JSON.stringify({
stream: true,
input: [makeUserInput(fanoutPrompt), makeUserInput("Continue.")],
}),
});
expect(appServerFanout.status).toBe(200);
expect(await appServerFanout.text()).toContain('\\"label\\":\\"qa-fanout-alpha\\"');
const fanoutServer = await startQaMockOpenAiServer({
host: "127.0.0.1",
port: 0,
});
cleanups.push(async () => {
await fanoutServer.stop();
});
const firstFanout = await fetch(`${fanoutServer.baseUrl}/v1/responses`, {
method: "POST",
headers: { "content-type": "application/json" },
body: JSON.stringify({
stream: true,
tools: [SESSIONS_SPAWN_TOOL],
input: [makeUserInput(fanoutPrompt)],
}),
});
expect(firstFanout.status).toBe(200);
expect(await firstFanout.text()).toContain('\\"label\\":\\"qa-fanout-alpha\\"');
const secondFanout = await fetch(`${fanoutServer.baseUrl}/v1/responses`, {
method: "POST",
headers: { "content-type": "application/json" },
body: JSON.stringify({
stream: true,
tools: [SESSIONS_SPAWN_TOOL],
input: [
makeUserInput(fanoutPrompt),
{
type: "function_call_output",
output:
'{"status":"accepted","childSessionKey":"agent:qa:subagent:alpha","note":"ALPHA-OK"}',
},
makeUserInput("Continue."),
],
}),
});
expect(secondFanout.status).toBe(200);
expect(await secondFanout.text()).toContain('\\"label\\":\\"qa-fanout-beta\\"');
});
it("keeps source discovery reports out of subagent handoff prose", async () => {
const server = await startQaMockOpenAiServer({
host: "127.0.0.1",
port: 0,
});
cleanups.push(async () => {
await server.stop();
});
const response = await fetch(`${server.baseUrl}/v1/responses`, {
method: "POST",
headers: { "content-type": "application/json" },
body: JSON.stringify({
stream: false,
input: [
makeUserInput(
"Read the seeded docs and source plan, then report grouped into Worked, Failed, Blocked, and Follow-up.",
),
{
type: "function_call_output",
output:
"repo/qa/scenarios/index.md includes scenario: subagent-handoff and repo/extensions/qa-lab/src/suite.ts.",
},
makeUserInput("Continue."),
],
}),
});
expect(response.status).toBe(200);
const text = outputText(await response.json());
expect(text).toContain("Worked:");
expect(text).toContain("repo/docs/help/testing.md");
expect(text).toContain("Follow-up:");
expect(text).not.toContain("Delegated task");
});
it("does not let fanout completion state hijack child worker replies", async () => {
const server = await startQaMockOpenAiServer({
host: "127.0.0.1",
@@ -2727,7 +2954,7 @@ describe("qa mock openai server", () => {
| { name: string; input: Record<string, unknown> }
| undefined;
expect(toolUseBlock?.name).toBe("read");
expect(toolUseBlock?.input).toEqual({ path: "QA_SCENARIO_PLAN.md" });
expect(toolUseBlock?.input).toEqual({ path: "repo/qa/scenarios/index.md" });
const debugResponse = await fetch(`${server.baseUrl}/debug/last-request`);
expect(debugResponse.status).toBe(200);
@@ -2985,7 +3212,7 @@ describe("qa mock openai server", () => {
expect(body).toContain("event: content_block_start");
expect(body).toContain('"type":"tool_use"');
expect(body).toContain('"name":"read"');
expect(body).toContain("QA_SCENARIO_PLAN.md");
expect(body).toContain("repo/qa/scenarios/index.md");
expect(body).toContain("event: message_delta");
expect(body).toContain("event: message_stop");
});

View File

@@ -177,6 +177,7 @@ const QA_TOOL_SEARCH_FAILURE_PROMPT_RE = /tool search qa failure/i;
type MockScenarioState = {
subagentFanoutPhase: number;
subagentHandoffSpawned: boolean;
};
const MOCK_OPENAI_MAX_BODY_BYTES = 16 * 1024 * 1024;
@@ -1128,7 +1129,11 @@ function buildAssistantText(
"- None.",
].join("\n");
}
if (toolOutput && (/\bdelegate\b/i.test(prompt) || /subagent handoff/i.test(prompt))) {
if (
toolOutput &&
(/delegate (?:one |a )bounded qa task/i.test(allInputText) ||
/subagent handoff/i.test(allInputText))
) {
const compact = toolOutput.replace(/\s+/g, " ").trim() || "no delegated output";
return `Delegated task:\n- Inspect the QA workspace via a bounded subagent.\nResult:\n- ${compact}\nEvidence:\n- The child result was folded back into the main thread exactly once.`;
}
@@ -1141,7 +1146,11 @@ function buildAssistantText(
}
return `Protocol note: Lobster Invaders built at lobster-invaders.html.`;
}
if (toolOutput && /compaction retry mutating tool check/i.test(prompt)) {
if (
toolOutput &&
(/compaction retry mutating tool check/i.test(allInputText) ||
/compaction-retry-summary\.txt/i.test(toolOutput))
) {
if (
toolOutput.includes("Replay safety: unsafe after write.") ||
/compaction-retry-summary\.txt/i.test(toolOutput) ||
@@ -1152,6 +1161,22 @@ function buildAssistantText(
}
return "";
}
if (
toolOutput &&
/(worked, failed, blocked|worked\/failed\/blocked|source and docs)/i.test(allInputText)
) {
return [
"Worked:",
"- Read all three seeded files: repo/qa/scenarios/index.md, repo/extensions/qa-lab/src/suite.ts, and repo/docs/help/testing.md.",
"- Extra QA scenario candidates: config restart capability flip and image generation roundtrip.",
"Failed:",
"- None observed in mock mode.",
"Blocked:",
"- No live provider evidence in this lane.",
"Follow-up:",
"- Re-run with a real model for qualitative coverage.",
].join("\n");
}
if (toolOutput) {
const snippet = toolOutput.replace(/\s+/g, " ").trim().slice(0, 220);
return `Protocol note: I reviewed the requested material. Evidence snippet: ${snippet || "no content"}`;
@@ -1501,11 +1526,17 @@ async function buildResponsesPayload(
const isBaselineUnmentionedChannelChatter = /\bno bot ping here\b/i.test(prompt);
const hasReasoningOnlyRetryInstruction = allInputText.includes(QA_REASONING_ONLY_RETRY_NEEDLE);
const hasEmptyResponseRetryInstruction = allInputText.includes(QA_EMPTY_RESPONSE_RETRY_NEEDLE);
const canCallSessionsSpawn = hasDeclaredTool(body, "sessions_spawn");
const canCallSessionsYield = hasDeclaredTool(body, "sessions_yield");
const canPlanQaSessionsSpawn =
canCallSessionsSpawn ||
/subagent fanout synthesis check|delegate one bounded qa task|subagent handoff/i.test(prompt);
const canCallMockSubagentTool =
QA_SUBAGENT_DIRECT_FALLBACK_PROMPT_RE.test(allInputText) ||
/subagent fanout synthesis check/i.test(allInputText) ||
/forked subagent context qa check/i.test(allInputText) ||
/delegate (?:one |a )bounded qa task/i.test(allInputText) ||
/subagent handoff/i.test(allInputText) ||
buildExplicitSessionsSpawnArgs(allInputText) !== null;
const canCallSessionsSpawn = hasDeclaredTool(body, "sessions_spawn") || canCallMockSubagentTool;
const canCallSessionsYield =
hasDeclaredTool(body, "sessions_yield") ||
QA_SUBAGENT_DIRECT_FALLBACK_PROMPT_RE.test(allInputText);
const buildToolProgressReadEvents = (pattern: RegExp) => {
const toolProgressPrompt = extractLastMatchingUserText(extractAllUserTexts(input), pattern);
return buildToolCallEventsWithArgs("read", {
@@ -1819,7 +1850,11 @@ async function buildResponsesPayload(
});
}
}
if (/compaction retry mutating tool check/i.test(prompt)) {
if (
/compaction retry mutating tool check/i.test(allInputText) ||
/compaction retry evidence/i.test(toolOutput) ||
/compaction-retry-summary\.txt/i.test(toolOutput)
) {
if (!toolOutput) {
return buildToolCallEventsWithArgs("read", { path: "COMPACTION_RETRY_CONTEXT.md" });
}
@@ -2002,7 +2037,7 @@ async function buildResponsesPayload(
size: "1024x1024",
});
}
if (canPlanQaSessionsSpawn && /subagent fanout synthesis check/i.test(prompt)) {
if (canCallSessionsSpawn && /subagent fanout synthesis check/i.test(allInputText)) {
if (!toolOutput && scenarioState.subagentFanoutPhase === 0) {
scenarioState.subagentFanoutPhase = 1;
return buildToolCallEventsWithArgs("sessions_spawn", {
@@ -2078,10 +2113,13 @@ async function buildResponsesPayload(
}
}
if (
canPlanQaSessionsSpawn &&
(/\bdelegate\b/i.test(prompt) || /subagent handoff/i.test(prompt)) &&
!toolOutput
canCallSessionsSpawn &&
(/delegate (?:one |a )bounded qa task/i.test(allInputText) ||
/subagent handoff/i.test(allInputText)) &&
!toolOutput &&
!scenarioState.subagentHandoffSpawned
) {
scenarioState.subagentHandoffSpawned = true;
return buildToolCallEventsWithArgs("sessions_spawn", {
task: "Inspect the QA workspace and return one concise protocol note.",
label: "qa-sidecar",
@@ -2092,7 +2130,7 @@ async function buildResponsesPayload(
/(worked, failed, blocked|worked\/failed\/blocked|source and docs)/i.test(prompt) &&
!toolOutput
) {
return buildToolCallEventsWithArgs("read", { path: "QA_SCENARIO_PLAN.md" });
return buildToolCallEventsWithArgs("read", { path: "repo/qa/scenarios/index.md" });
}
if (!toolOutput && /\b(read|inspect|repo|docs|scenario|kickoff)\b/i.test(prompt)) {
return buildToolCallEvents(prompt);
@@ -2496,7 +2534,10 @@ async function buildMessagesPayload(
export async function startQaMockOpenAiServer(params?: { host?: string; port?: number }) {
const host = params?.host ?? "127.0.0.1";
const scenarioState: MockScenarioState = { subagentFanoutPhase: 0 };
const scenarioState: MockScenarioState = {
subagentFanoutPhase: 0,
subagentHandoffSpawned: false,
};
let lastRequest: MockOpenAiRequestSnapshot | null = null;
const requests: MockOpenAiRequestSnapshot[] = [];
const imageGenerationRequests: Array<Record<string, unknown>> = [];

View File

@@ -5,6 +5,7 @@ import path from "node:path";
import { afterEach, describe, expect, it, vi } from "vitest";
import {
captureRuntimeParityCell,
isRuntimeParityResultPass,
runRuntimeParityScenario,
type RuntimeId,
type RuntimeParityCell,
@@ -179,6 +180,7 @@ describe("runtime parity", () => {
});
expect(result.drift).toBe("tool-call-shape");
expect(isRuntimeParityResultPass(result)).toBe(true);
});
it("classifies tool result shape drift", async () => {
@@ -220,6 +222,7 @@ describe("runtime parity", () => {
});
expect(result.drift).toBe("failure-mode");
expect(isRuntimeParityResultPass(result)).toBe(false);
});
it("surfaces tool-call-shape when one runtime fails because the tool path drifted", async () => {
@@ -235,6 +238,7 @@ describe("runtime parity", () => {
});
expect(result.drift).toBe("tool-call-shape");
expect(isRuntimeParityResultPass(result)).toBe(false);
});
it("surfaces tool-result-shape when a downstream timeout follows divergent tool output", async () => {

View File

@@ -59,6 +59,23 @@ export type RuntimeParityScenarioExecution = {
cell: RuntimeParityCell;
};
export function runtimeParityCellStatus(
cell: RuntimeParityCell | undefined,
): "pass" | "fail" | "missing" {
if (!cell) {
return "missing";
}
return cell.runtimeErrorClass || cell.transportErrorClass ? "fail" : "pass";
}
export function isRuntimeParityResultPass(result: RuntimeParityResult) {
return (
result.drift !== "failure-mode" &&
runtimeParityCellStatus(result.cells.pi) === "pass" &&
runtimeParityCellStatus(result.cells.codex) === "pass"
);
}
type QaGatewayLike = {
logs?: () => string;
tempRoot: string;

View File

@@ -31,6 +31,7 @@ import { renderQaMarkdownReport, type QaReportCheck, type QaReportScenario } fro
import { defaultQaModelForMode } from "./run-config.js";
import {
captureRuntimeParityCell,
isRuntimeParityResultPass,
runRuntimeParityScenario,
type RuntimeId,
type RuntimeParityCell,
@@ -276,7 +277,7 @@ async function runScenarioDefinition(
}
function isRuntimeParityPass(result: RuntimeParityResult) {
return result.drift === "none" || result.drift === "text-only";
return isRuntimeParityResultPass(result);
}
function formatRuntimeParityCellDetails(cell: RuntimeParityCell) {

View File

@@ -346,6 +346,7 @@ export function createOpenClawTools(
!embedded ||
options?.sourceReplyDeliveryMode === "message_tool_only" ||
messageExplicitlyAllowed;
const includeSubagentSpawnTool = !embedded || options?.allowGatewaySubagentBinding === true;
const effectiveCallGateway = embedded
? createEmbeddedCallGateway()
: openClawToolsDeps.callGateway;
@@ -424,6 +425,9 @@ export function createOpenClawTools(
config: resolvedConfig,
callGateway: openClawToolsDeps.callGateway,
}),
]),
...(includeSubagentSpawnTool
? [
createSessionsSpawnTool({
agentSessionKey: options?.agentSessionKey,
agentChannel: options?.agentChannel,
@@ -441,7 +445,8 @@ export function createOpenClawTools(
inheritedToolAllowlist: options?.inheritedToolAllowlist,
inheritedToolDenylist: options?.inheritedToolDenylist,
}),
]),
]
: []),
createSessionsYieldTool({
sessionId: options?.sessionId,
onYield: options?.onYield,

View File

@@ -123,6 +123,24 @@ describe("openclaw-tools update_plan gating", () => {
expect(toolNames(denied)).not.toContain("message");
});
it("keeps subagent spawn available for trusted embedded gateway-bound runs", () => {
setEmbeddedMode(true);
const defaultTools = createOpenClawTools({
config: {} as OpenClawConfig,
disablePluginTools: true,
});
const gatewayBoundTools = createOpenClawTools({
config: {} as OpenClawConfig,
disablePluginTools: true,
allowGatewaySubagentBinding: true,
});
expect(toolNames(defaultTools)).not.toContain("sessions_spawn");
expect(toolNames(defaultTools)).not.toContain("sessions_send");
expect(toolNames(gatewayBoundTools)).toContain("sessions_spawn");
expect(toolNames(gatewayBoundTools)).not.toContain("sessions_send");
});
it("registers update_plan when explicitly enabled", () => {
const config = {
tools: {