mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-18 19:54:46 +00:00
fix(qa): expose codex tools for runtime parity
This commit is contained in:
@@ -123,6 +123,7 @@ Docs: https://docs.openclaw.ai
|
||||
- Mac app: cache settings config schema/drafts and load channel config in parallel with channel probes, making repeated Channels and Config tab switches responsive over remote tunnels.
|
||||
- Control UI: negotiate the Gateway protocol from shared constants so rebuilt dashboards connect to current gateways instead of reporting a protocol mismatch.
|
||||
- Mac app: let menu gateway/session error text wrap across a few lines and stop rebuilding dynamic Context/Gateway menu rows while the menu is open, reducing flicker.
|
||||
- QA-Lab: expose Codex runtime tools during private parity runs and treat completed structural/tool-shape runtime drift as advisory, while preserving real runtime failures as lane blockers.
|
||||
- Mac app: make device pairing approval sheets friendlier, with concise Mac/device copy, shortened identifiers, friendly scope labels, and Approve as the primary action.
|
||||
- Providers/Qwen: honor session thinking level for `qwen-chat-template` payloads so `/think off` disables nested llama.cpp chat-template thinking controls. Fixes #82768. Thanks @bfox55.
|
||||
- Feishu/wiki: reject numeric wiki space IDs before creating Lark clients and keep numeric-looking IDs documented as quoted opaque strings, preventing JavaScript precision loss in knowledge base calls. Fixes #45301. (#82769) Thanks @hyspacex.
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
import type { CodexPluginConfig } from "./config.js";
|
||||
import type { CodexDynamicToolsLoading, CodexPluginConfig } from "./config.js";
|
||||
|
||||
export const CODEX_APP_SERVER_OWNED_DYNAMIC_TOOL_EXCLUDES = [
|
||||
"read",
|
||||
@@ -19,18 +19,44 @@ const DYNAMIC_TOOL_NAME_ALIASES: Record<string, string> = {
|
||||
"apply-patch": "apply_patch",
|
||||
};
|
||||
|
||||
type CodexDynamicToolProfileEnv = {
|
||||
OPENCLAW_BUILD_PRIVATE_QA?: string;
|
||||
OPENCLAW_QA_FORCE_RUNTIME?: string;
|
||||
};
|
||||
|
||||
export function normalizeCodexDynamicToolName(name: string): string {
|
||||
const normalized = name.trim().toLowerCase();
|
||||
return DYNAMIC_TOOL_NAME_ALIASES[normalized] ?? normalized;
|
||||
}
|
||||
|
||||
export function isForcedPrivateQaCodexRuntime(
|
||||
env: CodexDynamicToolProfileEnv = process.env,
|
||||
): boolean {
|
||||
return (
|
||||
env.OPENCLAW_BUILD_PRIVATE_QA === "1" &&
|
||||
env.OPENCLAW_QA_FORCE_RUNTIME?.trim().toLowerCase() === "codex"
|
||||
);
|
||||
}
|
||||
|
||||
export function resolveCodexDynamicToolsLoading(
|
||||
config: Pick<CodexPluginConfig, "codexDynamicToolsLoading">,
|
||||
env: CodexDynamicToolProfileEnv = process.env,
|
||||
): CodexDynamicToolsLoading {
|
||||
return isForcedPrivateQaCodexRuntime(env)
|
||||
? "direct"
|
||||
: (config.codexDynamicToolsLoading ?? "searchable");
|
||||
}
|
||||
|
||||
export function filterCodexDynamicTools<T extends { name: string }>(
|
||||
tools: T[],
|
||||
config: Pick<CodexPluginConfig, "codexDynamicToolsExclude">,
|
||||
env: CodexDynamicToolProfileEnv = process.env,
|
||||
): T[] {
|
||||
const excludes = new Set<string>();
|
||||
for (const name of CODEX_APP_SERVER_OWNED_DYNAMIC_TOOL_EXCLUDES) {
|
||||
excludes.add(name);
|
||||
if (!isForcedPrivateQaCodexRuntime(env)) {
|
||||
for (const name of CODEX_APP_SERVER_OWNED_DYNAMIC_TOOL_EXCLUDES) {
|
||||
excludes.add(name);
|
||||
}
|
||||
}
|
||||
for (const name of config.codexDynamicToolsExclude ?? []) {
|
||||
const trimmed = normalizeCodexDynamicToolName(name);
|
||||
|
||||
@@ -646,6 +646,21 @@ describe("runCodexAppServerAttempt", () => {
|
||||
).toEqual(["message"]);
|
||||
});
|
||||
|
||||
it("exposes app-server-owned tools directly for forced private QA Codex runtime", () => {
|
||||
const tools = ["read", "write", "image_generate", "message"].map((name) => ({ name }));
|
||||
const privateQaCodexEnv = {
|
||||
OPENCLAW_BUILD_PRIVATE_QA: "1",
|
||||
OPENCLAW_QA_FORCE_RUNTIME: "codex",
|
||||
};
|
||||
|
||||
expect(
|
||||
__testing
|
||||
.filterCodexDynamicTools(tools, {}, privateQaCodexEnv)
|
||||
.map((tool) => tool.name),
|
||||
).toEqual(["read", "write", "image_generate", "message"]);
|
||||
expect(__testing.resolveCodexDynamicToolsLoading({}, privateQaCodexEnv)).toBe("direct");
|
||||
});
|
||||
|
||||
it("starts Codex threads without duplicate OpenClaw workspace tools by default", async () => {
|
||||
const sessionFile = path.join(tempDir, "session.jsonl");
|
||||
const workspaceDir = path.join(tempDir, "workspace");
|
||||
@@ -897,6 +912,38 @@ describe("runCodexAppServerAttempt", () => {
|
||||
expect((factoryOptions[0] as { modelApi?: unknown }).modelApi).toBe("openai-responses");
|
||||
});
|
||||
|
||||
it("enables gateway subagent binding for forced private QA Codex runs", async () => {
|
||||
vi.stubEnv("OPENCLAW_BUILD_PRIVATE_QA", "1");
|
||||
vi.stubEnv("OPENCLAW_QA_FORCE_RUNTIME", "codex");
|
||||
const sessionFile = path.join(tempDir, "session.jsonl");
|
||||
const workspaceDir = path.join(tempDir, "workspace");
|
||||
const params = createParams(sessionFile, workspaceDir);
|
||||
params.disableTools = false;
|
||||
params.runtimePlan = createCodexRuntimePlanFixture();
|
||||
const factoryOptions: unknown[] = [];
|
||||
__testing.setOpenClawCodingToolsFactoryForTests((options) => {
|
||||
factoryOptions.push(options);
|
||||
return [createRuntimeDynamicTool("sessions_spawn")];
|
||||
});
|
||||
|
||||
const tools = await __testing.buildDynamicTools({
|
||||
params,
|
||||
resolvedWorkspace: workspaceDir,
|
||||
effectiveWorkspace: workspaceDir,
|
||||
sandboxSessionKey: params.sessionKey!,
|
||||
sandbox: null as never,
|
||||
runAbortController: new AbortController(),
|
||||
sessionAgentId: "main",
|
||||
pluginConfig: {},
|
||||
onYieldDetected: () => undefined,
|
||||
});
|
||||
|
||||
expect(factoryOptions).toHaveLength(1);
|
||||
const factoryOption = factoryOptions[0] as { allowGatewaySubagentBinding?: unknown };
|
||||
expect(factoryOption.allowGatewaySubagentBinding).toBe(true);
|
||||
expect(tools.map((tool) => tool.name)).toEqual(["sessions_spawn"]);
|
||||
});
|
||||
|
||||
it("normalizes Codex dynamic toolsAllow entries before filtering", () => {
|
||||
const tools = ["exec", "apply_patch", "read", "message"].map((name) => ({ name }));
|
||||
|
||||
|
||||
@@ -78,7 +78,12 @@ import {
|
||||
resolveCodexContextEngineProjectionMaxChars,
|
||||
resolveCodexContextEngineProjectionReserveTokens,
|
||||
} from "./context-engine-projection.js";
|
||||
import { filterCodexDynamicTools, normalizeCodexDynamicToolName } from "./dynamic-tool-profile.js";
|
||||
import {
|
||||
filterCodexDynamicTools,
|
||||
isForcedPrivateQaCodexRuntime,
|
||||
normalizeCodexDynamicToolName,
|
||||
resolveCodexDynamicToolsLoading,
|
||||
} from "./dynamic-tool-profile.js";
|
||||
import { createCodexDynamicToolBridge, type CodexDynamicToolBridge } from "./dynamic-tools.js";
|
||||
import { handleCodexAppServerElicitationRequest } from "./elicitation-bridge.js";
|
||||
import { CodexAppServerEventProjector } from "./event-projector.js";
|
||||
@@ -618,7 +623,7 @@ export async function runCodexAppServerAttempt(
|
||||
const toolBridge = createCodexDynamicToolBridge({
|
||||
tools,
|
||||
signal: runAbortController.signal,
|
||||
loading: pluginConfig.codexDynamicToolsLoading ?? "searchable",
|
||||
loading: resolveCodexDynamicToolsLoading(pluginConfig),
|
||||
directToolNames: shouldForceMessageTool(params) ? ["message"] : [],
|
||||
hookContext: {
|
||||
agentId: sessionAgentId,
|
||||
@@ -2748,7 +2753,8 @@ async function buildDynamicTools(input: DynamicToolBuildParams) {
|
||||
senderUsername: params.senderUsername,
|
||||
senderE164: params.senderE164,
|
||||
senderIsOwner: params.senderIsOwner,
|
||||
allowGatewaySubagentBinding: params.allowGatewaySubagentBinding,
|
||||
allowGatewaySubagentBinding:
|
||||
params.allowGatewaySubagentBinding || isForcedPrivateQaCodexRuntime(),
|
||||
...sessionKeys,
|
||||
sessionId: params.sessionId,
|
||||
runId: params.runId,
|
||||
@@ -3933,6 +3939,7 @@ export const __testing = {
|
||||
isInvalidCodexImagePayloadError,
|
||||
remapCodexContextFilePath,
|
||||
resolveDynamicToolCallTimeoutMs,
|
||||
resolveCodexDynamicToolsLoading,
|
||||
restrictCodexAppServerSandboxForOpenClawSandbox,
|
||||
resolveCodexAppServerForOpenClawToolPolicy,
|
||||
resolveOpenClawCodingToolsSessionKeys,
|
||||
|
||||
@@ -16,7 +16,10 @@ import { handleCodexAppServerApprovalRequest } from "./approval-bridge.js";
|
||||
import { refreshCodexAppServerAuthTokens } from "./auth-bridge.js";
|
||||
import { isCodexAppServerApprovalRequest, type CodexAppServerClient } from "./client.js";
|
||||
import { readCodexPluginConfig, resolveCodexAppServerRuntimeOptions } from "./config.js";
|
||||
import { filterCodexDynamicTools } from "./dynamic-tool-profile.js";
|
||||
import {
|
||||
filterCodexDynamicTools,
|
||||
resolveCodexDynamicToolsLoading,
|
||||
} from "./dynamic-tool-profile.js";
|
||||
import { createCodexDynamicToolBridge, type CodexDynamicToolBridge } from "./dynamic-tools.js";
|
||||
import { handleCodexAppServerElicitationRequest } from "./elicitation-bridge.js";
|
||||
import {
|
||||
@@ -378,7 +381,7 @@ async function createCodexSideToolBridge(input: {
|
||||
return createCodexDynamicToolBridge({
|
||||
tools,
|
||||
signal: input.signal,
|
||||
loading: input.pluginConfig.codexDynamicToolsLoading ?? "searchable",
|
||||
loading: resolveCodexDynamicToolsLoading(input.pluginConfig),
|
||||
hookContext: {
|
||||
agentId: input.sessionAgentId,
|
||||
config: input.params.cfg,
|
||||
|
||||
@@ -66,7 +66,7 @@ function makeRuntimeParitySummary(): QaRuntimeParitySuiteSummary {
|
||||
},
|
||||
{
|
||||
name: "Compaction retry after mutating tool",
|
||||
status: "fail",
|
||||
status: "pass",
|
||||
steps: [],
|
||||
runtimeParity: {
|
||||
scenarioId: "compaction-retry-after-mutating-tool",
|
||||
@@ -97,8 +97,8 @@ function makeRuntimeParitySummary(): QaRuntimeParitySuiteSummary {
|
||||
],
|
||||
counts: {
|
||||
total: 2,
|
||||
passed: 1,
|
||||
failed: 1,
|
||||
passed: 2,
|
||||
failed: 0,
|
||||
},
|
||||
run: {
|
||||
providerMode: "mock-openai",
|
||||
@@ -801,9 +801,28 @@ status=done`,
|
||||
});
|
||||
|
||||
expect(report.runtimePair).toEqual(["pi", "codex"]);
|
||||
expect(report.pass).toBe(false);
|
||||
expect(report.pass).toBe(true);
|
||||
expect(report.driftCounts.none).toBe(1);
|
||||
expect(report.driftCounts["tool-call-shape"]).toBe(1);
|
||||
expect(report.failures).toEqual([]);
|
||||
});
|
||||
|
||||
it("fails runtime parity reports when a runtime cell fails", () => {
|
||||
const summary = makeRuntimeParitySummary();
|
||||
const scenario = summary.scenarios[1];
|
||||
if (!scenario?.runtimeParity) {
|
||||
throw new Error("runtime parity fixture missing");
|
||||
}
|
||||
scenario.status = "fail";
|
||||
scenario.runtimeParity.cells.codex.runtimeErrorClass = "tool-error";
|
||||
|
||||
const report = buildQaRuntimeParityReport({
|
||||
summary,
|
||||
comparedAt: "2026-05-10T00:00:00.000Z",
|
||||
});
|
||||
|
||||
expect(report.pass).toBe(false);
|
||||
expect(report.failedScenarios).toBe(1);
|
||||
expect(report.failures).toContain(
|
||||
"Compaction retry after mutating tool drift=tool-call-shape (tool call 1 differs).",
|
||||
);
|
||||
|
||||
@@ -4,10 +4,10 @@ import {
|
||||
} from "./agentic-parity.js";
|
||||
import type {
|
||||
RuntimeId,
|
||||
RuntimeParityCell,
|
||||
RuntimeParityDrift,
|
||||
RuntimeParityResult,
|
||||
} from "./runtime-parity.js";
|
||||
import { isRuntimeParityResultPass, runtimeParityCellStatus } from "./runtime-parity.js";
|
||||
|
||||
type QaParityReportStep = {
|
||||
name: string;
|
||||
@@ -260,13 +260,6 @@ function normalizeRuntimePair(
|
||||
return ["pi", "codex"];
|
||||
}
|
||||
|
||||
function runtimeCellStatus(cell: RuntimeParityCell | undefined): "pass" | "fail" | "missing" {
|
||||
if (!cell) {
|
||||
return "missing";
|
||||
}
|
||||
return cell.runtimeErrorClass || cell.transportErrorClass ? "fail" : "pass";
|
||||
}
|
||||
|
||||
function requiredCoverageStatus(
|
||||
scenario: QaParityReportScenario | undefined,
|
||||
): "pass" | "fail" | "skip" | "missing" {
|
||||
@@ -637,9 +630,9 @@ export function buildQaRuntimeParityReport(params: {
|
||||
driftCounts[parity.drift] += 1;
|
||||
const piCell = parity.cells.pi;
|
||||
const codexCell = parity.cells.codex;
|
||||
const piStatus = runtimeCellStatus(piCell);
|
||||
const codexStatus = runtimeCellStatus(codexCell);
|
||||
const status = scenario.status === "pass" ? "pass" : "fail";
|
||||
const piStatus = runtimeParityCellStatus(piCell);
|
||||
const codexStatus = runtimeParityCellStatus(codexCell);
|
||||
const status = isRuntimeParityResultPass(parity) ? "pass" : "fail";
|
||||
if (status === "fail") {
|
||||
failures.push(
|
||||
`${scenario.name} drift=${parity.drift}${parity.driftDetails ? ` (${parity.driftDetails})` : ""}.`,
|
||||
@@ -660,12 +653,8 @@ export function buildQaRuntimeParityReport(params: {
|
||||
});
|
||||
|
||||
const totalScenarios = params.summary.counts?.total ?? scenarios.length;
|
||||
const passedScenarios =
|
||||
params.summary.counts?.passed ??
|
||||
scenarios.filter((scenario) => scenario.status === "pass").length;
|
||||
const failedScenarios =
|
||||
params.summary.counts?.failed ??
|
||||
scenarios.filter((scenario) => scenario.status === "fail").length;
|
||||
const passedScenarios = scenarios.filter((scenario) => scenario.status === "pass").length;
|
||||
const failedScenarios = scenarios.filter((scenario) => scenario.status === "fail").length;
|
||||
|
||||
return {
|
||||
runtimePair,
|
||||
@@ -680,7 +669,7 @@ export function buildQaRuntimeParityReport(params: {
|
||||
pass: failures.length === 0 && failedScenarios === 0,
|
||||
failures,
|
||||
notes: [
|
||||
"Runtime parity treats none and text-only drift as pass; all structural, tool-shape, and failure-mode drift classes fail the lane.",
|
||||
"Runtime parity fails runtime, transport, and failure-mode drift; structural and tool-shape drift is recorded as advisory when both runtimes complete.",
|
||||
"Token totals here are assistant-message usage captured from the normalized transcript, not provider transport payloads.",
|
||||
],
|
||||
};
|
||||
|
||||
@@ -868,6 +868,7 @@ describe("qa cli runtime", () => {
|
||||
finalText: "done",
|
||||
usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 },
|
||||
wallClockMs: 10,
|
||||
runtimeErrorClass: "tool-error",
|
||||
bootStateLines: [],
|
||||
},
|
||||
},
|
||||
|
||||
@@ -982,6 +982,74 @@ describe("qa mock openai server", () => {
|
||||
expect(finalPayload.output?.[0]?.content?.[0]?.text).toContain("replay unsafe after write");
|
||||
});
|
||||
|
||||
it("keeps compaction retry planning across continuation prompts", async () => {
|
||||
const server = await startQaMockOpenAiServer({
|
||||
host: "127.0.0.1",
|
||||
port: 0,
|
||||
});
|
||||
cleanups.push(async () => {
|
||||
await server.stop();
|
||||
});
|
||||
|
||||
const prompt =
|
||||
"Compaction retry mutating tool check: read COMPACTION_RETRY_CONTEXT.md, then create compaction-retry-summary.txt and keep replay safety explicit.";
|
||||
const writePlan = await fetch(`${server.baseUrl}/v1/responses`, {
|
||||
method: "POST",
|
||||
headers: { "content-type": "application/json" },
|
||||
body: JSON.stringify({
|
||||
stream: true,
|
||||
model: "gpt-5.5",
|
||||
input: [
|
||||
makeUserInput(prompt),
|
||||
{
|
||||
type: "function_call_output",
|
||||
output: "compaction retry evidence block 0000\ncompaction retry evidence block 0001",
|
||||
},
|
||||
makeUserInput("Continue after compaction."),
|
||||
],
|
||||
}),
|
||||
});
|
||||
expect(writePlan.status).toBe(200);
|
||||
expect(await writePlan.text()).toContain('"name":"write"');
|
||||
|
||||
const contextOnlyWritePlan = await fetch(`${server.baseUrl}/v1/responses`, {
|
||||
method: "POST",
|
||||
headers: { "content-type": "application/json" },
|
||||
body: JSON.stringify({
|
||||
stream: true,
|
||||
model: "gpt-5.5",
|
||||
input: [
|
||||
{
|
||||
type: "function_call_output",
|
||||
output: "compaction retry evidence block 0000\ncompaction retry evidence block 0001",
|
||||
},
|
||||
makeUserInput("Continue after compaction."),
|
||||
],
|
||||
}),
|
||||
});
|
||||
expect(contextOnlyWritePlan.status).toBe(200);
|
||||
expect(await contextOnlyWritePlan.text()).toContain('"name":"write"');
|
||||
|
||||
const finalReply = await fetch(`${server.baseUrl}/v1/responses`, {
|
||||
method: "POST",
|
||||
headers: { "content-type": "application/json" },
|
||||
body: JSON.stringify({
|
||||
stream: false,
|
||||
model: "gpt-5.5",
|
||||
input: [
|
||||
makeUserInput(prompt),
|
||||
{
|
||||
type: "function_call_output",
|
||||
output: "Successfully wrote 41 bytes to compaction-retry-summary.txt.",
|
||||
},
|
||||
makeUserInput("Continue after compaction."),
|
||||
],
|
||||
}),
|
||||
});
|
||||
expect(finalReply.status).toBe(200);
|
||||
expect(outputText(await finalReply.json())).toContain("replay unsafe after write");
|
||||
});
|
||||
|
||||
it("supports exact reply memory prompts and embeddings requests", async () => {
|
||||
const server = await startQaMockOpenAiServer({
|
||||
host: "127.0.0.1",
|
||||
@@ -1866,6 +1934,165 @@ describe("qa mock openai server", () => {
|
||||
expect(outputText(await phaseOnlyFinal.json())).toBe("subagent-1: ok\nsubagent-2: ok");
|
||||
});
|
||||
|
||||
it("uses full request text when planning continuation subagent tool calls", async () => {
|
||||
const server = await startQaMockOpenAiServer({
|
||||
host: "127.0.0.1",
|
||||
port: 0,
|
||||
});
|
||||
cleanups.push(async () => {
|
||||
await server.stop();
|
||||
});
|
||||
|
||||
const handoffPrompt =
|
||||
"Delegate one bounded QA task to a subagent. Wait for the subagent to finish.";
|
||||
const handoff = await fetch(`${server.baseUrl}/v1/responses`, {
|
||||
method: "POST",
|
||||
headers: { "content-type": "application/json" },
|
||||
body: JSON.stringify({
|
||||
stream: true,
|
||||
tools: [SESSIONS_SPAWN_TOOL],
|
||||
input: [makeUserInput(handoffPrompt), makeUserInput("Continue.")],
|
||||
}),
|
||||
});
|
||||
expect(handoff.status).toBe(200);
|
||||
expect(await handoff.text()).toContain('"name":"sessions_spawn"');
|
||||
|
||||
const handoffServer = await startQaMockOpenAiServer({
|
||||
host: "127.0.0.1",
|
||||
port: 0,
|
||||
});
|
||||
cleanups.push(async () => {
|
||||
await handoffServer.stop();
|
||||
});
|
||||
|
||||
const appServerHandoff = await fetch(`${handoffServer.baseUrl}/v1/responses`, {
|
||||
method: "POST",
|
||||
headers: { "content-type": "application/json" },
|
||||
body: JSON.stringify({
|
||||
stream: true,
|
||||
input: [makeUserInput(handoffPrompt), makeUserInput("Continue.")],
|
||||
}),
|
||||
});
|
||||
expect(appServerHandoff.status).toBe(200);
|
||||
expect(await appServerHandoff.text()).toContain('"name":"sessions_spawn"');
|
||||
|
||||
const repeatedHandoff = await fetch(`${handoffServer.baseUrl}/v1/responses`, {
|
||||
method: "POST",
|
||||
headers: { "content-type": "application/json" },
|
||||
body: JSON.stringify({
|
||||
stream: true,
|
||||
input: [makeUserInput(handoffPrompt), makeUserInput("Continue again.")],
|
||||
}),
|
||||
});
|
||||
expect(repeatedHandoff.status).toBe(200);
|
||||
expect(await repeatedHandoff.text()).not.toContain('"name":"sessions_spawn"');
|
||||
|
||||
const handoffFinal = await fetch(`${server.baseUrl}/v1/responses`, {
|
||||
method: "POST",
|
||||
headers: { "content-type": "application/json" },
|
||||
body: JSON.stringify({
|
||||
stream: false,
|
||||
tools: [SESSIONS_SPAWN_TOOL],
|
||||
input: [
|
||||
makeUserInput(handoffPrompt),
|
||||
{ type: "function_call_output", output: "SUBAGENT-OK" },
|
||||
makeUserInput("Continue."),
|
||||
],
|
||||
}),
|
||||
});
|
||||
expect(handoffFinal.status).toBe(200);
|
||||
expect(outputText(await handoffFinal.json())).toContain("Delegated task");
|
||||
|
||||
const fanoutPrompt =
|
||||
"Subagent fanout synthesis check: delegate two bounded subagents sequentially, then report both results together.";
|
||||
const appServerFanout = await fetch(`${server.baseUrl}/v1/responses`, {
|
||||
method: "POST",
|
||||
headers: { "content-type": "application/json" },
|
||||
body: JSON.stringify({
|
||||
stream: true,
|
||||
input: [makeUserInput(fanoutPrompt), makeUserInput("Continue.")],
|
||||
}),
|
||||
});
|
||||
expect(appServerFanout.status).toBe(200);
|
||||
expect(await appServerFanout.text()).toContain('\\"label\\":\\"qa-fanout-alpha\\"');
|
||||
|
||||
const fanoutServer = await startQaMockOpenAiServer({
|
||||
host: "127.0.0.1",
|
||||
port: 0,
|
||||
});
|
||||
cleanups.push(async () => {
|
||||
await fanoutServer.stop();
|
||||
});
|
||||
|
||||
const firstFanout = await fetch(`${fanoutServer.baseUrl}/v1/responses`, {
|
||||
method: "POST",
|
||||
headers: { "content-type": "application/json" },
|
||||
body: JSON.stringify({
|
||||
stream: true,
|
||||
tools: [SESSIONS_SPAWN_TOOL],
|
||||
input: [makeUserInput(fanoutPrompt)],
|
||||
}),
|
||||
});
|
||||
expect(firstFanout.status).toBe(200);
|
||||
expect(await firstFanout.text()).toContain('\\"label\\":\\"qa-fanout-alpha\\"');
|
||||
|
||||
const secondFanout = await fetch(`${fanoutServer.baseUrl}/v1/responses`, {
|
||||
method: "POST",
|
||||
headers: { "content-type": "application/json" },
|
||||
body: JSON.stringify({
|
||||
stream: true,
|
||||
tools: [SESSIONS_SPAWN_TOOL],
|
||||
input: [
|
||||
makeUserInput(fanoutPrompt),
|
||||
{
|
||||
type: "function_call_output",
|
||||
output:
|
||||
'{"status":"accepted","childSessionKey":"agent:qa:subagent:alpha","note":"ALPHA-OK"}',
|
||||
},
|
||||
makeUserInput("Continue."),
|
||||
],
|
||||
}),
|
||||
});
|
||||
expect(secondFanout.status).toBe(200);
|
||||
expect(await secondFanout.text()).toContain('\\"label\\":\\"qa-fanout-beta\\"');
|
||||
});
|
||||
|
||||
it("keeps source discovery reports out of subagent handoff prose", async () => {
|
||||
const server = await startQaMockOpenAiServer({
|
||||
host: "127.0.0.1",
|
||||
port: 0,
|
||||
});
|
||||
cleanups.push(async () => {
|
||||
await server.stop();
|
||||
});
|
||||
|
||||
const response = await fetch(`${server.baseUrl}/v1/responses`, {
|
||||
method: "POST",
|
||||
headers: { "content-type": "application/json" },
|
||||
body: JSON.stringify({
|
||||
stream: false,
|
||||
input: [
|
||||
makeUserInput(
|
||||
"Read the seeded docs and source plan, then report grouped into Worked, Failed, Blocked, and Follow-up.",
|
||||
),
|
||||
{
|
||||
type: "function_call_output",
|
||||
output:
|
||||
"repo/qa/scenarios/index.md includes scenario: subagent-handoff and repo/extensions/qa-lab/src/suite.ts.",
|
||||
},
|
||||
makeUserInput("Continue."),
|
||||
],
|
||||
}),
|
||||
});
|
||||
|
||||
expect(response.status).toBe(200);
|
||||
const text = outputText(await response.json());
|
||||
expect(text).toContain("Worked:");
|
||||
expect(text).toContain("repo/docs/help/testing.md");
|
||||
expect(text).toContain("Follow-up:");
|
||||
expect(text).not.toContain("Delegated task");
|
||||
});
|
||||
|
||||
it("does not let fanout completion state hijack child worker replies", async () => {
|
||||
const server = await startQaMockOpenAiServer({
|
||||
host: "127.0.0.1",
|
||||
@@ -2727,7 +2954,7 @@ describe("qa mock openai server", () => {
|
||||
| { name: string; input: Record<string, unknown> }
|
||||
| undefined;
|
||||
expect(toolUseBlock?.name).toBe("read");
|
||||
expect(toolUseBlock?.input).toEqual({ path: "QA_SCENARIO_PLAN.md" });
|
||||
expect(toolUseBlock?.input).toEqual({ path: "repo/qa/scenarios/index.md" });
|
||||
|
||||
const debugResponse = await fetch(`${server.baseUrl}/debug/last-request`);
|
||||
expect(debugResponse.status).toBe(200);
|
||||
@@ -2985,7 +3212,7 @@ describe("qa mock openai server", () => {
|
||||
expect(body).toContain("event: content_block_start");
|
||||
expect(body).toContain('"type":"tool_use"');
|
||||
expect(body).toContain('"name":"read"');
|
||||
expect(body).toContain("QA_SCENARIO_PLAN.md");
|
||||
expect(body).toContain("repo/qa/scenarios/index.md");
|
||||
expect(body).toContain("event: message_delta");
|
||||
expect(body).toContain("event: message_stop");
|
||||
});
|
||||
|
||||
@@ -177,6 +177,7 @@ const QA_TOOL_SEARCH_FAILURE_PROMPT_RE = /tool search qa failure/i;
|
||||
|
||||
type MockScenarioState = {
|
||||
subagentFanoutPhase: number;
|
||||
subagentHandoffSpawned: boolean;
|
||||
};
|
||||
|
||||
const MOCK_OPENAI_MAX_BODY_BYTES = 16 * 1024 * 1024;
|
||||
@@ -1128,7 +1129,11 @@ function buildAssistantText(
|
||||
"- None.",
|
||||
].join("\n");
|
||||
}
|
||||
if (toolOutput && (/\bdelegate\b/i.test(prompt) || /subagent handoff/i.test(prompt))) {
|
||||
if (
|
||||
toolOutput &&
|
||||
(/delegate (?:one |a )bounded qa task/i.test(allInputText) ||
|
||||
/subagent handoff/i.test(allInputText))
|
||||
) {
|
||||
const compact = toolOutput.replace(/\s+/g, " ").trim() || "no delegated output";
|
||||
return `Delegated task:\n- Inspect the QA workspace via a bounded subagent.\nResult:\n- ${compact}\nEvidence:\n- The child result was folded back into the main thread exactly once.`;
|
||||
}
|
||||
@@ -1141,7 +1146,11 @@ function buildAssistantText(
|
||||
}
|
||||
return `Protocol note: Lobster Invaders built at lobster-invaders.html.`;
|
||||
}
|
||||
if (toolOutput && /compaction retry mutating tool check/i.test(prompt)) {
|
||||
if (
|
||||
toolOutput &&
|
||||
(/compaction retry mutating tool check/i.test(allInputText) ||
|
||||
/compaction-retry-summary\.txt/i.test(toolOutput))
|
||||
) {
|
||||
if (
|
||||
toolOutput.includes("Replay safety: unsafe after write.") ||
|
||||
/compaction-retry-summary\.txt/i.test(toolOutput) ||
|
||||
@@ -1152,6 +1161,22 @@ function buildAssistantText(
|
||||
}
|
||||
return "";
|
||||
}
|
||||
if (
|
||||
toolOutput &&
|
||||
/(worked, failed, blocked|worked\/failed\/blocked|source and docs)/i.test(allInputText)
|
||||
) {
|
||||
return [
|
||||
"Worked:",
|
||||
"- Read all three seeded files: repo/qa/scenarios/index.md, repo/extensions/qa-lab/src/suite.ts, and repo/docs/help/testing.md.",
|
||||
"- Extra QA scenario candidates: config restart capability flip and image generation roundtrip.",
|
||||
"Failed:",
|
||||
"- None observed in mock mode.",
|
||||
"Blocked:",
|
||||
"- No live provider evidence in this lane.",
|
||||
"Follow-up:",
|
||||
"- Re-run with a real model for qualitative coverage.",
|
||||
].join("\n");
|
||||
}
|
||||
if (toolOutput) {
|
||||
const snippet = toolOutput.replace(/\s+/g, " ").trim().slice(0, 220);
|
||||
return `Protocol note: I reviewed the requested material. Evidence snippet: ${snippet || "no content"}`;
|
||||
@@ -1501,11 +1526,17 @@ async function buildResponsesPayload(
|
||||
const isBaselineUnmentionedChannelChatter = /\bno bot ping here\b/i.test(prompt);
|
||||
const hasReasoningOnlyRetryInstruction = allInputText.includes(QA_REASONING_ONLY_RETRY_NEEDLE);
|
||||
const hasEmptyResponseRetryInstruction = allInputText.includes(QA_EMPTY_RESPONSE_RETRY_NEEDLE);
|
||||
const canCallSessionsSpawn = hasDeclaredTool(body, "sessions_spawn");
|
||||
const canCallSessionsYield = hasDeclaredTool(body, "sessions_yield");
|
||||
const canPlanQaSessionsSpawn =
|
||||
canCallSessionsSpawn ||
|
||||
/subagent fanout synthesis check|delegate one bounded qa task|subagent handoff/i.test(prompt);
|
||||
const canCallMockSubagentTool =
|
||||
QA_SUBAGENT_DIRECT_FALLBACK_PROMPT_RE.test(allInputText) ||
|
||||
/subagent fanout synthesis check/i.test(allInputText) ||
|
||||
/forked subagent context qa check/i.test(allInputText) ||
|
||||
/delegate (?:one |a )bounded qa task/i.test(allInputText) ||
|
||||
/subagent handoff/i.test(allInputText) ||
|
||||
buildExplicitSessionsSpawnArgs(allInputText) !== null;
|
||||
const canCallSessionsSpawn = hasDeclaredTool(body, "sessions_spawn") || canCallMockSubagentTool;
|
||||
const canCallSessionsYield =
|
||||
hasDeclaredTool(body, "sessions_yield") ||
|
||||
QA_SUBAGENT_DIRECT_FALLBACK_PROMPT_RE.test(allInputText);
|
||||
const buildToolProgressReadEvents = (pattern: RegExp) => {
|
||||
const toolProgressPrompt = extractLastMatchingUserText(extractAllUserTexts(input), pattern);
|
||||
return buildToolCallEventsWithArgs("read", {
|
||||
@@ -1819,7 +1850,11 @@ async function buildResponsesPayload(
|
||||
});
|
||||
}
|
||||
}
|
||||
if (/compaction retry mutating tool check/i.test(prompt)) {
|
||||
if (
|
||||
/compaction retry mutating tool check/i.test(allInputText) ||
|
||||
/compaction retry evidence/i.test(toolOutput) ||
|
||||
/compaction-retry-summary\.txt/i.test(toolOutput)
|
||||
) {
|
||||
if (!toolOutput) {
|
||||
return buildToolCallEventsWithArgs("read", { path: "COMPACTION_RETRY_CONTEXT.md" });
|
||||
}
|
||||
@@ -2002,7 +2037,7 @@ async function buildResponsesPayload(
|
||||
size: "1024x1024",
|
||||
});
|
||||
}
|
||||
if (canPlanQaSessionsSpawn && /subagent fanout synthesis check/i.test(prompt)) {
|
||||
if (canCallSessionsSpawn && /subagent fanout synthesis check/i.test(allInputText)) {
|
||||
if (!toolOutput && scenarioState.subagentFanoutPhase === 0) {
|
||||
scenarioState.subagentFanoutPhase = 1;
|
||||
return buildToolCallEventsWithArgs("sessions_spawn", {
|
||||
@@ -2078,10 +2113,13 @@ async function buildResponsesPayload(
|
||||
}
|
||||
}
|
||||
if (
|
||||
canPlanQaSessionsSpawn &&
|
||||
(/\bdelegate\b/i.test(prompt) || /subagent handoff/i.test(prompt)) &&
|
||||
!toolOutput
|
||||
canCallSessionsSpawn &&
|
||||
(/delegate (?:one |a )bounded qa task/i.test(allInputText) ||
|
||||
/subagent handoff/i.test(allInputText)) &&
|
||||
!toolOutput &&
|
||||
!scenarioState.subagentHandoffSpawned
|
||||
) {
|
||||
scenarioState.subagentHandoffSpawned = true;
|
||||
return buildToolCallEventsWithArgs("sessions_spawn", {
|
||||
task: "Inspect the QA workspace and return one concise protocol note.",
|
||||
label: "qa-sidecar",
|
||||
@@ -2092,7 +2130,7 @@ async function buildResponsesPayload(
|
||||
/(worked, failed, blocked|worked\/failed\/blocked|source and docs)/i.test(prompt) &&
|
||||
!toolOutput
|
||||
) {
|
||||
return buildToolCallEventsWithArgs("read", { path: "QA_SCENARIO_PLAN.md" });
|
||||
return buildToolCallEventsWithArgs("read", { path: "repo/qa/scenarios/index.md" });
|
||||
}
|
||||
if (!toolOutput && /\b(read|inspect|repo|docs|scenario|kickoff)\b/i.test(prompt)) {
|
||||
return buildToolCallEvents(prompt);
|
||||
@@ -2496,7 +2534,10 @@ async function buildMessagesPayload(
|
||||
|
||||
export async function startQaMockOpenAiServer(params?: { host?: string; port?: number }) {
|
||||
const host = params?.host ?? "127.0.0.1";
|
||||
const scenarioState: MockScenarioState = { subagentFanoutPhase: 0 };
|
||||
const scenarioState: MockScenarioState = {
|
||||
subagentFanoutPhase: 0,
|
||||
subagentHandoffSpawned: false,
|
||||
};
|
||||
let lastRequest: MockOpenAiRequestSnapshot | null = null;
|
||||
const requests: MockOpenAiRequestSnapshot[] = [];
|
||||
const imageGenerationRequests: Array<Record<string, unknown>> = [];
|
||||
|
||||
@@ -5,6 +5,7 @@ import path from "node:path";
|
||||
import { afterEach, describe, expect, it, vi } from "vitest";
|
||||
import {
|
||||
captureRuntimeParityCell,
|
||||
isRuntimeParityResultPass,
|
||||
runRuntimeParityScenario,
|
||||
type RuntimeId,
|
||||
type RuntimeParityCell,
|
||||
@@ -179,6 +180,7 @@ describe("runtime parity", () => {
|
||||
});
|
||||
|
||||
expect(result.drift).toBe("tool-call-shape");
|
||||
expect(isRuntimeParityResultPass(result)).toBe(true);
|
||||
});
|
||||
|
||||
it("classifies tool result shape drift", async () => {
|
||||
@@ -220,6 +222,7 @@ describe("runtime parity", () => {
|
||||
});
|
||||
|
||||
expect(result.drift).toBe("failure-mode");
|
||||
expect(isRuntimeParityResultPass(result)).toBe(false);
|
||||
});
|
||||
|
||||
it("surfaces tool-call-shape when one runtime fails because the tool path drifted", async () => {
|
||||
@@ -235,6 +238,7 @@ describe("runtime parity", () => {
|
||||
});
|
||||
|
||||
expect(result.drift).toBe("tool-call-shape");
|
||||
expect(isRuntimeParityResultPass(result)).toBe(false);
|
||||
});
|
||||
|
||||
it("surfaces tool-result-shape when a downstream timeout follows divergent tool output", async () => {
|
||||
|
||||
@@ -59,6 +59,23 @@ export type RuntimeParityScenarioExecution = {
|
||||
cell: RuntimeParityCell;
|
||||
};
|
||||
|
||||
export function runtimeParityCellStatus(
|
||||
cell: RuntimeParityCell | undefined,
|
||||
): "pass" | "fail" | "missing" {
|
||||
if (!cell) {
|
||||
return "missing";
|
||||
}
|
||||
return cell.runtimeErrorClass || cell.transportErrorClass ? "fail" : "pass";
|
||||
}
|
||||
|
||||
export function isRuntimeParityResultPass(result: RuntimeParityResult) {
|
||||
return (
|
||||
result.drift !== "failure-mode" &&
|
||||
runtimeParityCellStatus(result.cells.pi) === "pass" &&
|
||||
runtimeParityCellStatus(result.cells.codex) === "pass"
|
||||
);
|
||||
}
|
||||
|
||||
type QaGatewayLike = {
|
||||
logs?: () => string;
|
||||
tempRoot: string;
|
||||
|
||||
@@ -31,6 +31,7 @@ import { renderQaMarkdownReport, type QaReportCheck, type QaReportScenario } fro
|
||||
import { defaultQaModelForMode } from "./run-config.js";
|
||||
import {
|
||||
captureRuntimeParityCell,
|
||||
isRuntimeParityResultPass,
|
||||
runRuntimeParityScenario,
|
||||
type RuntimeId,
|
||||
type RuntimeParityCell,
|
||||
@@ -276,7 +277,7 @@ async function runScenarioDefinition(
|
||||
}
|
||||
|
||||
function isRuntimeParityPass(result: RuntimeParityResult) {
|
||||
return result.drift === "none" || result.drift === "text-only";
|
||||
return isRuntimeParityResultPass(result);
|
||||
}
|
||||
|
||||
function formatRuntimeParityCellDetails(cell: RuntimeParityCell) {
|
||||
|
||||
@@ -346,6 +346,7 @@ export function createOpenClawTools(
|
||||
!embedded ||
|
||||
options?.sourceReplyDeliveryMode === "message_tool_only" ||
|
||||
messageExplicitlyAllowed;
|
||||
const includeSubagentSpawnTool = !embedded || options?.allowGatewaySubagentBinding === true;
|
||||
const effectiveCallGateway = embedded
|
||||
? createEmbeddedCallGateway()
|
||||
: openClawToolsDeps.callGateway;
|
||||
@@ -424,6 +425,9 @@ export function createOpenClawTools(
|
||||
config: resolvedConfig,
|
||||
callGateway: openClawToolsDeps.callGateway,
|
||||
}),
|
||||
]),
|
||||
...(includeSubagentSpawnTool
|
||||
? [
|
||||
createSessionsSpawnTool({
|
||||
agentSessionKey: options?.agentSessionKey,
|
||||
agentChannel: options?.agentChannel,
|
||||
@@ -441,7 +445,8 @@ export function createOpenClawTools(
|
||||
inheritedToolAllowlist: options?.inheritedToolAllowlist,
|
||||
inheritedToolDenylist: options?.inheritedToolDenylist,
|
||||
}),
|
||||
]),
|
||||
]
|
||||
: []),
|
||||
createSessionsYieldTool({
|
||||
sessionId: options?.sessionId,
|
||||
onYield: options?.onYield,
|
||||
|
||||
@@ -123,6 +123,24 @@ describe("openclaw-tools update_plan gating", () => {
|
||||
expect(toolNames(denied)).not.toContain("message");
|
||||
});
|
||||
|
||||
it("keeps subagent spawn available for trusted embedded gateway-bound runs", () => {
|
||||
setEmbeddedMode(true);
|
||||
const defaultTools = createOpenClawTools({
|
||||
config: {} as OpenClawConfig,
|
||||
disablePluginTools: true,
|
||||
});
|
||||
const gatewayBoundTools = createOpenClawTools({
|
||||
config: {} as OpenClawConfig,
|
||||
disablePluginTools: true,
|
||||
allowGatewaySubagentBinding: true,
|
||||
});
|
||||
|
||||
expect(toolNames(defaultTools)).not.toContain("sessions_spawn");
|
||||
expect(toolNames(defaultTools)).not.toContain("sessions_send");
|
||||
expect(toolNames(gatewayBoundTools)).toContain("sessions_spawn");
|
||||
expect(toolNames(gatewayBoundTools)).not.toContain("sessions_send");
|
||||
});
|
||||
|
||||
it("registers update_plan when explicitly enabled", () => {
|
||||
const config = {
|
||||
tools: {
|
||||
|
||||
Reference in New Issue
Block a user