diff --git a/CHANGELOG.md b/CHANGELOG.md index 0b5f14e217e..2d0fc8bfc8e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -32,6 +32,7 @@ Docs: https://docs.openclaw.ai - QA-Lab: fail live runtime parity reports when assistant-message usage is missing, preventing `0 vs 0` live token rows from being reported as passing proof. Fixes #80411. Thanks @100yenadmin. - QA-Lab: fail Codex-backed OpenAI live runtime-pair runs before launching isolated workers when no portable Codex auth is available, while staging API-key fallbacks and configured Codex keys for isolated QA agents. Fixes #80412. Thanks @100yenadmin. - QA-Lab: refresh parity gates, mock frontier fixtures, model scenarios, and workflow artifact lanes to compare GPT-5.5 against Claude Opus 4.7. Fixes #74262. Thanks @100yenadmin. +- QA-Lab: make mock parity dispatch provider-aware for source discovery and subagent scenarios so OpenAI and Anthropic lanes no longer share identical canned plans. Fixes #64879. Thanks @100yenadmin. - QA-Lab: stop returning Control UI bearer tokens from unauthenticated bootstrap payloads and bind Docker harness ports to loopback-only host addresses. (#66355) Thanks @pgondhi987. - Mac app: avoid a SwiftUI metadata crash when rendering the Cron Jobs settings pane. - Agents/OpenAI streams: yield via `setTimeout(0)` instead of `setImmediate` between bursty Responses chunks so abort timers can fire during the yield, keeping cancel-on-timeout responsive on hot streams. Refs #82462. diff --git a/extensions/qa-lab/src/providers/mock-openai/server.test.ts b/extensions/qa-lab/src/providers/mock-openai/server.test.ts index db2f56cebbe..e53bf00e59c 100644 --- a/extensions/qa-lab/src/providers/mock-openai/server.test.ts +++ b/extensions/qa-lab/src/providers/mock-openai/server.test.ts @@ -79,6 +79,14 @@ function outputItem(payload: unknown, index = 0) { return requireRecord(output[index], `response output ${index}`); } +function outputToolArgs(payload: unknown, index = 0) { + const item = outputItem(payload, index); + if (typeof item.arguments !== "string") { + throw new Error("Expected response output arguments"); + } + return requireRecord(JSON.parse(item.arguments) as unknown, "response output arguments"); +} + function outputContentItem(payload: unknown, outputIndex = 0, contentIndex = 0) { const content = requireArray(outputItem(payload, outputIndex).content, "response output content"); return requireRecord(content[contentIndex], `response content ${contentIndex}`); @@ -3017,7 +3025,7 @@ describe("qa mock openai server", () => { | { name: string; input: Record } | undefined; expect(toolUseBlock?.name).toBe("read"); - expect(toolUseBlock?.input).toEqual({ path: "repo/qa/scenarios/index.md" }); + expect(toolUseBlock?.input).toEqual({ path: "repo/docs/help/testing.md" }); const debugResponse = await fetch(`${server.baseUrl}/debug/last-request`); expect(debugResponse.status).toBe(200); @@ -3275,7 +3283,7 @@ describe("qa mock openai server", () => { expect(body).toContain("event: content_block_start"); expect(body).toContain('"type":"tool_use"'); expect(body).toContain('"name":"read"'); - expect(body).toContain("repo/qa/scenarios/index.md"); + expect(body).toContain("repo/docs/help/testing.md"); expect(body).toContain("event: message_delta"); expect(body).toContain("event: message_stop"); }); @@ -3739,6 +3747,77 @@ describe("resolveProviderVariant", () => { }); describe("qa mock openai server provider variant tagging", () => { + it("pins provider-specific plans for parity scenarios", async () => { + const sourcePrompt = + "Read the seeded docs and source plan, then report grouped into Worked, Failed, Blocked, and Follow-up."; + const handoffPrompt = + "Delegate one bounded QA task to a subagent. Wait for the subagent to finish."; + const fanoutPrompt = + "Subagent fanout synthesis check: delegate two bounded subagents sequentially, then report both results together."; + + const openaiSourceServer = await startMockServer(); + const openaiSource = await expectResponsesJson(openaiSourceServer, { + model: "openai/gpt-5.5", + stream: false, + input: [makeUserInput(sourcePrompt)], + }); + expect(outputToolArgs(openaiSource)).toEqual({ path: "repo/qa/scenarios/index.md" }); + + const anthropicSourceServer = await startMockServer(); + const anthropicSource = await expectResponsesJson(anthropicSourceServer, { + model: "anthropic/claude-opus-4-7", + stream: false, + input: [makeUserInput(sourcePrompt)], + }); + expect(outputToolArgs(anthropicSource)).toEqual({ path: "repo/docs/help/testing.md" }); + + const openaiHandoffServer = await startMockServer(); + const openaiHandoff = await expectResponsesJson(openaiHandoffServer, { + model: "gpt-5.5", + stream: false, + input: [makeUserInput(handoffPrompt)], + }); + expect(outputToolArgs(openaiHandoff)).toMatchObject({ + label: "qa-sidecar", + task: "Inspect the QA workspace and return one concise protocol note.", + }); + + const anthropicHandoffServer = await startMockServer(); + const anthropicHandoff = await expectResponsesJson(anthropicHandoffServer, { + model: "claude-opus-4-7", + stream: false, + input: [makeUserInput(handoffPrompt)], + }); + expect(outputToolArgs(anthropicHandoff)).toMatchObject({ + label: "qa-sidecar", + task: "Inspect the QA docs fixture and return one concise protocol note.", + }); + + const openaiFanoutServer = await startMockServer(); + const openaiFanout = await expectResponsesJson(openaiFanoutServer, { + model: "openai/gpt-5.5", + stream: false, + tools: [SESSIONS_SPAWN_TOOL], + input: [makeUserInput(fanoutPrompt)], + }); + expect(outputToolArgs(openaiFanout)).toMatchObject({ + label: "qa-fanout-alpha", + task: "Fanout worker alpha: inspect the QA workspace and finish with exactly ALPHA-OK.", + }); + + const anthropicFanoutServer = await startMockServer(); + const anthropicFanout = await expectResponsesJson(anthropicFanoutServer, { + model: "anthropic/claude-opus-4-7", + stream: false, + tools: [SESSIONS_SPAWN_TOOL], + input: [makeUserInput(fanoutPrompt)], + }); + expect(outputToolArgs(anthropicFanout)).toMatchObject({ + label: "qa-fanout-alpha", + task: "Fanout worker alpha: inspect the QA docs fixture and finish with exactly ALPHA-OK.", + }); + }); + it("records providerVariant on /debug/last-request for openai requests", async () => { const server = await startQaMockOpenAiServer({ host: "127.0.0.1", diff --git a/extensions/qa-lab/src/providers/mock-openai/server.ts b/extensions/qa-lab/src/providers/mock-openai/server.ts index be831461d03..7035193b522 100644 --- a/extensions/qa-lab/src/providers/mock-openai/server.ts +++ b/extensions/qa-lab/src/providers/mock-openai/server.ts @@ -109,8 +109,8 @@ type MockOpenAiRequestSnapshot = { // This is a subset of the real Anthropic Messages API — just enough so the // QA suite can run its parity pack against a "baseline" Anthropic provider // without needing real API keys. The scenarios drive their dispatch through -// the shared mock scenario logic (buildResponsesPayload), so whatever -// behavior the OpenAI mock exposes is automatically mirrored on this route. +// the shared mock scenario logic (buildResponsesPayload), with `model` +// preserved so provider-aware branches can intentionally diverge. type AnthropicMessageContentBlock = | { type: "text"; text: string } | { @@ -180,6 +180,28 @@ type MockScenarioState = { subagentHandoffSpawned: boolean; }; +function sourceDiscoveryReadPathForProvider(providerVariant: MockOpenAiProviderVariant) { + return providerVariant === "anthropic" + ? "repo/docs/help/testing.md" + : "repo/qa/scenarios/index.md"; +} + +function subagentHandoffTaskForProvider(providerVariant: MockOpenAiProviderVariant) { + return providerVariant === "anthropic" + ? "Inspect the QA docs fixture and return one concise protocol note." + : "Inspect the QA workspace and return one concise protocol note."; +} + +function subagentFanoutTaskForProvider( + providerVariant: MockOpenAiProviderVariant, + worker: "alpha" | "beta", +) { + const marker = worker === "alpha" ? "ALPHA-OK" : "BETA-OK"; + const scope = + providerVariant === "anthropic" ? "the QA docs fixture" : "the QA workspace"; + return `Fanout worker ${worker}: inspect ${scope} and finish with exactly ${marker}.`; +} + const MOCK_OPENAI_MAX_BODY_BYTES = 16 * 1024 * 1024; const MOCK_OPENAI_BODY_TIMEOUT_MS = 30_000; const MOCK_OPENAI_DEBUG_REQUEST_LIMIT = 200; @@ -1496,6 +1518,9 @@ async function buildResponsesPayload( body: Record, scenarioState: MockScenarioState, ) { + const providerVariant = resolveProviderVariant( + typeof body.model === "string" ? body.model : undefined, + ); const input = Array.isArray(body.input) ? (body.input as ResponsesInputItem[]) : []; const prompt = extractLastUserText(input); const toolOutput = extractToolOutput(input); @@ -2039,7 +2064,7 @@ async function buildResponsesPayload( if (!toolOutput && scenarioState.subagentFanoutPhase === 0) { scenarioState.subagentFanoutPhase = 1; return buildToolCallEventsWithArgs("sessions_spawn", { - task: "Fanout worker alpha: inspect the QA workspace and finish with exactly ALPHA-OK.", + task: subagentFanoutTaskForProvider(providerVariant, "alpha"), label: "qa-fanout-alpha", thread: false, }); @@ -2047,7 +2072,7 @@ async function buildResponsesPayload( if (toolOutput && scenarioState.subagentFanoutPhase === 1) { scenarioState.subagentFanoutPhase = 2; return buildToolCallEventsWithArgs("sessions_spawn", { - task: "Fanout worker beta: inspect the QA workspace and finish with exactly BETA-OK.", + task: subagentFanoutTaskForProvider(providerVariant, "beta"), label: "qa-fanout-beta", thread: false, }); @@ -2123,7 +2148,7 @@ async function buildResponsesPayload( ) { scenarioState.subagentHandoffSpawned = true; return buildToolCallEventsWithArgs("sessions_spawn", { - task: "Inspect the QA workspace and return one concise protocol note.", + task: subagentHandoffTaskForProvider(providerVariant), label: "qa-sidecar", thread: false, }); @@ -2132,7 +2157,9 @@ async function buildResponsesPayload( /(worked, failed, blocked|worked\/failed\/blocked|source and docs)/i.test(prompt) && !toolOutput ) { - return buildToolCallEventsWithArgs("read", { path: "repo/qa/scenarios/index.md" }); + return buildToolCallEventsWithArgs("read", { + path: sourceDiscoveryReadPathForProvider(providerVariant), + }); } if (!toolOutput && /\b(read|inspect|repo|docs|scenario|kickoff)\b/i.test(prompt)) { return buildToolCallEvents(prompt); @@ -2167,8 +2194,8 @@ async function buildResponsesPayload( // shapes into the shared ResponsesInputItem[] format, calls the same // buildResponsesPayload() dispatcher, and then re-serializes the resulting // events into an Anthropic response. This gives the parity harness a -// baseline lane that exercises the same scenario logic without requiring -// real Anthropic API keys. +// baseline lane that exercises the same scenario logic and selected +// provider-specific plans without requiring real Anthropic API keys. // // Scope: handles Anthropic Messages requests with text and tool_result // content blocks, supporting both non-streaming JSON responses and the