fix(qa-lab): differentiate mock provider plans

This commit is contained in:
Vincent Koc
2026-05-17 23:44:35 +08:00
parent 45a434fb23
commit b764396dee
3 changed files with 117 additions and 10 deletions

View File

@@ -32,6 +32,7 @@ Docs: https://docs.openclaw.ai
- QA-Lab: fail live runtime parity reports when assistant-message usage is missing, preventing `0 vs 0` live token rows from being reported as passing proof. Fixes #80411. Thanks @100yenadmin.
- QA-Lab: fail Codex-backed OpenAI live runtime-pair runs before launching isolated workers when no portable Codex auth is available, while staging API-key fallbacks and configured Codex keys for isolated QA agents. Fixes #80412. Thanks @100yenadmin.
- QA-Lab: refresh parity gates, mock frontier fixtures, model scenarios, and workflow artifact lanes to compare GPT-5.5 against Claude Opus 4.7. Fixes #74262. Thanks @100yenadmin.
- QA-Lab: make mock parity dispatch provider-aware for source discovery and subagent scenarios so OpenAI and Anthropic lanes no longer share identical canned plans. Fixes #64879. Thanks @100yenadmin.
- QA-Lab: stop returning Control UI bearer tokens from unauthenticated bootstrap payloads and bind Docker harness ports to loopback-only host addresses. (#66355) Thanks @pgondhi987.
- Mac app: avoid a SwiftUI metadata crash when rendering the Cron Jobs settings pane.
- Agents/OpenAI streams: yield via `setTimeout(0)` instead of `setImmediate` between bursty Responses chunks so abort timers can fire during the yield, keeping cancel-on-timeout responsive on hot streams. Refs #82462.

View File

@@ -79,6 +79,14 @@ function outputItem(payload: unknown, index = 0) {
return requireRecord(output[index], `response output ${index}`);
}
function outputToolArgs(payload: unknown, index = 0) {
const item = outputItem(payload, index);
if (typeof item.arguments !== "string") {
throw new Error("Expected response output arguments");
}
return requireRecord(JSON.parse(item.arguments) as unknown, "response output arguments");
}
function outputContentItem(payload: unknown, outputIndex = 0, contentIndex = 0) {
const content = requireArray(outputItem(payload, outputIndex).content, "response output content");
return requireRecord(content[contentIndex], `response content ${contentIndex}`);
@@ -3017,7 +3025,7 @@ describe("qa mock openai server", () => {
| { name: string; input: Record<string, unknown> }
| undefined;
expect(toolUseBlock?.name).toBe("read");
expect(toolUseBlock?.input).toEqual({ path: "repo/qa/scenarios/index.md" });
expect(toolUseBlock?.input).toEqual({ path: "repo/docs/help/testing.md" });
const debugResponse = await fetch(`${server.baseUrl}/debug/last-request`);
expect(debugResponse.status).toBe(200);
@@ -3275,7 +3283,7 @@ describe("qa mock openai server", () => {
expect(body).toContain("event: content_block_start");
expect(body).toContain('"type":"tool_use"');
expect(body).toContain('"name":"read"');
expect(body).toContain("repo/qa/scenarios/index.md");
expect(body).toContain("repo/docs/help/testing.md");
expect(body).toContain("event: message_delta");
expect(body).toContain("event: message_stop");
});
@@ -3739,6 +3747,77 @@ describe("resolveProviderVariant", () => {
});
describe("qa mock openai server provider variant tagging", () => {
it("pins provider-specific plans for parity scenarios", async () => {
const sourcePrompt =
"Read the seeded docs and source plan, then report grouped into Worked, Failed, Blocked, and Follow-up.";
const handoffPrompt =
"Delegate one bounded QA task to a subagent. Wait for the subagent to finish.";
const fanoutPrompt =
"Subagent fanout synthesis check: delegate two bounded subagents sequentially, then report both results together.";
const openaiSourceServer = await startMockServer();
const openaiSource = await expectResponsesJson(openaiSourceServer, {
model: "openai/gpt-5.5",
stream: false,
input: [makeUserInput(sourcePrompt)],
});
expect(outputToolArgs(openaiSource)).toEqual({ path: "repo/qa/scenarios/index.md" });
const anthropicSourceServer = await startMockServer();
const anthropicSource = await expectResponsesJson(anthropicSourceServer, {
model: "anthropic/claude-opus-4-7",
stream: false,
input: [makeUserInput(sourcePrompt)],
});
expect(outputToolArgs(anthropicSource)).toEqual({ path: "repo/docs/help/testing.md" });
const openaiHandoffServer = await startMockServer();
const openaiHandoff = await expectResponsesJson(openaiHandoffServer, {
model: "gpt-5.5",
stream: false,
input: [makeUserInput(handoffPrompt)],
});
expect(outputToolArgs(openaiHandoff)).toMatchObject({
label: "qa-sidecar",
task: "Inspect the QA workspace and return one concise protocol note.",
});
const anthropicHandoffServer = await startMockServer();
const anthropicHandoff = await expectResponsesJson(anthropicHandoffServer, {
model: "claude-opus-4-7",
stream: false,
input: [makeUserInput(handoffPrompt)],
});
expect(outputToolArgs(anthropicHandoff)).toMatchObject({
label: "qa-sidecar",
task: "Inspect the QA docs fixture and return one concise protocol note.",
});
const openaiFanoutServer = await startMockServer();
const openaiFanout = await expectResponsesJson(openaiFanoutServer, {
model: "openai/gpt-5.5",
stream: false,
tools: [SESSIONS_SPAWN_TOOL],
input: [makeUserInput(fanoutPrompt)],
});
expect(outputToolArgs(openaiFanout)).toMatchObject({
label: "qa-fanout-alpha",
task: "Fanout worker alpha: inspect the QA workspace and finish with exactly ALPHA-OK.",
});
const anthropicFanoutServer = await startMockServer();
const anthropicFanout = await expectResponsesJson(anthropicFanoutServer, {
model: "anthropic/claude-opus-4-7",
stream: false,
tools: [SESSIONS_SPAWN_TOOL],
input: [makeUserInput(fanoutPrompt)],
});
expect(outputToolArgs(anthropicFanout)).toMatchObject({
label: "qa-fanout-alpha",
task: "Fanout worker alpha: inspect the QA docs fixture and finish with exactly ALPHA-OK.",
});
});
it("records providerVariant on /debug/last-request for openai requests", async () => {
const server = await startQaMockOpenAiServer({
host: "127.0.0.1",

View File

@@ -109,8 +109,8 @@ type MockOpenAiRequestSnapshot = {
// This is a subset of the real Anthropic Messages API — just enough so the
// QA suite can run its parity pack against a "baseline" Anthropic provider
// without needing real API keys. The scenarios drive their dispatch through
// the shared mock scenario logic (buildResponsesPayload), so whatever
// behavior the OpenAI mock exposes is automatically mirrored on this route.
// the shared mock scenario logic (buildResponsesPayload), with `model`
// preserved so provider-aware branches can intentionally diverge.
type AnthropicMessageContentBlock =
| { type: "text"; text: string }
| {
@@ -180,6 +180,28 @@ type MockScenarioState = {
subagentHandoffSpawned: boolean;
};
function sourceDiscoveryReadPathForProvider(providerVariant: MockOpenAiProviderVariant) {
return providerVariant === "anthropic"
? "repo/docs/help/testing.md"
: "repo/qa/scenarios/index.md";
}
function subagentHandoffTaskForProvider(providerVariant: MockOpenAiProviderVariant) {
return providerVariant === "anthropic"
? "Inspect the QA docs fixture and return one concise protocol note."
: "Inspect the QA workspace and return one concise protocol note.";
}
function subagentFanoutTaskForProvider(
providerVariant: MockOpenAiProviderVariant,
worker: "alpha" | "beta",
) {
const marker = worker === "alpha" ? "ALPHA-OK" : "BETA-OK";
const scope =
providerVariant === "anthropic" ? "the QA docs fixture" : "the QA workspace";
return `Fanout worker ${worker}: inspect ${scope} and finish with exactly ${marker}.`;
}
const MOCK_OPENAI_MAX_BODY_BYTES = 16 * 1024 * 1024;
const MOCK_OPENAI_BODY_TIMEOUT_MS = 30_000;
const MOCK_OPENAI_DEBUG_REQUEST_LIMIT = 200;
@@ -1496,6 +1518,9 @@ async function buildResponsesPayload(
body: Record<string, unknown>,
scenarioState: MockScenarioState,
) {
const providerVariant = resolveProviderVariant(
typeof body.model === "string" ? body.model : undefined,
);
const input = Array.isArray(body.input) ? (body.input as ResponsesInputItem[]) : [];
const prompt = extractLastUserText(input);
const toolOutput = extractToolOutput(input);
@@ -2039,7 +2064,7 @@ async function buildResponsesPayload(
if (!toolOutput && scenarioState.subagentFanoutPhase === 0) {
scenarioState.subagentFanoutPhase = 1;
return buildToolCallEventsWithArgs("sessions_spawn", {
task: "Fanout worker alpha: inspect the QA workspace and finish with exactly ALPHA-OK.",
task: subagentFanoutTaskForProvider(providerVariant, "alpha"),
label: "qa-fanout-alpha",
thread: false,
});
@@ -2047,7 +2072,7 @@ async function buildResponsesPayload(
if (toolOutput && scenarioState.subagentFanoutPhase === 1) {
scenarioState.subagentFanoutPhase = 2;
return buildToolCallEventsWithArgs("sessions_spawn", {
task: "Fanout worker beta: inspect the QA workspace and finish with exactly BETA-OK.",
task: subagentFanoutTaskForProvider(providerVariant, "beta"),
label: "qa-fanout-beta",
thread: false,
});
@@ -2123,7 +2148,7 @@ async function buildResponsesPayload(
) {
scenarioState.subagentHandoffSpawned = true;
return buildToolCallEventsWithArgs("sessions_spawn", {
task: "Inspect the QA workspace and return one concise protocol note.",
task: subagentHandoffTaskForProvider(providerVariant),
label: "qa-sidecar",
thread: false,
});
@@ -2132,7 +2157,9 @@ async function buildResponsesPayload(
/(worked, failed, blocked|worked\/failed\/blocked|source and docs)/i.test(prompt) &&
!toolOutput
) {
return buildToolCallEventsWithArgs("read", { path: "repo/qa/scenarios/index.md" });
return buildToolCallEventsWithArgs("read", {
path: sourceDiscoveryReadPathForProvider(providerVariant),
});
}
if (!toolOutput && /\b(read|inspect|repo|docs|scenario|kickoff)\b/i.test(prompt)) {
return buildToolCallEvents(prompt);
@@ -2167,8 +2194,8 @@ async function buildResponsesPayload(
// shapes into the shared ResponsesInputItem[] format, calls the same
// buildResponsesPayload() dispatcher, and then re-serializes the resulting
// events into an Anthropic response. This gives the parity harness a
// baseline lane that exercises the same scenario logic without requiring
// real Anthropic API keys.
// baseline lane that exercises the same scenario logic and selected
// provider-specific plans without requiring real Anthropic API keys.
//
// Scope: handles Anthropic Messages requests with text and tool_result
// content blocks, supporting both non-streaming JSON responses and the