fix(qa-lab): differentiate mock provider plans

2026-05-22 13:04:04 +00:00 · 2026-05-17 23:44:35 +08:00
parent 45a434fb23
commit b764396dee
3 changed files with 117 additions and 10 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -32,6 +32,7 @@ Docs: https://docs.openclaw.ai
 - QA-Lab: fail live runtime parity reports when assistant-message usage is missing, preventing `0 vs 0` live token rows from being reported as passing proof. Fixes #80411. Thanks @100yenadmin.
 - QA-Lab: fail Codex-backed OpenAI live runtime-pair runs before launching isolated workers when no portable Codex auth is available, while staging API-key fallbacks and configured Codex keys for isolated QA agents. Fixes #80412. Thanks @100yenadmin.
 - QA-Lab: refresh parity gates, mock frontier fixtures, model scenarios, and workflow artifact lanes to compare GPT-5.5 against Claude Opus 4.7. Fixes #74262. Thanks @100yenadmin.
+- QA-Lab: make mock parity dispatch provider-aware for source discovery and subagent scenarios so OpenAI and Anthropic lanes no longer share identical canned plans. Fixes #64879. Thanks @100yenadmin.
 - QA-Lab: stop returning Control UI bearer tokens from unauthenticated bootstrap payloads and bind Docker harness ports to loopback-only host addresses. (#66355) Thanks @pgondhi987.
 - Mac app: avoid a SwiftUI metadata crash when rendering the Cron Jobs settings pane.
 - Agents/OpenAI streams: yield via `setTimeout(0)` instead of `setImmediate` between bursty Responses chunks so abort timers can fire during the yield, keeping cancel-on-timeout responsive on hot streams. Refs #82462.
--- a/extensions/qa-lab/src/providers/mock-openai/server.test.ts
+++ b/extensions/qa-lab/src/providers/mock-openai/server.test.ts
@@ -79,6 +79,14 @@ function outputItem(payload: unknown, index = 0) {
  return requireRecord(output[index], `response output ${index}`);
 }

+function outputToolArgs(payload: unknown, index = 0) {
+  const item = outputItem(payload, index);
+  if (typeof item.arguments !== "string") {
+    throw new Error("Expected response output arguments");
+  }
+  return requireRecord(JSON.parse(item.arguments) as unknown, "response output arguments");
+}
+
 function outputContentItem(payload: unknown, outputIndex = 0, contentIndex = 0) {
  const content = requireArray(outputItem(payload, outputIndex).content, "response output content");
  return requireRecord(content[contentIndex], `response content ${contentIndex}`);
@@ -3017,7 +3025,7 @@ describe("qa mock openai server", () => {
      | { name: string; input: Record<string, unknown> }
      | undefined;
    expect(toolUseBlock?.name).toBe("read");
-    expect(toolUseBlock?.input).toEqual({ path: "repo/qa/scenarios/index.md" });
+    expect(toolUseBlock?.input).toEqual({ path: "repo/docs/help/testing.md" });

    const debugResponse = await fetch(`${server.baseUrl}/debug/last-request`);
    expect(debugResponse.status).toBe(200);
@@ -3275,7 +3283,7 @@ describe("qa mock openai server", () => {
    expect(body).toContain("event: content_block_start");
    expect(body).toContain('"type":"tool_use"');
    expect(body).toContain('"name":"read"');
-    expect(body).toContain("repo/qa/scenarios/index.md");
+    expect(body).toContain("repo/docs/help/testing.md");
    expect(body).toContain("event: message_delta");
    expect(body).toContain("event: message_stop");
  });
@@ -3739,6 +3747,77 @@ describe("resolveProviderVariant", () => {
 });

 describe("qa mock openai server provider variant tagging", () => {
+  it("pins provider-specific plans for parity scenarios", async () => {
+    const sourcePrompt =
+      "Read the seeded docs and source plan, then report grouped into Worked, Failed, Blocked, and Follow-up.";
+    const handoffPrompt =
+      "Delegate one bounded QA task to a subagent. Wait for the subagent to finish.";
+    const fanoutPrompt =
+      "Subagent fanout synthesis check: delegate two bounded subagents sequentially, then report both results together.";
+
+    const openaiSourceServer = await startMockServer();
+    const openaiSource = await expectResponsesJson(openaiSourceServer, {
+      model: "openai/gpt-5.5",
+      stream: false,
+      input: [makeUserInput(sourcePrompt)],
+    });
+    expect(outputToolArgs(openaiSource)).toEqual({ path: "repo/qa/scenarios/index.md" });
+
+    const anthropicSourceServer = await startMockServer();
+    const anthropicSource = await expectResponsesJson(anthropicSourceServer, {
+      model: "anthropic/claude-opus-4-7",
+      stream: false,
+      input: [makeUserInput(sourcePrompt)],
+    });
+    expect(outputToolArgs(anthropicSource)).toEqual({ path: "repo/docs/help/testing.md" });
+
+    const openaiHandoffServer = await startMockServer();
+    const openaiHandoff = await expectResponsesJson(openaiHandoffServer, {
+      model: "gpt-5.5",
+      stream: false,
+      input: [makeUserInput(handoffPrompt)],
+    });
+    expect(outputToolArgs(openaiHandoff)).toMatchObject({
+      label: "qa-sidecar",
+      task: "Inspect the QA workspace and return one concise protocol note.",
+    });
+
+    const anthropicHandoffServer = await startMockServer();
+    const anthropicHandoff = await expectResponsesJson(anthropicHandoffServer, {
+      model: "claude-opus-4-7",
+      stream: false,
+      input: [makeUserInput(handoffPrompt)],
+    });
+    expect(outputToolArgs(anthropicHandoff)).toMatchObject({
+      label: "qa-sidecar",
+      task: "Inspect the QA docs fixture and return one concise protocol note.",
+    });
+
+    const openaiFanoutServer = await startMockServer();
+    const openaiFanout = await expectResponsesJson(openaiFanoutServer, {
+      model: "openai/gpt-5.5",
+      stream: false,
+      tools: [SESSIONS_SPAWN_TOOL],
+      input: [makeUserInput(fanoutPrompt)],
+    });
+    expect(outputToolArgs(openaiFanout)).toMatchObject({
+      label: "qa-fanout-alpha",
+      task: "Fanout worker alpha: inspect the QA workspace and finish with exactly ALPHA-OK.",
+    });
+
+    const anthropicFanoutServer = await startMockServer();
+    const anthropicFanout = await expectResponsesJson(anthropicFanoutServer, {
+      model: "anthropic/claude-opus-4-7",
+      stream: false,
+      tools: [SESSIONS_SPAWN_TOOL],
+      input: [makeUserInput(fanoutPrompt)],
+    });
+    expect(outputToolArgs(anthropicFanout)).toMatchObject({
+      label: "qa-fanout-alpha",
+      task: "Fanout worker alpha: inspect the QA docs fixture and finish with exactly ALPHA-OK.",
+    });
+  });
+
  it("records providerVariant on /debug/last-request for openai requests", async () => {
    const server = await startQaMockOpenAiServer({
      host: "127.0.0.1",
--- a/extensions/qa-lab/src/providers/mock-openai/server.ts
+++ b/extensions/qa-lab/src/providers/mock-openai/server.ts
@@ -109,8 +109,8 @@ type MockOpenAiRequestSnapshot = {
 // This is a subset of the real Anthropic Messages API — just enough so the
 // QA suite can run its parity pack against a "baseline" Anthropic provider
 // without needing real API keys. The scenarios drive their dispatch through
-// the shared mock scenario logic (buildResponsesPayload), so whatever
-// behavior the OpenAI mock exposes is automatically mirrored on this route.
+// the shared mock scenario logic (buildResponsesPayload), with `model`
+// preserved so provider-aware branches can intentionally diverge.
 type AnthropicMessageContentBlock =
  | { type: "text"; text: string }
  | {
@@ -180,6 +180,28 @@ type MockScenarioState = {
  subagentHandoffSpawned: boolean;
 };

+function sourceDiscoveryReadPathForProvider(providerVariant: MockOpenAiProviderVariant) {
+  return providerVariant === "anthropic"
+    ? "repo/docs/help/testing.md"
+    : "repo/qa/scenarios/index.md";
+}
+
+function subagentHandoffTaskForProvider(providerVariant: MockOpenAiProviderVariant) {
+  return providerVariant === "anthropic"
+    ? "Inspect the QA docs fixture and return one concise protocol note."
+    : "Inspect the QA workspace and return one concise protocol note.";
+}
+
+function subagentFanoutTaskForProvider(
+  providerVariant: MockOpenAiProviderVariant,
+  worker: "alpha" | "beta",
+) {
+  const marker = worker === "alpha" ? "ALPHA-OK" : "BETA-OK";
+  const scope =
+    providerVariant === "anthropic" ? "the QA docs fixture" : "the QA workspace";
+  return `Fanout worker ${worker}: inspect ${scope} and finish with exactly ${marker}.`;
+}
+
 const MOCK_OPENAI_MAX_BODY_BYTES = 16 * 1024 * 1024;
 const MOCK_OPENAI_BODY_TIMEOUT_MS = 30_000;
 const MOCK_OPENAI_DEBUG_REQUEST_LIMIT = 200;
@@ -1496,6 +1518,9 @@ async function buildResponsesPayload(
  body: Record<string, unknown>,
  scenarioState: MockScenarioState,
 ) {
+  const providerVariant = resolveProviderVariant(
+    typeof body.model === "string" ? body.model : undefined,
+  );
  const input = Array.isArray(body.input) ? (body.input as ResponsesInputItem[]) : [];
  const prompt = extractLastUserText(input);
  const toolOutput = extractToolOutput(input);
@@ -2039,7 +2064,7 @@ async function buildResponsesPayload(
    if (!toolOutput && scenarioState.subagentFanoutPhase === 0) {
      scenarioState.subagentFanoutPhase = 1;
      return buildToolCallEventsWithArgs("sessions_spawn", {
-        task: "Fanout worker alpha: inspect the QA workspace and finish with exactly ALPHA-OK.",
+        task: subagentFanoutTaskForProvider(providerVariant, "alpha"),
        label: "qa-fanout-alpha",
        thread: false,
      });
@@ -2047,7 +2072,7 @@ async function buildResponsesPayload(
    if (toolOutput && scenarioState.subagentFanoutPhase === 1) {
      scenarioState.subagentFanoutPhase = 2;
      return buildToolCallEventsWithArgs("sessions_spawn", {
-        task: "Fanout worker beta: inspect the QA workspace and finish with exactly BETA-OK.",
+        task: subagentFanoutTaskForProvider(providerVariant, "beta"),
        label: "qa-fanout-beta",
        thread: false,
      });
@@ -2123,7 +2148,7 @@ async function buildResponsesPayload(
  ) {
    scenarioState.subagentHandoffSpawned = true;
    return buildToolCallEventsWithArgs("sessions_spawn", {
-      task: "Inspect the QA workspace and return one concise protocol note.",
+      task: subagentHandoffTaskForProvider(providerVariant),
      label: "qa-sidecar",
      thread: false,
    });
@@ -2132,7 +2157,9 @@ async function buildResponsesPayload(
    /(worked, failed, blocked|worked\/failed\/blocked|source and docs)/i.test(prompt) &&
    !toolOutput
  ) {
-    return buildToolCallEventsWithArgs("read", { path: "repo/qa/scenarios/index.md" });
+    return buildToolCallEventsWithArgs("read", {
+      path: sourceDiscoveryReadPathForProvider(providerVariant),
+    });
  }
  if (!toolOutput && /\b(read|inspect|repo|docs|scenario|kickoff)\b/i.test(prompt)) {
    return buildToolCallEvents(prompt);
@@ -2167,8 +2194,8 @@ async function buildResponsesPayload(
 // shapes into the shared ResponsesInputItem[] format, calls the same
 // buildResponsesPayload() dispatcher, and then re-serializes the resulting
 // events into an Anthropic response. This gives the parity harness a
-// baseline lane that exercises the same scenario logic without requiring
-// real Anthropic API keys.
+// baseline lane that exercises the same scenario logic and selected
+// provider-specific plans without requiring real Anthropic API keys.
 //
 // Scope: handles Anthropic Messages requests with text and tool_result
 // content blocks, supporting both non-streaming JSON responses and the