mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-22 13:04:04 +00:00
fix(qa-lab): differentiate mock provider plans
This commit is contained in:
@@ -32,6 +32,7 @@ Docs: https://docs.openclaw.ai
|
||||
- QA-Lab: fail live runtime parity reports when assistant-message usage is missing, preventing `0 vs 0` live token rows from being reported as passing proof. Fixes #80411. Thanks @100yenadmin.
|
||||
- QA-Lab: fail Codex-backed OpenAI live runtime-pair runs before launching isolated workers when no portable Codex auth is available, while staging API-key fallbacks and configured Codex keys for isolated QA agents. Fixes #80412. Thanks @100yenadmin.
|
||||
- QA-Lab: refresh parity gates, mock frontier fixtures, model scenarios, and workflow artifact lanes to compare GPT-5.5 against Claude Opus 4.7. Fixes #74262. Thanks @100yenadmin.
|
||||
- QA-Lab: make mock parity dispatch provider-aware for source discovery and subagent scenarios so OpenAI and Anthropic lanes no longer share identical canned plans. Fixes #64879. Thanks @100yenadmin.
|
||||
- QA-Lab: stop returning Control UI bearer tokens from unauthenticated bootstrap payloads and bind Docker harness ports to loopback-only host addresses. (#66355) Thanks @pgondhi987.
|
||||
- Mac app: avoid a SwiftUI metadata crash when rendering the Cron Jobs settings pane.
|
||||
- Agents/OpenAI streams: yield via `setTimeout(0)` instead of `setImmediate` between bursty Responses chunks so abort timers can fire during the yield, keeping cancel-on-timeout responsive on hot streams. Refs #82462.
|
||||
|
||||
@@ -79,6 +79,14 @@ function outputItem(payload: unknown, index = 0) {
|
||||
return requireRecord(output[index], `response output ${index}`);
|
||||
}
|
||||
|
||||
function outputToolArgs(payload: unknown, index = 0) {
|
||||
const item = outputItem(payload, index);
|
||||
if (typeof item.arguments !== "string") {
|
||||
throw new Error("Expected response output arguments");
|
||||
}
|
||||
return requireRecord(JSON.parse(item.arguments) as unknown, "response output arguments");
|
||||
}
|
||||
|
||||
function outputContentItem(payload: unknown, outputIndex = 0, contentIndex = 0) {
|
||||
const content = requireArray(outputItem(payload, outputIndex).content, "response output content");
|
||||
return requireRecord(content[contentIndex], `response content ${contentIndex}`);
|
||||
@@ -3017,7 +3025,7 @@ describe("qa mock openai server", () => {
|
||||
| { name: string; input: Record<string, unknown> }
|
||||
| undefined;
|
||||
expect(toolUseBlock?.name).toBe("read");
|
||||
expect(toolUseBlock?.input).toEqual({ path: "repo/qa/scenarios/index.md" });
|
||||
expect(toolUseBlock?.input).toEqual({ path: "repo/docs/help/testing.md" });
|
||||
|
||||
const debugResponse = await fetch(`${server.baseUrl}/debug/last-request`);
|
||||
expect(debugResponse.status).toBe(200);
|
||||
@@ -3275,7 +3283,7 @@ describe("qa mock openai server", () => {
|
||||
expect(body).toContain("event: content_block_start");
|
||||
expect(body).toContain('"type":"tool_use"');
|
||||
expect(body).toContain('"name":"read"');
|
||||
expect(body).toContain("repo/qa/scenarios/index.md");
|
||||
expect(body).toContain("repo/docs/help/testing.md");
|
||||
expect(body).toContain("event: message_delta");
|
||||
expect(body).toContain("event: message_stop");
|
||||
});
|
||||
@@ -3739,6 +3747,77 @@ describe("resolveProviderVariant", () => {
|
||||
});
|
||||
|
||||
describe("qa mock openai server provider variant tagging", () => {
|
||||
it("pins provider-specific plans for parity scenarios", async () => {
|
||||
const sourcePrompt =
|
||||
"Read the seeded docs and source plan, then report grouped into Worked, Failed, Blocked, and Follow-up.";
|
||||
const handoffPrompt =
|
||||
"Delegate one bounded QA task to a subagent. Wait for the subagent to finish.";
|
||||
const fanoutPrompt =
|
||||
"Subagent fanout synthesis check: delegate two bounded subagents sequentially, then report both results together.";
|
||||
|
||||
const openaiSourceServer = await startMockServer();
|
||||
const openaiSource = await expectResponsesJson(openaiSourceServer, {
|
||||
model: "openai/gpt-5.5",
|
||||
stream: false,
|
||||
input: [makeUserInput(sourcePrompt)],
|
||||
});
|
||||
expect(outputToolArgs(openaiSource)).toEqual({ path: "repo/qa/scenarios/index.md" });
|
||||
|
||||
const anthropicSourceServer = await startMockServer();
|
||||
const anthropicSource = await expectResponsesJson(anthropicSourceServer, {
|
||||
model: "anthropic/claude-opus-4-7",
|
||||
stream: false,
|
||||
input: [makeUserInput(sourcePrompt)],
|
||||
});
|
||||
expect(outputToolArgs(anthropicSource)).toEqual({ path: "repo/docs/help/testing.md" });
|
||||
|
||||
const openaiHandoffServer = await startMockServer();
|
||||
const openaiHandoff = await expectResponsesJson(openaiHandoffServer, {
|
||||
model: "gpt-5.5",
|
||||
stream: false,
|
||||
input: [makeUserInput(handoffPrompt)],
|
||||
});
|
||||
expect(outputToolArgs(openaiHandoff)).toMatchObject({
|
||||
label: "qa-sidecar",
|
||||
task: "Inspect the QA workspace and return one concise protocol note.",
|
||||
});
|
||||
|
||||
const anthropicHandoffServer = await startMockServer();
|
||||
const anthropicHandoff = await expectResponsesJson(anthropicHandoffServer, {
|
||||
model: "claude-opus-4-7",
|
||||
stream: false,
|
||||
input: [makeUserInput(handoffPrompt)],
|
||||
});
|
||||
expect(outputToolArgs(anthropicHandoff)).toMatchObject({
|
||||
label: "qa-sidecar",
|
||||
task: "Inspect the QA docs fixture and return one concise protocol note.",
|
||||
});
|
||||
|
||||
const openaiFanoutServer = await startMockServer();
|
||||
const openaiFanout = await expectResponsesJson(openaiFanoutServer, {
|
||||
model: "openai/gpt-5.5",
|
||||
stream: false,
|
||||
tools: [SESSIONS_SPAWN_TOOL],
|
||||
input: [makeUserInput(fanoutPrompt)],
|
||||
});
|
||||
expect(outputToolArgs(openaiFanout)).toMatchObject({
|
||||
label: "qa-fanout-alpha",
|
||||
task: "Fanout worker alpha: inspect the QA workspace and finish with exactly ALPHA-OK.",
|
||||
});
|
||||
|
||||
const anthropicFanoutServer = await startMockServer();
|
||||
const anthropicFanout = await expectResponsesJson(anthropicFanoutServer, {
|
||||
model: "anthropic/claude-opus-4-7",
|
||||
stream: false,
|
||||
tools: [SESSIONS_SPAWN_TOOL],
|
||||
input: [makeUserInput(fanoutPrompt)],
|
||||
});
|
||||
expect(outputToolArgs(anthropicFanout)).toMatchObject({
|
||||
label: "qa-fanout-alpha",
|
||||
task: "Fanout worker alpha: inspect the QA docs fixture and finish with exactly ALPHA-OK.",
|
||||
});
|
||||
});
|
||||
|
||||
it("records providerVariant on /debug/last-request for openai requests", async () => {
|
||||
const server = await startQaMockOpenAiServer({
|
||||
host: "127.0.0.1",
|
||||
|
||||
@@ -109,8 +109,8 @@ type MockOpenAiRequestSnapshot = {
|
||||
// This is a subset of the real Anthropic Messages API — just enough so the
|
||||
// QA suite can run its parity pack against a "baseline" Anthropic provider
|
||||
// without needing real API keys. The scenarios drive their dispatch through
|
||||
// the shared mock scenario logic (buildResponsesPayload), so whatever
|
||||
// behavior the OpenAI mock exposes is automatically mirrored on this route.
|
||||
// the shared mock scenario logic (buildResponsesPayload), with `model`
|
||||
// preserved so provider-aware branches can intentionally diverge.
|
||||
type AnthropicMessageContentBlock =
|
||||
| { type: "text"; text: string }
|
||||
| {
|
||||
@@ -180,6 +180,28 @@ type MockScenarioState = {
|
||||
subagentHandoffSpawned: boolean;
|
||||
};
|
||||
|
||||
function sourceDiscoveryReadPathForProvider(providerVariant: MockOpenAiProviderVariant) {
|
||||
return providerVariant === "anthropic"
|
||||
? "repo/docs/help/testing.md"
|
||||
: "repo/qa/scenarios/index.md";
|
||||
}
|
||||
|
||||
function subagentHandoffTaskForProvider(providerVariant: MockOpenAiProviderVariant) {
|
||||
return providerVariant === "anthropic"
|
||||
? "Inspect the QA docs fixture and return one concise protocol note."
|
||||
: "Inspect the QA workspace and return one concise protocol note.";
|
||||
}
|
||||
|
||||
function subagentFanoutTaskForProvider(
|
||||
providerVariant: MockOpenAiProviderVariant,
|
||||
worker: "alpha" | "beta",
|
||||
) {
|
||||
const marker = worker === "alpha" ? "ALPHA-OK" : "BETA-OK";
|
||||
const scope =
|
||||
providerVariant === "anthropic" ? "the QA docs fixture" : "the QA workspace";
|
||||
return `Fanout worker ${worker}: inspect ${scope} and finish with exactly ${marker}.`;
|
||||
}
|
||||
|
||||
const MOCK_OPENAI_MAX_BODY_BYTES = 16 * 1024 * 1024;
|
||||
const MOCK_OPENAI_BODY_TIMEOUT_MS = 30_000;
|
||||
const MOCK_OPENAI_DEBUG_REQUEST_LIMIT = 200;
|
||||
@@ -1496,6 +1518,9 @@ async function buildResponsesPayload(
|
||||
body: Record<string, unknown>,
|
||||
scenarioState: MockScenarioState,
|
||||
) {
|
||||
const providerVariant = resolveProviderVariant(
|
||||
typeof body.model === "string" ? body.model : undefined,
|
||||
);
|
||||
const input = Array.isArray(body.input) ? (body.input as ResponsesInputItem[]) : [];
|
||||
const prompt = extractLastUserText(input);
|
||||
const toolOutput = extractToolOutput(input);
|
||||
@@ -2039,7 +2064,7 @@ async function buildResponsesPayload(
|
||||
if (!toolOutput && scenarioState.subagentFanoutPhase === 0) {
|
||||
scenarioState.subagentFanoutPhase = 1;
|
||||
return buildToolCallEventsWithArgs("sessions_spawn", {
|
||||
task: "Fanout worker alpha: inspect the QA workspace and finish with exactly ALPHA-OK.",
|
||||
task: subagentFanoutTaskForProvider(providerVariant, "alpha"),
|
||||
label: "qa-fanout-alpha",
|
||||
thread: false,
|
||||
});
|
||||
@@ -2047,7 +2072,7 @@ async function buildResponsesPayload(
|
||||
if (toolOutput && scenarioState.subagentFanoutPhase === 1) {
|
||||
scenarioState.subagentFanoutPhase = 2;
|
||||
return buildToolCallEventsWithArgs("sessions_spawn", {
|
||||
task: "Fanout worker beta: inspect the QA workspace and finish with exactly BETA-OK.",
|
||||
task: subagentFanoutTaskForProvider(providerVariant, "beta"),
|
||||
label: "qa-fanout-beta",
|
||||
thread: false,
|
||||
});
|
||||
@@ -2123,7 +2148,7 @@ async function buildResponsesPayload(
|
||||
) {
|
||||
scenarioState.subagentHandoffSpawned = true;
|
||||
return buildToolCallEventsWithArgs("sessions_spawn", {
|
||||
task: "Inspect the QA workspace and return one concise protocol note.",
|
||||
task: subagentHandoffTaskForProvider(providerVariant),
|
||||
label: "qa-sidecar",
|
||||
thread: false,
|
||||
});
|
||||
@@ -2132,7 +2157,9 @@ async function buildResponsesPayload(
|
||||
/(worked, failed, blocked|worked\/failed\/blocked|source and docs)/i.test(prompt) &&
|
||||
!toolOutput
|
||||
) {
|
||||
return buildToolCallEventsWithArgs("read", { path: "repo/qa/scenarios/index.md" });
|
||||
return buildToolCallEventsWithArgs("read", {
|
||||
path: sourceDiscoveryReadPathForProvider(providerVariant),
|
||||
});
|
||||
}
|
||||
if (!toolOutput && /\b(read|inspect|repo|docs|scenario|kickoff)\b/i.test(prompt)) {
|
||||
return buildToolCallEvents(prompt);
|
||||
@@ -2167,8 +2194,8 @@ async function buildResponsesPayload(
|
||||
// shapes into the shared ResponsesInputItem[] format, calls the same
|
||||
// buildResponsesPayload() dispatcher, and then re-serializes the resulting
|
||||
// events into an Anthropic response. This gives the parity harness a
|
||||
// baseline lane that exercises the same scenario logic without requiring
|
||||
// real Anthropic API keys.
|
||||
// baseline lane that exercises the same scenario logic and selected
|
||||
// provider-specific plans without requiring real Anthropic API keys.
|
||||
//
|
||||
// Scope: handles Anthropic Messages requests with text and tool_result
|
||||
// content blocks, supporting both non-streaming JSON responses and the
|
||||
|
||||
Reference in New Issue
Block a user