mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 05:30:42 +00:00
fix: stabilize qa lab mock suite
This commit is contained in:
@@ -207,7 +207,7 @@ refs and write a judged Markdown report:
|
||||
|
||||
```bash
|
||||
pnpm openclaw qa character-eval \
|
||||
--model openai-codex/gpt-5.5,thinking=xhigh \
|
||||
--model openai/gpt-5.4,thinking=medium,fast \
|
||||
--model openai/gpt-5.2,thinking=xhigh \
|
||||
--model openai/gpt-5,thinking=xhigh \
|
||||
--model anthropic/claude-opus-4-6,thinking=high \
|
||||
@@ -215,7 +215,7 @@ pnpm openclaw qa character-eval \
|
||||
--model zai/glm-5.1,thinking=high \
|
||||
--model moonshot/kimi-k2.5,thinking=high \
|
||||
--model google/gemini-3.1-pro-preview,thinking=high \
|
||||
--judge-model openai-codex/gpt-5.5,thinking=xhigh,fast \
|
||||
--judge-model openai/gpt-5.4,thinking=xhigh,fast \
|
||||
--judge-model anthropic/claude-opus-4-6,thinking=high \
|
||||
--blind-judge-models \
|
||||
--concurrency 16 \
|
||||
@@ -227,13 +227,13 @@ scenarios should set the persona through `SOUL.md`, then run ordinary user turns
|
||||
such as chat, workspace help, and small file tasks. The candidate model should
|
||||
not be told that it is being evaluated. The command preserves each full
|
||||
transcript, records basic run stats, then asks the judge models in fast mode with
|
||||
`xhigh` reasoning to rank the runs by naturalness, vibe, and humor.
|
||||
`xhigh` reasoning where supported to rank the runs by naturalness, vibe, and humor.
|
||||
Use `--blind-judge-models` when comparing providers: the judge prompt still gets
|
||||
every transcript and run status, but candidate refs are replaced with neutral
|
||||
labels such as `candidate-01`; the report maps rankings back to real refs after
|
||||
parsing.
|
||||
Candidate runs default to `high` thinking, with `xhigh` for OpenAI models that
|
||||
support it. Override a specific candidate inline with
|
||||
Candidate runs default to `high` thinking, with `medium` for GPT-5.4 and `xhigh`
|
||||
for older OpenAI eval refs that support it. Override a specific candidate inline with
|
||||
`--model provider/model,thinking=<level>`. `--thinking <level>` still sets a
|
||||
global fallback, and the older `--model-thinking <provider/model=level>` form is
|
||||
kept for compatibility.
|
||||
@@ -247,12 +247,12 @@ Candidate and judge model runs both default to concurrency 16. Lower
|
||||
`--concurrency` or `--judge-concurrency` when provider limits or local gateway
|
||||
pressure make a run too noisy.
|
||||
When no candidate `--model` is passed, the character eval defaults to
|
||||
`openai-codex/gpt-5.5`, `openai/gpt-5.4`, `openai/gpt-5.2`, `anthropic/claude-opus-4-6`,
|
||||
`openai/gpt-5.4`, `openai/gpt-5.2`, `openai/gpt-5`, `anthropic/claude-opus-4-6`,
|
||||
`anthropic/claude-sonnet-4-6`, `zai/glm-5.1`,
|
||||
`moonshot/kimi-k2.5`, and
|
||||
`google/gemini-3.1-pro-preview` when no `--model` is passed.
|
||||
When no `--judge-model` is passed, the judges default to
|
||||
`openai-codex/gpt-5.5,thinking=xhigh,fast` and
|
||||
`openai/gpt-5.4,thinking=xhigh,fast` and
|
||||
`anthropic/claude-opus-4-6,thinking=high`.
|
||||
|
||||
## Related docs
|
||||
|
||||
@@ -680,7 +680,7 @@ Docker notes:
|
||||
`agent` method:
|
||||
- load the bundled `codex` plugin
|
||||
- select `OPENCLAW_AGENT_RUNTIME=codex`
|
||||
- send a first gateway agent turn to `openai/gpt-5.5` with the Codex harness forced
|
||||
- send a first gateway agent turn to `openai/gpt-5.4` with the Codex harness forced
|
||||
- send a second turn to the same OpenClaw session and verify the app-server
|
||||
thread can resume
|
||||
- run `/codex status` and `/codex models` through the same gateway command
|
||||
@@ -690,7 +690,7 @@ Docker notes:
|
||||
denied so the agent asks back
|
||||
- Test: `src/gateway/gateway-codex-harness.live.test.ts`
|
||||
- Enable: `OPENCLAW_LIVE_CODEX_HARNESS=1`
|
||||
- Default model: `openai/gpt-5.5`
|
||||
- Default model: `openai/gpt-5.4`
|
||||
- Optional image probe: `OPENCLAW_LIVE_CODEX_HARNESS_IMAGE_PROBE=1`
|
||||
- Optional MCP/tool probe: `OPENCLAW_LIVE_CODEX_HARNESS_MCP_PROBE=1`
|
||||
- Optional Guardian probe: `OPENCLAW_LIVE_CODEX_HARNESS_GUARDIAN_PROBE=1`
|
||||
@@ -708,7 +708,7 @@ OPENCLAW_LIVE_CODEX_HARNESS=1 \
|
||||
OPENCLAW_LIVE_CODEX_HARNESS_IMAGE_PROBE=1 \
|
||||
OPENCLAW_LIVE_CODEX_HARNESS_MCP_PROBE=1 \
|
||||
OPENCLAW_LIVE_CODEX_HARNESS_GUARDIAN_PROBE=1 \
|
||||
OPENCLAW_LIVE_CODEX_HARNESS_MODEL=openai/gpt-5.5 \
|
||||
OPENCLAW_LIVE_CODEX_HARNESS_MODEL=openai/gpt-5.4 \
|
||||
pnpm test:live -- src/gateway/gateway-codex-harness.live.test.ts
|
||||
```
|
||||
|
||||
|
||||
@@ -125,7 +125,7 @@ describe("runQaCharacterEval", () => {
|
||||
expect.objectContaining({
|
||||
judgeModel: "openai/gpt-5.4",
|
||||
judgeThinkingDefault: "xhigh",
|
||||
judgeFastMode: false,
|
||||
judgeFastMode: true,
|
||||
timeoutMs: 300_000,
|
||||
}),
|
||||
);
|
||||
@@ -223,7 +223,7 @@ describe("runQaCharacterEval", () => {
|
||||
|
||||
expect(runSuite).toHaveBeenCalledTimes(8);
|
||||
expect(runSuite.mock.calls.map(([params]) => params.primaryModel)).toEqual([
|
||||
"openai/gpt-5.5",
|
||||
"openai/gpt-5.4",
|
||||
"openai/gpt-5.2",
|
||||
"openai/gpt-5",
|
||||
"anthropic/claude-opus-4-6",
|
||||
@@ -233,7 +233,7 @@ describe("runQaCharacterEval", () => {
|
||||
"google/gemini-3.1-pro-preview",
|
||||
]);
|
||||
expect(runSuite.mock.calls.map(([params]) => params.thinkingDefault)).toEqual([
|
||||
"xhigh",
|
||||
"medium",
|
||||
"xhigh",
|
||||
"xhigh",
|
||||
"high",
|
||||
@@ -254,14 +254,14 @@ describe("runQaCharacterEval", () => {
|
||||
]);
|
||||
expect(runJudge).toHaveBeenCalledTimes(2);
|
||||
expect(runJudge.mock.calls.map(([params]) => params.judgeModel)).toEqual([
|
||||
"openai/gpt-5.5",
|
||||
"openai/gpt-5.4",
|
||||
"anthropic/claude-opus-4-6",
|
||||
]);
|
||||
expect(runJudge.mock.calls.map(([params]) => params.judgeThinkingDefault)).toEqual([
|
||||
"xhigh",
|
||||
"high",
|
||||
]);
|
||||
expect(runJudge.mock.calls.map(([params]) => params.judgeFastMode)).toEqual([false, false]);
|
||||
expect(runJudge.mock.calls.map(([params]) => params.judgeFastMode)).toEqual([true, false]);
|
||||
});
|
||||
|
||||
it("runs candidate models with bounded concurrency while preserving result order", async () => {
|
||||
|
||||
@@ -189,6 +189,7 @@ describe("qa cli runtime", () => {
|
||||
primaryModel: "openai/gpt-5.4",
|
||||
alternateModel: "anthropic/claude-sonnet-4-6",
|
||||
fastMode: true,
|
||||
thinking: "medium",
|
||||
scenarioIds: ["approval-turn-tool-followthrough"],
|
||||
});
|
||||
|
||||
@@ -200,6 +201,7 @@ describe("qa cli runtime", () => {
|
||||
primaryModel: "openai/gpt-5.4",
|
||||
alternateModel: "anthropic/claude-sonnet-4-6",
|
||||
fastMode: true,
|
||||
thinkingDefault: "medium",
|
||||
scenarioIds: ["approval-turn-tool-followthrough"],
|
||||
});
|
||||
});
|
||||
@@ -1135,8 +1137,8 @@ describe("qa cli runtime", () => {
|
||||
repoRoot: path.resolve("/tmp/openclaw-repo"),
|
||||
transportId: "qa-channel",
|
||||
providerMode: "live-frontier",
|
||||
primaryModel: "openai/gpt-5.5",
|
||||
alternateModel: "openai/gpt-5.5",
|
||||
primaryModel: "openai/gpt-5.4",
|
||||
alternateModel: "openai/gpt-5.4",
|
||||
fastMode: undefined,
|
||||
message: "read qa kickoff and reply short",
|
||||
timeoutMs: undefined,
|
||||
@@ -1166,7 +1168,7 @@ describe("qa cli runtime", () => {
|
||||
it("defaults manual frontier runs onto Codex OAuth when the runtime resolver prefers it", async () => {
|
||||
defaultQaRuntimeModelForMode.mockImplementation((mode, options) =>
|
||||
mode === "live-frontier"
|
||||
? "openai/gpt-5.5"
|
||||
? "openai/gpt-5.4"
|
||||
: defaultQaProviderModelForMode(mode as QaProviderModeInput, options),
|
||||
);
|
||||
|
||||
@@ -1179,8 +1181,8 @@ describe("qa cli runtime", () => {
|
||||
repoRoot: path.resolve("/tmp/openclaw-repo"),
|
||||
transportId: "qa-channel",
|
||||
providerMode: "live-frontier",
|
||||
primaryModel: "openai/gpt-5.5",
|
||||
alternateModel: "openai/gpt-5.5",
|
||||
primaryModel: "openai/gpt-5.4",
|
||||
alternateModel: "openai/gpt-5.4",
|
||||
fastMode: undefined,
|
||||
message: "read qa kickoff and reply short",
|
||||
timeoutMs: undefined,
|
||||
|
||||
@@ -450,6 +450,7 @@ export async function runQaSuiteCommand(opts: {
|
||||
primaryModel?: string;
|
||||
alternateModel?: string;
|
||||
fastMode?: boolean;
|
||||
thinking?: string;
|
||||
cliAuthMode?: string;
|
||||
parityPack?: string;
|
||||
scenarioIds?: string[];
|
||||
@@ -490,6 +491,7 @@ export async function runQaSuiteCommand(opts: {
|
||||
throw new Error("--cli-auth-mode requires --runner host.");
|
||||
}
|
||||
if (runner === "multipass") {
|
||||
const thinkingDefault = parseQaThinkingLevel("--thinking", opts.thinking);
|
||||
const result = await runQaMultipass({
|
||||
repoRoot,
|
||||
outputDir: resolveRepoRelativeOutputDir(repoRoot, opts.outputDir),
|
||||
@@ -498,6 +500,7 @@ export async function runQaSuiteCommand(opts: {
|
||||
primaryModel: opts.primaryModel,
|
||||
alternateModel: opts.alternateModel,
|
||||
fastMode: opts.fastMode,
|
||||
...(thinkingDefault ? { thinkingDefault } : {}),
|
||||
allowFailures: true,
|
||||
scenarioIds,
|
||||
...(opts.concurrency !== undefined
|
||||
@@ -532,6 +535,7 @@ export async function runQaSuiteCommand(opts: {
|
||||
});
|
||||
return;
|
||||
}
|
||||
const thinkingDefault = parseQaThinkingLevel("--thinking", opts.thinking);
|
||||
const result = await runQaSuiteFromRuntimeWithInfraRetry({
|
||||
repoRoot,
|
||||
outputDir: resolveRepoRelativeOutputDir(repoRoot, opts.outputDir),
|
||||
@@ -540,6 +544,7 @@ export async function runQaSuiteCommand(opts: {
|
||||
primaryModel: opts.primaryModel,
|
||||
alternateModel: opts.alternateModel,
|
||||
fastMode: opts.fastMode,
|
||||
...(thinkingDefault ? { thinkingDefault } : {}),
|
||||
...(claudeCliAuthMode ? { claudeCliAuthMode } : {}),
|
||||
scenarioIds,
|
||||
...(opts.concurrency !== undefined
|
||||
|
||||
@@ -35,6 +35,7 @@ async function runQaSuite(opts: {
|
||||
primaryModel?: string;
|
||||
alternateModel?: string;
|
||||
fastMode?: boolean;
|
||||
thinking?: string;
|
||||
allowFailures?: boolean;
|
||||
cliAuthMode?: string;
|
||||
parityPack?: string;
|
||||
@@ -247,6 +248,10 @@ export function registerQaLabCli(program: Command) {
|
||||
false,
|
||||
)
|
||||
.option("--fast", "Enable provider fast mode where supported", false)
|
||||
.option(
|
||||
"--thinking <level>",
|
||||
"Suite thinking default: off|minimal|low|medium|high|xhigh|adaptive|max",
|
||||
)
|
||||
.option("--image <alias>", "Multipass image alias")
|
||||
.option("--cpus <count>", "Multipass vCPU count", (value: string) => Number(value))
|
||||
.option("--memory <size>", "Multipass memory size")
|
||||
@@ -266,6 +271,7 @@ export function registerQaLabCli(program: Command) {
|
||||
concurrency?: number;
|
||||
allowFailures?: boolean;
|
||||
fast?: boolean;
|
||||
thinking?: string;
|
||||
image?: string;
|
||||
cpus?: number;
|
||||
memory?: string;
|
||||
@@ -281,6 +287,7 @@ export function registerQaLabCli(program: Command) {
|
||||
primaryModel: opts.model,
|
||||
alternateModel: opts.altModel,
|
||||
fastMode: opts.fast,
|
||||
thinking: opts.thinking,
|
||||
cliAuthMode: opts.cliAuthMode,
|
||||
parityPack: opts.parityPack,
|
||||
scenarioIds: opts.scenario,
|
||||
|
||||
@@ -2,7 +2,7 @@ import { describe, expect, it } from "vitest";
|
||||
import { selectQaRunnerModelOptions } from "./model-catalog.runtime.js";
|
||||
|
||||
describe("qa runner model catalog", () => {
|
||||
it("filters to available rows and prefers gpt-5.5 first", () => {
|
||||
it("filters to available rows and prefers gpt-5.4 first", () => {
|
||||
expect(
|
||||
selectQaRunnerModelOptions([
|
||||
{
|
||||
@@ -13,8 +13,8 @@ describe("qa runner model catalog", () => {
|
||||
missing: false,
|
||||
},
|
||||
{
|
||||
key: "openai/gpt-5.5",
|
||||
name: "gpt-5.5",
|
||||
key: "openai/gpt-5.4",
|
||||
name: "gpt-5.4",
|
||||
input: "text,image",
|
||||
available: true,
|
||||
missing: false,
|
||||
@@ -27,6 +27,6 @@ describe("qa runner model catalog", () => {
|
||||
missing: false,
|
||||
},
|
||||
]).map((entry) => entry.key),
|
||||
).toEqual(["openai/gpt-5.5", "anthropic/claude-sonnet-4-6"]);
|
||||
).toEqual(["openai/gpt-5.4", "anthropic/claude-sonnet-4-6"]);
|
||||
});
|
||||
});
|
||||
|
||||
@@ -34,7 +34,7 @@ describe("qa model selection runtime", () => {
|
||||
resolveEnvApiKey.mockReturnValue({ apiKey: "sk-test" });
|
||||
|
||||
expect(resolveQaPreferredLiveModel()).toBeUndefined();
|
||||
expect(defaultQaRuntimeModelForMode("live-frontier")).toBe("openai/gpt-5.5");
|
||||
expect(defaultQaRuntimeModelForMode("live-frontier")).toBe("openai/gpt-5.4");
|
||||
expect(loadAuthProfileStoreForRuntime).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
@@ -43,8 +43,8 @@ describe("qa model selection runtime", () => {
|
||||
provider === "openai-codex" ? ["openai-codex:user@example.com"] : [],
|
||||
);
|
||||
|
||||
expect(resolveQaPreferredLiveModel()).toBe("openai/gpt-5.5");
|
||||
expect(defaultQaRuntimeModelForMode("live-frontier")).toBe("openai/gpt-5.5");
|
||||
expect(resolveQaPreferredLiveModel()).toBe("openai/gpt-5.4");
|
||||
expect(defaultQaRuntimeModelForMode("live-frontier")).toBe("openai/gpt-5.4");
|
||||
});
|
||||
|
||||
it("keeps the OpenAI live default when stored OpenAI profiles are available", () => {
|
||||
@@ -53,7 +53,7 @@ describe("qa model selection runtime", () => {
|
||||
);
|
||||
|
||||
expect(resolveQaPreferredLiveModel()).toBeUndefined();
|
||||
expect(defaultQaRuntimeModelForMode("live-frontier")).toBe("openai/gpt-5.5");
|
||||
expect(defaultQaRuntimeModelForMode("live-frontier")).toBe("openai/gpt-5.4");
|
||||
});
|
||||
|
||||
it("leaves mock defaults unchanged", () => {
|
||||
|
||||
@@ -71,6 +71,7 @@ export type QaMultipassPlan = {
|
||||
primaryModel?: string;
|
||||
alternateModel?: string;
|
||||
fastMode?: boolean;
|
||||
thinkingDefault?: string;
|
||||
scenarioIds: string[];
|
||||
forwardedEnv: Record<string, string>;
|
||||
hostCodexHomePath?: string;
|
||||
@@ -237,6 +238,7 @@ export function createQaMultipassPlan(params: {
|
||||
primaryModel?: string;
|
||||
alternateModel?: string;
|
||||
fastMode?: boolean;
|
||||
thinkingDefault?: string;
|
||||
allowFailures?: boolean;
|
||||
scenarioIds?: string[];
|
||||
concurrency?: number;
|
||||
@@ -276,6 +278,7 @@ export function createQaMultipassPlan(params: {
|
||||
...(params.primaryModel ? ["--model", params.primaryModel] : []),
|
||||
...(params.alternateModel ? ["--alt-model", params.alternateModel] : []),
|
||||
...(params.fastMode ? ["--fast"] : []),
|
||||
...(params.thinkingDefault ? ["--thinking", params.thinkingDefault] : []),
|
||||
...(params.allowFailures ? ["--allow-failures"] : []),
|
||||
...(params.concurrency ? ["--concurrency", String(params.concurrency)] : []),
|
||||
],
|
||||
@@ -301,6 +304,7 @@ export function createQaMultipassPlan(params: {
|
||||
primaryModel: params.primaryModel,
|
||||
alternateModel: params.alternateModel,
|
||||
fastMode: params.fastMode,
|
||||
thinkingDefault: params.thinkingDefault,
|
||||
scenarioIds,
|
||||
forwardedEnv,
|
||||
hostCodexHomePath,
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
export const QA_FRONTIER_PROVIDER_IDS = ["anthropic", "google", "openai"] as const;
|
||||
export const QA_FRONTIER_CATALOG_PRIMARY_MODEL = "openai/gpt-5.5";
|
||||
export const QA_FRONTIER_CATALOG_PRIMARY_MODEL = "openai/gpt-5.4";
|
||||
export const QA_FRONTIER_CATALOG_ALTERNATE_MODEL = "anthropic/claude-sonnet-4-6";
|
||||
|
||||
export function isPreferredQaLiveFrontierCatalogModel(modelRef: string) {
|
||||
|
||||
@@ -6,7 +6,7 @@ type QaFrontierCharacterModelOptions = {
|
||||
};
|
||||
|
||||
export const QA_FRONTIER_CHARACTER_EVAL_MODELS = Object.freeze([
|
||||
"openai/gpt-5.5",
|
||||
"openai/gpt-5.4",
|
||||
"openai/gpt-5.2",
|
||||
"openai/gpt-5",
|
||||
"anthropic/claude-opus-4-6",
|
||||
@@ -18,19 +18,19 @@ export const QA_FRONTIER_CHARACTER_EVAL_MODELS = Object.freeze([
|
||||
|
||||
export const QA_FRONTIER_CHARACTER_THINKING_BY_MODEL: Readonly<Record<string, QaThinkingLevel>> =
|
||||
Object.freeze({
|
||||
"openai/gpt-5.5": "xhigh",
|
||||
"openai/gpt-5.4": "medium",
|
||||
"openai/gpt-5.2": "xhigh",
|
||||
"openai/gpt-5": "xhigh",
|
||||
});
|
||||
|
||||
export const QA_FRONTIER_CHARACTER_JUDGE_MODELS = Object.freeze([
|
||||
"openai/gpt-5.5",
|
||||
"openai/gpt-5.4",
|
||||
"anthropic/claude-opus-4-6",
|
||||
]);
|
||||
|
||||
export const QA_FRONTIER_CHARACTER_JUDGE_MODEL_OPTIONS: Readonly<
|
||||
Record<string, QaFrontierCharacterModelOptions>
|
||||
> = Object.freeze({
|
||||
"openai/gpt-5.5": { thinkingDefault: "xhigh" },
|
||||
"openai/gpt-5.4": { thinkingDefault: "xhigh", fastMode: true },
|
||||
"anthropic/claude-opus-4-6": { thinkingDefault: "high" },
|
||||
});
|
||||
|
||||
@@ -23,7 +23,7 @@ function isClaudeOpusModel(modelRef: string) {
|
||||
export const liveFrontierProviderDefinition: QaProviderDefinition = {
|
||||
mode: "live-frontier",
|
||||
kind: "live",
|
||||
defaultModel: (options) => options?.preferredLiveModel ?? "openai/gpt-5.5",
|
||||
defaultModel: (options) => options?.preferredLiveModel ?? "openai/gpt-5.4",
|
||||
defaultImageGenerationProviderIds: ["openai"],
|
||||
defaultImageGenerationModel: ({ modelProviderIds }) =>
|
||||
modelProviderIds.includes("openai") ? "openai/gpt-image-1" : null,
|
||||
|
||||
@@ -4,7 +4,7 @@ import {
|
||||
} from "openclaw/plugin-sdk/agent-runtime";
|
||||
import { resolveEnvApiKey } from "openclaw/plugin-sdk/provider-auth";
|
||||
|
||||
const QA_CODEX_OAUTH_LIVE_MODEL = "openai/gpt-5.5";
|
||||
const QA_CODEX_OAUTH_LIVE_MODEL = "openai/gpt-5.4";
|
||||
|
||||
export function resolveQaLiveFrontierPreferredModel() {
|
||||
if (resolveEnvApiKey("openai")?.apiKey) {
|
||||
|
||||
@@ -1,2 +1,2 @@
|
||||
export const QA_FRONTIER_PARITY_CANDIDATE_LABEL = "openai/gpt-5.5";
|
||||
export const QA_FRONTIER_PARITY_CANDIDATE_LABEL = "openai/gpt-5.4";
|
||||
export const QA_FRONTIER_PARITY_BASELINE_LABEL = "anthropic/claude-opus-4-6";
|
||||
|
||||
@@ -151,6 +151,10 @@ const QA_REASONING_ONLY_RETRY_NEEDLE =
|
||||
"recorded reasoning but did not produce a user-visible answer";
|
||||
const QA_EMPTY_RESPONSE_RETRY_NEEDLE =
|
||||
"The previous attempt did not produce a user-visible answer.";
|
||||
const QA_SKILL_WORKSHOP_GIF_PROMPT_RE =
|
||||
/externally sourced animated GIF asset|animated GIF asset in a product UI/i;
|
||||
const QA_SKILL_WORKSHOP_REVIEW_PROMPT_RE = /Review transcript for durable skill updates/i;
|
||||
const QA_RELEASE_AUDIT_PROMPT_RE = /release readiness audit for the small project/i;
|
||||
|
||||
type MockScenarioState = {
|
||||
subagentFanoutPhase: number;
|
||||
@@ -727,6 +731,16 @@ function buildAssistantText(
|
||||
if (/(image generation check|capability flip image check)/i.test(prompt) && mediaPath) {
|
||||
return `Protocol note: generated the QA lighthouse image successfully.\nMEDIA:${mediaPath}`;
|
||||
}
|
||||
if (QA_SKILL_WORKSHOP_GIF_PROMPT_RE.test(prompt) && toolOutput) {
|
||||
return [
|
||||
"Animated GIF QA checklist ready.",
|
||||
"- Confirm true animation, not a static preview.",
|
||||
"- Verify dimensions and product UI fit.",
|
||||
"- Record attribution and license.",
|
||||
"- Keep a local copy before using the asset.",
|
||||
"- Re-open the copied file for final verification.",
|
||||
].join("\n");
|
||||
}
|
||||
if (/roundtrip image inspection check/i.test(prompt) && imageInputCount > 0) {
|
||||
return "Protocol note: the generated attachment shows the same QA lighthouse scene from the previous step.";
|
||||
}
|
||||
@@ -808,6 +822,79 @@ function buildToolCallEvents(prompt: string): StreamEvent[] {
|
||||
return buildToolCallEventsWithArgs("read", { path: targetPath });
|
||||
}
|
||||
|
||||
function buildReleaseAuditJson() {
|
||||
return `${JSON.stringify(
|
||||
{
|
||||
verified: true,
|
||||
findings: [
|
||||
{
|
||||
id: "REL-GATEWAY-417",
|
||||
source: "src/gateway/reconnect.ts",
|
||||
status: "retry jitter verified, resume token fallback still needs manual spot check",
|
||||
},
|
||||
{
|
||||
id: "REL-CHANNEL-238",
|
||||
source: "src/channels/delivery.ts",
|
||||
status: "thread replies preserve ordering, root-channel fallback needs handoff note",
|
||||
},
|
||||
{
|
||||
id: "REL-CRON-904",
|
||||
source: "src/scheduling/cron.ts",
|
||||
status: "single-run lock verified for restart wakeups",
|
||||
},
|
||||
{
|
||||
id: "REL-MEMORY-552",
|
||||
source: "src/memory/recall.ts",
|
||||
status:
|
||||
"fallback summary survives empty memory search; ranking sample needs second reviewer",
|
||||
},
|
||||
{
|
||||
id: "REL-PLUGIN-319",
|
||||
source: "src/plugins/runtime.ts",
|
||||
status: "bundled runtime manifest loads cleanly after restart",
|
||||
},
|
||||
{
|
||||
id: "REL-INSTALL-846",
|
||||
source: "install/update.ts",
|
||||
status: "update smoke passed from previous stable tag",
|
||||
},
|
||||
{
|
||||
id: "REL-DOCS-611",
|
||||
source: "docs/operator-notes.md",
|
||||
status:
|
||||
"docs mention reconnect, cron, memory, plugin, and installer checks; channel ordering and UI notes need maintainer handoff",
|
||||
},
|
||||
{
|
||||
id: "REL-UI-BLOCKED",
|
||||
source: "ui/control-panel.ts",
|
||||
status: "blocked: source file was referenced by checklist but missing from the fixture",
|
||||
},
|
||||
],
|
||||
},
|
||||
null,
|
||||
2,
|
||||
)}\n`;
|
||||
}
|
||||
|
||||
function buildReleaseHandoffMarkdown() {
|
||||
return [
|
||||
"# Release Handoff",
|
||||
"",
|
||||
"Ready:",
|
||||
"- REL-GATEWAY-417: gateway reconnect handling checked in `src/gateway/reconnect.ts`.",
|
||||
"- REL-CRON-904: cron duplicate prevention checked in `src/scheduling/cron.ts`.",
|
||||
"- REL-PLUGIN-319: plugin runtime loading checked in `src/plugins/runtime.ts`.",
|
||||
"- REL-INSTALL-846: installer update path checked in `install/update.ts`.",
|
||||
"",
|
||||
"Follow-up:",
|
||||
"- REL-CHANNEL-238: channel delivery ordering needs maintainer handoff.",
|
||||
"- REL-MEMORY-552: memory recall fallback ranking sample needs a second reviewer.",
|
||||
"- REL-DOCS-611: docs update status needs channel ordering and UI notes.",
|
||||
"- `ui/control-panel.ts` is blocked/not found in the fixture.",
|
||||
"",
|
||||
].join("\n");
|
||||
}
|
||||
|
||||
function extractPlannedToolName(events: StreamEvent[]) {
|
||||
for (const event of events) {
|
||||
if (event.type !== "response.output_item.done") {
|
||||
@@ -1128,6 +1215,63 @@ async function buildResponsesPayload(
|
||||
},
|
||||
]);
|
||||
}
|
||||
if (QA_SKILL_WORKSHOP_REVIEW_PROMPT_RE.test(allInputText)) {
|
||||
return buildAssistantEvents(
|
||||
JSON.stringify({
|
||||
action: "create",
|
||||
skillName: "animated-gif-workflow",
|
||||
title: "Animated GIF Workflow",
|
||||
reason: "Transcript captured a reusable animated media QA checklist.",
|
||||
description: "Reusable workflow notes for animated GIF QA tasks.",
|
||||
body: [
|
||||
"- Confirm the asset has true animation, not a static preview.",
|
||||
"- Check dimensions against the target product UI slot.",
|
||||
"- Record attribution and license before using the file.",
|
||||
"- Keep a local copy under the workspace before integration.",
|
||||
"- Re-open the local copy for final verification.",
|
||||
].join("\n"),
|
||||
}),
|
||||
);
|
||||
}
|
||||
if (QA_SKILL_WORKSHOP_GIF_PROMPT_RE.test(prompt) && !toolOutput) {
|
||||
return buildToolCallEventsWithArgs("write", {
|
||||
path: "animated-gif-qa-checklist.md",
|
||||
content: [
|
||||
"# Animated GIF QA Checklist",
|
||||
"",
|
||||
"- Confirm true animation.",
|
||||
"- Verify dimensions.",
|
||||
"- Record attribution.",
|
||||
"- Keep a local copy.",
|
||||
"- Perform final verification.",
|
||||
].join("\n"),
|
||||
});
|
||||
}
|
||||
if (QA_RELEASE_AUDIT_PROMPT_RE.test(prompt)) {
|
||||
if (!toolOutput) {
|
||||
return buildToolCallEventsWithArgs("read", { path: "audit-fixture/README.md" });
|
||||
}
|
||||
if (/Release readiness task|current checklist/i.test(toolOutput)) {
|
||||
return buildToolCallEventsWithArgs("read", {
|
||||
path: "audit-fixture/docs/current-readiness-checklist.md",
|
||||
});
|
||||
}
|
||||
if (/Current release readiness requires checking eight areas/i.test(toolOutput)) {
|
||||
return buildToolCallEventsWithArgs("write", {
|
||||
path: "audit-fixture/release-audit.json",
|
||||
content: buildReleaseAuditJson(),
|
||||
});
|
||||
}
|
||||
if (/release-audit\.json/i.test(toolOutput)) {
|
||||
return buildToolCallEventsWithArgs("write", {
|
||||
path: "audit-fixture/release-handoff.md",
|
||||
content: buildReleaseHandoffMarkdown(),
|
||||
});
|
||||
}
|
||||
if (/release-handoff\.md/i.test(toolOutput)) {
|
||||
return buildAssistantEvents("RELEASE-AUDIT-COMPLETE");
|
||||
}
|
||||
}
|
||||
if (/lobster invaders/i.test(prompt)) {
|
||||
if (!toolOutput) {
|
||||
return buildToolCallEventsWithArgs("read", { path: "QA_KICKOFF_TASK.md" });
|
||||
|
||||
@@ -45,8 +45,8 @@ describe("qa run config", () => {
|
||||
it("creates a live-by-default selection that arms every scenario", () => {
|
||||
expect(createDefaultQaRunSelection(scenarios)).toEqual({
|
||||
providerMode: "live-frontier",
|
||||
primaryModel: "openai/gpt-5.5",
|
||||
alternateModel: "openai/gpt-5.5",
|
||||
primaryModel: "openai/gpt-5.4",
|
||||
alternateModel: "openai/gpt-5.4",
|
||||
fastMode: true,
|
||||
scenarioIds: ["dm-chat-baseline", "thread-lifecycle"],
|
||||
});
|
||||
@@ -57,7 +57,7 @@ describe("qa run config", () => {
|
||||
normalizeQaRunSelection(
|
||||
{
|
||||
providerMode: "live-frontier",
|
||||
primaryModel: "openai/gpt-5.5",
|
||||
primaryModel: "openai/gpt-5.4",
|
||||
alternateModel: "",
|
||||
fastMode: false,
|
||||
scenarioIds: ["thread-lifecycle", "missing", "thread-lifecycle"],
|
||||
@@ -66,8 +66,8 @@ describe("qa run config", () => {
|
||||
),
|
||||
).toEqual({
|
||||
providerMode: "live-frontier",
|
||||
primaryModel: "openai/gpt-5.5",
|
||||
alternateModel: "openai/gpt-5.5",
|
||||
primaryModel: "openai/gpt-5.4",
|
||||
alternateModel: "openai/gpt-5.4",
|
||||
fastMode: true,
|
||||
scenarioIds: ["thread-lifecycle"],
|
||||
});
|
||||
@@ -99,13 +99,13 @@ describe("qa run config", () => {
|
||||
});
|
||||
|
||||
it("keeps idle snapshots on static defaults so startup does not inspect auth profiles", () => {
|
||||
defaultQaRuntimeModelForMode.mockReturnValue("openai/gpt-5.5");
|
||||
defaultQaRuntimeModelForMode.mockReturnValue("openai/gpt-5.4");
|
||||
defaultQaRuntimeModelForMode.mockClear();
|
||||
|
||||
expect(createIdleQaRunnerSnapshot(scenarios).selection).toMatchObject({
|
||||
providerMode: "live-frontier",
|
||||
primaryModel: "openai/gpt-5.5",
|
||||
alternateModel: "openai/gpt-5.5",
|
||||
primaryModel: "openai/gpt-5.4",
|
||||
alternateModel: "openai/gpt-5.4",
|
||||
});
|
||||
expect(defaultQaRuntimeModelForMode).not.toHaveBeenCalled();
|
||||
});
|
||||
@@ -138,14 +138,14 @@ describe("qa run config", () => {
|
||||
it("prefers the Codex OAuth default when the runtime resolver says it is available", () => {
|
||||
defaultQaRuntimeModelForMode.mockImplementation((mode, options) =>
|
||||
mode === "live-frontier"
|
||||
? "openai/gpt-5.5"
|
||||
? "openai/gpt-5.4"
|
||||
: defaultQaProviderModelForMode(mode as QaProviderModeInput, options),
|
||||
);
|
||||
|
||||
expect(createDefaultQaRunSelection(scenarios)).toEqual({
|
||||
providerMode: "live-frontier",
|
||||
primaryModel: "openai/gpt-5.5",
|
||||
alternateModel: "openai/gpt-5.5",
|
||||
primaryModel: "openai/gpt-5.4",
|
||||
alternateModel: "openai/gpt-5.4",
|
||||
fastMode: true,
|
||||
scenarioIds: ["dm-chat-baseline", "thread-lifecycle"],
|
||||
});
|
||||
|
||||
@@ -137,15 +137,15 @@ describe("qa scenario catalog", () => {
|
||||
|
||||
expect(scenario.sourcePath).toBe("qa/scenarios/models/gpt54-thinking-visibility-switch.md");
|
||||
expect(config?.requiredLiveProvider).toBe("openai");
|
||||
expect(config?.requiredLiveModel).toBe("gpt-5.5");
|
||||
expect(config?.requiredLiveModel).toBe("gpt-5.4");
|
||||
expect(config?.offDirective).toBe("/think off");
|
||||
expect(config?.maxDirective).toBe("/think max");
|
||||
expect(config?.maxDirective).toBe("/think medium");
|
||||
expect(config?.reasoningDirective).toBe("/reasoning on");
|
||||
expect(scenario.execution.flow?.steps.map((step) => step.name)).toEqual([
|
||||
"enables reasoning display and disables thinking",
|
||||
"switches to max thinking",
|
||||
"verifies max thinking emits visible reasoning",
|
||||
"verifies max thinking completes the answer",
|
||||
"switches to medium thinking",
|
||||
"verifies medium thinking emits visible reasoning",
|
||||
"verifies medium thinking completes the answer",
|
||||
]);
|
||||
});
|
||||
|
||||
@@ -169,10 +169,10 @@ describe("qa scenario catalog", () => {
|
||||
},
|
||||
});
|
||||
expect(config?.requiredProvider).toBe("openai");
|
||||
expect(config?.requiredModel).toBe("gpt-5.5");
|
||||
expect(config?.requiredModel).toBe("gpt-5.4");
|
||||
expect(config?.expectedMarker).toBe("WEB-SEARCH-OK");
|
||||
expect(scenario.execution.flow?.steps.map((step) => step.name)).toEqual([
|
||||
"confirms live OpenAI GPT-5.5 web search auto mode",
|
||||
"confirms live OpenAI GPT-5.4 web search auto mode",
|
||||
"searches official OpenAI News through the live model",
|
||||
]);
|
||||
});
|
||||
@@ -191,7 +191,7 @@ describe("qa scenario catalog", () => {
|
||||
expect(scenario.sourcePath).toBe("qa/scenarios/models/thinking-slash-model-remap.md");
|
||||
expect(config?.requiredProviderMode).toBe("live-frontier");
|
||||
expect(config?.anthropicModelRef).toBe("anthropic/claude-sonnet-4-6");
|
||||
expect(config?.openAiXhighModelRef).toBe("openai/gpt-5.5");
|
||||
expect(config?.openAiXhighModelRef).toBe("openai/gpt-5.4");
|
||||
expect(config?.noXhighModelRef).toBe("anthropic/claude-sonnet-4-6");
|
||||
expect(scenario.execution.flow?.steps.map((step) => step.name)).toEqual([
|
||||
"selects Anthropic and verifies adaptive options",
|
||||
|
||||
@@ -250,4 +250,32 @@ describe("qa suite planning helpers", () => {
|
||||
}).map((scenario) => scenario.id),
|
||||
).toEqual(["generic", "claude-subscription"]);
|
||||
});
|
||||
|
||||
it("filters provider-mode-specific scenarios from implicit suite selections", () => {
|
||||
const scenarios = [
|
||||
makeQaSuiteTestScenario("generic"),
|
||||
makeQaSuiteTestScenario("live-only", {
|
||||
config: { requiredProviderMode: "live-frontier" },
|
||||
}),
|
||||
makeQaSuiteTestScenario("mock-only", {
|
||||
config: { requiredProviderMode: "mock-openai" },
|
||||
}),
|
||||
];
|
||||
|
||||
expect(
|
||||
selectQaSuiteScenarios({
|
||||
scenarios,
|
||||
providerMode: "mock-openai",
|
||||
primaryModel: "mock-openai/gpt-5.4",
|
||||
}).map((scenario) => scenario.id),
|
||||
).toEqual(["generic", "mock-only"]);
|
||||
|
||||
expect(
|
||||
selectQaSuiteScenarios({
|
||||
scenarios,
|
||||
providerMode: "live-frontier",
|
||||
primaryModel: "openai/gpt-5.4",
|
||||
}).map((scenario) => scenario.id),
|
||||
).toEqual(["generic", "live-only"]);
|
||||
});
|
||||
});
|
||||
|
||||
@@ -33,11 +33,15 @@ function scenarioMatchesLiveLane(params: {
|
||||
providerMode: QaProviderMode;
|
||||
claudeCliAuthMode?: QaCliBackendAuthMode;
|
||||
}) {
|
||||
const config = params.scenario.execution.config ?? {};
|
||||
const requiredProviderMode = normalizeQaConfigString(config.requiredProviderMode);
|
||||
if (requiredProviderMode && params.providerMode !== requiredProviderMode) {
|
||||
return false;
|
||||
}
|
||||
if (getQaProvider(params.providerMode).kind !== "live") {
|
||||
return true;
|
||||
}
|
||||
const selected = splitModelRef(params.primaryModel);
|
||||
const config = params.scenario.execution.config ?? {};
|
||||
const requiredProvider = normalizeQaConfigString(config.requiredProvider);
|
||||
if (requiredProvider && selected?.provider !== requiredProvider) {
|
||||
return false;
|
||||
|
||||
@@ -50,6 +50,9 @@ steps:
|
||||
expr: "tools.has('image_generate')"
|
||||
message: image_generate not present after imageGenerationModel patch
|
||||
- call: reset
|
||||
- set: generationStartedAt
|
||||
value:
|
||||
expr: Date.now()
|
||||
- call: runAgentPrompt
|
||||
args:
|
||||
- ref: env
|
||||
@@ -70,17 +73,18 @@ steps:
|
||||
expr: "!env.mock || ((await fetchJson(`${env.mock.baseUrl}/debug/requests`)).find((request) => String(request.allInputText ?? '').includes(config.promptSnippet))?.plannedToolName === 'image_generate')"
|
||||
message:
|
||||
expr: "`expected image_generate, got ${String((await fetchJson(`${env.mock.baseUrl}/debug/requests`)).find((request) => String(request.allInputText ?? '').includes(config.promptSnippet))?.plannedToolName ?? '')}`"
|
||||
- call: waitForCondition
|
||||
saveAs: generated
|
||||
- call: resolveGeneratedImagePath
|
||||
saveAs: generatedPath
|
||||
args:
|
||||
- lambda:
|
||||
async: true
|
||||
expr: "!env.mock ? true : (await fetchJson(`${env.mock.baseUrl}/debug/image-generations`)).find((request) => request.model === 'gpt-image-1' && String(request.prompt ?? '').includes(config.generatedNeedle))"
|
||||
- 15000
|
||||
- 250
|
||||
- env:
|
||||
ref: env
|
||||
promptSnippet:
|
||||
expr: config.promptSnippet
|
||||
startedAtMs:
|
||||
ref: generationStartedAt
|
||||
timeoutMs: 15000
|
||||
- assert:
|
||||
expr: "!env.mock || Boolean(generated)"
|
||||
message:
|
||||
expr: "`image provider was never invoked`"
|
||||
detailsExpr: "env.mock ? `${outbound.text}\\nIMAGE_PROMPT:${generated.prompt ?? ''}` : outbound.text"
|
||||
expr: "typeof generatedPath === 'string' && generatedPath.length > 0"
|
||||
message: image generation did not produce a saved media path
|
||||
detailsExpr: "`${outbound.text}\\nIMAGE_PATH:${generatedPath}`"
|
||||
```
|
||||
|
||||
@@ -24,10 +24,10 @@ codeRefs:
|
||||
- extensions/qa-lab/src/suite.ts
|
||||
execution:
|
||||
kind: flow
|
||||
summary: Run with `pnpm openclaw qa suite --provider-mode live-frontier --model openai/gpt-5.5 --alt-model openai/gpt-5.5 --scenario codex-harness-no-meta-leak`.
|
||||
summary: Run with `pnpm openclaw qa suite --provider-mode live-frontier --model openai/gpt-5.4 --alt-model openai/gpt-5.4 --fast --thinking medium --scenario codex-harness-no-meta-leak`.
|
||||
config:
|
||||
requiredProvider: codex
|
||||
requiredModel: gpt-5.5
|
||||
requiredModel: gpt-5.4
|
||||
harnessRuntime: codex
|
||||
harnessFallback: none
|
||||
expectedReply: QA_LEAK_OK
|
||||
@@ -47,7 +47,7 @@ execution:
|
||||
|
||||
```yaml qa-flow
|
||||
steps:
|
||||
- name: confirms GPT-5.5 Codex harness target
|
||||
- name: confirms GPT-5.4 Codex harness target
|
||||
actions:
|
||||
- set: selected
|
||||
value:
|
||||
|
||||
@@ -1,20 +1,20 @@
|
||||
# GPT-5.5 thinking visibility switch
|
||||
# GPT-5.4 thinking visibility switch
|
||||
|
||||
```yaml qa-scenario
|
||||
id: gpt54-thinking-visibility-switch
|
||||
title: GPT-5.5 thinking visibility switch
|
||||
title: GPT-5.4 thinking visibility switch
|
||||
surface: models
|
||||
coverage:
|
||||
primary:
|
||||
- models.thinking
|
||||
secondary:
|
||||
- runtime.reasoning-visibility
|
||||
objective: Verify GPT-5.5 can switch from disabled thinking to max thinking while reasoning display stays enabled.
|
||||
objective: Verify GPT-5.4 can switch from disabled thinking to medium thinking while reasoning display stays enabled.
|
||||
successCriteria:
|
||||
- Live runs target openai/gpt-5.5, not a mini or pro variant.
|
||||
- Live runs target openai/gpt-5.4, not a mini or pro variant.
|
||||
- The session enables reasoning display before the comparison turns.
|
||||
- The disabled-thinking turn returns its visible marker without a Reasoning-prefixed message.
|
||||
- The max-thinking turn returns its visible marker and a separate Reasoning-prefixed message.
|
||||
- The medium-thinking turn returns its visible marker and a separate Reasoning-prefixed message.
|
||||
docsRefs:
|
||||
- docs/tools/thinking.md
|
||||
- docs/help/testing.md
|
||||
@@ -27,12 +27,12 @@ codeRefs:
|
||||
- extensions/qa-lab/src/providers/mock-openai/server.ts
|
||||
execution:
|
||||
kind: flow
|
||||
summary: Toggle reasoning display and GPT-5.5 thinking between off/none and max/high, then verify visible reasoning only on the max turn.
|
||||
summary: Toggle reasoning display and GPT-5.4 thinking between off/none and medium, then verify visible reasoning only on the medium turn.
|
||||
config:
|
||||
requiredLiveProvider: openai
|
||||
requiredLiveModel: gpt-5.5
|
||||
requiredLiveModel: gpt-5.4
|
||||
offDirective: /think off
|
||||
maxDirective: /think max
|
||||
maxDirective: /think medium
|
||||
reasoningDirective: /reasoning on
|
||||
conversationId: qa-thinking-visibility
|
||||
offPrompt: "QA thinking visibility check off: answer exactly THINKING-OFF-OK."
|
||||
@@ -60,7 +60,7 @@ steps:
|
||||
- assert:
|
||||
expr: "env.providerMode !== 'live-frontier' || (selected?.provider === config.requiredLiveProvider && selected?.model === config.requiredLiveModel)"
|
||||
message:
|
||||
expr: "`expected live GPT-5.5, got ${env.primaryModel}`"
|
||||
expr: "`expected live GPT-5.4, got ${env.primaryModel}`"
|
||||
- call: state.addInboundMessage
|
||||
args:
|
||||
- conversation:
|
||||
@@ -133,11 +133,11 @@ steps:
|
||||
value:
|
||||
expr: "requests.find((request) => String(request.allInputText ?? '').includes(config.offPrompt))"
|
||||
- assert:
|
||||
expr: "String(offRequest?.model ?? '').includes('gpt-5.5')"
|
||||
expr: "String(offRequest?.model ?? '').includes('gpt-5.4')"
|
||||
message:
|
||||
expr: "`expected GPT-5.5 off mock request, got ${String(offRequest?.model ?? '')}`"
|
||||
expr: "`expected GPT-5.4 off mock request, got ${String(offRequest?.model ?? '')}`"
|
||||
detailsExpr: "`off ack=${offAck.text}; off answer=${offAnswer.text}`"
|
||||
- name: switches to max thinking
|
||||
- name: switches to medium thinking
|
||||
actions:
|
||||
- call: state.addInboundMessage
|
||||
args:
|
||||
@@ -153,10 +153,10 @@ steps:
|
||||
saveAs: maxAck
|
||||
args:
|
||||
- lambda:
|
||||
expr: "state.getSnapshot().messages.filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === config.conversationId && /Thinking level set to high/i.test(candidate.text)).at(-1)"
|
||||
expr: "state.getSnapshot().messages.filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === config.conversationId && /Thinking level set to medium/i.test(candidate.text)).at(-1)"
|
||||
- expr: liveTurnTimeoutMs(env, 20000)
|
||||
detailsExpr: "`max ack=${maxAck.text}`"
|
||||
- name: verifies max thinking emits visible reasoning
|
||||
- name: verifies medium thinking emits visible reasoning
|
||||
actions:
|
||||
- set: maxCursor
|
||||
value:
|
||||
@@ -182,7 +182,7 @@ steps:
|
||||
message:
|
||||
expr: "`missing max reasoning message near answer: ${recentOutboundSummary(state, 6)}`"
|
||||
detailsExpr: "`reasoning=${maxReasoning.text}`"
|
||||
- name: verifies max thinking completes the answer
|
||||
- name: verifies medium thinking completes the answer
|
||||
actions:
|
||||
- call: waitForCondition
|
||||
saveAs: maxAnswer
|
||||
@@ -204,8 +204,8 @@ steps:
|
||||
value:
|
||||
expr: "requests.find((request) => String(request.allInputText ?? '').includes(config.maxPrompt))"
|
||||
- assert:
|
||||
expr: "String(maxRequest?.model ?? '').includes('gpt-5.5')"
|
||||
expr: "String(maxRequest?.model ?? '').includes('gpt-5.4')"
|
||||
message:
|
||||
expr: "`expected GPT-5.5 mock request, got ${String(maxRequest?.model ?? '')}`"
|
||||
expr: "`expected GPT-5.4 mock request, got ${String(maxRequest?.model ?? '')}`"
|
||||
detailsExpr: "`answer=${maxAnswer.text}`"
|
||||
```
|
||||
|
||||
@@ -12,7 +12,7 @@ coverage:
|
||||
objective: Verify a live OpenAI GPT model can use OpenAI native web_search when OpenClaw web search is enabled in auto mode.
|
||||
successCriteria:
|
||||
- A live-frontier run fails fast unless the selected primary provider is openai.
|
||||
- The selected primary model is GPT-5.5, not a mini or pro variant.
|
||||
- The selected primary model is GPT-5.4, not a mini or pro variant.
|
||||
- Web search is enabled without pinning a managed web_search provider.
|
||||
- The live reply includes the required marker plus an official OpenAI News URL and headline found through web search.
|
||||
gatewayConfigPatch:
|
||||
@@ -32,10 +32,10 @@ codeRefs:
|
||||
- extensions/qa-lab/src/suite.ts
|
||||
execution:
|
||||
kind: flow
|
||||
summary: Run with `OPENCLAW_LIVE_OPENAI_KEY="${OPENAI_API_KEY}" pnpm openclaw qa suite --provider-mode live-frontier --model openai/gpt-5.5 --alt-model openai/gpt-5.5 --scenario openai-native-web-search-live`.
|
||||
summary: Run with `OPENCLAW_LIVE_OPENAI_KEY="${OPENAI_API_KEY}" pnpm openclaw qa suite --provider-mode live-frontier --model openai/gpt-5.4 --alt-model openai/gpt-5.4 --fast --thinking medium --scenario openai-native-web-search-live`.
|
||||
config:
|
||||
requiredProvider: openai
|
||||
requiredModel: gpt-5.5
|
||||
requiredModel: gpt-5.4
|
||||
expectedMarker: WEB-SEARCH-OK
|
||||
failureMarker: WEB-SEARCH-FAILED
|
||||
searchPrompt: |-
|
||||
@@ -49,7 +49,7 @@ execution:
|
||||
|
||||
```yaml qa-flow
|
||||
steps:
|
||||
- name: confirms live OpenAI GPT-5.5 web search auto mode
|
||||
- name: confirms live OpenAI GPT-5.4 web search auto mode
|
||||
actions:
|
||||
- call: waitForGatewayHealthy
|
||||
args:
|
||||
|
||||
@@ -13,8 +13,8 @@ coverage:
|
||||
objective: Verify /think lists provider-owned levels and remaps stored thinking levels when /model changes provider capabilities.
|
||||
successCriteria:
|
||||
- Anthropic Claude Sonnet 4.6 advertises adaptive but not OpenAI-only xhigh or Opus max.
|
||||
- A stored adaptive level remaps to medium when switching to OpenAI GPT-5.5.
|
||||
- OpenAI GPT-5.5 advertises xhigh but not adaptive or max.
|
||||
- A stored adaptive level remaps to medium when switching to OpenAI GPT-5.4.
|
||||
- OpenAI GPT-5.4 advertises xhigh but not adaptive or max.
|
||||
- A stored xhigh level remaps to high when switching to an Anthropic model without xhigh support.
|
||||
docsRefs:
|
||||
- docs/tools/thinking.md
|
||||
@@ -33,7 +33,7 @@ execution:
|
||||
config:
|
||||
requiredProviderMode: live-frontier
|
||||
anthropicModelRef: anthropic/claude-sonnet-4-6
|
||||
openAiXhighModelRef: openai/gpt-5.5
|
||||
openAiXhighModelRef: openai/gpt-5.4
|
||||
noXhighModelRef: anthropic/claude-sonnet-4-6
|
||||
conversationId: qa-thinking-slash-remap
|
||||
```
|
||||
@@ -165,7 +165,7 @@ steps:
|
||||
- assert:
|
||||
expr: "/Options: .*\\bxhigh\\b/i.test(openAiThinkStatus.text) && !/Options: .*\\badaptive\\b/i.test(openAiThinkStatus.text) && !/Options: .*\\bmax\\b/i.test(openAiThinkStatus.text)"
|
||||
message:
|
||||
expr: "`expected OpenAI GPT-5.5 /think options to include xhigh only, got ${openAiThinkStatus.text}`"
|
||||
expr: "`expected OpenAI GPT-5.4 /think options to include xhigh only, got ${openAiThinkStatus.text}`"
|
||||
detailsExpr: "`adaptive=${adaptiveAck.text}; switch=${openAiModelAck.text}; think=${openAiThinkStatus.text}`"
|
||||
- name: maps xhigh to high on a model without xhigh
|
||||
actions:
|
||||
|
||||
@@ -11,7 +11,7 @@ coverage:
|
||||
- models.codex-cli
|
||||
objective: Verify the Codex app-server harness can plan and build a medium-complex self-contained browser game.
|
||||
successCriteria:
|
||||
- A live-frontier run fails fast unless the selected primary model is openai/gpt-5.5 with the Codex harness forced.
|
||||
- A live-frontier run fails fast unless the selected primary model is openai/gpt-5.4 with the Codex harness forced.
|
||||
- The scenario forces the Codex embedded harness and disables PI fallback.
|
||||
- The prompt explicitly asks the agent to enter plan mode before editing.
|
||||
- The agent writes a self-contained HTML game with a canvas loop, controls, scoring, waves, pause, and restart.
|
||||
@@ -25,10 +25,10 @@ codeRefs:
|
||||
- extensions/qa-lab/src/suite.ts
|
||||
execution:
|
||||
kind: flow
|
||||
summary: Run with `pnpm openclaw qa suite --provider-mode live-frontier --model openai/gpt-5.5 --alt-model openai/gpt-5.5 --scenario medium-game-plan-codex-harness`.
|
||||
summary: Run with `pnpm openclaw qa suite --provider-mode live-frontier --model openai/gpt-5.4 --alt-model openai/gpt-5.4 --fast --thinking medium --scenario medium-game-plan-codex-harness`.
|
||||
config:
|
||||
requiredProvider: codex
|
||||
requiredModel: gpt-5.5
|
||||
requiredModel: gpt-5.4
|
||||
harnessRuntime: codex
|
||||
harnessFallback: none
|
||||
artifactFile: star-garden-defenders-codex.html
|
||||
@@ -52,7 +52,7 @@ execution:
|
||||
|
||||
```yaml qa-flow
|
||||
steps:
|
||||
- name: confirms GPT-5.5 Codex harness target
|
||||
- name: confirms GPT-5.4 Codex harness target
|
||||
actions:
|
||||
- set: selected
|
||||
value:
|
||||
|
||||
@@ -9,9 +9,9 @@ coverage:
|
||||
- workspace.planning
|
||||
secondary:
|
||||
- agents.pi-harness
|
||||
objective: Verify GPT-5.5 can use the PI harness to plan and build a medium-complex self-contained browser game.
|
||||
objective: Verify GPT-5.4 can use the PI harness to plan and build a medium-complex self-contained browser game.
|
||||
successCriteria:
|
||||
- A live-frontier run fails fast unless the selected primary model is openai/gpt-5.5.
|
||||
- A live-frontier run fails fast unless the selected primary model is openai/gpt-5.4.
|
||||
- The scenario forces the embedded PI harness before the build turn.
|
||||
- The prompt explicitly asks the agent to enter plan mode before editing.
|
||||
- The agent writes a self-contained HTML game with a canvas loop, controls, scoring, waves, pause, and restart.
|
||||
@@ -25,10 +25,10 @@ codeRefs:
|
||||
- extensions/qa-lab/src/suite.ts
|
||||
execution:
|
||||
kind: flow
|
||||
summary: Run with `pnpm openclaw qa suite --provider-mode live-frontier --model openai/gpt-5.5 --alt-model openai/gpt-5.5 --scenario medium-game-plan-pi-harness`.
|
||||
summary: Run with `pnpm openclaw qa suite --provider-mode live-frontier --model openai/gpt-5.4 --alt-model openai/gpt-5.4 --fast --thinking medium --scenario medium-game-plan-pi-harness`.
|
||||
config:
|
||||
requiredProvider: openai
|
||||
requiredModel: gpt-5.5
|
||||
requiredModel: gpt-5.4
|
||||
harnessRuntime: pi
|
||||
harnessFallback: pi
|
||||
artifactFile: star-garden-defenders-pi.html
|
||||
@@ -52,7 +52,7 @@ execution:
|
||||
|
||||
```yaml qa-flow
|
||||
steps:
|
||||
- name: confirms GPT-5.5 PI harness target
|
||||
- name: confirms GPT-5.4 PI harness target
|
||||
actions:
|
||||
- set: selected
|
||||
value:
|
||||
|
||||
Reference in New Issue
Block a user