refactor: split qa scenarios into per-file markdown defs

This commit is contained in:
Peter Steinberger
2026-04-08 05:37:02 +01:00
parent 5eab61b45d
commit b73d8ef7d7
40 changed files with 1255 additions and 666 deletions

View File

@@ -9,7 +9,7 @@ describe("qa discovery evaluation", () => {
it("accepts rich discovery reports that explicitly confirm all required files were read", () => {
const report = `
Worked
- Read all three requested files: repo/qa/scenarios.md, repo/extensions/qa-lab/src/suite.ts, and repo/docs/help/testing.md.
- Read all three requested files: repo/qa/scenarios/index.md, repo/extensions/qa-lab/src/suite.ts, and repo/docs/help/testing.md.
Failed
- None.
Blocked
@@ -28,7 +28,7 @@ The helper text mentions banned phrases like "not present", "missing files", "bl
it("accepts numeric 'all 4 required files read' confirmations", () => {
const report = `
Worked
- Source: repo/qa/scenarios.md, repo/extensions/qa-lab/src/suite.ts, repo/docs/help/testing.md
- Source: repo/qa/scenarios/index.md, repo/extensions/qa-lab/src/suite.ts, repo/docs/help/testing.md
- all 3 required files read.
Failed
- None.
@@ -49,7 +49,7 @@ The report may quote phrases like "not present" while describing the evaluator,
const report = `
Worked
- All three files retrieved. Now let me compile the protocol report.
- All three mandated files read successfully: repo/qa/scenarios.md, repo/extensions/qa-lab/src/suite.ts, repo/docs/help/testing.md.
- All three mandated files read successfully: repo/qa/scenarios/index.md, repo/extensions/qa-lab/src/suite.ts, repo/docs/help/testing.md.
Failed
- None.
Blocked
@@ -83,7 +83,7 @@ Follow-up
it("flags discovery replies that drift into unrelated suite wrap-up claims", () => {
const report = `
Worked
- All three requested files were read: repo/qa/scenarios.md, repo/extensions/qa-lab/src/suite.ts, repo/docs/help/testing.md.
- All three requested files were read: repo/qa/scenarios/index.md, repo/extensions/qa-lab/src/suite.ts, repo/docs/help/testing.md.
Failed
- None.
Blocked

View File

@@ -1,10 +1,20 @@
import { normalizeLowercaseStringOrEmpty } from "openclaw/plugin-sdk/text-runtime";
import { readQaScenarioExecutionConfig } from "./scenario-catalog.js";
const REQUIRED_DISCOVERY_REFS = [
"repo/qa/scenarios.md",
"repo/extensions/qa-lab/src/suite.ts",
"repo/docs/help/testing.md",
] as const;
function readRequiredDiscoveryRefs() {
const config = readQaScenarioExecutionConfig("source-docs-discovery-report") as
| { requiredFiles?: string[] }
| undefined;
return (
config?.requiredFiles ?? [
"repo/qa/scenarios/index.md",
"repo/extensions/qa-lab/src/suite.ts",
"repo/docs/help/testing.md",
]
);
}
const REQUIRED_DISCOVERY_REFS = readRequiredDiscoveryRefs();
const REQUIRED_DISCOVERY_REFS_LOWER = REQUIRED_DISCOVERY_REFS.map(normalizeLowercaseStringOrEmpty);

View File

@@ -124,6 +124,8 @@ export function buildQaRuntimeEnv(params: {
function isRetryableGatewayCallError(details: string): boolean {
return (
details.includes("handshake timeout") ||
details.includes("gateway closed (1000") ||
details.includes("gateway closed (1012)") ||
details.includes("gateway closed (1006") ||
details.includes("abnormal closure") ||
@@ -168,6 +170,16 @@ async function waitForGatewayReady(params: {
throw new Error(`gateway failed to become healthy:\n${params.logs()}`);
}
function isRetryableRpcStartupError(error: unknown) {
const details = formatErrorMessage(error);
return (
details.includes("handshake timeout") ||
details.includes("gateway closed (1000") ||
details.includes("gateway closed (1006") ||
details.includes("gateway closed (1012)")
);
}
export function resolveQaControlUiRoot(params: { repoRoot: string; controlUiEnabled?: boolean }) {
if (params.controlUiEnabled === false) {
return undefined;
@@ -277,12 +289,34 @@ export async function startQaGatewayChild(params: {
baseUrl,
logs,
child,
timeoutMs: 120_000,
});
rpcClient = await startQaGatewayRpcClient({
wsUrl,
token: gatewayToken,
logs,
});
let lastRpcError: unknown = null;
for (let attempt = 1; attempt <= 4; attempt += 1) {
try {
rpcClient = await startQaGatewayRpcClient({
wsUrl,
token: gatewayToken,
logs,
});
break;
} catch (error) {
lastRpcError = error;
if (attempt >= 4 || !isRetryableRpcStartupError(error)) {
throw error;
}
await sleep(500 * attempt);
await waitForGatewayReady({
baseUrl,
logs,
child,
timeoutMs: 15_000,
});
}
}
if (!rpcClient) {
throw lastRpcError ?? new Error("qa gateway rpc client failed to start");
}
} catch (error) {
child.kill("SIGTERM");
throw error;

View File

@@ -15,7 +15,7 @@ describe("qa live timeout policy", () => {
).toBe(30_000);
});
it("uses the standard live floor for non-anthropic models", () => {
it("uses the higher gpt-5 live floor for openai heavy turns", () => {
expect(
resolveQaLiveTurnTimeoutMs(
{
@@ -25,6 +25,19 @@ describe("qa live timeout policy", () => {
},
30_000,
),
).toBe(360_000);
});
it("keeps the standard live floor for other non-anthropic models", () => {
expect(
resolveQaLiveTurnTimeoutMs(
{
providerMode: "live-frontier",
primaryModel: "google/gemini-3-flash",
alternateModel: "google/gemini-3-flash",
},
30_000,
),
).toBe(120_000);
});

View File

@@ -8,6 +8,14 @@ function isAnthropicModel(modelRef: string) {
return modelRef.startsWith("anthropic/");
}
function isOpenAiModel(modelRef: string) {
return modelRef.startsWith("openai/");
}
function isGptFiveModel(modelRef: string) {
return isOpenAiModel(modelRef) && modelRef.slice("openai/".length).startsWith("gpt-5");
}
function isClaudeOpusModel(modelRef: string) {
return isAnthropicModel(modelRef) && modelRef.includes("claude-opus");
}
@@ -26,5 +34,8 @@ export function resolveQaLiveTurnTimeoutMs(
if (isAnthropicModel(modelRef)) {
return Math.max(fallbackMs, 180_000);
}
if (isGptFiveModel(modelRef)) {
return Math.max(fallbackMs, 360_000);
}
return Math.max(fallbackMs, 120_000);
}

View File

@@ -1,5 +1,11 @@
import { describe, expect, it } from "vitest";
import { readQaBootstrapScenarioCatalog, readQaScenarioPack } from "./scenario-catalog.js";
import {
listQaScenarioMarkdownPaths,
readQaBootstrapScenarioCatalog,
readQaScenarioById,
readQaScenarioExecutionConfig,
readQaScenarioPack,
} from "./scenario-catalog.js";
describe("qa scenario catalog", () => {
it("loads the markdown pack as the canonical source of truth", () => {
@@ -8,6 +14,7 @@ describe("qa scenario catalog", () => {
expect(pack.version).toBe(1);
expect(pack.agent.identityMarkdown).toContain("Dev C-3PO");
expect(pack.kickoffTask).toContain("Lobster Invaders");
expect(listQaScenarioMarkdownPaths().length).toBe(pack.scenarios.length);
expect(pack.scenarios.some((scenario) => scenario.id === "image-generation-roundtrip")).toBe(
true,
);
@@ -23,4 +30,18 @@ describe("qa scenario catalog", () => {
true,
);
});
it("loads scenario-specific execution config from per-scenario markdown", () => {
const discovery = readQaScenarioById("source-docs-discovery-report");
const discoveryConfig = readQaScenarioExecutionConfig("source-docs-discovery-report");
const fallbackConfig = readQaScenarioExecutionConfig("memory-failure-fallback");
expect(discovery.title).toBe("Source and docs discovery report");
expect((discoveryConfig?.requiredFiles as string[] | undefined)?.[0]).toBe(
"repo/qa/scenarios/index.md",
);
expect(fallbackConfig?.gracefulFallbackAny as string[] | undefined).toContain(
"will not reveal",
);
});
});

View File

@@ -24,6 +24,7 @@ const qaScenarioExecutionSchema = z.object({
kind: z.literal("custom").default("custom"),
handler: z.string().trim().min(1),
summary: z.string().trim().min(1).optional(),
config: z.record(z.string(), z.unknown()).optional(),
});
const qaSeedScenarioSchema = z.object({
@@ -47,12 +48,13 @@ const qaScenarioPackSchema = z.object({
identityMarkdown: DEFAULT_QA_AGENT_IDENTITY_MARKDOWN,
}),
kickoffTask: z.string().trim().min(1),
scenarios: z.array(qaSeedScenarioSchema).min(1),
});
export type QaScenarioExecution = z.infer<typeof qaScenarioExecutionSchema>;
export type QaSeedScenario = z.infer<typeof qaSeedScenarioSchema>;
export type QaScenarioPack = z.infer<typeof qaScenarioPackSchema>;
export type QaScenarioPack = z.infer<typeof qaScenarioPackSchema> & {
scenarios: QaSeedScenario[];
};
export type QaBootstrapScenarioCatalog = {
agentIdentityMarkdown: string;
@@ -60,8 +62,11 @@ export type QaBootstrapScenarioCatalog = {
scenarios: QaSeedScenario[];
};
const QA_SCENARIO_PACK_PATH = "qa/scenarios.md";
const QA_SCENARIO_PACK_INDEX_PATH = "qa/scenarios/index.md";
const QA_SCENARIO_LEGACY_OVERVIEW_PATH = "qa/scenarios.md";
const QA_SCENARIO_DIR_PATH = "qa/scenarios";
const QA_PACK_FENCE_RE = /```ya?ml qa-pack\r?\n([\s\S]*?)\r?\n```/i;
const QA_SCENARIO_FENCE_RE = /```ya?ml qa-scenario\r?\n([\s\S]*?)\r?\n```/i;
function walkUpDirectories(start: string): string[] {
const roots: string[] = [];
@@ -76,10 +81,14 @@ function walkUpDirectories(start: string): string[] {
}
}
function resolveRepoFile(relativePath: string): string | null {
function resolveRepoPath(relativePath: string, kind: "file" | "directory" = "file"): string | null {
for (const dir of walkUpDirectories(import.meta.dirname)) {
const candidate = path.join(dir, relativePath);
if (fs.existsSync(candidate) && fs.statSync(candidate).isFile()) {
if (!fs.existsSync(candidate)) {
continue;
}
const stat = fs.statSync(candidate);
if ((kind === "file" && stat.isFile()) || (kind === "directory" && stat.isDirectory())) {
return candidate;
}
}
@@ -87,34 +96,75 @@ function resolveRepoFile(relativePath: string): string | null {
}
function readTextFile(relativePath: string): string {
const resolved = resolveRepoFile(relativePath);
const resolved = resolveRepoPath(relativePath, "file");
if (!resolved) {
return "";
}
return fs.readFileSync(resolved, "utf8");
}
function readDirEntries(relativePath: string): string[] {
const resolved = resolveRepoPath(relativePath, "directory");
if (!resolved) {
return [];
}
return fs.readdirSync(resolved);
}
function extractQaPackYaml(content: string) {
const match = content.match(QA_PACK_FENCE_RE);
if (!match?.[1]) {
throw new Error(
`qa scenario pack missing \`\`\`yaml qa-pack fence in ${QA_SCENARIO_PACK_PATH}`,
`qa scenario pack missing \`\`\`yaml qa-pack fence in ${QA_SCENARIO_PACK_INDEX_PATH}`,
);
}
return match[1];
}
function extractQaScenarioYaml(content: string, relativePath: string) {
const match = content.match(QA_SCENARIO_FENCE_RE);
if (!match?.[1]) {
throw new Error(`qa scenario file missing \`\`\`yaml qa-scenario fence in ${relativePath}`);
}
return match[1];
}
export function readQaScenarioPackMarkdown(): string {
return readTextFile(QA_SCENARIO_PACK_PATH).trim();
const chunks = [readTextFile(QA_SCENARIO_PACK_INDEX_PATH).trim()];
for (const relativePath of listQaScenarioMarkdownPaths()) {
chunks.push(readTextFile(relativePath).trim());
}
return chunks.filter(Boolean).join("\n\n");
}
export function readQaScenarioPack(): QaScenarioPack {
const markdown = readQaScenarioPackMarkdown();
if (!markdown) {
throw new Error(`qa scenario pack not found: ${QA_SCENARIO_PACK_PATH}`);
const packMarkdown = readTextFile(QA_SCENARIO_PACK_INDEX_PATH).trim();
if (!packMarkdown) {
throw new Error(`qa scenario pack not found: ${QA_SCENARIO_PACK_INDEX_PATH}`);
}
const parsed = YAML.parse(extractQaPackYaml(markdown)) as unknown;
return qaScenarioPackSchema.parse(parsed);
const parsedPack = qaScenarioPackSchema.parse(
YAML.parse(extractQaPackYaml(packMarkdown)) as unknown,
);
const scenarios = listQaScenarioMarkdownPaths().map((relativePath) =>
qaSeedScenarioSchema.parse(
YAML.parse(extractQaScenarioYaml(readTextFile(relativePath), relativePath)) as unknown,
),
);
return {
...parsedPack,
scenarios,
};
}
export function listQaScenarioMarkdownPaths(): string[] {
return readDirEntries(QA_SCENARIO_DIR_PATH)
.filter((entry) => entry.endsWith(".md") && entry !== "index.md")
.map((entry) => `${QA_SCENARIO_DIR_PATH}/${entry}`)
.toSorted();
}
export function readQaScenarioOverviewMarkdown(): string {
return readTextFile(QA_SCENARIO_LEGACY_OVERVIEW_PATH).trim();
}
export function readQaBootstrapScenarioCatalog(): QaBootstrapScenarioCatalog {
@@ -125,3 +175,15 @@ export function readQaBootstrapScenarioCatalog(): QaBootstrapScenarioCatalog {
scenarios: pack.scenarios,
};
}
export function readQaScenarioById(id: string): QaSeedScenario {
const scenario = readQaScenarioPack().scenarios.find((candidate) => candidate.id === id);
if (!scenario) {
throw new Error(`unknown qa scenario: ${id}`);
}
return scenario;
}
export function readQaScenarioExecutionConfig(id: string): Record<string, unknown> | undefined {
return readQaScenarioById(id).execution?.config;
}

View File

@@ -35,7 +35,10 @@ import {
import { hasModelSwitchContinuityEvidence } from "./model-switch-eval.js";
import { renderQaMarkdownReport, type QaReportCheck, type QaReportScenario } from "./report.js";
import { qaChannelPlugin, type QaBusMessage } from "./runtime-api.js";
import { readQaBootstrapScenarioCatalog } from "./scenario-catalog.js";
import {
readQaBootstrapScenarioCatalog,
readQaScenarioExecutionConfig,
} from "./scenario-catalog.js";
type QaSuiteStep = {
name: string;
@@ -60,8 +63,10 @@ type QaSuiteEnvironment = {
alternateModel: string;
};
const QA_IMAGE_UNDERSTANDING_PNG_BASE64 =
"iVBORw0KGgoAAAANSUhEUgAAAEAAAABACAIAAAAlC+aJAAAAT0lEQVR42u3RQQkAMAzAwPg33Wnos+wgBo40dboAAAAAAAAAAAAAAAAAAAAAAAAAAAAAANYADwAAAAAAAAAAAAAAAAAAAAAAAAAAAAC+Azy47PDiI4pA2wAAAABJRU5ErkJggg==";
const _QA_IMAGE_UNDERSTANDING_PNG_BASE64 =
"iVBORw0KGgoAAAANSUhEUgAAAQAAAAEACAYAAABccqhmAAAAAklEQVR4AewaftIAAAK4SURBVO3BAQEAMAwCIG//znsQgXfJBZjUALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsl9wFmNQAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwP4TIF+7ciPkoAAAAASUVORK5CYII=";
const QA_IMAGE_UNDERSTANDING_LARGE_PNG_BASE64 =
"iVBORw0KGgoAAAANSUhEUgAAAQAAAAEACAYAAABccqhmAAACuklEQVR4Ae3BAQEAMAwCIG//znsQgXfJBZjUALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsBpjVALMaYFYDzGqAWQ0wqwFmNcCsl9wFmNQAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwGmNUAsxpgVgPMaoBZDTCrAWY1wKwP4TIF+2YE/z8AAAAASUVORK5CYII=";
type QaSkillStatusEntry = {
name?: string;
@@ -99,6 +104,14 @@ type QaRawSessionStoreEntry = {
updatedAt?: number;
};
const QA_CONTROL_PLANE_WRITE_WINDOW_MS = 60_000;
const QA_CONTROL_PLANE_WRITE_MAX_REQUESTS = 2;
function readScenarioExecutionConfig<T extends Record<string, unknown>>(id: string): T {
return (readQaScenarioExecutionConfig(id) as T | undefined) ?? ({} as T);
}
const qaControlPlaneWriteTimestamps: number[] = [];
function splitModelRef(ref: string) {
const slash = ref.indexOf("/");
if (slash <= 0 || slash === ref.length - 1) {
@@ -187,6 +200,21 @@ function recentOutboundSummary(state: QaBusState, limit = 5) {
.join(" | ");
}
function normalizeQaFanoutSuccessText(text: string) {
const lower = normalizeLowercaseStringOrEmpty(text);
const sawFirst =
lower.includes("alpha-ok") ||
lower.includes("subagent_one_ok") ||
lower.includes("subagent one ok") ||
lower.includes("subagent-1: ok");
const sawSecond =
lower.includes("beta-ok") ||
lower.includes("subagent_two_ok") ||
lower.includes("subagent two ok") ||
lower.includes("subagent-2: ok");
return sawFirst && sawSecond;
}
async function runScenario(name: string, steps: QaSuiteStep[]): Promise<QaSuiteScenarioResult> {
const stepResults: QaReportCheck[] = [];
for (const step of steps) {
@@ -309,6 +337,44 @@ function isConfigHashConflict(error: unknown) {
return formatErrorMessage(error).includes("config changed since last load");
}
function getGatewayRetryAfterMs(error: unknown) {
const text = formatErrorMessage(error);
const millisecondsMatch = /retryAfterMs["=: ]+(\d+)/i.exec(text);
if (millisecondsMatch) {
const parsed = Number(millisecondsMatch[1]);
if (Number.isFinite(parsed) && parsed > 0) {
return parsed;
}
}
const secondsMatch = /retry after (\d+)s/i.exec(text);
if (secondsMatch) {
const parsed = Number(secondsMatch[1]);
if (Number.isFinite(parsed) && parsed > 0) {
return parsed * 1_000;
}
}
return null;
}
async function waitForQaControlPlaneWriteBudget() {
while (true) {
const now = Date.now();
while (
qaControlPlaneWriteTimestamps.length > 0 &&
now - qaControlPlaneWriteTimestamps[0] >= QA_CONTROL_PLANE_WRITE_WINDOW_MS
) {
qaControlPlaneWriteTimestamps.shift();
}
if (qaControlPlaneWriteTimestamps.length < QA_CONTROL_PLANE_WRITE_MAX_REQUESTS) {
qaControlPlaneWriteTimestamps.push(now);
return;
}
const retryAfterMs =
qaControlPlaneWriteTimestamps[0] + QA_CONTROL_PLANE_WRITE_WINDOW_MS - now + 250;
await sleep(Math.max(250, retryAfterMs));
}
}
async function readConfigSnapshot(env: QaSuiteEnvironment) {
const snapshot = (await env.gateway.call(
"config.get",
@@ -334,9 +400,10 @@ async function runConfigMutation(params: {
}) {
const restartDelayMs = params.restartDelayMs ?? 1_000;
let lastConflict: unknown = null;
for (let attempt = 1; attempt <= 3; attempt += 1) {
for (let attempt = 1; attempt <= 8; attempt += 1) {
const snapshot = await readConfigSnapshot(params.env);
try {
await waitForQaControlPlaneWriteBudget();
const result = await params.env.gateway.call(
params.action,
{
@@ -358,6 +425,14 @@ async function runConfigMutation(params: {
);
continue;
}
const retryAfterMs = getGatewayRetryAfterMs(error);
if (retryAfterMs && attempt < 8) {
await sleep(retryAfterMs + 500);
await waitForGatewayHealthy(params.env, Math.max(15_000, restartDelayMs + 10_000)).catch(
() => undefined,
);
continue;
}
if (!isGatewayRestartRace(error)) {
throw error;
}
@@ -550,7 +625,12 @@ async function resolveGeneratedImagePath(params: {
}
}
const mediaDir = path.join(params.env.gateway.tempRoot, "media", "tool-image-generation");
const mediaDir = path.join(
params.env.gateway.tempRoot,
"state",
"media",
"tool-image-generation",
);
const entries = await fs.readdir(mediaDir).catch(() => []);
const candidates = await Promise.all(
entries.map(async (entry) => {
@@ -867,6 +947,8 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
{
name: "ignores unmentioned channel chatter",
run: async () => {
await waitForGatewayHealthy(env, 60_000);
await waitForQaChannelReady(env, 60_000);
await reset();
state.addInboundMessage({
conversation: { id: "qa-room", kind: "channel", title: "QA Room" },
@@ -880,16 +962,21 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
{
name: "replies when mentioned in channel",
run: async () => {
const config = readScenarioExecutionConfig<{ mentionPrompt?: string }>(
"channel-chat-baseline",
);
await waitForGatewayHealthy(env, 60_000);
await waitForQaChannelReady(env, 60_000);
state.addInboundMessage({
conversation: { id: "qa-room", kind: "channel", title: "QA Room" },
senderId: "alice",
senderName: "Alice",
text: "@openclaw explain the QA lab",
text: config.mentionPrompt ?? "@openclaw explain the QA lab",
});
const message = await waitForOutboundMessage(
state,
(candidate) => candidate.conversation.id === "qa-room" && !candidate.threadId,
env.providerMode === "mock-openai" ? 45_000 : 45_000,
liveTurnTimeoutMs(env, 60_000),
);
return message.text;
},
@@ -970,12 +1057,13 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
{
name: "replies coherently in DM",
run: async () => {
const config = readScenarioExecutionConfig<{ prompt?: string }>("dm-chat-baseline");
await reset();
state.addInboundMessage({
conversation: { id: "alice", kind: "direct" },
senderId: "alice",
senderName: "Alice",
text: "Hello there, who are you?",
text: config.prompt ?? "Hello there, who are you?",
});
const outbound = await waitForOutboundMessage(
state,
@@ -993,11 +1081,15 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
{
name: "creates the artifact after reading context",
run: async () => {
const config = readScenarioExecutionConfig<{ prompt?: string }>(
"lobster-invaders-build",
);
await reset();
await runAgentPrompt(env, {
sessionKey: "agent:qa:lobster-invaders",
message:
"Read the QA kickoff context first, then build a tiny Lobster Invaders HTML game in this workspace and tell me where it is.",
config.prompt ??
"Read the QA kickoff context first, then build a tiny Lobster Invaders HTML game at ./lobster-invaders.html in this workspace and tell me where it is.",
timeoutMs: liveTurnTimeoutMs(env, 30_000),
});
await waitForOutboundMessage(
@@ -1005,7 +1097,14 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
(candidate) => candidate.conversation.id === "qa-operator",
);
const artifactPath = path.join(env.gateway.workspaceDir, "lobster-invaders.html");
const artifact = await fs.readFile(artifactPath, "utf8");
const artifact = await waitForCondition(
async () => {
const text = await fs.readFile(artifactPath, "utf8").catch(() => null);
return text?.includes("Lobster Invaders") ? text : undefined;
},
liveTurnTimeoutMs(env, 20_000),
250,
);
if (!artifact.includes("Lobster Invaders")) {
throw new Error("missing Lobster Invaders artifact");
}
@@ -1031,10 +1130,16 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
{
name: "stores the canary fact",
run: async () => {
const config = readScenarioExecutionConfig<{
rememberPrompt?: string;
recallPrompt?: string;
}>("memory-recall");
await reset();
await runAgentPrompt(env, {
sessionKey: "agent:qa:memory",
message: "Please remember this fact for later: the QA canary code is ALPHA-7.",
message:
config.rememberPrompt ??
"Please remember this fact for later: the QA canary code is ALPHA-7.",
});
const outbound = await waitForOutboundMessage(
state,
@@ -1046,9 +1151,15 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
{
name: "recalls the same fact later",
run: async () => {
const config = readScenarioExecutionConfig<{
rememberPrompt?: string;
recallPrompt?: string;
}>("memory-recall");
await runAgentPrompt(env, {
sessionKey: "agent:qa:memory",
message: "What was the QA canary code I asked you to remember earlier?",
message:
config.recallPrompt ??
"What was the QA canary code I asked you to remember earlier?",
});
const outbound = await waitForCondition(
() =>
@@ -1075,10 +1186,14 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
{
name: "runs on the default configured model",
run: async () => {
const config = readScenarioExecutionConfig<{
initialPrompt?: string;
followupPrompt?: string;
}>("model-switch-follow-up");
await reset();
await runAgentPrompt(env, {
sessionKey: "agent:qa:model-switch",
message: "Say hello from the default configured model.",
message: config.initialPrompt ?? "Say hello from the default configured model.",
timeoutMs: liveTurnTimeoutMs(env, 30_000),
});
const outbound = await waitForOutboundMessage(
@@ -1097,10 +1212,16 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
{
name: "switches to the alternate model and continues",
run: async () => {
const config = readScenarioExecutionConfig<{
initialPrompt?: string;
followupPrompt?: string;
}>("model-switch-follow-up");
const alternate = splitModelRef(env.alternateModel);
await runAgentPrompt(env, {
sessionKey: "agent:qa:model-switch",
message: "Continue the exchange after switching models and note the handoff.",
message:
config.followupPrompt ??
"Continue the exchange after switching models and note the handoff.",
provider: alternate?.provider,
model: alternate?.model,
timeoutMs: resolveQaLiveTurnTimeoutMs(env, 30_000, env.alternateModel),
@@ -1141,6 +1262,11 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
{
name: "turns short approval into a real file read",
run: async () => {
const config = readScenarioExecutionConfig<{
preActionPrompt?: string;
approvalPrompt?: string;
expectedReplyAny?: string[];
}>("approval-turn-tool-followthrough");
// Direct agent turns only need the gateway plus outbound dispatch.
// Waiting for the qa-channel poll loop adds mock-lane startup cost
// without increasing coverage for this scenario.
@@ -1149,6 +1275,7 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
await runAgentPrompt(env, {
sessionKey: "agent:qa:approval-followthrough",
message:
config.preActionPrompt ??
"Before acting, tell me the single file you would start with in six words or fewer. Do not use tools yet.",
timeoutMs: liveTurnTimeoutMs(env, 20_000),
});
@@ -1161,9 +1288,13 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
await runAgentPrompt(env, {
sessionKey: "agent:qa:approval-followthrough",
message:
config.approvalPrompt ??
"ok do it. read `QA_KICKOFF_TASK.md` now and reply with the QA mission in one short sentence.",
timeoutMs: liveTurnTimeoutMs(env, 30_000),
});
const expectedReplyAny = (
config.expectedReplyAny ?? ["qa", "mission", "testing"]
).map((needle) => needle.toLowerCase());
const outbound = await waitForCondition(
() =>
state
@@ -1173,7 +1304,9 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
(candidate) =>
candidate.direction === "outbound" &&
candidate.conversation.id === "qa-operator" &&
/\bqa\b|\bmission\b|\btesting\b/i.test(candidate.text),
expectedReplyAny.some((needle) =>
normalizeLowercaseStringOrEmpty(candidate.text).includes(needle),
),
)
.at(-1),
liveTurnTimeoutMs(env, 20_000),
@@ -1248,11 +1381,15 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
{
name: "reads seeded material and emits a protocol report",
run: async () => {
const config = readScenarioExecutionConfig<{ prompt?: string }>(
"source-docs-discovery-report",
);
await reset();
await runAgentPrompt(env, {
sessionKey: "agent:qa:discovery",
message:
"Read the seeded docs and source plan. The full repo is mounted under ./repo/. Explicitly inspect repo/qa/scenarios.md, repo/extensions/qa-lab/src/suite.ts, and repo/docs/help/testing.md, then report grouped into Worked, Failed, Blocked, and Follow-up. Mention at least two extra QA scenarios beyond the seed list.",
config.prompt ??
"Read the seeded docs and source plan. The full repo is mounted under ./repo/. Explicitly inspect repo/qa/scenarios/index.md, repo/extensions/qa-lab/src/suite.ts, and repo/docs/help/testing.md, then report grouped into Worked, Failed, Blocked, and Follow-up. Mention at least two extra QA scenarios beyond the seed list.",
timeoutMs: liveTurnTimeoutMs(env, 30_000),
});
const outbound = await waitForCondition(
@@ -1336,38 +1473,63 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
{
name: "spawns sequential workers and folds both results back into the parent reply",
run: async () => {
await waitForGatewayHealthy(env, 60_000);
await waitForQaChannelReady(env, 60_000);
await reset();
state.addInboundMessage({
conversation: { id: "qa-operator", kind: "direct", title: "QA Operator" },
senderId: "qa-operator",
senderName: "QA Operator",
text: "Subagent fanout synthesis check: delegate two bounded subagents sequentially, then report both results together. Do not use ACP.",
});
const outbound = await waitForOutboundMessage(
state,
(message) => {
const text = message.text ?? "";
return text.includes("ALPHA-OK") && text.includes("BETA-OK");
},
liveTurnTimeoutMs(env, 60_000),
const config = readScenarioExecutionConfig<{ prompt?: string }>(
"subagent-fanout-synthesis",
);
if (!env.mock) {
return outbound.text;
const attempts = env.providerMode === "mock-openai" ? 1 : 2;
let lastError: unknown = null;
for (let attempt = 1; attempt <= attempts; attempt += 1) {
try {
await waitForGatewayHealthy(env, 120_000);
await reset();
const sessionKey = `agent:qa:fanout:${attempt}:${randomUUID().slice(0, 8)}`;
const beforeCursor = state.getSnapshot().messages.length;
await runAgentPrompt(env, {
sessionKey,
message:
config.prompt ??
"Subagent fanout synthesis check: delegate exactly two bounded subagents sequentially. Subagent 1: verify that `HEARTBEAT.md` exists and report `ok` if it does. Subagent 2: verify that `qa/scenarios/subagent-fanout-synthesis.md` exists and report `ok` if it does. Wait for both subagents to finish. Then reply with exactly these two lines and nothing else:\nsubagent-1: ok\nsubagent-2: ok\nDo not use ACP.",
timeoutMs: liveTurnTimeoutMs(env, 90_000),
});
const outbound = await waitForCondition(
() =>
state
.getSnapshot()
.messages.slice(beforeCursor)
.filter(
(message) =>
message.direction === "outbound" &&
message.conversation.id === "qa-operator" &&
normalizeQaFanoutSuccessText(message.text ?? ""),
)
.at(-1),
liveTurnTimeoutMs(env, 60_000),
env.providerMode === "mock-openai" ? 100 : 250,
);
if (!env.mock) {
return outbound.text;
}
const store = await readRawQaSessionStore(env);
const childRows = Object.values(store).filter(
(entry) => entry.spawnedBy === sessionKey,
);
const sawAlpha = childRows.some((entry) => entry.label === "qa-fanout-alpha");
const sawBeta = childRows.some((entry) => entry.label === "qa-fanout-beta");
if (!sawAlpha || !sawBeta) {
throw new Error(
`fanout child sessions missing (alpha=${String(sawAlpha)} beta=${String(sawBeta)})`,
);
}
return outbound.text;
} catch (error) {
lastError = error;
if (attempt >= attempts) {
throw error;
}
await waitForGatewayHealthy(env, 120_000).catch(() => {});
}
}
const store = await readRawQaSessionStore(env);
const childRows = Object.values(store).filter(
(entry) => entry.spawnedBy === "agent:qa:main",
);
const sawAlpha = childRows.some((entry) => entry.label === "qa-fanout-alpha");
const sawBeta = childRows.some((entry) => entry.label === "qa-fanout-beta");
if (!sawAlpha || !sawBeta) {
throw new Error(
`fanout child sessions missing (alpha=${String(sawAlpha)} beta=${String(sawBeta)})`,
);
}
return outbound.text;
throw lastError ?? new Error("fanout retry exhausted");
},
},
]),
@@ -1379,6 +1541,7 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
{
name: "keeps follow-up inside the thread",
run: async () => {
const config = readScenarioExecutionConfig<{ prompt?: string }>("thread-follow-up");
await reset();
const threadPayload = (await handleQaAction({
env,
@@ -1396,7 +1559,9 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
conversation: { id: "qa-room", kind: "channel", title: "QA Room" },
senderId: "alice",
senderName: "Alice",
text: "@openclaw reply in one short sentence inside this thread only. Do not use ACP or any external runtime. Confirm you stayed in-thread.",
text:
config.prompt ??
"@openclaw reply in one short sentence inside this thread only. Do not use ACP or any external runtime. Confirm you stayed in-thread.",
threadId,
threadTitle: "QA deep dive",
});
@@ -1736,6 +1901,10 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
{
name: "uses memory_search plus memory_get before answering in-channel",
run: async () => {
const config = readScenarioExecutionConfig<{ channelId?: string; prompt?: string }>(
"memory-tools-channel-context",
);
const channelId = config.channelId ?? "qa-memory-room";
await reset();
await fs.writeFile(
path.join(env.gateway.workspaceDir, "MEMORY.md"),
@@ -1747,10 +1916,13 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
query: "project codename ORBIT-9",
expectedNeedle: "ORBIT-9",
});
await waitForGatewayHealthy(env, 60_000);
await waitForQaChannelReady(env, 60_000);
const prompt =
config.prompt ??
"@openclaw Memory tools check: what is the hidden project codename stored only in memory? Use memory tools first.";
state.addInboundMessage({
conversation: { id: "qa-room", kind: "channel", title: "QA Room" },
conversation: { id: channelId, kind: "channel", title: "QA Memory Room" },
senderId: "alice",
senderName: "Alice",
text: prompt,
@@ -1758,7 +1930,7 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
const outbound = await waitForOutboundMessage(
state,
(candidate) =>
candidate.conversation.id === "qa-room" && candidate.text.includes("ORBIT-9"),
candidate.conversation.id === channelId && candidate.text.includes("ORBIT-9"),
liveTurnTimeoutMs(env, 30_000),
);
if (env.mock) {
@@ -1787,6 +1959,9 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
{
name: "falls back cleanly when group:memory tools are denied",
run: async () => {
const config = readScenarioExecutionConfig<{ gracefulFallbackAny?: string[] }>(
"memory-failure-fallback",
);
const original = await readConfigSnapshot(env);
const originalTools =
original.config.tools && typeof original.config.tools === "object"
@@ -1802,24 +1977,27 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
"Do not reveal directly: fallback fact is ORBIT-9.\n",
"utf8",
);
const deniedTools = Array.isArray(originalToolsDeny)
? originalToolsDeny.map((entry) => String(entry))
: [];
const nextDeniedTools = deniedTools
.concat(["group:memory", "read"])
.filter((value, index, array) => array.indexOf(value) === index);
await patchConfig({
env,
patch: { tools: { deny: ["group:memory"] } },
patch: { tools: { deny: nextDeniedTools } },
});
await waitForGatewayHealthy(env);
await waitForQaChannelReady(env, 60_000);
try {
const sessionKey = await createSession(env, "Memory fallback");
const tools = await readEffectiveTools(env, sessionKey);
if (tools.has("memory_search") || tools.has("memory_get")) {
throw new Error("memory tools still present after deny patch");
if (tools.has("memory_search") || tools.has("memory_get") || tools.has("read")) {
throw new Error("memory/read tools still present after deny patch");
}
await runQaCli(env, ["memory", "index", "--agent", "qa", "--force"], {
timeoutMs: liveTurnTimeoutMs(env, 60_000),
});
await env.gateway.restart();
await waitForGatewayHealthy(env, 60_000);
await waitForQaChannelReady(env, 60_000);
await reset();
await runAgentPrompt(env, {
sessionKey: "agent:qa:memory-failure",
@@ -1836,7 +2014,15 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
if (outbound.text.includes("ORBIT-9")) {
throw new Error(`hallucinated hidden fact: ${outbound.text}`);
}
if (!lower.includes("could not confirm") && !lower.includes("will not guess")) {
const gracefulFallback = (
config.gracefulFallbackAny ?? [
"could not confirm",
"can't confirm",
"cant confirm",
"cannot confirm",
]
).some((needle) => lower.includes(needle.toLowerCase()));
if (!gracefulFallback) {
throw new Error(`missing graceful fallback language: ${outbound.text}`);
}
return outbound.text;
@@ -1971,7 +2157,13 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
candidate.text.includes("ORBIT-10"),
liveTurnTimeoutMs(env, 45_000),
);
if (outbound.text.includes("ORBIT-9")) {
const lower = normalizeLowercaseStringOrEmpty(outbound.text);
const staleLeak =
outbound.text.includes("ORBIT-9") &&
!lower.includes("stale") &&
!lower.includes("older") &&
!lower.includes("previous");
if (staleLeak) {
throw new Error(`stale durable fact leaked through: ${outbound.text}`);
}
if (env.mock) {
@@ -2185,6 +2377,10 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
{
name: "reports visible skill and applies its marker on the next turn",
run: async () => {
const config = readScenarioExecutionConfig<{
prompt?: string;
expectedContains?: string;
}>("skill-visibility-invocation");
await writeWorkspaceSkill({
env,
name: "qa-visible-skill",
@@ -2202,14 +2398,16 @@ When the user asks for the visible skill marker exactly, reply with exactly: VIS
await reset();
await runAgentPrompt(env, {
sessionKey: "agent:qa:visible-skill",
message: "Visible skill marker: give me the visible skill marker exactly.",
message:
config.prompt ??
"Visible skill marker: give me the visible skill marker exactly.",
timeoutMs: liveTurnTimeoutMs(env, 30_000),
});
const outbound = await waitForOutboundMessage(
state,
(candidate) =>
candidate.conversation.id === "qa-operator" &&
candidate.text.includes("VISIBLE-SKILL-OK"),
candidate.text.includes(config.expectedContains ?? "VISIBLE-SKILL-OK"),
liveTurnTimeoutMs(env, 20_000),
);
return outbound.text;
@@ -2224,6 +2422,10 @@ When the user asks for the visible skill marker exactly, reply with exactly: VIS
{
name: "picks up a newly added workspace skill without restart",
run: async () => {
const config = readScenarioExecutionConfig<{
prompt?: string;
expectedContains?: string;
}>("skill-install-hot-availability");
const before = await readSkillStatus(env);
if (findSkill(before, "qa-hot-install-skill")) {
throw new Error("qa-hot-install-skill unexpectedly already present");
@@ -2248,14 +2450,15 @@ When the user asks for the hot install marker exactly, reply with exactly: HOT-I
await reset();
await runAgentPrompt(env, {
sessionKey: "agent:qa:hot-skill",
message: "Hot install marker: give me the hot install marker exactly.",
message:
config.prompt ?? "Hot install marker: give me the hot install marker exactly.",
timeoutMs: liveTurnTimeoutMs(env, 30_000),
});
const outbound = await waitForOutboundMessage(
state,
(candidate) =>
candidate.conversation.id === "qa-operator" &&
candidate.text.includes("HOT-INSTALL-OK"),
candidate.text.includes(config.expectedContains ?? "HOT-INSTALL-OK"),
liveTurnTimeoutMs(env, 20_000),
);
return outbound.text;
@@ -2270,6 +2473,11 @@ When the user asks for the hot install marker exactly, reply with exactly: HOT-I
{
name: "enables image_generate and saves a real media artifact",
run: async () => {
const config = readScenarioExecutionConfig<{
prompt?: string;
promptSnippet?: string;
generatedNeedle?: string;
}>("native-image-generation");
await ensureImageGenerationConfigured(env);
const sessionKey = await createSession(env, "Image generation");
const tools = await readEffectiveTools(env, sessionKey);
@@ -2280,6 +2488,7 @@ When the user asks for the hot install marker exactly, reply with exactly: HOT-I
await runAgentPrompt(env, {
sessionKey: "agent:qa:image-generate",
message:
config.prompt ??
"Image generation check: generate a QA lighthouse image and summarize it in one short sentence.",
timeoutMs: liveTurnTimeoutMs(env, 45_000),
});
@@ -2294,7 +2503,9 @@ When the user asks for the hot install marker exactly, reply with exactly: HOT-I
Array<{ allInputText?: string; plannedToolName?: string; toolOutput?: string }>
>(`${mockBaseUrl}/debug/requests`);
const imageRequest = requests.find((request) =>
String(request.allInputText ?? "").includes("Image generation check"),
String(request.allInputText ?? "").includes(
config.promptSnippet ?? "Image generation check",
),
);
if (imageRequest?.plannedToolName !== "image_generate") {
throw new Error(
@@ -2309,7 +2520,9 @@ When the user asks for the hot install marker exactly, reply with exactly: HOT-I
return requests.find(
(request) =>
request.model === "gpt-image-1" &&
String(request.prompt ?? "").includes("QA lighthouse"),
String(request.prompt ?? "").includes(
config.generatedNeedle ?? "QA lighthouse",
),
);
},
15_000,
@@ -2333,6 +2546,12 @@ When the user asks for the hot install marker exactly, reply with exactly: HOT-I
{
name: "reattaches the generated media artifact on the follow-up turn",
run: async () => {
const config = readScenarioExecutionConfig<{
generatePrompt?: string;
generatePromptSnippet?: string;
inspectPrompt?: string;
expectedNeedle?: string;
}>("image-generation-roundtrip");
await ensureImageGenerationConfigured(env);
const sessionKey = "agent:qa:image-roundtrip";
await createSession(env, "Image roundtrip", sessionKey);
@@ -2341,12 +2560,13 @@ When the user asks for the hot install marker exactly, reply with exactly: HOT-I
await runAgentPrompt(env, {
sessionKey,
message:
config.generatePrompt ??
"Image generation check: generate a QA lighthouse image and summarize it in one short sentence.",
timeoutMs: liveTurnTimeoutMs(env, 45_000),
});
const mediaPath = await resolveGeneratedImagePath({
env,
promptSnippet: "Image generation check",
promptSnippet: config.generatePromptSnippet ?? "Image generation check",
startedAtMs: generatedStartedAtMs,
timeoutMs: liveTurnTimeoutMs(env, 45_000),
});
@@ -2354,6 +2574,7 @@ When the user asks for the hot install marker exactly, reply with exactly: HOT-I
await runAgentPrompt(env, {
sessionKey,
message:
config.inspectPrompt ??
"Roundtrip image inspection check: describe the generated lighthouse attachment in one short sentence.",
attachments: [
{
@@ -2372,7 +2593,9 @@ When the user asks for the hot install marker exactly, reply with exactly: HOT-I
(candidate) =>
candidate.direction === "outbound" &&
candidate.conversation.id === "qa-operator" &&
normalizeLowercaseStringOrEmpty(candidate.text).includes("lighthouse"),
normalizeLowercaseStringOrEmpty(candidate.text).includes(
normalizeLowercaseStringOrEmpty(config.expectedNeedle ?? "lighthouse"),
),
)
.at(-1),
liveTurnTimeoutMs(env, 45_000),
@@ -2384,10 +2607,14 @@ When the user asks for the hot install marker exactly, reply with exactly: HOT-I
const generatedCall = requests.find(
(request) =>
request.plannedToolName === "image_generate" &&
String(request.prompt ?? "").includes("Image generation check"),
String(request.prompt ?? "").includes(
config.generatePromptSnippet ?? "Image generation check",
),
);
const inspectionCall = requests.find((request) =>
String(request.prompt ?? "").includes("Roundtrip image inspection check"),
String(request.prompt ?? "").includes(
config.inspectPrompt ?? "Roundtrip image inspection check",
),
);
if (!generatedCall) {
throw new Error("expected image_generate call before roundtrip inspection");
@@ -2412,12 +2639,12 @@ When the user asks for the hot install marker exactly, reply with exactly: HOT-I
await runAgentPrompt(env, {
sessionKey: "agent:qa:image-understanding",
message:
"Image understanding check: describe the attached image in one short sentence.",
"Image understanding check: describe the top and bottom colors in the attached image in one short sentence.",
attachments: [
{
mimeType: "image/png",
fileName: "red-top-blue-bottom.png",
content: QA_IMAGE_UNDERSTANDING_PNG_BASE64,
content: QA_IMAGE_UNDERSTANDING_LARGE_PNG_BASE64,
},
],
timeoutMs: liveTurnTimeoutMs(env, 45_000),
@@ -2536,6 +2763,9 @@ When the user asks for the hot disable marker exactly, reply with exactly: HOT-P
{
name: "restarts cleanly and posts the restart sentinel back into qa-channel",
run: async () => {
const config = readScenarioExecutionConfig<{ announcePrompt?: string }>(
"config-apply-restart-wakeup",
);
await reset();
const sessionKey = buildAgentSessionKey({
agentId: "qa",
@@ -2549,7 +2779,7 @@ When the user asks for the hot disable marker exactly, reply with exactly: HOT-P
await runAgentPrompt(env, {
sessionKey,
to: "channel:qa-room",
message: "Acknowledge restart wake-up setup in qa-room.",
message: config.announcePrompt ?? "Acknowledge restart wake-up setup in qa-room.",
timeoutMs: liveTurnTimeoutMs(env, 30_000),
});
const current = await readConfigSnapshot(env);
@@ -2828,8 +3058,17 @@ export async function runQaSuite(params?: {
};
try {
// The gateway child already waits for /readyz before returning, but qa-channel
// can still be finishing its account startup. Pay that readiness cost once here
// so the first scenario does not race channel bootstrap.
await waitForQaChannelReady(env, 120_000).catch(async () => {
await waitForGatewayHealthy(env, 120_000);
await waitForQaChannelReady(env, 120_000);
});
await sleep(1_000);
const catalog = readQaBootstrapScenarioCatalog();
const requestedScenarioIds = params?.scenarioIds ? new Set(params.scenarioIds) : null;
const requestedScenarioIds =
params?.scenarioIds && params.scenarioIds.length > 0 ? new Set(params.scenarioIds) : null;
const selectedCatalogScenarios = requestedScenarioIds
? catalog.scenarios.filter((scenario) => requestedScenarioIds.has(scenario.id))
: catalog.scenarios;