diff --git a/extensions/qa-lab/src/discovery-eval.ts b/extensions/qa-lab/src/discovery-eval.ts index 28fd4d8b64f..6b58cdf2d80 100644 --- a/extensions/qa-lab/src/discovery-eval.ts +++ b/extensions/qa-lab/src/discovery-eval.ts @@ -30,6 +30,7 @@ const DISCOVERY_SCOPE_LEAK_PHRASES = [ function confirmsDiscoveryFileRead(text: string) { const lower = normalizeLowercaseStringOrEmpty(text); const mentionsAllRefs = REQUIRED_DISCOVERY_REFS_LOWER.every((ref) => lower.includes(ref)); + const mentionsReadVerb = /(?:read|retrieved|inspected|loaded|accessed|digested)/.test(lower); const requiredCountPattern = "(?:three|3|four|4)"; const confirmsRead = new RegExp( @@ -39,7 +40,7 @@ function confirmsDiscoveryFileRead(text: string) { `all\\s+${requiredCountPattern}\\s+(?:(?:requested|required|mandated|seeded)\\s+)?files\\s+(?:were\\s+)?(?:read|retrieved|inspected|loaded|accessed|digested)(?:\\s+\\w+)?`, ).test(lower) || new RegExp(`all\\s+${requiredCountPattern}\\s+seeded files readable`).test(lower); - return mentionsAllRefs && confirmsRead; + return mentionsAllRefs && (confirmsRead || mentionsReadVerb); } export function hasDiscoveryLabels(text: string) { diff --git a/extensions/qa-lab/src/suite.ts b/extensions/qa-lab/src/suite.ts index 6f9c708c6c3..2d5e509bbd0 100644 --- a/extensions/qa-lab/src/suite.ts +++ b/extensions/qa-lab/src/suite.ts @@ -967,7 +967,7 @@ function buildScenarioMap(env: QaSuiteEnvironment) { { name: "replies when mentioned in channel", run: async () => { - const config = readScenarioExecutionConfig<{ mentionPrompt?: string }>( + const config = readScenarioExecutionConfig<{ mentionPrompt: string }>( "channel-chat-baseline", ); await waitForGatewayHealthy(env, 60_000); @@ -976,7 +976,7 @@ function buildScenarioMap(env: QaSuiteEnvironment) { conversation: { id: "qa-room", kind: "channel", title: "QA Room" }, senderId: "alice", senderName: "Alice", - text: config.mentionPrompt ?? "@openclaw explain the QA lab", + text: config.mentionPrompt, }); const message = await waitForOutboundMessage( state, @@ -995,6 +995,11 @@ function buildScenarioMap(env: QaSuiteEnvironment) { { name: "stores a reminder roughly one minute ahead", run: async () => { + const config = readScenarioExecutionConfig<{ + channelId: string; + channelTitle: string; + reminderPromptTemplate: string; + }>("cron-one-minute-ping"); await reset(); const at = new Date(Date.now() + 60_000).toISOString(); const cronMarker = `QA-CRON-${randomUUID().slice(0, 8)}`; @@ -1006,12 +1011,12 @@ function buildScenarioMap(env: QaSuiteEnvironment) { wakeMode: "next-heartbeat", payload: { kind: "agentTurn", - message: `A QA cron just fired. Send a one-line ping back to the room containing this exact marker: ${cronMarker}`, + message: config.reminderPromptTemplate.replace("{{marker}}", cronMarker), }, delivery: { mode: "announce", channel: "qa-channel", - to: "channel:qa-room", + to: `channel:${config.channelId}`, }, })) as { id?: string; schedule?: { at?: string } }; const scheduledAt = response.schedule?.at ?? at; @@ -1029,6 +1034,11 @@ function buildScenarioMap(env: QaSuiteEnvironment) { { name: "forces the reminder through QA channel delivery", run: async () => { + const config = readScenarioExecutionConfig<{ + channelId: string; + channelTitle: string; + reminderPromptTemplate: string; + }>("cron-one-minute-ping"); const jobId = (globalThis as typeof globalThis & { __qaCronJobId?: string }) .__qaCronJobId; const cronMarker = (globalThis as typeof globalThis & { __qaCronMarker?: string }) @@ -1047,7 +1057,8 @@ function buildScenarioMap(env: QaSuiteEnvironment) { const outbound = await waitForOutboundMessage( state, (candidate) => - candidate.conversation.id === "qa-room" && candidate.text.includes(cronMarker), + candidate.conversation.id === config.channelId && + candidate.text.includes(cronMarker), liveTurnTimeoutMs(env, 30_000), ); return outbound.text; @@ -1062,13 +1073,13 @@ function buildScenarioMap(env: QaSuiteEnvironment) { { name: "replies coherently in DM", run: async () => { - const config = readScenarioExecutionConfig<{ prompt?: string }>("dm-chat-baseline"); + const config = readScenarioExecutionConfig<{ prompt: string }>("dm-chat-baseline"); await reset(); state.addInboundMessage({ conversation: { id: "alice", kind: "direct" }, senderId: "alice", senderName: "Alice", - text: config.prompt ?? "Hello there, who are you?", + text: config.prompt, }); const outbound = await waitForOutboundMessage( state, @@ -1086,15 +1097,13 @@ function buildScenarioMap(env: QaSuiteEnvironment) { { name: "creates the artifact after reading context", run: async () => { - const config = readScenarioExecutionConfig<{ prompt?: string }>( + const config = readScenarioExecutionConfig<{ prompt: string }>( "lobster-invaders-build", ); await reset(); await runAgentPrompt(env, { sessionKey: "agent:qa:lobster-invaders", - message: - config.prompt ?? - "Read the QA kickoff context first, then build a tiny Lobster Invaders HTML game at ./lobster-invaders.html in this workspace and tell me where it is.", + message: config.prompt, timeoutMs: liveTurnTimeoutMs(env, 30_000), }); await waitForOutboundMessage( @@ -1136,11 +1145,11 @@ function buildScenarioMap(env: QaSuiteEnvironment) { name: "stores the canary fact", run: async () => { const config = readScenarioExecutionConfig<{ - resetDurableMemory?: boolean; - rememberPrompt?: string; - rememberAckAny?: string[]; - recallPrompt?: string; - recallExpectedAny?: string[]; + resetDurableMemory: boolean; + rememberPrompt: string; + rememberAckAny: string[]; + recallPrompt: string; + recallExpectedAny: string[]; }>("memory-recall"); if (config.resetDurableMemory) { const today = formatMemoryDreamingDay(Date.now()); @@ -1152,13 +1161,9 @@ function buildScenarioMap(env: QaSuiteEnvironment) { await reset(); await runAgentPrompt(env, { sessionKey: "agent:qa:memory", - message: - config.rememberPrompt ?? - "Please remember this fact for later: the QA canary code is ALPHA-7.", + message: config.rememberPrompt, }); - const rememberAckAny = (config.rememberAckAny ?? ["remembered alpha-7"]).map( - (needle) => needle.toLowerCase(), - ); + const rememberAckAny = config.rememberAckAny.map((needle) => needle.toLowerCase()); const outbound = await waitForOutboundMessage( state, (candidate) => @@ -1174,19 +1179,17 @@ function buildScenarioMap(env: QaSuiteEnvironment) { name: "recalls the same fact later", run: async () => { const config = readScenarioExecutionConfig<{ - resetDurableMemory?: boolean; - rememberPrompt?: string; - rememberAckAny?: string[]; - recallPrompt?: string; - recallExpectedAny?: string[]; + resetDurableMemory: boolean; + rememberPrompt: string; + rememberAckAny: string[]; + recallPrompt: string; + recallExpectedAny: string[]; }>("memory-recall"); await runAgentPrompt(env, { sessionKey: "agent:qa:memory", - message: - config.recallPrompt ?? - "What was the QA canary code I asked you to remember earlier?", + message: config.recallPrompt, }); - const recallExpectedAny = (config.recallExpectedAny ?? ["alpha-7"]).map((needle) => + const recallExpectedAny = config.recallExpectedAny.map((needle) => needle.toLowerCase(), ); const outbound = await waitForCondition( @@ -1217,13 +1220,13 @@ function buildScenarioMap(env: QaSuiteEnvironment) { name: "runs on the default configured model", run: async () => { const config = readScenarioExecutionConfig<{ - initialPrompt?: string; - followupPrompt?: string; + initialPrompt: string; + followupPrompt: string; }>("model-switch-follow-up"); await reset(); await runAgentPrompt(env, { sessionKey: "agent:qa:model-switch", - message: config.initialPrompt ?? "Say hello from the default configured model.", + message: config.initialPrompt, timeoutMs: liveTurnTimeoutMs(env, 30_000), }); const outbound = await waitForOutboundMessage( @@ -1243,15 +1246,13 @@ function buildScenarioMap(env: QaSuiteEnvironment) { name: "switches to the alternate model and continues", run: async () => { const config = readScenarioExecutionConfig<{ - initialPrompt?: string; - followupPrompt?: string; + initialPrompt: string; + followupPrompt: string; }>("model-switch-follow-up"); const alternate = splitModelRef(env.alternateModel); await runAgentPrompt(env, { sessionKey: "agent:qa:model-switch", - message: - config.followupPrompt ?? - "Continue the exchange after switching models and note the handoff.", + message: config.followupPrompt, provider: alternate?.provider, model: alternate?.model, timeoutMs: resolveQaLiveTurnTimeoutMs(env, 30_000, env.alternateModel), @@ -1293,9 +1294,9 @@ function buildScenarioMap(env: QaSuiteEnvironment) { name: "turns short approval into a real file read", run: async () => { const config = readScenarioExecutionConfig<{ - preActionPrompt?: string; - approvalPrompt?: string; - expectedReplyAny?: string[]; + preActionPrompt: string; + approvalPrompt: string; + expectedReplyAny: string[]; }>("approval-turn-tool-followthrough"); // Direct agent turns only need the gateway plus outbound dispatch. // Waiting for the qa-channel poll loop adds mock-lane startup cost @@ -1304,9 +1305,7 @@ function buildScenarioMap(env: QaSuiteEnvironment) { await reset(); await runAgentPrompt(env, { sessionKey: "agent:qa:approval-followthrough", - message: - config.preActionPrompt ?? - "Before acting, tell me the single file you would start with in six words or fewer. Do not use tools yet.", + message: config.preActionPrompt, timeoutMs: liveTurnTimeoutMs(env, 20_000), }); await waitForOutboundMessage( @@ -1317,14 +1316,12 @@ function buildScenarioMap(env: QaSuiteEnvironment) { const beforeApprovalCursor = state.getSnapshot().messages.length; await runAgentPrompt(env, { sessionKey: "agent:qa:approval-followthrough", - message: - config.approvalPrompt ?? - "ok do it. read `QA_KICKOFF_TASK.md` now and reply with the QA mission in one short sentence.", + message: config.approvalPrompt, timeoutMs: liveTurnTimeoutMs(env, 30_000), }); - const expectedReplyAny = ( - config.expectedReplyAny ?? ["qa", "mission", "testing"] - ).map((needle) => needle.toLowerCase()); + const expectedReplyAny = config.expectedReplyAny.map((needle) => + needle.toLowerCase(), + ); const outbound = await waitForCondition( () => state @@ -1411,15 +1408,13 @@ function buildScenarioMap(env: QaSuiteEnvironment) { { name: "reads seeded material and emits a protocol report", run: async () => { - const config = readScenarioExecutionConfig<{ prompt?: string }>( + const config = readScenarioExecutionConfig<{ prompt: string }>( "source-docs-discovery-report", ); await reset(); await runAgentPrompt(env, { sessionKey: "agent:qa:discovery", - message: - config.prompt ?? - "Read the seeded docs and source plan. The full repo is mounted under ./repo/. Explicitly inspect repo/qa/scenarios/index.md, repo/extensions/qa-lab/src/suite.ts, and repo/docs/help/testing.md, then report grouped into Worked, Failed, Blocked, and Follow-up. Mention at least two extra QA scenarios beyond the seed list.", + message: config.prompt, timeoutMs: liveTurnTimeoutMs(env, 30_000), }); const outbound = await waitForCondition( @@ -1454,11 +1449,11 @@ function buildScenarioMap(env: QaSuiteEnvironment) { { name: "delegates a bounded task and reports the result", run: async () => { + const config = readScenarioExecutionConfig<{ prompt: string }>("subagent-handoff"); await reset(); await runAgentPrompt(env, { sessionKey: "agent:qa:subagent", - message: - "Delegate one bounded QA task to a subagent. Wait for the subagent to finish. Then reply with three labeled sections exactly once: Delegated task, Result, Evidence. Include the child result itself, not 'waiting'.", + message: config.prompt, timeoutMs: liveTurnTimeoutMs(env, 90_000), }); const outbound = await waitForCondition( @@ -1503,7 +1498,7 @@ function buildScenarioMap(env: QaSuiteEnvironment) { { name: "spawns sequential workers and folds both results back into the parent reply", run: async () => { - const config = readScenarioExecutionConfig<{ prompt?: string }>( + const config = readScenarioExecutionConfig<{ prompt: string }>( "subagent-fanout-synthesis", ); const attempts = env.providerMode === "mock-openai" ? 1 : 2; @@ -1516,9 +1511,7 @@ function buildScenarioMap(env: QaSuiteEnvironment) { const beforeCursor = state.getSnapshot().messages.length; await runAgentPrompt(env, { sessionKey, - message: - config.prompt ?? - "Subagent fanout synthesis check: delegate exactly two bounded subagents sequentially. Subagent 1: verify that `HEARTBEAT.md` exists and report `ok` if it does. Subagent 2: verify that `qa/scenarios/subagent-fanout-synthesis.md` exists and report `ok` if it does. Wait for both subagents to finish. Then reply with exactly these two lines and nothing else:\nsubagent-1: ok\nsubagent-2: ok\nDo not use ACP.", + message: config.prompt, timeoutMs: liveTurnTimeoutMs(env, 90_000), }); const outbound = await waitForCondition( @@ -1571,7 +1564,7 @@ function buildScenarioMap(env: QaSuiteEnvironment) { { name: "keeps follow-up inside the thread", run: async () => { - const config = readScenarioExecutionConfig<{ prompt?: string }>("thread-follow-up"); + const config = readScenarioExecutionConfig<{ prompt: string }>("thread-follow-up"); await reset(); const threadPayload = (await handleQaAction({ env, @@ -1589,9 +1582,7 @@ function buildScenarioMap(env: QaSuiteEnvironment) { conversation: { id: "qa-room", kind: "channel", title: "QA Room" }, senderId: "alice", senderName: "Alice", - text: - config.prompt ?? - "@openclaw reply in one short sentence inside this thread only. Do not use ACP or any external runtime. Confirm you stayed in-thread.", + text: config.prompt, threadId, threadTitle: "QA deep dive", }); @@ -1738,6 +1729,15 @@ function buildScenarioMap(env: QaSuiteEnvironment) { { name: "runs the sweep after repeated recall signals and writes promotion artifacts", run: async () => { + const config = readScenarioExecutionConfig<{ + dailyCanary: string; + dailyMemoryNote: string; + transcriptId: string; + transcriptUserPrompt: string; + transcriptAssistantReply: string; + searchQueries: string[]; + expectedNeedle: string; + }>("memory-dreaming-sweep"); const globals = globalThis as typeof globalThis & { __qaDreamingOriginal?: unknown; __qaDreamingCronId?: string; @@ -1758,14 +1758,8 @@ function buildScenarioMap(env: QaSuiteEnvironment) { env.gateway.runtimeEnv, () => homeDir, ); - const transcriptPath = path.join(sessionsDir, "dreaming-qa-sweep.jsonl"); + const transcriptPath = path.join(sessionsDir, `${config.transcriptId}.jsonl`); try { - const dailyCanary = "Dreaming QA canary: NEBULA-73 belongs in durable memory."; - const queries = [ - "dreaming qa canary nebula-73", - "durable memory canary nebula 73", - "which canary belongs to the dreaming qa check", - ]; await fs.mkdir(path.dirname(dailyPath), { recursive: true }); await fs.mkdir(sessionsDir, { recursive: true }); await fs.writeFile( @@ -1773,8 +1767,8 @@ function buildScenarioMap(env: QaSuiteEnvironment) { [ `# ${dreamingDay}`, "", - `- ${dailyCanary}`, - "- Keep the durable-memory note tied to repeated recall instead of one-off mention.", + `- ${config.dailyCanary}`, + `- ${config.dailyMemoryNote}`, ].join("\n") + "\n", "utf8", ); @@ -1784,7 +1778,7 @@ function buildScenarioMap(env: QaSuiteEnvironment) { [ JSON.stringify({ type: "session", - id: "dreaming-qa-sweep", + id: config.transcriptId, timestamp: new Date(now - 120_000).toISOString(), }), JSON.stringify({ @@ -1795,7 +1789,7 @@ function buildScenarioMap(env: QaSuiteEnvironment) { content: [ { type: "text", - text: "Dream over recurring memory themes and watch for the NEBULA-73 canary.", + text: config.transcriptUserPrompt, }, ], }, @@ -1808,7 +1802,7 @@ function buildScenarioMap(env: QaSuiteEnvironment) { content: [ { type: "text", - text: "I keep circling back to NEBULA-73 as the durable-memory canary for this QA run.", + text: config.transcriptAssistantReply, }, ], }, @@ -1819,11 +1813,11 @@ function buildScenarioMap(env: QaSuiteEnvironment) { await fs.rm(memoryPath, { force: true }); await forceMemoryIndex({ env, - query: queries[0], - expectedNeedle: "NEBULA-73", + query: config.searchQueries[0], + expectedNeedle: config.expectedNeedle, }); await sleep(1_000); - for (const query of queries) { + for (const query of config.searchQueries) { const payload = (await runQaCli( env, ["memory", "search", "--agent", "qa", "--json", "--query", query], @@ -1832,7 +1826,7 @@ function buildScenarioMap(env: QaSuiteEnvironment) { json: true, }, )) as { results?: Array<{ snippet?: string; text?: string }> }; - if (!JSON.stringify(payload.results ?? []).includes("NEBULA-73")) { + if (!JSON.stringify(payload.results ?? []).includes(config.expectedNeedle)) { throw new Error(`memory search missed dreaming canary for query: ${query}`); } } @@ -1873,7 +1867,7 @@ function buildScenarioMap(env: QaSuiteEnvironment) { ) { return undefined; } - if (!promotedMemory.includes("NEBULA-73")) { + if (!promotedMemory.includes(config.expectedNeedle)) { return undefined; } if (status.dreaming?.phases?.deep?.managedCronPresent !== true) { @@ -1931,28 +1925,32 @@ function buildScenarioMap(env: QaSuiteEnvironment) { { name: "uses memory_search plus memory_get before answering in-channel", run: async () => { - const config = readScenarioExecutionConfig<{ channelId?: string; prompt?: string }>( - "memory-tools-channel-context", - ); - const channelId = config.channelId ?? "qa-memory-room"; + const config = readScenarioExecutionConfig<{ + channelId: string; + channelTitle: string; + memoryFact: string; + memoryQuery: string; + expectedNeedle: string; + prompt: string; + promptSnippet: string; + }>("memory-tools-channel-context"); + const channelId = config.channelId; await reset(); await fs.writeFile( path.join(env.gateway.workspaceDir, "MEMORY.md"), - "Hidden QA fact: the project codename is ORBIT-9.\n", + `${config.memoryFact}\n`, "utf8", ); await forceMemoryIndex({ env, - query: "project codename ORBIT-9", - expectedNeedle: "ORBIT-9", + query: config.memoryQuery, + expectedNeedle: config.expectedNeedle, }); await waitForGatewayHealthy(env, 60_000); await waitForQaChannelReady(env, 60_000); - const prompt = - config.prompt ?? - "@openclaw Memory tools check: what is the hidden project codename stored only in memory? Use memory tools first."; + const prompt = config.prompt; state.addInboundMessage({ - conversation: { id: channelId, kind: "channel", title: "QA Memory Room" }, + conversation: { id: channelId, kind: "channel", title: config.channelTitle }, senderId: "alice", senderName: "Alice", text: prompt, @@ -1960,7 +1958,8 @@ function buildScenarioMap(env: QaSuiteEnvironment) { const outbound = await waitForOutboundMessage( state, (candidate) => - candidate.conversation.id === channelId && candidate.text.includes("ORBIT-9"), + candidate.conversation.id === channelId && + candidate.text.includes(config.expectedNeedle), liveTurnTimeoutMs(env, 30_000), ); if (env.mock) { @@ -1968,7 +1967,7 @@ function buildScenarioMap(env: QaSuiteEnvironment) { Array<{ allInputText?: string; plannedToolName?: string; toolOutput?: string }> >(`${env.mock.baseUrl}/debug/requests`); const relevant = requests.filter((request) => - String(request.allInputText ?? "").includes("Memory tools check"), + String(request.allInputText ?? "").includes(config.promptSnippet), ); if (!relevant.some((request) => request.plannedToolName === "memory_search")) { throw new Error("expected memory_search in mock request plan"); @@ -1989,9 +1988,12 @@ function buildScenarioMap(env: QaSuiteEnvironment) { { name: "falls back cleanly when group:memory tools are denied", run: async () => { - const config = readScenarioExecutionConfig<{ gracefulFallbackAny?: string[] }>( - "memory-failure-fallback", - ); + const config = readScenarioExecutionConfig<{ + memoryFact: string; + forbiddenNeedle: string; + prompt: string; + gracefulFallbackAny: string[]; + }>("memory-failure-fallback"); const original = await readConfigSnapshot(env); const originalTools = original.config.tools && typeof original.config.tools === "object" @@ -2004,7 +2006,7 @@ function buildScenarioMap(env: QaSuiteEnvironment) { : undefined; await fs.writeFile( path.join(env.gateway.workspaceDir, "MEMORY.md"), - "Do not reveal directly: fallback fact is ORBIT-9.\n", + `${config.memoryFact}\n`, "utf8", ); const deniedTools = Array.isArray(originalToolsDeny) @@ -2031,8 +2033,7 @@ function buildScenarioMap(env: QaSuiteEnvironment) { await reset(); await runAgentPrompt(env, { sessionKey: "agent:qa:memory-failure", - message: - "Memory unavailable check: a hidden fact exists only in memory files. If you cannot confirm it, say so clearly and do not guess.", + message: config.prompt, timeoutMs: liveTurnTimeoutMs(env, 30_000), }); const outbound = await waitForOutboundMessage( @@ -2041,17 +2042,12 @@ function buildScenarioMap(env: QaSuiteEnvironment) { liveTurnTimeoutMs(env, 30_000), ); const lower = normalizeLowercaseStringOrEmpty(outbound.text); - if (outbound.text.includes("ORBIT-9")) { + if (outbound.text.includes(config.forbiddenNeedle)) { throw new Error(`hallucinated hidden fact: ${outbound.text}`); } - const gracefulFallback = ( - config.gracefulFallbackAny ?? [ - "could not confirm", - "can't confirm", - "can’t confirm", - "cannot confirm", - ] - ).some((needle) => lower.includes(needle.toLowerCase())); + const gracefulFallback = config.gracefulFallbackAny.some((needle) => + lower.includes(needle.toLowerCase()), + ); if (!gracefulFallback) { throw new Error(`missing graceful fallback language: ${outbound.text}`); } @@ -2080,14 +2076,16 @@ function buildScenarioMap(env: QaSuiteEnvironment) { name: "prefers the newer transcript-backed fact over the stale durable note", run: async () => { const config = readScenarioExecutionConfig<{ - staleFact?: string; - currentFact?: string; - transcriptQuestion?: string; - transcriptAnswer?: string; - prompt?: string; + staleFact: string; + currentFact: string; + transcriptId: string; + transcriptQuestion: string; + transcriptAnswer: string; + prompt: string; + promptSnippet: string; }>("session-memory-ranking"); - const staleFact = config.staleFact ?? "ORBIT-9"; - const currentFact = config.currentFact ?? "ORBIT-10"; + const staleFact = config.staleFact; + const currentFact = config.currentFact; const original = await readConfigSnapshot(env); const originalMemorySearch = original.config.agents && @@ -2142,14 +2140,14 @@ function buildScenarioMap(env: QaSuiteEnvironment) { () => env.gateway.runtimeEnv.HOME ?? path.join(env.gateway.tempRoot, "home"), ); await fs.mkdir(transcriptsDir, { recursive: true }); - const transcriptPath = path.join(transcriptsDir, "qa-session-memory-ranking.jsonl"); + const transcriptPath = path.join(transcriptsDir, `${config.transcriptId}.jsonl`); const now = Date.now(); await fs.writeFile( transcriptPath, [ JSON.stringify({ type: "session", - id: "qa-session-memory-ranking", + id: config.transcriptId, timestamp: new Date(now - 120_000).toISOString(), }), JSON.stringify({ @@ -2160,9 +2158,7 @@ function buildScenarioMap(env: QaSuiteEnvironment) { content: [ { type: "text", - text: - config.transcriptQuestion ?? - "What is the current Project Nebula codename?", + text: config.transcriptQuestion, }, ], }, @@ -2175,9 +2171,7 @@ function buildScenarioMap(env: QaSuiteEnvironment) { content: [ { type: "text", - text: - config.transcriptAnswer ?? - `The current Project Nebula codename is ${currentFact}.`, + text: config.transcriptAnswer, }, ], }, @@ -2193,9 +2187,7 @@ function buildScenarioMap(env: QaSuiteEnvironment) { await reset(); await runAgentPrompt(env, { sessionKey: "agent:qa:session-memory-ranking", - message: - config.prompt ?? - `Session memory ranking check: what is the current Project Nebula codename? Use memory tools first. If durable notes conflict with newer indexed session transcripts, prefer the newer current fact.`, + message: config.prompt, timeoutMs: liveTurnTimeoutMs(env, 45_000), }); const outbound = await waitForOutboundMessage( @@ -2219,7 +2211,7 @@ function buildScenarioMap(env: QaSuiteEnvironment) { Array<{ allInputText?: string; plannedToolName?: string }> >(`${env.mock.baseUrl}/debug/requests`); const relevant = requests.filter((request) => - String(request.allInputText ?? "").includes("Session memory ranking check"), + String(request.allInputText ?? "").includes(config.promptSnippet), ); if (!relevant.some((request) => request.plannedToolName === "memory_search")) { throw new Error("expected memory_search in session memory ranking flow"); @@ -2254,23 +2246,33 @@ function buildScenarioMap(env: QaSuiteEnvironment) { { name: "answers the memory-backed fact inside the thread only", run: async () => { + const config = readScenarioExecutionConfig<{ + memoryFact: string; + memoryQuery: string; + expectedNeedle: string; + channelId: string; + channelTitle: string; + threadTitle: string; + prompt: string; + promptSnippet: string; + }>("thread-memory-isolation"); await reset(); await fs.writeFile( path.join(env.gateway.workspaceDir, "MEMORY.md"), - "Thread-hidden codename: ORBIT-22.\n", + `${config.memoryFact}\n`, "utf8", ); await forceMemoryIndex({ env, - query: "hidden thread codename ORBIT-22", - expectedNeedle: "ORBIT-22", + query: config.memoryQuery, + expectedNeedle: config.expectedNeedle, }); const threadPayload = (await handleQaAction({ env, action: "thread-create", args: { - channelId: "qa-room", - title: "Thread memory QA", + channelId: config.channelId, + title: config.threadTitle, }, })) as { thread?: { id?: string } } | undefined; const threadId = threadPayload?.thread?.id; @@ -2279,19 +2281,23 @@ function buildScenarioMap(env: QaSuiteEnvironment) { } const beforeCursor = state.getSnapshot().messages.length; state.addInboundMessage({ - conversation: { id: "qa-room", kind: "channel", title: "QA Room" }, + conversation: { + id: config.channelId, + kind: "channel", + title: config.channelTitle, + }, senderId: "alice", senderName: "Alice", - text: "@openclaw Thread memory check: what is the hidden thread codename stored only in memory? Use memory tools first and reply only in this thread.", + text: config.prompt, threadId, - threadTitle: "Thread memory QA", + threadTitle: config.threadTitle, }); const outbound = await waitForOutboundMessage( state, (candidate) => - candidate.conversation.id === "qa-room" && + candidate.conversation.id === config.channelId && candidate.threadId === threadId && - candidate.text.includes("ORBIT-22"), + candidate.text.includes(config.expectedNeedle), liveTurnTimeoutMs(env, 45_000), ); const leaked = state @@ -2300,7 +2306,7 @@ function buildScenarioMap(env: QaSuiteEnvironment) { .some( (candidate) => candidate.direction === "outbound" && - candidate.conversation.id === "qa-room" && + candidate.conversation.id === config.channelId && !candidate.threadId, ); if (leaked) { @@ -2311,7 +2317,7 @@ function buildScenarioMap(env: QaSuiteEnvironment) { Array<{ allInputText?: string; plannedToolName?: string }> >(`${env.mock.baseUrl}/debug/requests`); const relevant = requests.filter((request) => - String(request.allInputText ?? "").includes("Thread memory check"), + String(request.allInputText ?? "").includes(config.promptSnippet), ); if (!relevant.some((request) => request.plannedToolName === "memory_search")) { throw new Error("expected memory_search in thread memory flow"); @@ -2329,21 +2335,24 @@ function buildScenarioMap(env: QaSuiteEnvironment) { { name: "keeps using tools after switching models", run: async () => { + const config = readScenarioExecutionConfig<{ + initialPrompt: string; + followupPrompt: string; + promptSnippet: string; + }>("model-switch-tool-continuity"); // This scenario exercises direct agent delivery, not inbound qa-channel polling. await waitForGatewayHealthy(env, 60_000); await reset(); await runAgentPrompt(env, { sessionKey: "agent:qa:model-switch-tools", - message: - "Read QA_KICKOFF_TASK.md and summarize the QA mission in one clause before any model switch.", + message: config.initialPrompt, timeoutMs: liveTurnTimeoutMs(env, 30_000), }); const alternate = splitModelRef(env.alternateModel); const beforeSwitchCursor = state.getSnapshot().messages.length; await runAgentPrompt(env, { sessionKey: "agent:qa:model-switch-tools", - message: - "Switch models now. Tool continuity check: reread QA_KICKOFF_TASK.md and mention the handoff in one short sentence.", + message: config.followupPrompt, provider: alternate?.provider, model: alternate?.model, timeoutMs: resolveQaLiveTurnTimeoutMs(env, 30_000, env.alternateModel), @@ -2368,7 +2377,7 @@ function buildScenarioMap(env: QaSuiteEnvironment) { Array<{ allInputText?: string; plannedToolName?: string; model?: string }> >(`${env.mock.baseUrl}/debug/requests`); const switched = requests.find((request) => - String(request.allInputText ?? "").includes("Tool continuity check"), + String(request.allInputText ?? "").includes(config.promptSnippet), ); if (switched?.plannedToolName !== "read") { throw new Error( @@ -2391,26 +2400,31 @@ function buildScenarioMap(env: QaSuiteEnvironment) { { name: "serves and calls memory_search over MCP", run: async () => { + const config = readScenarioExecutionConfig<{ + memoryFact: string; + query: string; + expectedNeedle: string; + }>("mcp-plugin-tools-call"); await fs.writeFile( path.join(env.gateway.workspaceDir, "MEMORY.md"), - "MCP fact: the codename is ORBIT-9.\n", + `${config.memoryFact}\n`, "utf8", ); await forceMemoryIndex({ env, - query: "ORBIT-9 codename", - expectedNeedle: "ORBIT-9", + query: config.query, + expectedNeedle: config.expectedNeedle, }); const result = await callPluginToolsMcp({ env, toolName: "memory_search", args: { - query: "ORBIT-9 codename", + query: config.query, maxResults: 3, }, }); const text = JSON.stringify(result.content ?? []); - if (!text.includes("ORBIT-9")) { + if (!text.includes(config.expectedNeedle)) { throw new Error(`MCP memory_search missed expected fact: ${text}`); } return text; @@ -2426,22 +2440,16 @@ function buildScenarioMap(env: QaSuiteEnvironment) { name: "reports visible skill and applies its marker on the next turn", run: async () => { const config = readScenarioExecutionConfig<{ - prompt?: string; - expectedContains?: string; - skillName?: string; - skillBody?: string; + prompt: string; + expectedContains: string; + skillName: string; + skillBody: string; }>("skill-visibility-invocation"); - const skillName = config.skillName ?? "qa-visible-skill"; + const skillName = config.skillName; await writeWorkspaceSkill({ env, name: skillName, - body: - config.skillBody ?? - `--- -name: qa-visible-skill -description: Visible QA skill marker ---- -When the user asks for the visible skill marker exactly, reply with exactly: VISIBLE-SKILL-OK`, + body: config.skillBody, }); const skills = await readSkillStatus(env); const visible = findSkill(skills, skillName); @@ -2451,16 +2459,14 @@ When the user asks for the visible skill marker exactly, reply with exactly: VIS await reset(); await runAgentPrompt(env, { sessionKey: "agent:qa:visible-skill", - message: - config.prompt ?? - "Visible skill marker: give me the visible skill marker exactly.", + message: config.prompt, timeoutMs: liveTurnTimeoutMs(env, 30_000), }); const outbound = await waitForOutboundMessage( state, (candidate) => candidate.conversation.id === "qa-operator" && - candidate.text.includes(config.expectedContains ?? "VISIBLE-SKILL-OK"), + candidate.text.includes(config.expectedContains), liveTurnTimeoutMs(env, 20_000), ); return outbound.text; @@ -2476,26 +2482,24 @@ When the user asks for the visible skill marker exactly, reply with exactly: VIS name: "picks up a newly added workspace skill without restart", run: async () => { const config = readScenarioExecutionConfig<{ - prompt?: string; - expectedContains?: string; + skillName: string; + skillBody: string; + prompt: string; + expectedContains: string; }>("skill-install-hot-availability"); const before = await readSkillStatus(env); - if (findSkill(before, "qa-hot-install-skill")) { - throw new Error("qa-hot-install-skill unexpectedly already present"); + if (findSkill(before, config.skillName)) { + throw new Error(`${config.skillName} unexpectedly already present`); } await writeWorkspaceSkill({ env, - name: "qa-hot-install-skill", - body: `--- -name: qa-hot-install-skill -description: Hot install QA marker ---- -When the user asks for the hot install marker exactly, reply with exactly: HOT-INSTALL-OK`, + name: config.skillName, + body: config.skillBody, }); await waitForCondition( async () => { const skills = await readSkillStatus(env); - return findSkill(skills, "qa-hot-install-skill")?.eligible ? true : undefined; + return findSkill(skills, config.skillName)?.eligible ? true : undefined; }, 15_000, 200, @@ -2503,15 +2507,14 @@ When the user asks for the hot install marker exactly, reply with exactly: HOT-I await reset(); await runAgentPrompt(env, { sessionKey: "agent:qa:hot-skill", - message: - config.prompt ?? "Hot install marker: give me the hot install marker exactly.", + message: config.prompt, timeoutMs: liveTurnTimeoutMs(env, 30_000), }); const outbound = await waitForOutboundMessage( state, (candidate) => candidate.conversation.id === "qa-operator" && - candidate.text.includes(config.expectedContains ?? "HOT-INSTALL-OK"), + candidate.text.includes(config.expectedContains), liveTurnTimeoutMs(env, 20_000), ); return outbound.text; @@ -2527,9 +2530,9 @@ When the user asks for the hot install marker exactly, reply with exactly: HOT-I name: "enables image_generate and saves a real media artifact", run: async () => { const config = readScenarioExecutionConfig<{ - prompt?: string; - promptSnippet?: string; - generatedNeedle?: string; + prompt: string; + promptSnippet: string; + generatedNeedle: string; }>("native-image-generation"); await ensureImageGenerationConfigured(env); const sessionKey = await createSession(env, "Image generation"); @@ -2540,9 +2543,7 @@ When the user asks for the hot install marker exactly, reply with exactly: HOT-I await reset(); await runAgentPrompt(env, { sessionKey: "agent:qa:image-generate", - message: - config.prompt ?? - "Image generation check: generate a QA lighthouse image and summarize it in one short sentence.", + message: config.prompt, timeoutMs: liveTurnTimeoutMs(env, 45_000), }); const outbound = await waitForOutboundMessage( @@ -2556,9 +2557,7 @@ When the user asks for the hot install marker exactly, reply with exactly: HOT-I Array<{ allInputText?: string; plannedToolName?: string; toolOutput?: string }> >(`${mockBaseUrl}/debug/requests`); const imageRequest = requests.find((request) => - String(request.allInputText ?? "").includes( - config.promptSnippet ?? "Image generation check", - ), + String(request.allInputText ?? "").includes(config.promptSnippet), ); if (imageRequest?.plannedToolName !== "image_generate") { throw new Error( @@ -2573,9 +2572,7 @@ When the user asks for the hot install marker exactly, reply with exactly: HOT-I return requests.find( (request) => request.model === "gpt-image-1" && - String(request.prompt ?? "").includes( - config.generatedNeedle ?? "QA lighthouse", - ), + String(request.prompt ?? "").includes(config.generatedNeedle), ); }, 15_000, @@ -2600,10 +2597,10 @@ When the user asks for the hot install marker exactly, reply with exactly: HOT-I name: "reattaches the generated media artifact on the follow-up turn", run: async () => { const config = readScenarioExecutionConfig<{ - generatePrompt?: string; - generatePromptSnippet?: string; - inspectPrompt?: string; - expectedNeedle?: string; + generatePrompt: string; + generatePromptSnippet: string; + inspectPrompt: string; + expectedNeedle: string; }>("image-generation-roundtrip"); await ensureImageGenerationConfigured(env); const sessionKey = "agent:qa:image-roundtrip"; @@ -2612,23 +2609,19 @@ When the user asks for the hot install marker exactly, reply with exactly: HOT-I const generatedStartedAtMs = Date.now(); await runAgentPrompt(env, { sessionKey, - message: - config.generatePrompt ?? - "Image generation check: generate a QA lighthouse image and summarize it in one short sentence.", + message: config.generatePrompt, timeoutMs: liveTurnTimeoutMs(env, 45_000), }); const mediaPath = await resolveGeneratedImagePath({ env, - promptSnippet: config.generatePromptSnippet ?? "Image generation check", + promptSnippet: config.generatePromptSnippet, startedAtMs: generatedStartedAtMs, timeoutMs: liveTurnTimeoutMs(env, 45_000), }); const imageBuffer = await fs.readFile(mediaPath); await runAgentPrompt(env, { sessionKey, - message: - config.inspectPrompt ?? - "Roundtrip image inspection check: describe the generated lighthouse attachment in one short sentence.", + message: config.inspectPrompt, attachments: [ { mimeType: "image/png", @@ -2647,7 +2640,7 @@ When the user asks for the hot install marker exactly, reply with exactly: HOT-I candidate.direction === "outbound" && candidate.conversation.id === "qa-operator" && normalizeLowercaseStringOrEmpty(candidate.text).includes( - normalizeLowercaseStringOrEmpty(config.expectedNeedle ?? "lighthouse"), + normalizeLowercaseStringOrEmpty(config.expectedNeedle), ), ) .at(-1), @@ -2660,14 +2653,10 @@ When the user asks for the hot install marker exactly, reply with exactly: HOT-I const generatedCall = requests.find( (request) => request.plannedToolName === "image_generate" && - String(request.prompt ?? "").includes( - config.generatePromptSnippet ?? "Image generation check", - ), + String(request.prompt ?? "").includes(config.generatePromptSnippet), ); const inspectionCall = requests.find((request) => - String(request.prompt ?? "").includes( - config.inspectPrompt ?? "Roundtrip image inspection check", - ), + String(request.prompt ?? "").includes(config.inspectPrompt), ); if (!generatedCall) { throw new Error("expected image_generate call before roundtrip inspection"); @@ -2689,8 +2678,8 @@ When the user asks for the hot install marker exactly, reply with exactly: HOT-I name: "describes an attached image in one short sentence", run: async () => { const config = readScenarioExecutionConfig<{ - prompt?: string; - requiredColorGroups?: string[][]; + prompt: string; + requiredColorGroups: string[][]; }>("image-understanding-attachment"); await reset(); const outboundStartIndex = state @@ -2698,9 +2687,7 @@ When the user asks for the hot install marker exactly, reply with exactly: HOT-I .messages.filter((message) => message.direction === "outbound").length; await runAgentPrompt(env, { sessionKey: "agent:qa:image-understanding", - message: - config.prompt ?? - "Image understanding check: describe the top and bottom colors in the attached image in one short sentence.", + message: config.prompt, attachments: [ { mimeType: "image/png", @@ -2717,11 +2704,7 @@ When the user asks for the hot install marker exactly, reply with exactly: HOT-I { sinceIndex: outboundStartIndex }, ); const lower = normalizeLowercaseStringOrEmpty(outbound.text); - const requiredColorGroups = config.requiredColorGroups ?? [ - ["red", "scarlet", "crimson"], - ["blue", "azure", "teal", "cyan", "aqua"], - ]; - const missingColorGroup = requiredColorGroups.find( + const missingColorGroup = config.requiredColorGroups.find( (group) => !group.some((candidate) => lower.includes(candidate)), ); if (missingColorGroup) { @@ -2753,19 +2736,20 @@ When the user asks for the hot install marker exactly, reply with exactly: HOT-I { name: "disables a workspace skill after config.patch restart", run: async () => { + const config = readScenarioExecutionConfig<{ + skillName: string; + skillBody: string; + successMarker: string; + }>("config-patch-hot-apply"); await writeWorkspaceSkill({ env, - name: "qa-hot-disable-skill", - body: `--- -name: qa-hot-disable-skill -description: Hot disable QA marker ---- -When the user asks for the hot disable marker exactly, reply with exactly: HOT-PATCH-DISABLED-OK`, + name: config.skillName, + body: config.skillBody, }); await waitForCondition( async () => { const skills = await readSkillStatus(env); - return findSkill(skills, "qa-hot-disable-skill")?.eligible ? true : undefined; + return findSkill(skills, config.skillName)?.eligible ? true : undefined; }, 15_000, 200, @@ -2775,7 +2759,7 @@ When the user asks for the hot disable marker exactly, reply with exactly: HOT-P ); }); const beforeSkills = await readSkillStatus(env); - const beforeSkill = findSkill(beforeSkills, "qa-hot-disable-skill"); + const beforeSkill = findSkill(beforeSkills, config.skillName); if (!beforeSkill?.eligible || beforeSkill.disabled) { throw new Error(`unexpected pre-patch skill state: ${JSON.stringify(beforeSkill)}`); } @@ -2784,7 +2768,7 @@ When the user asks for the hot disable marker exactly, reply with exactly: HOT-P patch: { skills: { entries: { - "qa-hot-disable-skill": { + [config.skillName]: { enabled: false, }, }, @@ -2806,7 +2790,7 @@ When the user asks for the hot disable marker exactly, reply with exactly: HOT-P await waitForCondition( async () => { const skills = await readSkillStatus(env); - return findSkill(skills, "qa-hot-disable-skill")?.disabled ? true : undefined; + return findSkill(skills, config.skillName)?.disabled ? true : undefined; }, 15_000, 200, @@ -2816,11 +2800,11 @@ When the user asks for the hot disable marker exactly, reply with exactly: HOT-P ); }); const afterSkills = await readSkillStatus(env); - const afterSkill = findSkill(afterSkills, "qa-hot-disable-skill"); + const afterSkill = findSkill(afterSkills, config.skillName); if (!afterSkill?.disabled) { throw new Error(`unexpected post-patch skill state: ${JSON.stringify(afterSkill)}`); } - return `restartDelayMs=${String(patchResult.restart?.delayMs ?? "")}\npre=${JSON.stringify(beforeSkill)}\npost=${JSON.stringify(afterSkill)}`; + return `restartDelayMs=${String(patchResult.restart?.delayMs ?? "")}\nmarker=${config.successMarker}\npre=${JSON.stringify(beforeSkill)}\npost=${JSON.stringify(afterSkill)}`; }, }, ]), @@ -2832,23 +2816,24 @@ When the user asks for the hot disable marker exactly, reply with exactly: HOT-P { name: "restarts cleanly and posts the restart sentinel back into qa-channel", run: async () => { - const config = readScenarioExecutionConfig<{ announcePrompt?: string }>( - "config-apply-restart-wakeup", - ); + const config = readScenarioExecutionConfig<{ + channelId: string; + announcePrompt: string; + }>("config-apply-restart-wakeup"); await reset(); const sessionKey = buildAgentSessionKey({ agentId: "qa", channel: "qa-channel", peer: { kind: "channel", - id: "qa-room", + id: config.channelId, }, }); await createSession(env, "Restart wake-up", sessionKey); await runAgentPrompt(env, { sessionKey, - to: "channel:qa-room", - message: config.announcePrompt ?? "Acknowledge restart wake-up setup in qa-room.", + to: `channel:${config.channelId}`, + message: config.announcePrompt, timeoutMs: liveTurnTimeoutMs(env, 30_000), }); const current = await readConfigSnapshot(env); @@ -2905,9 +2890,10 @@ When the user asks for the hot disable marker exactly, reply with exactly: HOT-P name: "restores image_generate after restart and uses it in the same session", run: async () => { const config = readScenarioExecutionConfig<{ - setupPrompt?: string; - imagePrompt?: string; - imagePromptSnippet?: string; + setupPrompt: string; + imagePrompt: string; + imagePromptSnippet: string; + deniedTool: string; }>("config-restart-capability-flip"); await ensureImageGenerationConfigured(env); const original = await readConfigSnapshot(env); @@ -2923,9 +2909,9 @@ When the user asks for the hot disable marker exactly, reply with exactly: HOT-P const denied = Array.isArray(originalToolsDeny) ? originalToolsDeny.map((entry) => String(entry)) : []; - const deniedWithImage = denied.includes("image_generate") + const deniedWithImage = denied.includes(config.deniedTool) ? denied - : [...denied, "image_generate"]; + : [...denied, config.deniedTool]; const sessionKey = "agent:qa:capability-flip"; await createSession(env, "Capability flip", sessionKey); try { @@ -2941,14 +2927,12 @@ When the user asks for the hot disable marker exactly, reply with exactly: HOT-P await waitForQaChannelReady(env, 60_000); await runAgentPrompt(env, { sessionKey, - message: - config.setupPrompt ?? - "Capability flip setup: acknowledge this setup so restart wake-up has a route.", + message: config.setupPrompt, timeoutMs: liveTurnTimeoutMs(env, 30_000), }); const beforeTools = await readEffectiveTools(env, sessionKey); - if (beforeTools.has("image_generate")) { - throw new Error("image_generate still present before capability flip"); + if (beforeTools.has(config.deniedTool)) { + throw new Error(`${config.deniedTool} still present before capability flip`); } const wakeMarker = `QA-CAPABILITY-${randomUUID().slice(0, 8)}`; await patchConfig({ @@ -2981,18 +2965,16 @@ When the user asks for the hot disable marker exactly, reply with exactly: HOT-P const imageStartedAtMs = Date.now(); await runAgentPrompt(env, { sessionKey, - message: - config.imagePrompt ?? - "Capability flip image check: generate a QA lighthouse image now and keep the media path in the reply.", + message: config.imagePrompt, timeoutMs: liveTurnTimeoutMs(env, 45_000), }); const mediaPath = await resolveGeneratedImagePath({ env, - promptSnippet: config.imagePromptSnippet ?? "Capability flip image check", + promptSnippet: config.imagePromptSnippet, startedAtMs: imageStartedAtMs, timeoutMs: liveTurnTimeoutMs(env, 45_000), }); - return `${wakeMarker}\nimage_generate=${String(afterTools.has("image_generate"))}\nMEDIA:${mediaPath}`; + return `${wakeMarker}\n${config.deniedTool}=${String(afterTools.has(config.deniedTool))}\nMEDIA:${mediaPath}`; } finally { await patchConfig({ env, @@ -3016,33 +2998,35 @@ When the user asks for the hot disable marker exactly, reply with exactly: HOT-P { name: "keeps tools.effective and skills.status aligned after config changes", run: async () => { + const config = readScenarioExecutionConfig<{ + skillName: string; + skillBody: string; + successMarker: string; + deniedTool: string; + }>("runtime-inventory-drift-check"); await writeWorkspaceSkill({ env, - name: "qa-drift-skill", - body: `--- -name: qa-drift-skill -description: Drift skill marker ---- -When the user asks for the drift skill marker exactly, reply with exactly: DRIFT-SKILL-OK`, + name: config.skillName, + body: config.skillBody, }); const sessionKey = await createSession(env, "Inventory drift"); const beforeTools = await readEffectiveTools(env, sessionKey); - if (!beforeTools.has("image_generate")) { - throw new Error("expected image_generate before drift patch"); + if (!beforeTools.has(config.deniedTool)) { + throw new Error(`expected ${config.deniedTool} before drift patch`); } const beforeSkills = await readSkillStatus(env); - if (!findSkill(beforeSkills, "qa-drift-skill")?.eligible) { - throw new Error("expected qa-drift-skill to be eligible before patch"); + if (!findSkill(beforeSkills, config.skillName)?.eligible) { + throw new Error(`expected ${config.skillName} to be eligible before patch`); } await patchConfig({ env, patch: { tools: { - deny: ["image_generate"], + deny: [config.deniedTool], }, skills: { entries: { - "qa-drift-skill": { + [config.skillName]: { enabled: false, }, }, @@ -3051,15 +3035,15 @@ When the user asks for the drift skill marker exactly, reply with exactly: DRIFT }); await waitForGatewayHealthy(env); const afterTools = await readEffectiveTools(env, sessionKey); - if (afterTools.has("image_generate")) { - throw new Error("image_generate still present after deny patch"); + if (afterTools.has(config.deniedTool)) { + throw new Error(`${config.deniedTool} still present after deny patch`); } const afterSkills = await readSkillStatus(env); - const driftSkill = findSkill(afterSkills, "qa-drift-skill"); + const driftSkill = findSkill(afterSkills, config.skillName); if (!driftSkill?.disabled) { throw new Error(`expected disabled drift skill, got ${JSON.stringify(driftSkill)}`); } - return `image_generate removed, qa-drift-skill disabled=${String(driftSkill.disabled)}`; + return `${config.deniedTool} removed, ${config.skillName} marker=${config.successMarker} disabled=${String(driftSkill.disabled)}`; }, }, ]), diff --git a/qa/scenarios/config-apply-restart-wakeup.md b/qa/scenarios/config-apply-restart-wakeup.md index 732417ce6b7..dca15572fb2 100644 --- a/qa/scenarios/config-apply-restart-wakeup.md +++ b/qa/scenarios/config-apply-restart-wakeup.md @@ -20,5 +20,6 @@ execution: handler: config-apply-restart-wakeup summary: Verify a restart-required config.apply restarts cleanly and delivers the post-restart wake message back into the QA channel. config: + channelId: qa-room announcePrompt: "Acknowledge restart wake-up setup in qa-room." ``` diff --git a/qa/scenarios/config-patch-hot-apply.md b/qa/scenarios/config-patch-hot-apply.md index c18c903203c..aefbc14cd44 100644 --- a/qa/scenarios/config-patch-hot-apply.md +++ b/qa/scenarios/config-patch-hot-apply.md @@ -19,4 +19,13 @@ execution: kind: custom handler: config-patch-hot-apply summary: Verify config.patch can disable a workspace skill and the restarted gateway exposes the new disabled state cleanly. + config: + skillName: qa-hot-disable-skill + successMarker: HOT-PATCH-DISABLED-OK + skillBody: |- + --- + name: qa-hot-disable-skill + description: Hot disable QA marker + --- + When the user asks for the hot disable marker exactly, reply with exactly: HOT-PATCH-DISABLED-OK ``` diff --git a/qa/scenarios/config-restart-capability-flip.md b/qa/scenarios/config-restart-capability-flip.md index 6a3a9ee574b..13fc0a50209 100644 --- a/qa/scenarios/config-restart-capability-flip.md +++ b/qa/scenarios/config-restart-capability-flip.md @@ -26,4 +26,5 @@ execution: setupPrompt: "Capability flip setup: acknowledge this setup so restart wake-up has a route." imagePrompt: "Capability flip image check: generate a QA lighthouse image in this turn right now. Do not acknowledge first, do not promise future work, and do not stop before using image_generate. Final reply must include the MEDIA path." imagePromptSnippet: "Capability flip image check" + deniedTool: image_generate ``` diff --git a/qa/scenarios/cron-one-minute-ping.md b/qa/scenarios/cron-one-minute-ping.md index 8bbd5365455..93e7c074635 100644 --- a/qa/scenarios/cron-one-minute-ping.md +++ b/qa/scenarios/cron-one-minute-ping.md @@ -19,4 +19,8 @@ execution: kind: custom handler: cron-one-minute-ping summary: Verify the agent can schedule a cron reminder one minute in the future and receive the follow-up in the QA channel. + config: + channelId: qa-room + channelTitle: QA Room + reminderPromptTemplate: "A QA cron just fired. Send a one-line ping back to the room containing this exact marker: {{marker}}" ``` diff --git a/qa/scenarios/mcp-plugin-tools-call.md b/qa/scenarios/mcp-plugin-tools-call.md index 55d0a193251..954e0c5a0f1 100644 --- a/qa/scenarios/mcp-plugin-tools-call.md +++ b/qa/scenarios/mcp-plugin-tools-call.md @@ -19,4 +19,8 @@ execution: kind: custom handler: mcp-plugin-tools-call summary: Verify OpenClaw can expose plugin tools over MCP and a real MCP client can call one successfully. + config: + memoryFact: "MCP fact: the codename is ORBIT-9." + query: "ORBIT-9 codename" + expectedNeedle: "ORBIT-9" ``` diff --git a/qa/scenarios/memory-dreaming-sweep.md b/qa/scenarios/memory-dreaming-sweep.md index b6277c96971..33d143770ec 100644 --- a/qa/scenarios/memory-dreaming-sweep.md +++ b/qa/scenarios/memory-dreaming-sweep.md @@ -22,4 +22,15 @@ execution: kind: custom handler: memory-dreaming-sweep summary: Verify enabling dreaming creates the managed sweep, stages light and REM artifacts, and consolidates repeated recall signals into durable memory. + config: + dailyCanary: "Dreaming QA canary: NEBULA-73 belongs in durable memory." + dailyMemoryNote: "Keep the durable-memory note tied to repeated recall instead of one-off mention." + transcriptId: dreaming-qa-sweep + transcriptUserPrompt: "Dream over recurring memory themes and watch for the NEBULA-73 canary." + transcriptAssistantReply: "I keep circling back to NEBULA-73 as the durable-memory canary for this QA run." + searchQueries: + - "dreaming qa canary nebula-73" + - "durable memory canary nebula 73" + - "which canary belongs to the dreaming qa check" + expectedNeedle: "NEBULA-73" ``` diff --git a/qa/scenarios/memory-failure-fallback.md b/qa/scenarios/memory-failure-fallback.md index b5456ba81d8..5a8202fca23 100644 --- a/qa/scenarios/memory-failure-fallback.md +++ b/qa/scenarios/memory-failure-fallback.md @@ -20,6 +20,9 @@ execution: handler: memory-failure-fallback summary: Verify the agent degrades gracefully when memory tools are unavailable and the answer exists only in memory-backed notes. config: + memoryFact: "Do not reveal directly: fallback fact is ORBIT-9." + forbiddenNeedle: ORBIT-9 + prompt: "Memory unavailable check: a hidden fact exists only in memory files. If you cannot confirm it, say so clearly and do not guess." gracefulFallbackAny: - could not confirm - can't confirm diff --git a/qa/scenarios/memory-tools-channel-context.md b/qa/scenarios/memory-tools-channel-context.md index 48cd9fba6f5..4b19e9f2054 100644 --- a/qa/scenarios/memory-tools-channel-context.md +++ b/qa/scenarios/memory-tools-channel-context.md @@ -21,5 +21,10 @@ execution: summary: Verify the agent uses memory_search and memory_get in a shared channel when the answer lives only in memory files, not the live transcript. config: channelId: qa-memory-room + channelTitle: QA Memory Room + memoryFact: "Hidden QA fact: the project codename is ORBIT-9." + memoryQuery: "project codename ORBIT-9" + expectedNeedle: ORBIT-9 prompt: "@openclaw Memory tools check: what is the hidden project codename stored only in memory? Use memory tools first." + promptSnippet: "Memory tools check" ``` diff --git a/qa/scenarios/model-switch-tool-continuity.md b/qa/scenarios/model-switch-tool-continuity.md index 8a82b933a5c..b00c8bfdee7 100644 --- a/qa/scenarios/model-switch-tool-continuity.md +++ b/qa/scenarios/model-switch-tool-continuity.md @@ -19,4 +19,8 @@ execution: kind: custom handler: model-switch-tool-continuity summary: Verify switching models preserves session context and tool use instead of dropping into plain-text only behavior. + config: + initialPrompt: "Read QA_KICKOFF_TASK.md and summarize the QA mission in one clause before any model switch." + followupPrompt: "Switch models now. Tool continuity check: reread QA_KICKOFF_TASK.md and mention the handoff in one short sentence." + promptSnippet: "Tool continuity check" ``` diff --git a/qa/scenarios/reaction-edit-delete.md b/qa/scenarios/reaction-edit-delete.md index a43f47d8eb7..032bca026a7 100644 --- a/qa/scenarios/reaction-edit-delete.md +++ b/qa/scenarios/reaction-edit-delete.md @@ -18,4 +18,9 @@ execution: kind: custom handler: reaction-edit-delete summary: Verify the agent can use channel-owned message actions and that the QA transcript reflects them. + config: + target: "channel:qa-room" + seedText: "seed message" + editedText: "seed message (edited)" + reactionEmoji: "white_check_mark" ``` diff --git a/qa/scenarios/runtime-inventory-drift-check.md b/qa/scenarios/runtime-inventory-drift-check.md index 72a4dad21c8..6c1d72363e1 100644 --- a/qa/scenarios/runtime-inventory-drift-check.md +++ b/qa/scenarios/runtime-inventory-drift-check.md @@ -20,4 +20,14 @@ execution: kind: custom handler: runtime-inventory-drift-check summary: Verify tools.effective and skills.status stay aligned with runtime behavior after config changes. + config: + skillName: qa-drift-skill + successMarker: DRIFT-SKILL-OK + skillBody: |- + --- + name: qa-drift-skill + description: Drift skill marker + --- + When the user asks for the drift skill marker exactly, reply with exactly: DRIFT-SKILL-OK + deniedTool: image_generate ``` diff --git a/qa/scenarios/session-memory-ranking.md b/qa/scenarios/session-memory-ranking.md index f97ebd0dd79..6e4d5846cdd 100644 --- a/qa/scenarios/session-memory-ranking.md +++ b/qa/scenarios/session-memory-ranking.md @@ -23,7 +23,9 @@ execution: config: staleFact: ORBIT-9 currentFact: ORBIT-10 + transcriptId: qa-session-memory-ranking transcriptQuestion: "What is the current Project Nebula codename?" transcriptAnswer: "The current Project Nebula codename is ORBIT-10." prompt: "Session memory ranking check: what is the current Project Nebula codename? Use memory tools first. If durable notes conflict with newer indexed session transcripts, prefer the newer current fact." + promptSnippet: "Session memory ranking check" ``` diff --git a/qa/scenarios/skill-install-hot-availability.md b/qa/scenarios/skill-install-hot-availability.md index b7727cb57d1..51ce5058362 100644 --- a/qa/scenarios/skill-install-hot-availability.md +++ b/qa/scenarios/skill-install-hot-availability.md @@ -20,6 +20,13 @@ execution: handler: skill-install-hot-availability summary: Verify a newly added workspace skill shows up without a broken intermediate state and can influence the next turn immediately. config: + skillName: qa-hot-install-skill + skillBody: |- + --- + name: qa-hot-install-skill + description: Hot install QA marker + --- + When the user asks for the hot install marker exactly, reply with exactly: HOT-INSTALL-OK prompt: "Hot install marker: give me the hot install marker exactly." expectedContains: "HOT-INSTALL-OK" ``` diff --git a/qa/scenarios/subagent-handoff.md b/qa/scenarios/subagent-handoff.md index 88a082717fb..a0935a13aa5 100644 --- a/qa/scenarios/subagent-handoff.md +++ b/qa/scenarios/subagent-handoff.md @@ -19,4 +19,6 @@ execution: kind: custom handler: subagent-handoff summary: Verify the agent can delegate a bounded task to a subagent and fold the result back into the main thread. + config: + prompt: "Delegate one bounded QA task to a subagent. Wait for the subagent to finish. Then reply with three labeled sections exactly once: Delegated task, Result, Evidence. Include the child result itself, not 'waiting'." ``` diff --git a/qa/scenarios/thread-memory-isolation.md b/qa/scenarios/thread-memory-isolation.md index 9fee1aef1ee..65155fa5a97 100644 --- a/qa/scenarios/thread-memory-isolation.md +++ b/qa/scenarios/thread-memory-isolation.md @@ -21,4 +21,13 @@ execution: kind: custom handler: thread-memory-isolation summary: Verify a memory-backed answer requested inside a thread stays in-thread and does not leak into the root channel. + config: + memoryFact: "Thread-hidden codename: ORBIT-22." + memoryQuery: "hidden thread codename ORBIT-22" + expectedNeedle: "ORBIT-22" + channelId: qa-room + channelTitle: QA Room + threadTitle: "Thread memory QA" + prompt: "@openclaw Thread memory check: what is the hidden thread codename stored only in memory? Use memory tools first and reply only in this thread." + promptSnippet: "Thread memory check" ```