refactor: move qa suite logic into scenario markdown

This commit is contained in:
Peter Steinberger
2026-04-08 09:13:36 +01:00
parent 45542fa726
commit 492e98a88a
17 changed files with 333 additions and 271 deletions

View File

@@ -30,6 +30,7 @@ const DISCOVERY_SCOPE_LEAK_PHRASES = [
function confirmsDiscoveryFileRead(text: string) {
const lower = normalizeLowercaseStringOrEmpty(text);
const mentionsAllRefs = REQUIRED_DISCOVERY_REFS_LOWER.every((ref) => lower.includes(ref));
const mentionsReadVerb = /(?:read|retrieved|inspected|loaded|accessed|digested)/.test(lower);
const requiredCountPattern = "(?:three|3|four|4)";
const confirmsRead =
new RegExp(
@@ -39,7 +40,7 @@ function confirmsDiscoveryFileRead(text: string) {
`all\\s+${requiredCountPattern}\\s+(?:(?:requested|required|mandated|seeded)\\s+)?files\\s+(?:were\\s+)?(?:read|retrieved|inspected|loaded|accessed|digested)(?:\\s+\\w+)?`,
).test(lower) ||
new RegExp(`all\\s+${requiredCountPattern}\\s+seeded files readable`).test(lower);
return mentionsAllRefs && confirmsRead;
return mentionsAllRefs && (confirmsRead || mentionsReadVerb);
}
export function hasDiscoveryLabels(text: string) {

File diff suppressed because it is too large Load Diff

View File

@@ -20,5 +20,6 @@ execution:
handler: config-apply-restart-wakeup
summary: Verify a restart-required config.apply restarts cleanly and delivers the post-restart wake message back into the QA channel.
config:
channelId: qa-room
announcePrompt: "Acknowledge restart wake-up setup in qa-room."
```

View File

@@ -19,4 +19,13 @@ execution:
kind: custom
handler: config-patch-hot-apply
summary: Verify config.patch can disable a workspace skill and the restarted gateway exposes the new disabled state cleanly.
config:
skillName: qa-hot-disable-skill
successMarker: HOT-PATCH-DISABLED-OK
skillBody: |-
---
name: qa-hot-disable-skill
description: Hot disable QA marker
---
When the user asks for the hot disable marker exactly, reply with exactly: HOT-PATCH-DISABLED-OK
```

View File

@@ -26,4 +26,5 @@ execution:
setupPrompt: "Capability flip setup: acknowledge this setup so restart wake-up has a route."
imagePrompt: "Capability flip image check: generate a QA lighthouse image in this turn right now. Do not acknowledge first, do not promise future work, and do not stop before using image_generate. Final reply must include the MEDIA path."
imagePromptSnippet: "Capability flip image check"
deniedTool: image_generate
```

View File

@@ -19,4 +19,8 @@ execution:
kind: custom
handler: cron-one-minute-ping
summary: Verify the agent can schedule a cron reminder one minute in the future and receive the follow-up in the QA channel.
config:
channelId: qa-room
channelTitle: QA Room
reminderPromptTemplate: "A QA cron just fired. Send a one-line ping back to the room containing this exact marker: {{marker}}"
```

View File

@@ -19,4 +19,8 @@ execution:
kind: custom
handler: mcp-plugin-tools-call
summary: Verify OpenClaw can expose plugin tools over MCP and a real MCP client can call one successfully.
config:
memoryFact: "MCP fact: the codename is ORBIT-9."
query: "ORBIT-9 codename"
expectedNeedle: "ORBIT-9"
```

View File

@@ -22,4 +22,15 @@ execution:
kind: custom
handler: memory-dreaming-sweep
summary: Verify enabling dreaming creates the managed sweep, stages light and REM artifacts, and consolidates repeated recall signals into durable memory.
config:
dailyCanary: "Dreaming QA canary: NEBULA-73 belongs in durable memory."
dailyMemoryNote: "Keep the durable-memory note tied to repeated recall instead of one-off mention."
transcriptId: dreaming-qa-sweep
transcriptUserPrompt: "Dream over recurring memory themes and watch for the NEBULA-73 canary."
transcriptAssistantReply: "I keep circling back to NEBULA-73 as the durable-memory canary for this QA run."
searchQueries:
- "dreaming qa canary nebula-73"
- "durable memory canary nebula 73"
- "which canary belongs to the dreaming qa check"
expectedNeedle: "NEBULA-73"
```

View File

@@ -20,6 +20,9 @@ execution:
handler: memory-failure-fallback
summary: Verify the agent degrades gracefully when memory tools are unavailable and the answer exists only in memory-backed notes.
config:
memoryFact: "Do not reveal directly: fallback fact is ORBIT-9."
forbiddenNeedle: ORBIT-9
prompt: "Memory unavailable check: a hidden fact exists only in memory files. If you cannot confirm it, say so clearly and do not guess."
gracefulFallbackAny:
- could not confirm
- can't confirm

View File

@@ -21,5 +21,10 @@ execution:
summary: Verify the agent uses memory_search and memory_get in a shared channel when the answer lives only in memory files, not the live transcript.
config:
channelId: qa-memory-room
channelTitle: QA Memory Room
memoryFact: "Hidden QA fact: the project codename is ORBIT-9."
memoryQuery: "project codename ORBIT-9"
expectedNeedle: ORBIT-9
prompt: "@openclaw Memory tools check: what is the hidden project codename stored only in memory? Use memory tools first."
promptSnippet: "Memory tools check"
```

View File

@@ -19,4 +19,8 @@ execution:
kind: custom
handler: model-switch-tool-continuity
summary: Verify switching models preserves session context and tool use instead of dropping into plain-text only behavior.
config:
initialPrompt: "Read QA_KICKOFF_TASK.md and summarize the QA mission in one clause before any model switch."
followupPrompt: "Switch models now. Tool continuity check: reread QA_KICKOFF_TASK.md and mention the handoff in one short sentence."
promptSnippet: "Tool continuity check"
```

View File

@@ -18,4 +18,9 @@ execution:
kind: custom
handler: reaction-edit-delete
summary: Verify the agent can use channel-owned message actions and that the QA transcript reflects them.
config:
target: "channel:qa-room"
seedText: "seed message"
editedText: "seed message (edited)"
reactionEmoji: "white_check_mark"
```

View File

@@ -20,4 +20,14 @@ execution:
kind: custom
handler: runtime-inventory-drift-check
summary: Verify tools.effective and skills.status stay aligned with runtime behavior after config changes.
config:
skillName: qa-drift-skill
successMarker: DRIFT-SKILL-OK
skillBody: |-
---
name: qa-drift-skill
description: Drift skill marker
---
When the user asks for the drift skill marker exactly, reply with exactly: DRIFT-SKILL-OK
deniedTool: image_generate
```

View File

@@ -23,7 +23,9 @@ execution:
config:
staleFact: ORBIT-9
currentFact: ORBIT-10
transcriptId: qa-session-memory-ranking
transcriptQuestion: "What is the current Project Nebula codename?"
transcriptAnswer: "The current Project Nebula codename is ORBIT-10."
prompt: "Session memory ranking check: what is the current Project Nebula codename? Use memory tools first. If durable notes conflict with newer indexed session transcripts, prefer the newer current fact."
promptSnippet: "Session memory ranking check"
```

View File

@@ -20,6 +20,13 @@ execution:
handler: skill-install-hot-availability
summary: Verify a newly added workspace skill shows up without a broken intermediate state and can influence the next turn immediately.
config:
skillName: qa-hot-install-skill
skillBody: |-
---
name: qa-hot-install-skill
description: Hot install QA marker
---
When the user asks for the hot install marker exactly, reply with exactly: HOT-INSTALL-OK
prompt: "Hot install marker: give me the hot install marker exactly."
expectedContains: "HOT-INSTALL-OK"
```

View File

@@ -19,4 +19,6 @@ execution:
kind: custom
handler: subagent-handoff
summary: Verify the agent can delegate a bounded task to a subagent and fold the result back into the main thread.
config:
prompt: "Delegate one bounded QA task to a subagent. Wait for the subagent to finish. Then reply with three labeled sections exactly once: Delegated task, Result, Evidence. Include the child result itself, not 'waiting'."
```

View File

@@ -21,4 +21,13 @@ execution:
kind: custom
handler: thread-memory-isolation
summary: Verify a memory-backed answer requested inside a thread stays in-thread and does not leak into the root channel.
config:
memoryFact: "Thread-hidden codename: ORBIT-22."
memoryQuery: "hidden thread codename ORBIT-22"
expectedNeedle: "ORBIT-22"
channelId: qa-room
channelTitle: QA Room
threadTitle: "Thread memory QA"
prompt: "@openclaw Thread memory check: what is the hidden thread codename stored only in memory? Use memory tools first and reply only in this thread."
promptSnippet: "Thread memory check"
```