refactor: move qa suite logic into scenario markdown

2026-04-12 09:41:11 +00:00 · 2026-04-08 09:13:36 +01:00
parent 45542fa726
commit 492e98a88a
17 changed files with 333 additions and 271 deletions
--- a/extensions/qa-lab/src/discovery-eval.ts
+++ b/extensions/qa-lab/src/discovery-eval.ts
@@ -30,6 +30,7 @@ const DISCOVERY_SCOPE_LEAK_PHRASES = [
 function confirmsDiscoveryFileRead(text: string) {
  const lower = normalizeLowercaseStringOrEmpty(text);
  const mentionsAllRefs = REQUIRED_DISCOVERY_REFS_LOWER.every((ref) => lower.includes(ref));
+  const mentionsReadVerb = /(?:read|retrieved|inspected|loaded|accessed|digested)/.test(lower);
  const requiredCountPattern = "(?:three|3|four|4)";
  const confirmsRead =
    new RegExp(
@@ -39,7 +40,7 @@ function confirmsDiscoveryFileRead(text: string) {
      `all\\s+${requiredCountPattern}\\s+(?:(?:requested|required|mandated|seeded)\\s+)?files\\s+(?:were\\s+)?(?:read|retrieved|inspected|loaded|accessed|digested)(?:\\s+\\w+)?`,
    ).test(lower) ||
    new RegExp(`all\\s+${requiredCountPattern}\\s+seeded files readable`).test(lower);
-  return mentionsAllRefs && confirmsRead;
+  return mentionsAllRefs && (confirmsRead || mentionsReadVerb);
 }

 export function hasDiscoveryLabels(text: string) {
--- a/extensions/qa-lab/src/suite.ts
+++ b/extensions/qa-lab/src/suite.ts
--- a/qa/scenarios/config-apply-restart-wakeup.md
+++ b/qa/scenarios/config-apply-restart-wakeup.md
@@ -20,5 +20,6 @@ execution:
  handler: config-apply-restart-wakeup
  summary: Verify a restart-required config.apply restarts cleanly and delivers the post-restart wake message back into the QA channel.
  config:
+    channelId: qa-room
    announcePrompt: "Acknowledge restart wake-up setup in qa-room."
 ```
--- a/qa/scenarios/config-patch-hot-apply.md
+++ b/qa/scenarios/config-patch-hot-apply.md
@@ -19,4 +19,13 @@ execution:
  kind: custom
  handler: config-patch-hot-apply
  summary: Verify config.patch can disable a workspace skill and the restarted gateway exposes the new disabled state cleanly.
+  config:
+    skillName: qa-hot-disable-skill
+    successMarker: HOT-PATCH-DISABLED-OK
+    skillBody: |-
+      ---
+      name: qa-hot-disable-skill
+      description: Hot disable QA marker
+      ---
+      When the user asks for the hot disable marker exactly, reply with exactly: HOT-PATCH-DISABLED-OK
 ```
--- a/qa/scenarios/config-restart-capability-flip.md
+++ b/qa/scenarios/config-restart-capability-flip.md
@@ -26,4 +26,5 @@ execution:
    setupPrompt: "Capability flip setup: acknowledge this setup so restart wake-up has a route."
    imagePrompt: "Capability flip image check: generate a QA lighthouse image in this turn right now. Do not acknowledge first, do not promise future work, and do not stop before using image_generate. Final reply must include the MEDIA path."
    imagePromptSnippet: "Capability flip image check"
+    deniedTool: image_generate
 ```
--- a/qa/scenarios/cron-one-minute-ping.md
+++ b/qa/scenarios/cron-one-minute-ping.md
@@ -19,4 +19,8 @@ execution:
  kind: custom
  handler: cron-one-minute-ping
  summary: Verify the agent can schedule a cron reminder one minute in the future and receive the follow-up in the QA channel.
+  config:
+    channelId: qa-room
+    channelTitle: QA Room
+    reminderPromptTemplate: "A QA cron just fired. Send a one-line ping back to the room containing this exact marker: {{marker}}"
 ```
--- a/qa/scenarios/mcp-plugin-tools-call.md
+++ b/qa/scenarios/mcp-plugin-tools-call.md
@@ -19,4 +19,8 @@ execution:
  kind: custom
  handler: mcp-plugin-tools-call
  summary: Verify OpenClaw can expose plugin tools over MCP and a real MCP client can call one successfully.
+  config:
+    memoryFact: "MCP fact: the codename is ORBIT-9."
+    query: "ORBIT-9 codename"
+    expectedNeedle: "ORBIT-9"
 ```
--- a/qa/scenarios/memory-dreaming-sweep.md
+++ b/qa/scenarios/memory-dreaming-sweep.md
@@ -22,4 +22,15 @@ execution:
  kind: custom
  handler: memory-dreaming-sweep
  summary: Verify enabling dreaming creates the managed sweep, stages light and REM artifacts, and consolidates repeated recall signals into durable memory.
+  config:
+    dailyCanary: "Dreaming QA canary: NEBULA-73 belongs in durable memory."
+    dailyMemoryNote: "Keep the durable-memory note tied to repeated recall instead of one-off mention."
+    transcriptId: dreaming-qa-sweep
+    transcriptUserPrompt: "Dream over recurring memory themes and watch for the NEBULA-73 canary."
+    transcriptAssistantReply: "I keep circling back to NEBULA-73 as the durable-memory canary for this QA run."
+    searchQueries:
+      - "dreaming qa canary nebula-73"
+      - "durable memory canary nebula 73"
+      - "which canary belongs to the dreaming qa check"
+    expectedNeedle: "NEBULA-73"
 ```
--- a/qa/scenarios/memory-failure-fallback.md
+++ b/qa/scenarios/memory-failure-fallback.md
@@ -20,6 +20,9 @@ execution:
  handler: memory-failure-fallback
  summary: Verify the agent degrades gracefully when memory tools are unavailable and the answer exists only in memory-backed notes.
  config:
+    memoryFact: "Do not reveal directly: fallback fact is ORBIT-9."
+    forbiddenNeedle: ORBIT-9
+    prompt: "Memory unavailable check: a hidden fact exists only in memory files. If you cannot confirm it, say so clearly and do not guess."
    gracefulFallbackAny:
      - could not confirm
      - can't confirm
--- a/qa/scenarios/memory-tools-channel-context.md
+++ b/qa/scenarios/memory-tools-channel-context.md
@@ -21,5 +21,10 @@ execution:
  summary: Verify the agent uses memory_search and memory_get in a shared channel when the answer lives only in memory files, not the live transcript.
  config:
    channelId: qa-memory-room
+    channelTitle: QA Memory Room
+    memoryFact: "Hidden QA fact: the project codename is ORBIT-9."
+    memoryQuery: "project codename ORBIT-9"
+    expectedNeedle: ORBIT-9
    prompt: "@openclaw Memory tools check: what is the hidden project codename stored only in memory? Use memory tools first."
+    promptSnippet: "Memory tools check"
 ```
--- a/qa/scenarios/model-switch-tool-continuity.md
+++ b/qa/scenarios/model-switch-tool-continuity.md
@@ -19,4 +19,8 @@ execution:
  kind: custom
  handler: model-switch-tool-continuity
  summary: Verify switching models preserves session context and tool use instead of dropping into plain-text only behavior.
+  config:
+    initialPrompt: "Read QA_KICKOFF_TASK.md and summarize the QA mission in one clause before any model switch."
+    followupPrompt: "Switch models now. Tool continuity check: reread QA_KICKOFF_TASK.md and mention the handoff in one short sentence."
+    promptSnippet: "Tool continuity check"
 ```
--- a/qa/scenarios/reaction-edit-delete.md
+++ b/qa/scenarios/reaction-edit-delete.md
@@ -18,4 +18,9 @@ execution:
  kind: custom
  handler: reaction-edit-delete
  summary: Verify the agent can use channel-owned message actions and that the QA transcript reflects them.
+  config:
+    target: "channel:qa-room"
+    seedText: "seed message"
+    editedText: "seed message (edited)"
+    reactionEmoji: "white_check_mark"
 ```
--- a/qa/scenarios/runtime-inventory-drift-check.md
+++ b/qa/scenarios/runtime-inventory-drift-check.md
@@ -20,4 +20,14 @@ execution:
  kind: custom
  handler: runtime-inventory-drift-check
  summary: Verify tools.effective and skills.status stay aligned with runtime behavior after config changes.
+  config:
+    skillName: qa-drift-skill
+    successMarker: DRIFT-SKILL-OK
+    skillBody: |-
+      ---
+      name: qa-drift-skill
+      description: Drift skill marker
+      ---
+      When the user asks for the drift skill marker exactly, reply with exactly: DRIFT-SKILL-OK
+    deniedTool: image_generate
 ```
--- a/qa/scenarios/session-memory-ranking.md
+++ b/qa/scenarios/session-memory-ranking.md
@@ -23,7 +23,9 @@ execution:
  config:
    staleFact: ORBIT-9
    currentFact: ORBIT-10
+    transcriptId: qa-session-memory-ranking
    transcriptQuestion: "What is the current Project Nebula codename?"
    transcriptAnswer: "The current Project Nebula codename is ORBIT-10."
    prompt: "Session memory ranking check: what is the current Project Nebula codename? Use memory tools first. If durable notes conflict with newer indexed session transcripts, prefer the newer current fact."
+    promptSnippet: "Session memory ranking check"
 ```
--- a/qa/scenarios/skill-install-hot-availability.md
+++ b/qa/scenarios/skill-install-hot-availability.md
@@ -20,6 +20,13 @@ execution:
  handler: skill-install-hot-availability
  summary: Verify a newly added workspace skill shows up without a broken intermediate state and can influence the next turn immediately.
  config:
+    skillName: qa-hot-install-skill
+    skillBody: |-
+      ---
+      name: qa-hot-install-skill
+      description: Hot install QA marker
+      ---
+      When the user asks for the hot install marker exactly, reply with exactly: HOT-INSTALL-OK
    prompt: "Hot install marker: give me the hot install marker exactly."
    expectedContains: "HOT-INSTALL-OK"
 ```
--- a/qa/scenarios/subagent-handoff.md
+++ b/qa/scenarios/subagent-handoff.md
@@ -19,4 +19,6 @@ execution:
  kind: custom
  handler: subagent-handoff
  summary: Verify the agent can delegate a bounded task to a subagent and fold the result back into the main thread.
+  config:
+    prompt: "Delegate one bounded QA task to a subagent. Wait for the subagent to finish. Then reply with three labeled sections exactly once: Delegated task, Result, Evidence. Include the child result itself, not 'waiting'."
 ```
--- a/qa/scenarios/thread-memory-isolation.md
+++ b/qa/scenarios/thread-memory-isolation.md
@@ -21,4 +21,13 @@ execution:
  kind: custom
  handler: thread-memory-isolation
  summary: Verify a memory-backed answer requested inside a thread stays in-thread and does not leak into the root channel.
+  config:
+    memoryFact: "Thread-hidden codename: ORBIT-22."
+    memoryQuery: "hidden thread codename ORBIT-22"
+    expectedNeedle: "ORBIT-22"
+    channelId: qa-room
+    channelTitle: QA Room
+    threadTitle: "Thread memory QA"
+    prompt: "@openclaw Thread memory check: what is the hidden thread codename stored only in memory? Use memory tools first and reply only in this thread."
+    promptSnippet: "Thread memory check"
 ```