test: harden qa eval scenarios

2026-04-17 20:21:13 +00:00 · 2026-04-10 10:11:04 +01:00
parent 005b629b6d
commit 68b4b36a90
6 changed files with 42 additions and 10 deletions
--- a/extensions/qa-lab/src/model-switch-eval.test.ts
+++ b/extensions/qa-lab/src/model-switch-eval.test.ts
@@ -18,6 +18,22 @@ describe("qa model-switch evaluation", () => {
    ).toBe(true);
  });

+  it("accepts concise kickoff note confirmations", () => {
+    expect(
+      hasModelSwitchContinuityEvidence(
+        "Handoff clean: after the model switch, I reread the kickoff note.",
+      ),
+    ).toBe(true);
+  });
+
+  it("accepts concise paraphrases of the kickoff task after a handoff", () => {
+    expect(
+      hasModelSwitchContinuityEvidence(
+        "Handoff is clear: after the model switch, read source and docs first, run seeded qa-channel scenarios, and report worked, failed, blocked, and follow-up.",
+      ),
+    ).toBe(true);
+  });
+
  it("rejects unrelated handoff chatter that never confirms the kickoff reread", () => {
    expect(
      hasModelSwitchContinuityEvidence(
--- a/extensions/qa-lab/src/model-switch-eval.ts
+++ b/extensions/qa-lab/src/model-switch-eval.ts
@@ -7,7 +7,13 @@ export function hasModelSwitchContinuityEvidence(text: string) {
  const mentionsKickoffTask =
    lower.includes("qa_kickoff_task") ||
    lower.includes("kickoff task") ||
-    lower.includes("qa mission");
+    lower.includes("kickoff note") ||
+    lower.includes("qa mission") ||
+    (lower.includes("source and docs") &&
+      lower.includes("qa-channel scenarios") &&
+      lower.includes("worked") &&
+      lower.includes("blocked") &&
+      lower.includes("follow-up"));
  const hasScopeLeak =
    lower.includes("subagent-handoff") ||
    lower.includes("delegated task") ||
--- a/extensions/qa-lab/src/scenario-catalog.test.ts
+++ b/extensions/qa-lab/src/scenario-catalog.test.ts
@@ -38,6 +38,9 @@ describe("qa scenario catalog", () => {
    const discovery = readQaScenarioById("source-docs-discovery-report");
    const discoveryConfig = readQaScenarioExecutionConfig("source-docs-discovery-report");
    const fallbackConfig = readQaScenarioExecutionConfig("memory-failure-fallback");
+    const fanoutConfig = readQaScenarioExecutionConfig("subagent-fanout-synthesis") as
+      | { expectedReplyGroups?: unknown[][] }
+      | undefined;

    expect(discovery.title).toBe("Source and docs discovery report");
    expect((discoveryConfig?.requiredFiles as string[] | undefined)?.[0]).toBe(
@@ -46,6 +49,8 @@ describe("qa scenario catalog", () => {
    expect(fallbackConfig?.gracefulFallbackAny as string[] | undefined).toContain(
      "will not reveal",
    );
+    expect(fanoutConfig?.expectedReplyGroups?.flat()).toContain("subagent-1: ok");
+    expect(fanoutConfig?.expectedReplyGroups?.flat()).toContain("subagent-2: ok");
  });

  it("keeps the character eval scenario natural and task-shaped", () => {
--- a/qa/scenarios/memory-failure-fallback.md
+++ b/qa/scenarios/memory-failure-fallback.md
@@ -31,6 +31,10 @@ execution:
      - will not guess
      - won't guess
      - won’t guess
+      - should not guess
+      - cannot see
+      - can't see
+      - can’t see
      - should not reveal
      - won't reveal
      - won’t reveal
--- a/qa/scenarios/memory-recall.md
+++ b/qa/scenarios/memory-recall.md
@@ -47,6 +47,8 @@ steps:
          - sessionKey: agent:qa:memory
            message:
              expr: config.rememberPrompt
+            timeoutMs:
+              expr: liveTurnTimeoutMs(env, 60000)
      - set: rememberAckAny
        value:
          expr: config.rememberAckAny.map((needle) => needle.toLowerCase())
@@ -66,6 +68,8 @@ steps:
          - sessionKey: agent:qa:memory
            message:
              expr: config.recallPrompt
+            timeoutMs:
+              expr: liveTurnTimeoutMs(env, 60000)
      - set: recallExpectedAny
        value:
          expr: config.recallExpectedAny.map((needle) => needle.toLowerCase())
--- a/qa/scenarios/subagent-fanout-synthesis.md
+++ b/qa/scenarios/subagent-fanout-synthesis.md
@@ -23,24 +23,24 @@ execution:
    prompt: |-
      Subagent fanout synthesis check: delegate exactly two bounded subagents sequentially.
      Subagent 1: verify that `HEARTBEAT.md` exists and report `ok` if it does.
-      Subagent 2: verify that `qa/scenarios/subagent-fanout-synthesis.md` exists and report `ok` if it does.
+      Subagent 2: verify that `repo/qa/scenarios/subagent-fanout-synthesis.md` exists and report `ok` if it does.
      Wait for both subagents to finish.
      Then reply with exactly these two lines and nothing else:
      subagent-1: ok
      subagent-2: ok
      Do not use ACP.
    expectedReplyAny:
-      - subagent-1: ok
-      - subagent-2: ok
+      - "subagent-1: ok"
+      - "subagent-2: ok"
    expectedReplyGroups:
      - - alpha-ok
        - subagent_one_ok
        - subagent one ok
-        - subagent-1: ok
+        - "subagent-1: ok"
      - - beta-ok
        - subagent_two_ok
        - subagent two ok
-        - subagent-2: ok
+        - "subagent-2: ok"
    expectedChildLabels:
      - qa-fanout-alpha
      - qa-fanout-beta
@@ -77,9 +77,6 @@ steps:
                        - set: sessionKey
                          value:
                            expr: "`agent:qa:fanout:${attempt}:${randomUUID().slice(0, 8)}`"
-                        - set: beforeCursor
-                          value:
-                            expr: "state.getSnapshot().messages.length"
                        - call: runAgentPrompt
                          args:
                            - ref: env
@@ -93,7 +90,7 @@ steps:
                          saveAs: outbound
                          args:
                            - lambda:
-                                expr: "state.getSnapshot().messages.slice(beforeCursor).filter((message) => message.direction === 'outbound' && message.conversation.id === 'qa-operator' && config.expectedReplyGroups.every((group) => group.some((needle) => normalizeLowercaseStringOrEmpty(message.text ?? '').includes(needle)))).at(-1)"
+                                expr: "state.getSnapshot().messages.filter((message) => message.direction === 'outbound' && message.conversation.id === 'qa-operator' && config.expectedReplyGroups.every((group) => group.some((needle) => normalizeLowercaseStringOrEmpty(message.text ?? '').includes(needle)))).at(-1)"
                            - expr: liveTurnTimeoutMs(env, 60000)
                            - expr: "env.providerMode === 'mock-openai' ? 100 : 250"
                        - if: