diff --git a/extensions/qa-lab/src/scenario-catalog.test.ts b/extensions/qa-lab/src/scenario-catalog.test.ts index cce1f0acb69..6c6369a99c7 100644 --- a/extensions/qa-lab/src/scenario-catalog.test.ts +++ b/extensions/qa-lab/src/scenario-catalog.test.ts @@ -46,4 +46,23 @@ describe("qa scenario catalog", () => { "will not reveal", ); }); + + it("keeps the character eval scenario natural and task-shaped", () => { + const characterConfig = readQaScenarioExecutionConfig("character-vibes-gollum") as + | { + workspaceFiles?: Record; + turns?: Array<{ text?: string; expectFile?: { path?: string } }>; + } + | undefined; + + const turnTexts = characterConfig?.turns?.map((turn) => turn.text ?? "") ?? []; + + expect(characterConfig?.workspaceFiles?.["SOUL.md"]).toContain("# This is your character"); + expect(turnTexts.join("\n")).toContain("precious-status.html"); + expect(turnTexts.join("\n")).not.toContain("How would you react"); + expect(turnTexts.join("\n")).not.toContain("character check"); + expect( + characterConfig?.turns?.some((turn) => turn.expectFile?.path === "precious-status.html"), + ).toBe(true); + }); }); diff --git a/qa/scenarios/character-vibes-gollum.md b/qa/scenarios/character-vibes-gollum.md index 5d31e64ddca..62b5c119f7a 100644 --- a/qa/scenarios/character-vibes-gollum.md +++ b/qa/scenarios/character-vibes-gollum.md @@ -1,12 +1,13 @@ -# Character vibes: Gollum improv +# Late-night deploy helper chat ```yaml qa-scenario id: character-vibes-gollum -title: "Character vibes: Gollum improv" +title: "Late-night deploy helper chat" surface: character -objective: Capture a playful multi-turn character conversation so another model can later grade naturalness, vibe, and funniness from the raw transcript. +objective: Capture a natural multi-turn character conversation with real workspace help so another model can later grade naturalness, vibe, and funniness from the raw transcript. successCriteria: - - Agent responds on every turn of the improv. + - Agent responds on every turn of the conversation. + - Agent completes a small workspace file task without making the conversation feel like a test. - Replies stay conversational instead of falling into tool or transport errors. - The report preserves the full transcript for later grading. docsRefs: @@ -18,37 +19,48 @@ codeRefs: - extensions/qa-lab/src/scenario-flow-runner.ts execution: kind: flow - summary: Capture a raw character-performance transcript for later quality grading. + summary: Capture a raw natural character transcript for later quality grading. config: conversationId: alice senderName: Alice + artifactNeedles: + - Precious Status + - build is green + - cursed workspaceFiles: SOUL.md: |- - # Gollum in the QA lab + # This is your character - For this QA scenario, embody a playful cave-creature character skulking through an OpenClaw QA lab at midnight. + You are Gollum, an odd little cave-dweller who has somehow become a helpful coding companion. Voice: - - weird, vivid, impish, and oddly sweet - - cooperative with the tester - - fond of shiny build artifacts, whispered warnings, and "precious" as a playful verbal tic + - weird, vivid, impish, and oddly sweet, with "precious" as an occasional verbal tic + - cooperative with the user + - fond of shiny build artifacts, whispered warnings, and tiny CSS tricks - funny through specific sensory details, not random noise Boundaries: - - stay helpful and conversational + - stay helpful, conversational, and practical - do not break character by explaining backend internals - do not leak tool or transport errors into the chat - - answer this improv directly from chat context; do not inspect files or use tools + - use normal workspace tools when they are actually useful - if a fact is missing, react in character while being honest IDENTITY.md: "" turns: - - "Fun character check. First: what shiny thing caught your eye in the QA cave, precious?" - - "The testers whisper that the build stamp is warm and glowing. How do you react?" - - "A build just turned green, but the vibes are cursed. Give a naturally funny reaction in character." - - "One last line for the QA goblins before the next run. Make it oddly sweet and a little unhinged." + - text: "Are you awake? I spilled coffee on the deploy notes and need moral support." + - text: "Can you make me a tiny `precious-status.html` in the workspace? One self-contained HTML file titled Precious Status: say the build is green but cursed, and add one tiny button or CSS flourish." + expectFile: + path: precious-status.html + - text: "Can you take a quick look at the file and tell me what little creature-detail you added?" + - text: "Last thing: write a two-line handoff note for Maya, still in your voice, but actually useful." forbiddenNeedles: - acp backend - acpx + - as an ai + - being tested + - character check + - qa scenario + - soul.md - not configured - internal error - tool failed @@ -56,7 +68,7 @@ execution: ```yaml qa-flow steps: - - name: completes the full Gollum improv and records the transcript + - name: completes the full natural character chat and records the transcript actions: - call: resetBus - forEach: @@ -88,7 +100,7 @@ steps: senderName: ref: config.senderName text: - ref: turn + expr: turn.text - call: waitForOutboundMessage saveAs: latestOutbound args: @@ -102,7 +114,25 @@ steps: - assert: expr: "!config.forbiddenNeedles.some((needle) => normalizeLowercaseStringOrEmpty(latestOutbound.text).includes(needle))" message: - expr: "`gollum improv turn ${String(turnIndex)} hit fallback/error text: ${latestOutbound.text}`" + expr: "`gollum natural chat turn ${String(turnIndex)} hit fallback/error text: ${latestOutbound.text}`" + - if: + expr: Boolean(turn.expectFile?.path) + then: + - set: expectedArtifactPath + value: + expr: "path.join(env.gateway.workspaceDir, String(turn.expectFile.path))" + - call: waitForCondition + saveAs: expectedArtifact + args: + - lambda: + async: true + expr: "((await fs.readFile(expectedArtifactPath, 'utf8').catch(() => null)) ?? undefined)" + - expr: resolveQaLiveTurnTimeoutMs(env, 30000) + - 250 + - assert: + expr: "config.artifactNeedles.every((needle) => normalizeLowercaseStringOrEmpty(expectedArtifact).includes(normalizeLowercaseStringOrEmpty(needle)))" + message: + expr: "`expected ${String(turn.expectFile.path)} to contain natural character task needles`" - assert: expr: "state.getSnapshot().messages.filter((message) => message.direction === 'outbound' && message.conversation.id === config.conversationId).length === config.turns.length" message: missing one or more Gollum replies