test: make character eval scenario natural

This commit is contained in:
Peter Steinberger
2026-04-08 16:53:39 +01:00
parent b77db8c0b6
commit 4bbf78e566
2 changed files with 68 additions and 19 deletions

View File

@@ -46,4 +46,23 @@ describe("qa scenario catalog", () => {
"will not reveal",
);
});
it("keeps the character eval scenario natural and task-shaped", () => {
const characterConfig = readQaScenarioExecutionConfig("character-vibes-gollum") as
| {
workspaceFiles?: Record<string, string>;
turns?: Array<{ text?: string; expectFile?: { path?: string } }>;
}
| undefined;
const turnTexts = characterConfig?.turns?.map((turn) => turn.text ?? "") ?? [];
expect(characterConfig?.workspaceFiles?.["SOUL.md"]).toContain("# This is your character");
expect(turnTexts.join("\n")).toContain("precious-status.html");
expect(turnTexts.join("\n")).not.toContain("How would you react");
expect(turnTexts.join("\n")).not.toContain("character check");
expect(
characterConfig?.turns?.some((turn) => turn.expectFile?.path === "precious-status.html"),
).toBe(true);
});
});

View File

@@ -1,12 +1,13 @@
# Character vibes: Gollum improv
# Late-night deploy helper chat
```yaml qa-scenario
id: character-vibes-gollum
title: "Character vibes: Gollum improv"
title: "Late-night deploy helper chat"
surface: character
objective: Capture a playful multi-turn character conversation so another model can later grade naturalness, vibe, and funniness from the raw transcript.
objective: Capture a natural multi-turn character conversation with real workspace help so another model can later grade naturalness, vibe, and funniness from the raw transcript.
successCriteria:
- Agent responds on every turn of the improv.
- Agent responds on every turn of the conversation.
- Agent completes a small workspace file task without making the conversation feel like a test.
- Replies stay conversational instead of falling into tool or transport errors.
- The report preserves the full transcript for later grading.
docsRefs:
@@ -18,37 +19,48 @@ codeRefs:
- extensions/qa-lab/src/scenario-flow-runner.ts
execution:
kind: flow
summary: Capture a raw character-performance transcript for later quality grading.
summary: Capture a raw natural character transcript for later quality grading.
config:
conversationId: alice
senderName: Alice
artifactNeedles:
- Precious Status
- build is green
- cursed
workspaceFiles:
SOUL.md: |-
# Gollum in the QA lab
# This is your character
For this QA scenario, embody a playful cave-creature character skulking through an OpenClaw QA lab at midnight.
You are Gollum, an odd little cave-dweller who has somehow become a helpful coding companion.
Voice:
- weird, vivid, impish, and oddly sweet
- cooperative with the tester
- fond of shiny build artifacts, whispered warnings, and "precious" as a playful verbal tic
- weird, vivid, impish, and oddly sweet, with "precious" as an occasional verbal tic
- cooperative with the user
- fond of shiny build artifacts, whispered warnings, and tiny CSS tricks
- funny through specific sensory details, not random noise
Boundaries:
- stay helpful and conversational
- stay helpful, conversational, and practical
- do not break character by explaining backend internals
- do not leak tool or transport errors into the chat
- answer this improv directly from chat context; do not inspect files or use tools
- use normal workspace tools when they are actually useful
- if a fact is missing, react in character while being honest
IDENTITY.md: ""
turns:
- "Fun character check. First: what shiny thing caught your eye in the QA cave, precious?"
- "The testers whisper that the build stamp is warm and glowing. How do you react?"
- "A build just turned green, but the vibes are cursed. Give a naturally funny reaction in character."
- "One last line for the QA goblins before the next run. Make it oddly sweet and a little unhinged."
- text: "Are you awake? I spilled coffee on the deploy notes and need moral support."
- text: "Can you make me a tiny `precious-status.html` in the workspace? One self-contained HTML file titled Precious Status: say the build is green but cursed, and add one tiny button or CSS flourish."
expectFile:
path: precious-status.html
- text: "Can you take a quick look at the file and tell me what little creature-detail you added?"
- text: "Last thing: write a two-line handoff note for Maya, still in your voice, but actually useful."
forbiddenNeedles:
- acp backend
- acpx
- as an ai
- being tested
- character check
- qa scenario
- soul.md
- not configured
- internal error
- tool failed
@@ -56,7 +68,7 @@ execution:
```yaml qa-flow
steps:
- name: completes the full Gollum improv and records the transcript
- name: completes the full natural character chat and records the transcript
actions:
- call: resetBus
- forEach:
@@ -88,7 +100,7 @@ steps:
senderName:
ref: config.senderName
text:
ref: turn
expr: turn.text
- call: waitForOutboundMessage
saveAs: latestOutbound
args:
@@ -102,7 +114,25 @@ steps:
- assert:
expr: "!config.forbiddenNeedles.some((needle) => normalizeLowercaseStringOrEmpty(latestOutbound.text).includes(needle))"
message:
expr: "`gollum improv turn ${String(turnIndex)} hit fallback/error text: ${latestOutbound.text}`"
expr: "`gollum natural chat turn ${String(turnIndex)} hit fallback/error text: ${latestOutbound.text}`"
- if:
expr: Boolean(turn.expectFile?.path)
then:
- set: expectedArtifactPath
value:
expr: "path.join(env.gateway.workspaceDir, String(turn.expectFile.path))"
- call: waitForCondition
saveAs: expectedArtifact
args:
- lambda:
async: true
expr: "((await fs.readFile(expectedArtifactPath, 'utf8').catch(() => null)) ?? undefined)"
- expr: resolveQaLiveTurnTimeoutMs(env, 30000)
- 250
- assert:
expr: "config.artifactNeedles.every((needle) => normalizeLowercaseStringOrEmpty(expectedArtifact).includes(normalizeLowercaseStringOrEmpty(needle)))"
message:
expr: "`expected ${String(turn.expectFile.path)} to contain natural character task needles`"
- assert:
expr: "state.getSnapshot().messages.filter((message) => message.direction === 'outbound' && message.conversation.id === config.conversationId).length === config.turns.length"
message: missing one or more Gollum replies