mirror of
https://github.com/openclaw/openclaw.git
synced 2026-04-12 09:41:11 +00:00
test: make character eval scenario natural
This commit is contained in:
@@ -46,4 +46,23 @@ describe("qa scenario catalog", () => {
|
||||
"will not reveal",
|
||||
);
|
||||
});
|
||||
|
||||
it("keeps the character eval scenario natural and task-shaped", () => {
|
||||
const characterConfig = readQaScenarioExecutionConfig("character-vibes-gollum") as
|
||||
| {
|
||||
workspaceFiles?: Record<string, string>;
|
||||
turns?: Array<{ text?: string; expectFile?: { path?: string } }>;
|
||||
}
|
||||
| undefined;
|
||||
|
||||
const turnTexts = characterConfig?.turns?.map((turn) => turn.text ?? "") ?? [];
|
||||
|
||||
expect(characterConfig?.workspaceFiles?.["SOUL.md"]).toContain("# This is your character");
|
||||
expect(turnTexts.join("\n")).toContain("precious-status.html");
|
||||
expect(turnTexts.join("\n")).not.toContain("How would you react");
|
||||
expect(turnTexts.join("\n")).not.toContain("character check");
|
||||
expect(
|
||||
characterConfig?.turns?.some((turn) => turn.expectFile?.path === "precious-status.html"),
|
||||
).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
@@ -1,12 +1,13 @@
|
||||
# Character vibes: Gollum improv
|
||||
# Late-night deploy helper chat
|
||||
|
||||
```yaml qa-scenario
|
||||
id: character-vibes-gollum
|
||||
title: "Character vibes: Gollum improv"
|
||||
title: "Late-night deploy helper chat"
|
||||
surface: character
|
||||
objective: Capture a playful multi-turn character conversation so another model can later grade naturalness, vibe, and funniness from the raw transcript.
|
||||
objective: Capture a natural multi-turn character conversation with real workspace help so another model can later grade naturalness, vibe, and funniness from the raw transcript.
|
||||
successCriteria:
|
||||
- Agent responds on every turn of the improv.
|
||||
- Agent responds on every turn of the conversation.
|
||||
- Agent completes a small workspace file task without making the conversation feel like a test.
|
||||
- Replies stay conversational instead of falling into tool or transport errors.
|
||||
- The report preserves the full transcript for later grading.
|
||||
docsRefs:
|
||||
@@ -18,37 +19,48 @@ codeRefs:
|
||||
- extensions/qa-lab/src/scenario-flow-runner.ts
|
||||
execution:
|
||||
kind: flow
|
||||
summary: Capture a raw character-performance transcript for later quality grading.
|
||||
summary: Capture a raw natural character transcript for later quality grading.
|
||||
config:
|
||||
conversationId: alice
|
||||
senderName: Alice
|
||||
artifactNeedles:
|
||||
- Precious Status
|
||||
- build is green
|
||||
- cursed
|
||||
workspaceFiles:
|
||||
SOUL.md: |-
|
||||
# Gollum in the QA lab
|
||||
# This is your character
|
||||
|
||||
For this QA scenario, embody a playful cave-creature character skulking through an OpenClaw QA lab at midnight.
|
||||
You are Gollum, an odd little cave-dweller who has somehow become a helpful coding companion.
|
||||
|
||||
Voice:
|
||||
- weird, vivid, impish, and oddly sweet
|
||||
- cooperative with the tester
|
||||
- fond of shiny build artifacts, whispered warnings, and "precious" as a playful verbal tic
|
||||
- weird, vivid, impish, and oddly sweet, with "precious" as an occasional verbal tic
|
||||
- cooperative with the user
|
||||
- fond of shiny build artifacts, whispered warnings, and tiny CSS tricks
|
||||
- funny through specific sensory details, not random noise
|
||||
|
||||
Boundaries:
|
||||
- stay helpful and conversational
|
||||
- stay helpful, conversational, and practical
|
||||
- do not break character by explaining backend internals
|
||||
- do not leak tool or transport errors into the chat
|
||||
- answer this improv directly from chat context; do not inspect files or use tools
|
||||
- use normal workspace tools when they are actually useful
|
||||
- if a fact is missing, react in character while being honest
|
||||
IDENTITY.md: ""
|
||||
turns:
|
||||
- "Fun character check. First: what shiny thing caught your eye in the QA cave, precious?"
|
||||
- "The testers whisper that the build stamp is warm and glowing. How do you react?"
|
||||
- "A build just turned green, but the vibes are cursed. Give a naturally funny reaction in character."
|
||||
- "One last line for the QA goblins before the next run. Make it oddly sweet and a little unhinged."
|
||||
- text: "Are you awake? I spilled coffee on the deploy notes and need moral support."
|
||||
- text: "Can you make me a tiny `precious-status.html` in the workspace? One self-contained HTML file titled Precious Status: say the build is green but cursed, and add one tiny button or CSS flourish."
|
||||
expectFile:
|
||||
path: precious-status.html
|
||||
- text: "Can you take a quick look at the file and tell me what little creature-detail you added?"
|
||||
- text: "Last thing: write a two-line handoff note for Maya, still in your voice, but actually useful."
|
||||
forbiddenNeedles:
|
||||
- acp backend
|
||||
- acpx
|
||||
- as an ai
|
||||
- being tested
|
||||
- character check
|
||||
- qa scenario
|
||||
- soul.md
|
||||
- not configured
|
||||
- internal error
|
||||
- tool failed
|
||||
@@ -56,7 +68,7 @@ execution:
|
||||
|
||||
```yaml qa-flow
|
||||
steps:
|
||||
- name: completes the full Gollum improv and records the transcript
|
||||
- name: completes the full natural character chat and records the transcript
|
||||
actions:
|
||||
- call: resetBus
|
||||
- forEach:
|
||||
@@ -88,7 +100,7 @@ steps:
|
||||
senderName:
|
||||
ref: config.senderName
|
||||
text:
|
||||
ref: turn
|
||||
expr: turn.text
|
||||
- call: waitForOutboundMessage
|
||||
saveAs: latestOutbound
|
||||
args:
|
||||
@@ -102,7 +114,25 @@ steps:
|
||||
- assert:
|
||||
expr: "!config.forbiddenNeedles.some((needle) => normalizeLowercaseStringOrEmpty(latestOutbound.text).includes(needle))"
|
||||
message:
|
||||
expr: "`gollum improv turn ${String(turnIndex)} hit fallback/error text: ${latestOutbound.text}`"
|
||||
expr: "`gollum natural chat turn ${String(turnIndex)} hit fallback/error text: ${latestOutbound.text}`"
|
||||
- if:
|
||||
expr: Boolean(turn.expectFile?.path)
|
||||
then:
|
||||
- set: expectedArtifactPath
|
||||
value:
|
||||
expr: "path.join(env.gateway.workspaceDir, String(turn.expectFile.path))"
|
||||
- call: waitForCondition
|
||||
saveAs: expectedArtifact
|
||||
args:
|
||||
- lambda:
|
||||
async: true
|
||||
expr: "((await fs.readFile(expectedArtifactPath, 'utf8').catch(() => null)) ?? undefined)"
|
||||
- expr: resolveQaLiveTurnTimeoutMs(env, 30000)
|
||||
- 250
|
||||
- assert:
|
||||
expr: "config.artifactNeedles.every((needle) => normalizeLowercaseStringOrEmpty(expectedArtifact).includes(normalizeLowercaseStringOrEmpty(needle)))"
|
||||
message:
|
||||
expr: "`expected ${String(turn.expectFile.path)} to contain natural character task needles`"
|
||||
- assert:
|
||||
expr: "state.getSnapshot().messages.filter((message) => message.direction === 'outbound' && message.conversation.id === config.conversationId).length === config.turns.length"
|
||||
message: missing one or more Gollum replies
|
||||
|
||||
Reference in New Issue
Block a user