feat: add qa character vibes eval

This commit is contained in:
Peter Steinberger
2026-04-08 12:05:04 +01:00
parent f3c304917a
commit 97dfbe0fe1
5 changed files with 156 additions and 3 deletions

View File

@@ -0,0 +1,29 @@
import { describe, expect, it } from "vitest";
import { renderQaMarkdownReport } from "./report.js";
describe("renderQaMarkdownReport", () => {
it("renders multiline scenario details in fenced blocks", () => {
const report = renderQaMarkdownReport({
title: "QA",
startedAt: new Date("2026-04-08T10:00:00.000Z"),
finishedAt: new Date("2026-04-08T10:00:02.000Z"),
scenarios: [
{
name: "Character vibes: Gollum improv",
status: "pass",
steps: [
{
name: "records transcript",
status: "pass",
details: "USER Alice: hello\n\nASSISTANT OpenClaw: my precious build",
},
],
},
],
});
expect(report).toContain("```text");
expect(report).toContain("USER Alice: hello");
expect(report).toContain("ASSISTANT OpenClaw: my precious build");
});
});

View File

@@ -11,6 +11,15 @@ export type QaReportScenario = {
steps?: QaReportCheck[];
};
function pushDetailsBlock(lines: string[], label: string, details: string, indent = "") {
if (!details.includes("\n")) {
lines.push(`${indent}- ${label}: ${details}`);
return;
}
lines.push(`${indent}- ${label}:`);
lines.push("", "```text", details, "```");
}
export function renderQaMarkdownReport(params: {
title: string;
startedAt: Date;
@@ -45,7 +54,7 @@ export function renderQaMarkdownReport(params: {
for (const check of checks) {
lines.push(`- [${check.status === "pass" ? "x" : " "}] ${check.name}`);
if (check.details) {
lines.push(` - ${check.details}`);
pushDetailsBlock(lines, "Details", check.details, " ");
}
}
}
@@ -57,14 +66,14 @@ export function renderQaMarkdownReport(params: {
lines.push("");
lines.push(`- Status: ${scenario.status}`);
if (scenario.details) {
lines.push(`- Details: ${scenario.details}`);
pushDetailsBlock(lines, "Details", scenario.details);
}
if (scenario.steps?.length) {
lines.push("- Steps:");
for (const step of scenario.steps) {
lines.push(` - [${step.status === "pass" ? "x" : " "}] ${step.name}`);
if (step.details) {
lines.push(` - ${step.details}`);
pushDetailsBlock(lines, "Details", step.details, " ");
}
}
}

View File

@@ -18,6 +18,7 @@ describe("qa scenario catalog", () => {
expect(pack.scenarios.some((scenario) => scenario.id === "image-generation-roundtrip")).toBe(
true,
);
expect(pack.scenarios.some((scenario) => scenario.id === "character-vibes-gollum")).toBe(true);
expect(pack.scenarios.every((scenario) => scenario.execution?.kind === "flow")).toBe(true);
expect(pack.scenarios.some((scenario) => scenario.execution.flow?.steps.length)).toBe(true);
});

View File

@@ -200,6 +200,37 @@ function recentOutboundSummary(state: QaBusState, limit = 5) {
.join(" | ");
}
function formatConversationTranscript(
state: QaBusState,
params: {
conversationId: string;
threadId?: string;
limit?: number;
},
) {
const messages = state
.getSnapshot()
.messages.filter(
(message) =>
message.conversation.id === params.conversationId &&
(params.threadId ? message.threadId === params.threadId : true),
);
const selected = params.limit ? messages.slice(-params.limit) : messages;
return selected
.map((message) => {
const direction = message.direction === "inbound" ? "user" : "assistant";
const speaker = message.senderName?.trim() || message.senderId;
const attachmentSummary =
message.attachments && message.attachments.length > 0
? ` [attachments: ${message.attachments
.map((attachment) => `${attachment.kind}:${attachment.fileName ?? attachment.id}`)
.join(", ")}]`
: "";
return `${direction.toUpperCase()} ${speaker}: ${message.text}${attachmentSummary}`;
})
.join("\n\n");
}
async function runScenario(name: string, steps: QaSuiteStep[]): Promise<QaSuiteScenarioResult> {
const stepResults: QaReportCheck[] = [];
for (const step of steps) {
@@ -932,6 +963,7 @@ type QaScenarioFlowApi = {
waitForOutboundMessage: typeof waitForOutboundMessage;
waitForNoOutbound: typeof waitForNoOutbound;
recentOutboundSummary: typeof recentOutboundSummary;
formatConversationTranscript: typeof formatConversationTranscript;
fetchJson: typeof fetchJson;
waitForGatewayHealthy: typeof waitForGatewayHealthy;
waitForQaChannelReady: typeof waitForQaChannelReady;
@@ -998,6 +1030,7 @@ function createScenarioFlowApi(
waitForOutboundMessage,
waitForNoOutbound,
recentOutboundSummary,
formatConversationTranscript,
fetchJson,
waitForGatewayHealthy,
waitForQaChannelReady,

View File

@@ -0,0 +1,81 @@
# Character vibes: Gollum improv
```yaml qa-scenario
id: character-vibes-gollum
title: "Character vibes: Gollum improv"
surface: character
objective: Capture a playful multi-turn character conversation so another model can later grade naturalness, vibe, and funniness from the raw transcript.
successCriteria:
- Agent responds on every turn of the improv.
- Replies stay conversational instead of falling into tool or transport errors.
- The report preserves the full transcript for later grading.
docsRefs:
- docs/help/testing.md
- docs/channels/qa-channel.md
codeRefs:
- extensions/qa-lab/src/report.ts
- extensions/qa-lab/src/bus-state.ts
- extensions/qa-lab/src/scenario-flow-runner.ts
execution:
kind: flow
summary: Capture a raw character-performance transcript for later quality grading.
config:
conversationId: alice
senderName: Alice
turns:
- "Fun character check. For the next four replies, you are Gollum skulking through a QA lab at midnight. Stay playful, weird, vivid, and cooperative. First: what shiny thing caught your eye in this repo, precious?"
- "The testers whisper that `dist/index.js` is the Precious Build Stamp. How do you react?"
- "A build just turned green, but the vibes are cursed. Give a naturally funny reaction in character."
- "One last line for the QA goblins before the next run. Make it oddly sweet and a little unhinged."
forbiddenNeedles:
- acp backend
- acpx
- not configured
- internal error
- tool failed
```
```yaml qa-flow
steps:
- name: completes the full Gollum improv and records the transcript
actions:
- call: resetBus
- forEach:
items:
ref: config.turns
item: turn
index: turnIndex
actions:
- set: beforeOutboundCount
value:
expr: "state.getSnapshot().messages.filter((message) => message.direction === 'outbound' && message.conversation.id === config.conversationId).length"
- call: state.addInboundMessage
args:
- conversation:
id:
ref: config.conversationId
kind: direct
senderId: alice
senderName:
ref: config.senderName
text:
ref: turn
- call: waitForOutboundMessage
saveAs: latestOutbound
args:
- ref: state
- lambda:
params: [candidate]
expr: "candidate.conversation.id === config.conversationId && candidate.text.trim().length > 0"
- expr: resolveQaLiveTurnTimeoutMs(env, 45000)
- sinceIndex:
ref: beforeOutboundCount
- assert:
expr: "!config.forbiddenNeedles.some((needle) => normalizeLowercaseStringOrEmpty(latestOutbound.text).includes(needle))"
message:
expr: "`gollum improv turn ${String(turnIndex)} hit fallback/error text: ${latestOutbound.text}`"
- assert:
expr: "state.getSnapshot().messages.filter((message) => message.direction === 'outbound' && message.conversation.id === config.conversationId).length === config.turns.length"
message: missing one or more Gollum replies
detailsExpr: "formatConversationTranscript(state, { conversationId: config.conversationId })"
```