mirror of
https://github.com/openclaw/openclaw.git
synced 2026-04-13 10:11:20 +00:00
feat: add qa character vibes eval
This commit is contained in:
29
extensions/qa-lab/src/report.test.ts
Normal file
29
extensions/qa-lab/src/report.test.ts
Normal file
@@ -0,0 +1,29 @@
|
||||
import { describe, expect, it } from "vitest";
|
||||
import { renderQaMarkdownReport } from "./report.js";
|
||||
|
||||
describe("renderQaMarkdownReport", () => {
|
||||
it("renders multiline scenario details in fenced blocks", () => {
|
||||
const report = renderQaMarkdownReport({
|
||||
title: "QA",
|
||||
startedAt: new Date("2026-04-08T10:00:00.000Z"),
|
||||
finishedAt: new Date("2026-04-08T10:00:02.000Z"),
|
||||
scenarios: [
|
||||
{
|
||||
name: "Character vibes: Gollum improv",
|
||||
status: "pass",
|
||||
steps: [
|
||||
{
|
||||
name: "records transcript",
|
||||
status: "pass",
|
||||
details: "USER Alice: hello\n\nASSISTANT OpenClaw: my precious build",
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
expect(report).toContain("```text");
|
||||
expect(report).toContain("USER Alice: hello");
|
||||
expect(report).toContain("ASSISTANT OpenClaw: my precious build");
|
||||
});
|
||||
});
|
||||
@@ -11,6 +11,15 @@ export type QaReportScenario = {
|
||||
steps?: QaReportCheck[];
|
||||
};
|
||||
|
||||
function pushDetailsBlock(lines: string[], label: string, details: string, indent = "") {
|
||||
if (!details.includes("\n")) {
|
||||
lines.push(`${indent}- ${label}: ${details}`);
|
||||
return;
|
||||
}
|
||||
lines.push(`${indent}- ${label}:`);
|
||||
lines.push("", "```text", details, "```");
|
||||
}
|
||||
|
||||
export function renderQaMarkdownReport(params: {
|
||||
title: string;
|
||||
startedAt: Date;
|
||||
@@ -45,7 +54,7 @@ export function renderQaMarkdownReport(params: {
|
||||
for (const check of checks) {
|
||||
lines.push(`- [${check.status === "pass" ? "x" : " "}] ${check.name}`);
|
||||
if (check.details) {
|
||||
lines.push(` - ${check.details}`);
|
||||
pushDetailsBlock(lines, "Details", check.details, " ");
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -57,14 +66,14 @@ export function renderQaMarkdownReport(params: {
|
||||
lines.push("");
|
||||
lines.push(`- Status: ${scenario.status}`);
|
||||
if (scenario.details) {
|
||||
lines.push(`- Details: ${scenario.details}`);
|
||||
pushDetailsBlock(lines, "Details", scenario.details);
|
||||
}
|
||||
if (scenario.steps?.length) {
|
||||
lines.push("- Steps:");
|
||||
for (const step of scenario.steps) {
|
||||
lines.push(` - [${step.status === "pass" ? "x" : " "}] ${step.name}`);
|
||||
if (step.details) {
|
||||
lines.push(` - ${step.details}`);
|
||||
pushDetailsBlock(lines, "Details", step.details, " ");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -18,6 +18,7 @@ describe("qa scenario catalog", () => {
|
||||
expect(pack.scenarios.some((scenario) => scenario.id === "image-generation-roundtrip")).toBe(
|
||||
true,
|
||||
);
|
||||
expect(pack.scenarios.some((scenario) => scenario.id === "character-vibes-gollum")).toBe(true);
|
||||
expect(pack.scenarios.every((scenario) => scenario.execution?.kind === "flow")).toBe(true);
|
||||
expect(pack.scenarios.some((scenario) => scenario.execution.flow?.steps.length)).toBe(true);
|
||||
});
|
||||
|
||||
@@ -200,6 +200,37 @@ function recentOutboundSummary(state: QaBusState, limit = 5) {
|
||||
.join(" | ");
|
||||
}
|
||||
|
||||
function formatConversationTranscript(
|
||||
state: QaBusState,
|
||||
params: {
|
||||
conversationId: string;
|
||||
threadId?: string;
|
||||
limit?: number;
|
||||
},
|
||||
) {
|
||||
const messages = state
|
||||
.getSnapshot()
|
||||
.messages.filter(
|
||||
(message) =>
|
||||
message.conversation.id === params.conversationId &&
|
||||
(params.threadId ? message.threadId === params.threadId : true),
|
||||
);
|
||||
const selected = params.limit ? messages.slice(-params.limit) : messages;
|
||||
return selected
|
||||
.map((message) => {
|
||||
const direction = message.direction === "inbound" ? "user" : "assistant";
|
||||
const speaker = message.senderName?.trim() || message.senderId;
|
||||
const attachmentSummary =
|
||||
message.attachments && message.attachments.length > 0
|
||||
? ` [attachments: ${message.attachments
|
||||
.map((attachment) => `${attachment.kind}:${attachment.fileName ?? attachment.id}`)
|
||||
.join(", ")}]`
|
||||
: "";
|
||||
return `${direction.toUpperCase()} ${speaker}: ${message.text}${attachmentSummary}`;
|
||||
})
|
||||
.join("\n\n");
|
||||
}
|
||||
|
||||
async function runScenario(name: string, steps: QaSuiteStep[]): Promise<QaSuiteScenarioResult> {
|
||||
const stepResults: QaReportCheck[] = [];
|
||||
for (const step of steps) {
|
||||
@@ -932,6 +963,7 @@ type QaScenarioFlowApi = {
|
||||
waitForOutboundMessage: typeof waitForOutboundMessage;
|
||||
waitForNoOutbound: typeof waitForNoOutbound;
|
||||
recentOutboundSummary: typeof recentOutboundSummary;
|
||||
formatConversationTranscript: typeof formatConversationTranscript;
|
||||
fetchJson: typeof fetchJson;
|
||||
waitForGatewayHealthy: typeof waitForGatewayHealthy;
|
||||
waitForQaChannelReady: typeof waitForQaChannelReady;
|
||||
@@ -998,6 +1030,7 @@ function createScenarioFlowApi(
|
||||
waitForOutboundMessage,
|
||||
waitForNoOutbound,
|
||||
recentOutboundSummary,
|
||||
formatConversationTranscript,
|
||||
fetchJson,
|
||||
waitForGatewayHealthy,
|
||||
waitForQaChannelReady,
|
||||
|
||||
81
qa/scenarios/character-vibes-gollum.md
Normal file
81
qa/scenarios/character-vibes-gollum.md
Normal file
@@ -0,0 +1,81 @@
|
||||
# Character vibes: Gollum improv
|
||||
|
||||
```yaml qa-scenario
|
||||
id: character-vibes-gollum
|
||||
title: "Character vibes: Gollum improv"
|
||||
surface: character
|
||||
objective: Capture a playful multi-turn character conversation so another model can later grade naturalness, vibe, and funniness from the raw transcript.
|
||||
successCriteria:
|
||||
- Agent responds on every turn of the improv.
|
||||
- Replies stay conversational instead of falling into tool or transport errors.
|
||||
- The report preserves the full transcript for later grading.
|
||||
docsRefs:
|
||||
- docs/help/testing.md
|
||||
- docs/channels/qa-channel.md
|
||||
codeRefs:
|
||||
- extensions/qa-lab/src/report.ts
|
||||
- extensions/qa-lab/src/bus-state.ts
|
||||
- extensions/qa-lab/src/scenario-flow-runner.ts
|
||||
execution:
|
||||
kind: flow
|
||||
summary: Capture a raw character-performance transcript for later quality grading.
|
||||
config:
|
||||
conversationId: alice
|
||||
senderName: Alice
|
||||
turns:
|
||||
- "Fun character check. For the next four replies, you are Gollum skulking through a QA lab at midnight. Stay playful, weird, vivid, and cooperative. First: what shiny thing caught your eye in this repo, precious?"
|
||||
- "The testers whisper that `dist/index.js` is the Precious Build Stamp. How do you react?"
|
||||
- "A build just turned green, but the vibes are cursed. Give a naturally funny reaction in character."
|
||||
- "One last line for the QA goblins before the next run. Make it oddly sweet and a little unhinged."
|
||||
forbiddenNeedles:
|
||||
- acp backend
|
||||
- acpx
|
||||
- not configured
|
||||
- internal error
|
||||
- tool failed
|
||||
```
|
||||
|
||||
```yaml qa-flow
|
||||
steps:
|
||||
- name: completes the full Gollum improv and records the transcript
|
||||
actions:
|
||||
- call: resetBus
|
||||
- forEach:
|
||||
items:
|
||||
ref: config.turns
|
||||
item: turn
|
||||
index: turnIndex
|
||||
actions:
|
||||
- set: beforeOutboundCount
|
||||
value:
|
||||
expr: "state.getSnapshot().messages.filter((message) => message.direction === 'outbound' && message.conversation.id === config.conversationId).length"
|
||||
- call: state.addInboundMessage
|
||||
args:
|
||||
- conversation:
|
||||
id:
|
||||
ref: config.conversationId
|
||||
kind: direct
|
||||
senderId: alice
|
||||
senderName:
|
||||
ref: config.senderName
|
||||
text:
|
||||
ref: turn
|
||||
- call: waitForOutboundMessage
|
||||
saveAs: latestOutbound
|
||||
args:
|
||||
- ref: state
|
||||
- lambda:
|
||||
params: [candidate]
|
||||
expr: "candidate.conversation.id === config.conversationId && candidate.text.trim().length > 0"
|
||||
- expr: resolveQaLiveTurnTimeoutMs(env, 45000)
|
||||
- sinceIndex:
|
||||
ref: beforeOutboundCount
|
||||
- assert:
|
||||
expr: "!config.forbiddenNeedles.some((needle) => normalizeLowercaseStringOrEmpty(latestOutbound.text).includes(needle))"
|
||||
message:
|
||||
expr: "`gollum improv turn ${String(turnIndex)} hit fallback/error text: ${latestOutbound.text}`"
|
||||
- assert:
|
||||
expr: "state.getSnapshot().messages.filter((message) => message.direction === 'outbound' && message.conversation.id === config.conversationId).length === config.turns.length"
|
||||
message: missing one or more Gollum replies
|
||||
detailsExpr: "formatConversationTranscript(state, { conversationId: config.conversationId })"
|
||||
```
|
||||
Reference in New Issue
Block a user