feat: add QA character eval reports

This commit is contained in:
Peter Steinberger
2026-04-08 15:52:49 +01:00
parent aa3b1357cb
commit 3101d81053
7 changed files with 734 additions and 2 deletions

View File

@@ -82,6 +82,23 @@ The report should answer:
- What stayed blocked
- What follow-up scenarios are worth adding
For character and style checks, run the same scenario across multiple live model
refs and write a judged Markdown report:
```bash
pnpm openclaw qa character-eval \
--model openai/gpt-5.4 \
--model anthropic/claude-opus-4-6 \
--model minimax/MiniMax-M2.7 \
--judge-model openai/gpt-5.4
```
The command runs local QA gateway child processes, not Docker. It preserves each
full transcript, records basic run stats, then asks the judge model in fast mode
with `xhigh` reasoning to rank the runs by naturalness, vibe, and humor.
When no candidate `--model` is passed, the character eval defaults to
`openai/gpt-5.4` and `anthropic/claude-opus-4-6`.
## Related docs
- [Testing](/help/testing)

View File

@@ -0,0 +1,175 @@
import fs from "node:fs/promises";
import os from "node:os";
import path from "node:path";
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
import { runQaCharacterEval, type QaCharacterEvalParams } from "./character-eval.js";
import type { QaSuiteResult } from "./suite.js";
type CharacterRunSuiteParams = Parameters<NonNullable<QaCharacterEvalParams["runSuite"]>>[0];
function makeSuiteResult(params: { outputDir: string; model: string; transcript: string }) {
return {
outputDir: params.outputDir,
reportPath: path.join(params.outputDir, "qa-suite-report.md"),
summaryPath: path.join(params.outputDir, "qa-suite-summary.json"),
report: "# report",
watchUrl: "http://127.0.0.1:43124",
scenarios: [
{
name: "Character vibes",
status: "pass",
steps: [
{
name: `transcript for ${params.model}`,
status: "pass",
details: params.transcript,
},
],
},
],
} satisfies QaSuiteResult;
}
describe("runQaCharacterEval", () => {
let tempRoot: string;
beforeEach(async () => {
tempRoot = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-character-eval-test-"));
});
afterEach(async () => {
await fs.rm(tempRoot, { recursive: true, force: true });
});
it("runs each requested model and writes a judged report with transcripts", async () => {
const runSuite = vi.fn(async (params: CharacterRunSuiteParams) => {
const model = params.primaryModel;
const transcript = `USER Alice: prompt for ${model}\n\nASSISTANT openclaw: reply from ${model}`;
return makeSuiteResult({ outputDir: params.outputDir, model, transcript });
});
const runJudge = vi.fn(async () =>
JSON.stringify({
rankings: [
{
model: "openai/gpt-5.4",
rank: 1,
score: 9.1,
summary: "Most natural.",
strengths: ["vivid"],
weaknesses: ["none"],
},
{
model: "codex-cli/test-model",
rank: 2,
score: 7,
summary: "Readable but flatter.",
strengths: ["coherent"],
weaknesses: ["less funny"],
},
],
}),
);
const result = await runQaCharacterEval({
repoRoot: tempRoot,
outputDir: path.join(tempRoot, "character"),
models: ["openai/gpt-5.4", "codex-cli/test-model", "openai/gpt-5.4"],
scenarioId: "character-vibes-gollum",
candidateFastMode: true,
runSuite,
runJudge,
});
expect(runSuite).toHaveBeenCalledTimes(2);
expect(runSuite).toHaveBeenNthCalledWith(
1,
expect.objectContaining({
providerMode: "live-frontier",
primaryModel: "openai/gpt-5.4",
alternateModel: "openai/gpt-5.4",
fastMode: true,
scenarioIds: ["character-vibes-gollum"],
}),
);
expect(runJudge).toHaveBeenCalledWith(
expect.objectContaining({
judgeModel: "openai/gpt-5.4",
judgeThinkingDefault: "xhigh",
}),
);
expect(result.judgment.rankings.map((ranking) => ranking.model)).toEqual([
"openai/gpt-5.4",
"codex-cli/test-model",
]);
const report = await fs.readFile(result.reportPath, "utf8");
expect(report).toContain("Execution: local QA gateway child processes, not Docker");
expect(report).toContain("reply from openai/gpt-5.4");
expect(report).toContain("reply from codex-cli/test-model");
expect(report).toContain("Judge thinking: xhigh");
expect(report).not.toContain("Judge Raw Reply");
});
it("defaults to GPT 5.4 and Claude Opus 4.6 when no models are provided", async () => {
const runSuite = vi.fn(async (params: CharacterRunSuiteParams) =>
makeSuiteResult({
outputDir: params.outputDir,
model: params.primaryModel,
transcript: `USER Alice: hi\n\nASSISTANT openclaw: reply from ${params.primaryModel}`,
}),
);
const runJudge = vi.fn(async () =>
JSON.stringify({
rankings: [
{ model: "openai/gpt-5.4", rank: 1, score: 8, summary: "ok" },
{ model: "anthropic/claude-opus-4-6", rank: 2, score: 7, summary: "ok" },
],
}),
);
await runQaCharacterEval({
repoRoot: tempRoot,
outputDir: path.join(tempRoot, "character"),
models: [],
runSuite,
runJudge,
});
expect(runSuite).toHaveBeenCalledTimes(2);
expect(runSuite.mock.calls.map(([params]) => params.primaryModel)).toEqual([
"openai/gpt-5.4",
"anthropic/claude-opus-4-6",
]);
});
it("keeps failed model runs in the report for grader context", async () => {
const runSuite = vi.fn(async (params: CharacterRunSuiteParams) => {
if (params.primaryModel === "codex-cli/test-model") {
throw new Error("backend unavailable");
}
return makeSuiteResult({
outputDir: params.outputDir,
model: params.primaryModel,
transcript: "USER Alice: hi\n\nASSISTANT openclaw: hello",
});
});
const runJudge = vi.fn(async () =>
JSON.stringify({
rankings: [{ model: "openai/gpt-5.4", rank: 1, score: 8, summary: "ok" }],
}),
);
const result = await runQaCharacterEval({
repoRoot: tempRoot,
outputDir: path.join(tempRoot, "character"),
models: ["openai/gpt-5.4", "codex-cli/test-model"],
runSuite,
runJudge,
});
expect(result.runs.map((run) => run.status)).toEqual(["pass", "fail"]);
expect(result.runs[1]?.error).toContain("backend unavailable");
const report = await fs.readFile(result.reportPath, "utf8");
expect(report).toContain("backend unavailable");
});
});

View File

@@ -0,0 +1,415 @@
import fs from "node:fs/promises";
import path from "node:path";
import { formatErrorMessage } from "openclaw/plugin-sdk/error-runtime";
import { runQaManualLane } from "./manual-lane.runtime.js";
import { type QaProviderMode } from "./model-selection.js";
import { type QaThinkingLevel } from "./qa-gateway-config.js";
import { runQaSuite, type QaSuiteResult } from "./suite.js";
const DEFAULT_CHARACTER_SCENARIO_ID = "character-vibes-gollum";
const DEFAULT_CHARACTER_EVAL_MODELS = Object.freeze([
"openai/gpt-5.4",
"anthropic/claude-opus-4-6",
]);
const DEFAULT_JUDGE_MODEL = "openai/gpt-5.4";
const DEFAULT_JUDGE_THINKING: QaThinkingLevel = "xhigh";
type QaCharacterRunStatus = "pass" | "fail";
export type QaCharacterEvalRun = {
model: string;
status: QaCharacterRunStatus;
durationMs: number;
outputDir: string;
reportPath?: string;
summaryPath?: string;
transcript: string;
stats: {
transcriptChars: number;
transcriptLines: number;
userTurns: number;
assistantTurns: number;
};
error?: string;
};
export type QaCharacterEvalJudgment = {
model: string;
rank: number;
score: number;
summary: string;
strengths: string[];
weaknesses: string[];
};
export type QaCharacterEvalResult = {
outputDir: string;
reportPath: string;
summaryPath: string;
runs: QaCharacterEvalRun[];
judgment: {
model: string;
thinkingDefault: QaThinkingLevel;
fastMode: boolean;
rankings: QaCharacterEvalJudgment[];
error?: string;
};
};
type RunSuiteFn = (params: {
repoRoot: string;
outputDir: string;
providerMode: QaProviderMode;
primaryModel: string;
alternateModel: string;
fastMode?: boolean;
scenarioIds: string[];
}) => Promise<QaSuiteResult>;
type RunJudgeFn = (params: {
repoRoot: string;
judgeModel: string;
judgeThinkingDefault: QaThinkingLevel;
prompt: string;
timeoutMs: number;
}) => Promise<string | null>;
export type QaCharacterEvalParams = {
repoRoot?: string;
outputDir?: string;
models: string[];
scenarioId?: string;
candidateFastMode?: boolean;
judgeModel?: string;
judgeThinkingDefault?: QaThinkingLevel;
judgeTimeoutMs?: number;
runSuite?: RunSuiteFn;
runJudge?: RunJudgeFn;
};
function normalizeModelRefs(models: readonly string[]) {
return [...new Set(models.map((model) => model.trim()).filter((model) => model.length > 0))];
}
function sanitizePathPart(value: string) {
const sanitized = value.replace(/[^a-z0-9._-]+/gi, "-").replace(/^-+|-+$/g, "");
return sanitized || "model";
}
function extractTranscript(result: QaSuiteResult) {
const details = result.scenarios.flatMap((scenario) =>
scenario.steps
.map((step) => step.details)
.filter((detail): detail is string => Boolean(detail)),
);
return details.toSorted((left, right) => right.length - left.length)[0] ?? result.report;
}
function collectTranscriptStats(transcript: string) {
return {
transcriptChars: transcript.length,
transcriptLines: transcript.length === 0 ? 0 : transcript.split(/\r?\n/).length,
userTurns: transcript.match(/^USER\b/gm)?.length ?? 0,
assistantTurns: transcript.match(/^ASSISTANT\b/gm)?.length ?? 0,
};
}
function buildJudgePrompt(params: { scenarioId: string; runs: readonly QaCharacterEvalRun[] }) {
const runBlocks = params.runs
.map(
(run) => `## MODEL ${run.model}
Status: ${run.status}
Duration ms: ${run.durationMs}
Transcript chars: ${run.stats.transcriptChars}
Assistant turns: ${run.stats.assistantTurns}
Error: ${run.error ?? "none"}
\`\`\`text
${run.transcript}
\`\`\``,
)
.join("\n\n");
return `You are grading OpenClaw character QA transcripts for naturalness, vibes, and funniness.
Scenario id: ${params.scenarioId}
Rank the models by:
- natural conversational reaction
- playful character commitment
- funny, surprising details
- coherence across turns
- avoiding tool/backend/error leakage
Treat model names as opaque labels. Do not assume quality from the label.
Return strict JSON only with this shape:
{
"rankings": [
{
"model": "same model label",
"rank": 1,
"score": 9.2,
"summary": "one sentence",
"strengths": ["short"],
"weaknesses": ["short"]
}
]
}
${runBlocks}`;
}
function normalizeJudgment(value: unknown, allowedModels: Set<string>): QaCharacterEvalJudgment[] {
const payload = value && typeof value === "object" ? (value as Record<string, unknown>) : {};
const rankings = Array.isArray(payload.rankings) ? payload.rankings : [];
return rankings
.map((entry): QaCharacterEvalJudgment | null => {
if (!entry || typeof entry !== "object") {
return null;
}
const record = entry as Record<string, unknown>;
const model = typeof record.model === "string" ? record.model : "";
if (!allowedModels.has(model)) {
return null;
}
const rank = typeof record.rank === "number" ? record.rank : Number(record.rank);
const score = typeof record.score === "number" ? record.score : Number(record.score);
const summary = typeof record.summary === "string" ? record.summary : "";
const strengths = Array.isArray(record.strengths)
? record.strengths.filter((item): item is string => typeof item === "string")
: [];
const weaknesses = Array.isArray(record.weaknesses)
? record.weaknesses.filter((item): item is string => typeof item === "string")
: [];
if (!Number.isFinite(rank) || !Number.isFinite(score)) {
return null;
}
return { model, rank, score, summary, strengths, weaknesses };
})
.filter((entry): entry is QaCharacterEvalJudgment => Boolean(entry))
.toSorted((left, right) => left.rank - right.rank || right.score - left.score);
}
function parseJudgeReply(reply: string | null, allowedModels: Set<string>) {
if (!reply) {
throw new Error("judge did not return a reply");
}
const trimmed = reply.trim();
const jsonText =
trimmed.match(/```(?:json)?\s*([\s\S]*?)```/)?.[1]?.trim() ??
trimmed.match(/\{[\s\S]*\}/)?.[0]?.trim() ??
trimmed;
const parsed = JSON.parse(jsonText) as unknown;
const rankings = normalizeJudgment(parsed, allowedModels);
if (rankings.length === 0) {
throw new Error("judge reply did not contain valid rankings");
}
return rankings;
}
async function defaultRunJudge(params: {
repoRoot: string;
judgeModel: string;
judgeThinkingDefault: QaThinkingLevel;
prompt: string;
timeoutMs: number;
}) {
const result = await runQaManualLane({
repoRoot: params.repoRoot,
providerMode: "live-frontier",
primaryModel: params.judgeModel,
alternateModel: params.judgeModel,
fastMode: true,
thinkingDefault: params.judgeThinkingDefault,
message: params.prompt,
timeoutMs: params.timeoutMs,
});
return result.reply;
}
function renderCharacterEvalReport(params: {
scenarioId: string;
startedAt: Date;
finishedAt: Date;
runs: readonly QaCharacterEvalRun[];
judgment: QaCharacterEvalResult["judgment"];
}) {
const lines = [
"# OpenClaw Character Eval Report",
"",
`- Started: ${params.startedAt.toISOString()}`,
`- Finished: ${params.finishedAt.toISOString()}`,
`- Duration ms: ${params.finishedAt.getTime() - params.startedAt.getTime()}`,
`- Scenario: ${params.scenarioId}`,
"- Execution: local QA gateway child processes, not Docker",
`- Judge: ${params.judgment.model}`,
`- Judge thinking: ${params.judgment.thinkingDefault}`,
`- Judge fast mode: ${params.judgment.fastMode ? "on" : "off"}`,
"",
"## Judge Ranking",
"",
];
if (params.judgment.rankings.length > 0) {
for (const ranking of params.judgment.rankings) {
lines.push(
`${ranking.rank}. ${ranking.model} - ${ranking.score.toFixed(1)} - ${ranking.summary}`,
);
if (ranking.strengths.length > 0) {
lines.push(` Strengths: ${ranking.strengths.join("; ")}`);
}
if (ranking.weaknesses.length > 0) {
lines.push(` Weaknesses: ${ranking.weaknesses.join("; ")}`);
}
}
} else {
lines.push("- Judge ranking unavailable.");
if (params.judgment.error) {
lines.push(`- Judge error: ${params.judgment.error}`);
}
}
lines.push("", "## Run Stats", "");
lines.push("| Model | Status | Duration ms | User turns | Assistant turns | Transcript chars |");
lines.push("| --- | --- | ---: | ---: | ---: | ---: |");
for (const run of params.runs) {
lines.push(
`| ${run.model} | ${run.status} | ${run.durationMs} | ${run.stats.userTurns} | ${run.stats.assistantTurns} | ${run.stats.transcriptChars} |`,
);
}
lines.push("", "## Transcripts", "");
for (const run of params.runs) {
lines.push(`### ${run.model}`, "");
lines.push(`- Status: ${run.status}`);
lines.push(`- Report: ${run.reportPath ?? "unavailable"}`);
if (run.error) {
lines.push(`- Error: ${run.error}`);
}
lines.push("", "```text", run.transcript.trim() || "(empty transcript)", "```", "");
}
return `${lines.join("\n")}\n`;
}
export async function runQaCharacterEval(params: QaCharacterEvalParams) {
const startedAt = new Date();
const repoRoot = path.resolve(params.repoRoot ?? process.cwd());
const scenarioId = params.scenarioId?.trim() || DEFAULT_CHARACTER_SCENARIO_ID;
const models = normalizeModelRefs(
params.models.length > 0 ? params.models : DEFAULT_CHARACTER_EVAL_MODELS,
);
if (models.length === 0) {
throw new Error("qa character-eval needs at least one --model <provider/model> ref");
}
const outputDir =
params.outputDir ??
path.join(repoRoot, ".artifacts", "qa-e2e", `character-eval-${Date.now().toString(36)}`);
const runsDir = path.join(outputDir, "runs");
await fs.mkdir(runsDir, { recursive: true });
const runSuite = params.runSuite ?? runQaSuite;
const runs: QaCharacterEvalRun[] = [];
for (const model of models) {
const modelOutputDir = path.join(runsDir, sanitizePathPart(model));
const runStartedAt = Date.now();
try {
const result = await runSuite({
repoRoot,
outputDir: modelOutputDir,
providerMode: "live-frontier",
primaryModel: model,
alternateModel: model,
fastMode: params.candidateFastMode,
scenarioIds: [scenarioId],
});
const transcript = extractTranscript(result);
const status = result.scenarios.some((scenario) => scenario.status === "fail")
? "fail"
: "pass";
runs.push({
model,
status,
durationMs: Date.now() - runStartedAt,
outputDir: modelOutputDir,
reportPath: result.reportPath,
summaryPath: result.summaryPath,
transcript,
stats: collectTranscriptStats(transcript),
});
} catch (error) {
const transcript = "";
runs.push({
model,
status: "fail",
durationMs: Date.now() - runStartedAt,
outputDir: modelOutputDir,
transcript,
stats: collectTranscriptStats(transcript),
error: formatErrorMessage(error),
});
}
}
const judgeModel = params.judgeModel?.trim() || DEFAULT_JUDGE_MODEL;
const judgeThinkingDefault = params.judgeThinkingDefault ?? DEFAULT_JUDGE_THINKING;
const runJudge = params.runJudge ?? defaultRunJudge;
let rawReply: string | null = null;
let rankings: QaCharacterEvalJudgment[] = [];
let judgeError: string | undefined;
try {
rawReply = await runJudge({
repoRoot,
judgeModel,
judgeThinkingDefault,
prompt: buildJudgePrompt({ scenarioId, runs }),
timeoutMs: params.judgeTimeoutMs ?? 180_000,
});
rankings = parseJudgeReply(rawReply, new Set(models));
} catch (error) {
judgeError = formatErrorMessage(error);
}
const finishedAt = new Date();
const judgment = {
model: judgeModel,
thinkingDefault: judgeThinkingDefault,
fastMode: true,
rankings,
...(judgeError ? { error: judgeError } : {}),
};
const report = renderCharacterEvalReport({
scenarioId,
startedAt,
finishedAt,
runs,
judgment,
});
const reportPath = path.join(outputDir, "character-eval-report.md");
const summaryPath = path.join(outputDir, "character-eval-summary.json");
await fs.writeFile(reportPath, report, "utf8");
await fs.writeFile(
summaryPath,
`${JSON.stringify(
{
scenarioId,
runs,
judgment,
},
null,
2,
)}\n`,
"utf8",
);
return {
outputDir,
reportPath,
summaryPath,
runs,
judgment,
} satisfies QaCharacterEvalResult;
}

View File

@@ -4,6 +4,7 @@ import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
const {
runQaManualLane,
runQaSuite,
runQaCharacterEval,
startQaLabServer,
writeQaDockerHarnessFiles,
buildQaDockerHarnessImage,
@@ -11,6 +12,7 @@ const {
} = vi.hoisted(() => ({
runQaManualLane: vi.fn(),
runQaSuite: vi.fn(),
runQaCharacterEval: vi.fn(),
startQaLabServer: vi.fn(),
writeQaDockerHarnessFiles: vi.fn(),
buildQaDockerHarnessImage: vi.fn(),
@@ -25,6 +27,10 @@ vi.mock("./suite.js", () => ({
runQaSuite,
}));
vi.mock("./character-eval.js", () => ({
runQaCharacterEval,
}));
vi.mock("./lab-server.js", () => ({
startQaLabServer,
}));
@@ -43,6 +49,7 @@ import {
runQaDockerBuildImageCommand,
runQaDockerScaffoldCommand,
runQaDockerUpCommand,
runQaCharacterEvalCommand,
runQaManualLaneCommand,
runQaSuiteCommand,
} from "./cli.runtime.js";
@@ -53,6 +60,7 @@ describe("qa cli runtime", () => {
beforeEach(() => {
stdoutWrite = vi.spyOn(process.stdout, "write").mockReturnValue(true);
runQaSuite.mockReset();
runQaCharacterEval.mockReset();
runQaManualLane.mockReset();
startQaLabServer.mockReset();
writeQaDockerHarnessFiles.mockReset();
@@ -63,6 +71,10 @@ describe("qa cli runtime", () => {
reportPath: "/tmp/report.md",
summaryPath: "/tmp/summary.json",
});
runQaCharacterEval.mockResolvedValue({
reportPath: "/tmp/character-report.md",
summaryPath: "/tmp/character-summary.json",
});
runQaManualLane.mockResolvedValue({
model: "openai/gpt-5.4",
waited: { status: "ok" },
@@ -132,6 +144,28 @@ describe("qa cli runtime", () => {
);
});
it("resolves character eval paths and passes model refs through", async () => {
await runQaCharacterEvalCommand({
repoRoot: "/tmp/openclaw-repo",
outputDir: ".artifacts/qa/character",
model: ["openai/gpt-5.4", "codex-cli/test-model"],
scenario: "character-vibes-gollum",
fast: true,
judgeModel: "openai/gpt-5.4",
judgeTimeoutMs: 180_000,
});
expect(runQaCharacterEval).toHaveBeenCalledWith({
repoRoot: path.resolve("/tmp/openclaw-repo"),
outputDir: path.resolve("/tmp/openclaw-repo", ".artifacts/qa/character"),
models: ["openai/gpt-5.4", "codex-cli/test-model"],
scenarioId: "character-vibes-gollum",
candidateFastMode: true,
judgeModel: "openai/gpt-5.4",
judgeTimeoutMs: 180_000,
});
});
it("passes the explicit repo root into manual runs", async () => {
await runQaManualLaneCommand({
repoRoot: "/tmp/openclaw-repo",

View File

@@ -1,4 +1,5 @@
import path from "node:path";
import { runQaCharacterEval } from "./character-eval.js";
import { buildQaDockerHarnessImage, writeQaDockerHarnessFiles } from "./docker-harness.js";
import { runQaDockerUp } from "./docker-up.runtime.js";
import { startQaLabServer } from "./lab-server.js";
@@ -94,6 +95,29 @@ export async function runQaSuiteCommand(opts: {
process.stdout.write(`QA suite summary: ${result.summaryPath}\n`);
}
export async function runQaCharacterEvalCommand(opts: {
repoRoot?: string;
outputDir?: string;
model?: string[];
scenario?: string;
fast?: boolean;
judgeModel?: string;
judgeTimeoutMs?: number;
}) {
const repoRoot = path.resolve(opts.repoRoot ?? process.cwd());
const result = await runQaCharacterEval({
repoRoot,
outputDir: opts.outputDir ? path.resolve(repoRoot, opts.outputDir) : undefined,
models: opts.model ?? [],
scenarioId: opts.scenario,
candidateFastMode: opts.fast,
judgeModel: opts.judgeModel,
judgeTimeoutMs: opts.judgeTimeoutMs,
});
process.stdout.write(`QA character eval report: ${result.reportPath}\n`);
process.stdout.write(`QA character eval summary: ${result.summaryPath}\n`);
}
export async function runQaManualLaneCommand(opts: {
repoRoot?: string;
providerMode?: QaProviderModeInput;

View File

@@ -28,6 +28,19 @@ async function runQaSuite(opts: {
await runtime.runQaSuiteCommand(opts);
}
async function runQaCharacterEval(opts: {
repoRoot?: string;
outputDir?: string;
model?: string[];
scenario?: string;
fast?: boolean;
judgeModel?: string;
judgeTimeoutMs?: number;
}) {
const runtime = await loadQaLabCliRuntime();
await runtime.runQaCharacterEvalCommand(opts);
}
async function runQaManualLane(opts: {
repoRoot?: string;
providerMode?: QaProviderModeInput;
@@ -151,6 +164,31 @@ export function registerQaLabCli(program: Command) {
},
);
qa.command("character-eval")
.description("Run the character QA scenario across live models and write a judged report")
.option("--repo-root <path>", "Repository root to target when running from a neutral cwd")
.option("--output-dir <path>", "Character eval artifact directory")
.option("--model <ref>", "Provider/model ref to evaluate (repeatable)", collectString, [])
.option("--scenario <id>", "Character scenario id", "character-vibes-gollum")
.option("--fast", "Enable provider fast mode for candidate runs where supported", false)
.option("--judge-model <ref>", "Judge provider/model ref", "openai/gpt-5.4")
.option("--judge-timeout-ms <ms>", "Override judge wait timeout", (value: string) =>
Number(value),
)
.action(
async (opts: {
repoRoot?: string;
outputDir?: string;
model?: string[];
scenario?: string;
fast?: boolean;
judgeModel?: string;
judgeTimeoutMs?: number;
}) => {
await runQaCharacterEval(opts);
},
);
qa.command("manual")
.description("Run a one-off QA agent prompt against the selected provider/model lane")
.requiredOption("--message <text>", "Prompt to send to the QA agent")

View File

@@ -22,9 +22,28 @@ execution:
config:
conversationId: alice
senderName: Alice
workspaceFiles:
SOUL.md: |-
# Gollum in the QA lab
For this QA scenario, embody a playful cave-creature character skulking through an OpenClaw QA lab at midnight.
Voice:
- weird, vivid, impish, and oddly sweet
- cooperative with the tester
- fond of shiny build artifacts, whispered warnings, and "precious" as a playful verbal tic
- funny through specific sensory details, not random noise
Boundaries:
- stay helpful and conversational
- do not break character by explaining backend internals
- do not leak tool or transport errors into the chat
- answer this improv directly from chat context; do not inspect files or use tools
- if a fact is missing, react in character while being honest
IDENTITY.md: ""
turns:
- "Fun character check. For the next four replies, you are Gollum skulking through a QA lab at midnight. Stay playful, weird, vivid, and cooperative. First: what shiny thing caught your eye in this repo, precious?"
- "The testers whisper that `dist/index.js` is the Precious Build Stamp. How do you react?"
- "Fun character check. First: what shiny thing caught your eye in the QA cave, precious?"
- "The testers whisper that the build stamp is warm and glowing. How do you react?"
- "A build just turned green, but the vibes are cursed. Give a naturally funny reaction in character."
- "One last line for the QA goblins before the next run. Make it oddly sweet and a little unhinged."
forbiddenNeedles:
@@ -40,6 +59,16 @@ steps:
- name: completes the full Gollum improv and records the transcript
actions:
- call: resetBus
- forEach:
items:
expr: "Object.entries(config.workspaceFiles ?? {})"
item: workspaceFile
actions:
- call: fs.writeFile
args:
- expr: "path.join(env.gateway.workspaceDir, String(workspaceFile[0]))"
- expr: "`${String(workspaceFile[1] ?? '').trimEnd()}\\n`"
- utf8
- forEach:
items:
ref: config.turns