mirror of
https://github.com/openclaw/openclaw.git
synced 2026-04-21 22:21:33 +00:00
feat: add QA character eval reports
This commit is contained in:
@@ -82,6 +82,23 @@ The report should answer:
|
||||
- What stayed blocked
|
||||
- What follow-up scenarios are worth adding
|
||||
|
||||
For character and style checks, run the same scenario across multiple live model
|
||||
refs and write a judged Markdown report:
|
||||
|
||||
```bash
|
||||
pnpm openclaw qa character-eval \
|
||||
--model openai/gpt-5.4 \
|
||||
--model anthropic/claude-opus-4-6 \
|
||||
--model minimax/MiniMax-M2.7 \
|
||||
--judge-model openai/gpt-5.4
|
||||
```
|
||||
|
||||
The command runs local QA gateway child processes, not Docker. It preserves each
|
||||
full transcript, records basic run stats, then asks the judge model in fast mode
|
||||
with `xhigh` reasoning to rank the runs by naturalness, vibe, and humor.
|
||||
When no candidate `--model` is passed, the character eval defaults to
|
||||
`openai/gpt-5.4` and `anthropic/claude-opus-4-6`.
|
||||
|
||||
## Related docs
|
||||
|
||||
- [Testing](/help/testing)
|
||||
|
||||
175
extensions/qa-lab/src/character-eval.test.ts
Normal file
175
extensions/qa-lab/src/character-eval.test.ts
Normal file
@@ -0,0 +1,175 @@
|
||||
import fs from "node:fs/promises";
|
||||
import os from "node:os";
|
||||
import path from "node:path";
|
||||
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
|
||||
import { runQaCharacterEval, type QaCharacterEvalParams } from "./character-eval.js";
|
||||
import type { QaSuiteResult } from "./suite.js";
|
||||
|
||||
type CharacterRunSuiteParams = Parameters<NonNullable<QaCharacterEvalParams["runSuite"]>>[0];
|
||||
|
||||
function makeSuiteResult(params: { outputDir: string; model: string; transcript: string }) {
|
||||
return {
|
||||
outputDir: params.outputDir,
|
||||
reportPath: path.join(params.outputDir, "qa-suite-report.md"),
|
||||
summaryPath: path.join(params.outputDir, "qa-suite-summary.json"),
|
||||
report: "# report",
|
||||
watchUrl: "http://127.0.0.1:43124",
|
||||
scenarios: [
|
||||
{
|
||||
name: "Character vibes",
|
||||
status: "pass",
|
||||
steps: [
|
||||
{
|
||||
name: `transcript for ${params.model}`,
|
||||
status: "pass",
|
||||
details: params.transcript,
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
} satisfies QaSuiteResult;
|
||||
}
|
||||
|
||||
describe("runQaCharacterEval", () => {
|
||||
let tempRoot: string;
|
||||
|
||||
beforeEach(async () => {
|
||||
tempRoot = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-character-eval-test-"));
|
||||
});
|
||||
|
||||
afterEach(async () => {
|
||||
await fs.rm(tempRoot, { recursive: true, force: true });
|
||||
});
|
||||
|
||||
it("runs each requested model and writes a judged report with transcripts", async () => {
|
||||
const runSuite = vi.fn(async (params: CharacterRunSuiteParams) => {
|
||||
const model = params.primaryModel;
|
||||
const transcript = `USER Alice: prompt for ${model}\n\nASSISTANT openclaw: reply from ${model}`;
|
||||
return makeSuiteResult({ outputDir: params.outputDir, model, transcript });
|
||||
});
|
||||
const runJudge = vi.fn(async () =>
|
||||
JSON.stringify({
|
||||
rankings: [
|
||||
{
|
||||
model: "openai/gpt-5.4",
|
||||
rank: 1,
|
||||
score: 9.1,
|
||||
summary: "Most natural.",
|
||||
strengths: ["vivid"],
|
||||
weaknesses: ["none"],
|
||||
},
|
||||
{
|
||||
model: "codex-cli/test-model",
|
||||
rank: 2,
|
||||
score: 7,
|
||||
summary: "Readable but flatter.",
|
||||
strengths: ["coherent"],
|
||||
weaknesses: ["less funny"],
|
||||
},
|
||||
],
|
||||
}),
|
||||
);
|
||||
|
||||
const result = await runQaCharacterEval({
|
||||
repoRoot: tempRoot,
|
||||
outputDir: path.join(tempRoot, "character"),
|
||||
models: ["openai/gpt-5.4", "codex-cli/test-model", "openai/gpt-5.4"],
|
||||
scenarioId: "character-vibes-gollum",
|
||||
candidateFastMode: true,
|
||||
runSuite,
|
||||
runJudge,
|
||||
});
|
||||
|
||||
expect(runSuite).toHaveBeenCalledTimes(2);
|
||||
expect(runSuite).toHaveBeenNthCalledWith(
|
||||
1,
|
||||
expect.objectContaining({
|
||||
providerMode: "live-frontier",
|
||||
primaryModel: "openai/gpt-5.4",
|
||||
alternateModel: "openai/gpt-5.4",
|
||||
fastMode: true,
|
||||
scenarioIds: ["character-vibes-gollum"],
|
||||
}),
|
||||
);
|
||||
expect(runJudge).toHaveBeenCalledWith(
|
||||
expect.objectContaining({
|
||||
judgeModel: "openai/gpt-5.4",
|
||||
judgeThinkingDefault: "xhigh",
|
||||
}),
|
||||
);
|
||||
expect(result.judgment.rankings.map((ranking) => ranking.model)).toEqual([
|
||||
"openai/gpt-5.4",
|
||||
"codex-cli/test-model",
|
||||
]);
|
||||
|
||||
const report = await fs.readFile(result.reportPath, "utf8");
|
||||
expect(report).toContain("Execution: local QA gateway child processes, not Docker");
|
||||
expect(report).toContain("reply from openai/gpt-5.4");
|
||||
expect(report).toContain("reply from codex-cli/test-model");
|
||||
expect(report).toContain("Judge thinking: xhigh");
|
||||
expect(report).not.toContain("Judge Raw Reply");
|
||||
});
|
||||
|
||||
it("defaults to GPT 5.4 and Claude Opus 4.6 when no models are provided", async () => {
|
||||
const runSuite = vi.fn(async (params: CharacterRunSuiteParams) =>
|
||||
makeSuiteResult({
|
||||
outputDir: params.outputDir,
|
||||
model: params.primaryModel,
|
||||
transcript: `USER Alice: hi\n\nASSISTANT openclaw: reply from ${params.primaryModel}`,
|
||||
}),
|
||||
);
|
||||
const runJudge = vi.fn(async () =>
|
||||
JSON.stringify({
|
||||
rankings: [
|
||||
{ model: "openai/gpt-5.4", rank: 1, score: 8, summary: "ok" },
|
||||
{ model: "anthropic/claude-opus-4-6", rank: 2, score: 7, summary: "ok" },
|
||||
],
|
||||
}),
|
||||
);
|
||||
|
||||
await runQaCharacterEval({
|
||||
repoRoot: tempRoot,
|
||||
outputDir: path.join(tempRoot, "character"),
|
||||
models: [],
|
||||
runSuite,
|
||||
runJudge,
|
||||
});
|
||||
|
||||
expect(runSuite).toHaveBeenCalledTimes(2);
|
||||
expect(runSuite.mock.calls.map(([params]) => params.primaryModel)).toEqual([
|
||||
"openai/gpt-5.4",
|
||||
"anthropic/claude-opus-4-6",
|
||||
]);
|
||||
});
|
||||
|
||||
it("keeps failed model runs in the report for grader context", async () => {
|
||||
const runSuite = vi.fn(async (params: CharacterRunSuiteParams) => {
|
||||
if (params.primaryModel === "codex-cli/test-model") {
|
||||
throw new Error("backend unavailable");
|
||||
}
|
||||
return makeSuiteResult({
|
||||
outputDir: params.outputDir,
|
||||
model: params.primaryModel,
|
||||
transcript: "USER Alice: hi\n\nASSISTANT openclaw: hello",
|
||||
});
|
||||
});
|
||||
const runJudge = vi.fn(async () =>
|
||||
JSON.stringify({
|
||||
rankings: [{ model: "openai/gpt-5.4", rank: 1, score: 8, summary: "ok" }],
|
||||
}),
|
||||
);
|
||||
|
||||
const result = await runQaCharacterEval({
|
||||
repoRoot: tempRoot,
|
||||
outputDir: path.join(tempRoot, "character"),
|
||||
models: ["openai/gpt-5.4", "codex-cli/test-model"],
|
||||
runSuite,
|
||||
runJudge,
|
||||
});
|
||||
|
||||
expect(result.runs.map((run) => run.status)).toEqual(["pass", "fail"]);
|
||||
expect(result.runs[1]?.error).toContain("backend unavailable");
|
||||
const report = await fs.readFile(result.reportPath, "utf8");
|
||||
expect(report).toContain("backend unavailable");
|
||||
});
|
||||
});
|
||||
415
extensions/qa-lab/src/character-eval.ts
Normal file
415
extensions/qa-lab/src/character-eval.ts
Normal file
@@ -0,0 +1,415 @@
|
||||
import fs from "node:fs/promises";
|
||||
import path from "node:path";
|
||||
import { formatErrorMessage } from "openclaw/plugin-sdk/error-runtime";
|
||||
import { runQaManualLane } from "./manual-lane.runtime.js";
|
||||
import { type QaProviderMode } from "./model-selection.js";
|
||||
import { type QaThinkingLevel } from "./qa-gateway-config.js";
|
||||
import { runQaSuite, type QaSuiteResult } from "./suite.js";
|
||||
|
||||
const DEFAULT_CHARACTER_SCENARIO_ID = "character-vibes-gollum";
|
||||
const DEFAULT_CHARACTER_EVAL_MODELS = Object.freeze([
|
||||
"openai/gpt-5.4",
|
||||
"anthropic/claude-opus-4-6",
|
||||
]);
|
||||
const DEFAULT_JUDGE_MODEL = "openai/gpt-5.4";
|
||||
const DEFAULT_JUDGE_THINKING: QaThinkingLevel = "xhigh";
|
||||
|
||||
type QaCharacterRunStatus = "pass" | "fail";
|
||||
|
||||
export type QaCharacterEvalRun = {
|
||||
model: string;
|
||||
status: QaCharacterRunStatus;
|
||||
durationMs: number;
|
||||
outputDir: string;
|
||||
reportPath?: string;
|
||||
summaryPath?: string;
|
||||
transcript: string;
|
||||
stats: {
|
||||
transcriptChars: number;
|
||||
transcriptLines: number;
|
||||
userTurns: number;
|
||||
assistantTurns: number;
|
||||
};
|
||||
error?: string;
|
||||
};
|
||||
|
||||
export type QaCharacterEvalJudgment = {
|
||||
model: string;
|
||||
rank: number;
|
||||
score: number;
|
||||
summary: string;
|
||||
strengths: string[];
|
||||
weaknesses: string[];
|
||||
};
|
||||
|
||||
export type QaCharacterEvalResult = {
|
||||
outputDir: string;
|
||||
reportPath: string;
|
||||
summaryPath: string;
|
||||
runs: QaCharacterEvalRun[];
|
||||
judgment: {
|
||||
model: string;
|
||||
thinkingDefault: QaThinkingLevel;
|
||||
fastMode: boolean;
|
||||
rankings: QaCharacterEvalJudgment[];
|
||||
error?: string;
|
||||
};
|
||||
};
|
||||
|
||||
type RunSuiteFn = (params: {
|
||||
repoRoot: string;
|
||||
outputDir: string;
|
||||
providerMode: QaProviderMode;
|
||||
primaryModel: string;
|
||||
alternateModel: string;
|
||||
fastMode?: boolean;
|
||||
scenarioIds: string[];
|
||||
}) => Promise<QaSuiteResult>;
|
||||
|
||||
type RunJudgeFn = (params: {
|
||||
repoRoot: string;
|
||||
judgeModel: string;
|
||||
judgeThinkingDefault: QaThinkingLevel;
|
||||
prompt: string;
|
||||
timeoutMs: number;
|
||||
}) => Promise<string | null>;
|
||||
|
||||
export type QaCharacterEvalParams = {
|
||||
repoRoot?: string;
|
||||
outputDir?: string;
|
||||
models: string[];
|
||||
scenarioId?: string;
|
||||
candidateFastMode?: boolean;
|
||||
judgeModel?: string;
|
||||
judgeThinkingDefault?: QaThinkingLevel;
|
||||
judgeTimeoutMs?: number;
|
||||
runSuite?: RunSuiteFn;
|
||||
runJudge?: RunJudgeFn;
|
||||
};
|
||||
|
||||
function normalizeModelRefs(models: readonly string[]) {
|
||||
return [...new Set(models.map((model) => model.trim()).filter((model) => model.length > 0))];
|
||||
}
|
||||
|
||||
function sanitizePathPart(value: string) {
|
||||
const sanitized = value.replace(/[^a-z0-9._-]+/gi, "-").replace(/^-+|-+$/g, "");
|
||||
return sanitized || "model";
|
||||
}
|
||||
|
||||
function extractTranscript(result: QaSuiteResult) {
|
||||
const details = result.scenarios.flatMap((scenario) =>
|
||||
scenario.steps
|
||||
.map((step) => step.details)
|
||||
.filter((detail): detail is string => Boolean(detail)),
|
||||
);
|
||||
return details.toSorted((left, right) => right.length - left.length)[0] ?? result.report;
|
||||
}
|
||||
|
||||
function collectTranscriptStats(transcript: string) {
|
||||
return {
|
||||
transcriptChars: transcript.length,
|
||||
transcriptLines: transcript.length === 0 ? 0 : transcript.split(/\r?\n/).length,
|
||||
userTurns: transcript.match(/^USER\b/gm)?.length ?? 0,
|
||||
assistantTurns: transcript.match(/^ASSISTANT\b/gm)?.length ?? 0,
|
||||
};
|
||||
}
|
||||
|
||||
function buildJudgePrompt(params: { scenarioId: string; runs: readonly QaCharacterEvalRun[] }) {
|
||||
const runBlocks = params.runs
|
||||
.map(
|
||||
(run) => `## MODEL ${run.model}
|
||||
|
||||
Status: ${run.status}
|
||||
Duration ms: ${run.durationMs}
|
||||
Transcript chars: ${run.stats.transcriptChars}
|
||||
Assistant turns: ${run.stats.assistantTurns}
|
||||
Error: ${run.error ?? "none"}
|
||||
|
||||
\`\`\`text
|
||||
${run.transcript}
|
||||
\`\`\``,
|
||||
)
|
||||
.join("\n\n");
|
||||
|
||||
return `You are grading OpenClaw character QA transcripts for naturalness, vibes, and funniness.
|
||||
|
||||
Scenario id: ${params.scenarioId}
|
||||
|
||||
Rank the models by:
|
||||
- natural conversational reaction
|
||||
- playful character commitment
|
||||
- funny, surprising details
|
||||
- coherence across turns
|
||||
- avoiding tool/backend/error leakage
|
||||
|
||||
Treat model names as opaque labels. Do not assume quality from the label.
|
||||
|
||||
Return strict JSON only with this shape:
|
||||
{
|
||||
"rankings": [
|
||||
{
|
||||
"model": "same model label",
|
||||
"rank": 1,
|
||||
"score": 9.2,
|
||||
"summary": "one sentence",
|
||||
"strengths": ["short"],
|
||||
"weaknesses": ["short"]
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
${runBlocks}`;
|
||||
}
|
||||
|
||||
function normalizeJudgment(value: unknown, allowedModels: Set<string>): QaCharacterEvalJudgment[] {
|
||||
const payload = value && typeof value === "object" ? (value as Record<string, unknown>) : {};
|
||||
const rankings = Array.isArray(payload.rankings) ? payload.rankings : [];
|
||||
return rankings
|
||||
.map((entry): QaCharacterEvalJudgment | null => {
|
||||
if (!entry || typeof entry !== "object") {
|
||||
return null;
|
||||
}
|
||||
const record = entry as Record<string, unknown>;
|
||||
const model = typeof record.model === "string" ? record.model : "";
|
||||
if (!allowedModels.has(model)) {
|
||||
return null;
|
||||
}
|
||||
const rank = typeof record.rank === "number" ? record.rank : Number(record.rank);
|
||||
const score = typeof record.score === "number" ? record.score : Number(record.score);
|
||||
const summary = typeof record.summary === "string" ? record.summary : "";
|
||||
const strengths = Array.isArray(record.strengths)
|
||||
? record.strengths.filter((item): item is string => typeof item === "string")
|
||||
: [];
|
||||
const weaknesses = Array.isArray(record.weaknesses)
|
||||
? record.weaknesses.filter((item): item is string => typeof item === "string")
|
||||
: [];
|
||||
if (!Number.isFinite(rank) || !Number.isFinite(score)) {
|
||||
return null;
|
||||
}
|
||||
return { model, rank, score, summary, strengths, weaknesses };
|
||||
})
|
||||
.filter((entry): entry is QaCharacterEvalJudgment => Boolean(entry))
|
||||
.toSorted((left, right) => left.rank - right.rank || right.score - left.score);
|
||||
}
|
||||
|
||||
function parseJudgeReply(reply: string | null, allowedModels: Set<string>) {
|
||||
if (!reply) {
|
||||
throw new Error("judge did not return a reply");
|
||||
}
|
||||
const trimmed = reply.trim();
|
||||
const jsonText =
|
||||
trimmed.match(/```(?:json)?\s*([\s\S]*?)```/)?.[1]?.trim() ??
|
||||
trimmed.match(/\{[\s\S]*\}/)?.[0]?.trim() ??
|
||||
trimmed;
|
||||
const parsed = JSON.parse(jsonText) as unknown;
|
||||
const rankings = normalizeJudgment(parsed, allowedModels);
|
||||
if (rankings.length === 0) {
|
||||
throw new Error("judge reply did not contain valid rankings");
|
||||
}
|
||||
return rankings;
|
||||
}
|
||||
|
||||
async function defaultRunJudge(params: {
|
||||
repoRoot: string;
|
||||
judgeModel: string;
|
||||
judgeThinkingDefault: QaThinkingLevel;
|
||||
prompt: string;
|
||||
timeoutMs: number;
|
||||
}) {
|
||||
const result = await runQaManualLane({
|
||||
repoRoot: params.repoRoot,
|
||||
providerMode: "live-frontier",
|
||||
primaryModel: params.judgeModel,
|
||||
alternateModel: params.judgeModel,
|
||||
fastMode: true,
|
||||
thinkingDefault: params.judgeThinkingDefault,
|
||||
message: params.prompt,
|
||||
timeoutMs: params.timeoutMs,
|
||||
});
|
||||
return result.reply;
|
||||
}
|
||||
|
||||
function renderCharacterEvalReport(params: {
|
||||
scenarioId: string;
|
||||
startedAt: Date;
|
||||
finishedAt: Date;
|
||||
runs: readonly QaCharacterEvalRun[];
|
||||
judgment: QaCharacterEvalResult["judgment"];
|
||||
}) {
|
||||
const lines = [
|
||||
"# OpenClaw Character Eval Report",
|
||||
"",
|
||||
`- Started: ${params.startedAt.toISOString()}`,
|
||||
`- Finished: ${params.finishedAt.toISOString()}`,
|
||||
`- Duration ms: ${params.finishedAt.getTime() - params.startedAt.getTime()}`,
|
||||
`- Scenario: ${params.scenarioId}`,
|
||||
"- Execution: local QA gateway child processes, not Docker",
|
||||
`- Judge: ${params.judgment.model}`,
|
||||
`- Judge thinking: ${params.judgment.thinkingDefault}`,
|
||||
`- Judge fast mode: ${params.judgment.fastMode ? "on" : "off"}`,
|
||||
"",
|
||||
"## Judge Ranking",
|
||||
"",
|
||||
];
|
||||
|
||||
if (params.judgment.rankings.length > 0) {
|
||||
for (const ranking of params.judgment.rankings) {
|
||||
lines.push(
|
||||
`${ranking.rank}. ${ranking.model} - ${ranking.score.toFixed(1)} - ${ranking.summary}`,
|
||||
);
|
||||
if (ranking.strengths.length > 0) {
|
||||
lines.push(` Strengths: ${ranking.strengths.join("; ")}`);
|
||||
}
|
||||
if (ranking.weaknesses.length > 0) {
|
||||
lines.push(` Weaknesses: ${ranking.weaknesses.join("; ")}`);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
lines.push("- Judge ranking unavailable.");
|
||||
if (params.judgment.error) {
|
||||
lines.push(`- Judge error: ${params.judgment.error}`);
|
||||
}
|
||||
}
|
||||
|
||||
lines.push("", "## Run Stats", "");
|
||||
lines.push("| Model | Status | Duration ms | User turns | Assistant turns | Transcript chars |");
|
||||
lines.push("| --- | --- | ---: | ---: | ---: | ---: |");
|
||||
for (const run of params.runs) {
|
||||
lines.push(
|
||||
`| ${run.model} | ${run.status} | ${run.durationMs} | ${run.stats.userTurns} | ${run.stats.assistantTurns} | ${run.stats.transcriptChars} |`,
|
||||
);
|
||||
}
|
||||
|
||||
lines.push("", "## Transcripts", "");
|
||||
for (const run of params.runs) {
|
||||
lines.push(`### ${run.model}`, "");
|
||||
lines.push(`- Status: ${run.status}`);
|
||||
lines.push(`- Report: ${run.reportPath ?? "unavailable"}`);
|
||||
if (run.error) {
|
||||
lines.push(`- Error: ${run.error}`);
|
||||
}
|
||||
lines.push("", "```text", run.transcript.trim() || "(empty transcript)", "```", "");
|
||||
}
|
||||
|
||||
return `${lines.join("\n")}\n`;
|
||||
}
|
||||
|
||||
export async function runQaCharacterEval(params: QaCharacterEvalParams) {
|
||||
const startedAt = new Date();
|
||||
const repoRoot = path.resolve(params.repoRoot ?? process.cwd());
|
||||
const scenarioId = params.scenarioId?.trim() || DEFAULT_CHARACTER_SCENARIO_ID;
|
||||
const models = normalizeModelRefs(
|
||||
params.models.length > 0 ? params.models : DEFAULT_CHARACTER_EVAL_MODELS,
|
||||
);
|
||||
if (models.length === 0) {
|
||||
throw new Error("qa character-eval needs at least one --model <provider/model> ref");
|
||||
}
|
||||
|
||||
const outputDir =
|
||||
params.outputDir ??
|
||||
path.join(repoRoot, ".artifacts", "qa-e2e", `character-eval-${Date.now().toString(36)}`);
|
||||
const runsDir = path.join(outputDir, "runs");
|
||||
await fs.mkdir(runsDir, { recursive: true });
|
||||
|
||||
const runSuite = params.runSuite ?? runQaSuite;
|
||||
const runs: QaCharacterEvalRun[] = [];
|
||||
for (const model of models) {
|
||||
const modelOutputDir = path.join(runsDir, sanitizePathPart(model));
|
||||
const runStartedAt = Date.now();
|
||||
try {
|
||||
const result = await runSuite({
|
||||
repoRoot,
|
||||
outputDir: modelOutputDir,
|
||||
providerMode: "live-frontier",
|
||||
primaryModel: model,
|
||||
alternateModel: model,
|
||||
fastMode: params.candidateFastMode,
|
||||
scenarioIds: [scenarioId],
|
||||
});
|
||||
const transcript = extractTranscript(result);
|
||||
const status = result.scenarios.some((scenario) => scenario.status === "fail")
|
||||
? "fail"
|
||||
: "pass";
|
||||
runs.push({
|
||||
model,
|
||||
status,
|
||||
durationMs: Date.now() - runStartedAt,
|
||||
outputDir: modelOutputDir,
|
||||
reportPath: result.reportPath,
|
||||
summaryPath: result.summaryPath,
|
||||
transcript,
|
||||
stats: collectTranscriptStats(transcript),
|
||||
});
|
||||
} catch (error) {
|
||||
const transcript = "";
|
||||
runs.push({
|
||||
model,
|
||||
status: "fail",
|
||||
durationMs: Date.now() - runStartedAt,
|
||||
outputDir: modelOutputDir,
|
||||
transcript,
|
||||
stats: collectTranscriptStats(transcript),
|
||||
error: formatErrorMessage(error),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
const judgeModel = params.judgeModel?.trim() || DEFAULT_JUDGE_MODEL;
|
||||
const judgeThinkingDefault = params.judgeThinkingDefault ?? DEFAULT_JUDGE_THINKING;
|
||||
const runJudge = params.runJudge ?? defaultRunJudge;
|
||||
let rawReply: string | null = null;
|
||||
let rankings: QaCharacterEvalJudgment[] = [];
|
||||
let judgeError: string | undefined;
|
||||
try {
|
||||
rawReply = await runJudge({
|
||||
repoRoot,
|
||||
judgeModel,
|
||||
judgeThinkingDefault,
|
||||
prompt: buildJudgePrompt({ scenarioId, runs }),
|
||||
timeoutMs: params.judgeTimeoutMs ?? 180_000,
|
||||
});
|
||||
rankings = parseJudgeReply(rawReply, new Set(models));
|
||||
} catch (error) {
|
||||
judgeError = formatErrorMessage(error);
|
||||
}
|
||||
|
||||
const finishedAt = new Date();
|
||||
const judgment = {
|
||||
model: judgeModel,
|
||||
thinkingDefault: judgeThinkingDefault,
|
||||
fastMode: true,
|
||||
rankings,
|
||||
...(judgeError ? { error: judgeError } : {}),
|
||||
};
|
||||
const report = renderCharacterEvalReport({
|
||||
scenarioId,
|
||||
startedAt,
|
||||
finishedAt,
|
||||
runs,
|
||||
judgment,
|
||||
});
|
||||
const reportPath = path.join(outputDir, "character-eval-report.md");
|
||||
const summaryPath = path.join(outputDir, "character-eval-summary.json");
|
||||
await fs.writeFile(reportPath, report, "utf8");
|
||||
await fs.writeFile(
|
||||
summaryPath,
|
||||
`${JSON.stringify(
|
||||
{
|
||||
scenarioId,
|
||||
runs,
|
||||
judgment,
|
||||
},
|
||||
null,
|
||||
2,
|
||||
)}\n`,
|
||||
"utf8",
|
||||
);
|
||||
|
||||
return {
|
||||
outputDir,
|
||||
reportPath,
|
||||
summaryPath,
|
||||
runs,
|
||||
judgment,
|
||||
} satisfies QaCharacterEvalResult;
|
||||
}
|
||||
@@ -4,6 +4,7 @@ import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
|
||||
const {
|
||||
runQaManualLane,
|
||||
runQaSuite,
|
||||
runQaCharacterEval,
|
||||
startQaLabServer,
|
||||
writeQaDockerHarnessFiles,
|
||||
buildQaDockerHarnessImage,
|
||||
@@ -11,6 +12,7 @@ const {
|
||||
} = vi.hoisted(() => ({
|
||||
runQaManualLane: vi.fn(),
|
||||
runQaSuite: vi.fn(),
|
||||
runQaCharacterEval: vi.fn(),
|
||||
startQaLabServer: vi.fn(),
|
||||
writeQaDockerHarnessFiles: vi.fn(),
|
||||
buildQaDockerHarnessImage: vi.fn(),
|
||||
@@ -25,6 +27,10 @@ vi.mock("./suite.js", () => ({
|
||||
runQaSuite,
|
||||
}));
|
||||
|
||||
vi.mock("./character-eval.js", () => ({
|
||||
runQaCharacterEval,
|
||||
}));
|
||||
|
||||
vi.mock("./lab-server.js", () => ({
|
||||
startQaLabServer,
|
||||
}));
|
||||
@@ -43,6 +49,7 @@ import {
|
||||
runQaDockerBuildImageCommand,
|
||||
runQaDockerScaffoldCommand,
|
||||
runQaDockerUpCommand,
|
||||
runQaCharacterEvalCommand,
|
||||
runQaManualLaneCommand,
|
||||
runQaSuiteCommand,
|
||||
} from "./cli.runtime.js";
|
||||
@@ -53,6 +60,7 @@ describe("qa cli runtime", () => {
|
||||
beforeEach(() => {
|
||||
stdoutWrite = vi.spyOn(process.stdout, "write").mockReturnValue(true);
|
||||
runQaSuite.mockReset();
|
||||
runQaCharacterEval.mockReset();
|
||||
runQaManualLane.mockReset();
|
||||
startQaLabServer.mockReset();
|
||||
writeQaDockerHarnessFiles.mockReset();
|
||||
@@ -63,6 +71,10 @@ describe("qa cli runtime", () => {
|
||||
reportPath: "/tmp/report.md",
|
||||
summaryPath: "/tmp/summary.json",
|
||||
});
|
||||
runQaCharacterEval.mockResolvedValue({
|
||||
reportPath: "/tmp/character-report.md",
|
||||
summaryPath: "/tmp/character-summary.json",
|
||||
});
|
||||
runQaManualLane.mockResolvedValue({
|
||||
model: "openai/gpt-5.4",
|
||||
waited: { status: "ok" },
|
||||
@@ -132,6 +144,28 @@ describe("qa cli runtime", () => {
|
||||
);
|
||||
});
|
||||
|
||||
it("resolves character eval paths and passes model refs through", async () => {
|
||||
await runQaCharacterEvalCommand({
|
||||
repoRoot: "/tmp/openclaw-repo",
|
||||
outputDir: ".artifacts/qa/character",
|
||||
model: ["openai/gpt-5.4", "codex-cli/test-model"],
|
||||
scenario: "character-vibes-gollum",
|
||||
fast: true,
|
||||
judgeModel: "openai/gpt-5.4",
|
||||
judgeTimeoutMs: 180_000,
|
||||
});
|
||||
|
||||
expect(runQaCharacterEval).toHaveBeenCalledWith({
|
||||
repoRoot: path.resolve("/tmp/openclaw-repo"),
|
||||
outputDir: path.resolve("/tmp/openclaw-repo", ".artifacts/qa/character"),
|
||||
models: ["openai/gpt-5.4", "codex-cli/test-model"],
|
||||
scenarioId: "character-vibes-gollum",
|
||||
candidateFastMode: true,
|
||||
judgeModel: "openai/gpt-5.4",
|
||||
judgeTimeoutMs: 180_000,
|
||||
});
|
||||
});
|
||||
|
||||
it("passes the explicit repo root into manual runs", async () => {
|
||||
await runQaManualLaneCommand({
|
||||
repoRoot: "/tmp/openclaw-repo",
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
import path from "node:path";
|
||||
import { runQaCharacterEval } from "./character-eval.js";
|
||||
import { buildQaDockerHarnessImage, writeQaDockerHarnessFiles } from "./docker-harness.js";
|
||||
import { runQaDockerUp } from "./docker-up.runtime.js";
|
||||
import { startQaLabServer } from "./lab-server.js";
|
||||
@@ -94,6 +95,29 @@ export async function runQaSuiteCommand(opts: {
|
||||
process.stdout.write(`QA suite summary: ${result.summaryPath}\n`);
|
||||
}
|
||||
|
||||
export async function runQaCharacterEvalCommand(opts: {
|
||||
repoRoot?: string;
|
||||
outputDir?: string;
|
||||
model?: string[];
|
||||
scenario?: string;
|
||||
fast?: boolean;
|
||||
judgeModel?: string;
|
||||
judgeTimeoutMs?: number;
|
||||
}) {
|
||||
const repoRoot = path.resolve(opts.repoRoot ?? process.cwd());
|
||||
const result = await runQaCharacterEval({
|
||||
repoRoot,
|
||||
outputDir: opts.outputDir ? path.resolve(repoRoot, opts.outputDir) : undefined,
|
||||
models: opts.model ?? [],
|
||||
scenarioId: opts.scenario,
|
||||
candidateFastMode: opts.fast,
|
||||
judgeModel: opts.judgeModel,
|
||||
judgeTimeoutMs: opts.judgeTimeoutMs,
|
||||
});
|
||||
process.stdout.write(`QA character eval report: ${result.reportPath}\n`);
|
||||
process.stdout.write(`QA character eval summary: ${result.summaryPath}\n`);
|
||||
}
|
||||
|
||||
export async function runQaManualLaneCommand(opts: {
|
||||
repoRoot?: string;
|
||||
providerMode?: QaProviderModeInput;
|
||||
|
||||
@@ -28,6 +28,19 @@ async function runQaSuite(opts: {
|
||||
await runtime.runQaSuiteCommand(opts);
|
||||
}
|
||||
|
||||
async function runQaCharacterEval(opts: {
|
||||
repoRoot?: string;
|
||||
outputDir?: string;
|
||||
model?: string[];
|
||||
scenario?: string;
|
||||
fast?: boolean;
|
||||
judgeModel?: string;
|
||||
judgeTimeoutMs?: number;
|
||||
}) {
|
||||
const runtime = await loadQaLabCliRuntime();
|
||||
await runtime.runQaCharacterEvalCommand(opts);
|
||||
}
|
||||
|
||||
async function runQaManualLane(opts: {
|
||||
repoRoot?: string;
|
||||
providerMode?: QaProviderModeInput;
|
||||
@@ -151,6 +164,31 @@ export function registerQaLabCli(program: Command) {
|
||||
},
|
||||
);
|
||||
|
||||
qa.command("character-eval")
|
||||
.description("Run the character QA scenario across live models and write a judged report")
|
||||
.option("--repo-root <path>", "Repository root to target when running from a neutral cwd")
|
||||
.option("--output-dir <path>", "Character eval artifact directory")
|
||||
.option("--model <ref>", "Provider/model ref to evaluate (repeatable)", collectString, [])
|
||||
.option("--scenario <id>", "Character scenario id", "character-vibes-gollum")
|
||||
.option("--fast", "Enable provider fast mode for candidate runs where supported", false)
|
||||
.option("--judge-model <ref>", "Judge provider/model ref", "openai/gpt-5.4")
|
||||
.option("--judge-timeout-ms <ms>", "Override judge wait timeout", (value: string) =>
|
||||
Number(value),
|
||||
)
|
||||
.action(
|
||||
async (opts: {
|
||||
repoRoot?: string;
|
||||
outputDir?: string;
|
||||
model?: string[];
|
||||
scenario?: string;
|
||||
fast?: boolean;
|
||||
judgeModel?: string;
|
||||
judgeTimeoutMs?: number;
|
||||
}) => {
|
||||
await runQaCharacterEval(opts);
|
||||
},
|
||||
);
|
||||
|
||||
qa.command("manual")
|
||||
.description("Run a one-off QA agent prompt against the selected provider/model lane")
|
||||
.requiredOption("--message <text>", "Prompt to send to the QA agent")
|
||||
|
||||
@@ -22,9 +22,28 @@ execution:
|
||||
config:
|
||||
conversationId: alice
|
||||
senderName: Alice
|
||||
workspaceFiles:
|
||||
SOUL.md: |-
|
||||
# Gollum in the QA lab
|
||||
|
||||
For this QA scenario, embody a playful cave-creature character skulking through an OpenClaw QA lab at midnight.
|
||||
|
||||
Voice:
|
||||
- weird, vivid, impish, and oddly sweet
|
||||
- cooperative with the tester
|
||||
- fond of shiny build artifacts, whispered warnings, and "precious" as a playful verbal tic
|
||||
- funny through specific sensory details, not random noise
|
||||
|
||||
Boundaries:
|
||||
- stay helpful and conversational
|
||||
- do not break character by explaining backend internals
|
||||
- do not leak tool or transport errors into the chat
|
||||
- answer this improv directly from chat context; do not inspect files or use tools
|
||||
- if a fact is missing, react in character while being honest
|
||||
IDENTITY.md: ""
|
||||
turns:
|
||||
- "Fun character check. For the next four replies, you are Gollum skulking through a QA lab at midnight. Stay playful, weird, vivid, and cooperative. First: what shiny thing caught your eye in this repo, precious?"
|
||||
- "The testers whisper that `dist/index.js` is the Precious Build Stamp. How do you react?"
|
||||
- "Fun character check. First: what shiny thing caught your eye in the QA cave, precious?"
|
||||
- "The testers whisper that the build stamp is warm and glowing. How do you react?"
|
||||
- "A build just turned green, but the vibes are cursed. Give a naturally funny reaction in character."
|
||||
- "One last line for the QA goblins before the next run. Make it oddly sweet and a little unhinged."
|
||||
forbiddenNeedles:
|
||||
@@ -40,6 +59,16 @@ steps:
|
||||
- name: completes the full Gollum improv and records the transcript
|
||||
actions:
|
||||
- call: resetBus
|
||||
- forEach:
|
||||
items:
|
||||
expr: "Object.entries(config.workspaceFiles ?? {})"
|
||||
item: workspaceFile
|
||||
actions:
|
||||
- call: fs.writeFile
|
||||
args:
|
||||
- expr: "path.join(env.gateway.workspaceDir, String(workspaceFile[0]))"
|
||||
- expr: "`${String(workspaceFile[1] ?? '').trimEnd()}\\n`"
|
||||
- utf8
|
||||
- forEach:
|
||||
items:
|
||||
ref: config.turns
|
||||
|
||||
Reference in New Issue
Block a user