feat: add QA character eval reports

2026-04-21 22:21:33 +00:00 · 2026-04-08 15:52:49 +01:00
parent aa3b1357cb
commit 3101d81053
7 changed files with 734 additions and 2 deletions
--- a/docs/concepts/qa-e2e-automation.md
+++ b/docs/concepts/qa-e2e-automation.md
@@ -82,6 +82,23 @@ The report should answer:
 - What stayed blocked
 - What follow-up scenarios are worth adding

+For character and style checks, run the same scenario across multiple live model
+refs and write a judged Markdown report:
+
+```bash
+pnpm openclaw qa character-eval \
+  --model openai/gpt-5.4 \
+  --model anthropic/claude-opus-4-6 \
+  --model minimax/MiniMax-M2.7 \
+  --judge-model openai/gpt-5.4
+```
+
+The command runs local QA gateway child processes, not Docker. It preserves each
+full transcript, records basic run stats, then asks the judge model in fast mode
+with `xhigh` reasoning to rank the runs by naturalness, vibe, and humor.
+When no candidate `--model` is passed, the character eval defaults to
+`openai/gpt-5.4` and `anthropic/claude-opus-4-6`.
+
 ## Related docs

 - [Testing](/help/testing)
--- a/extensions/qa-lab/src/character-eval.test.ts
+++ b/extensions/qa-lab/src/character-eval.test.ts
@@ -0,0 +1,175 @@
+import fs from "node:fs/promises";
+import os from "node:os";
+import path from "node:path";
+import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
+import { runQaCharacterEval, type QaCharacterEvalParams } from "./character-eval.js";
+import type { QaSuiteResult } from "./suite.js";
+
+type CharacterRunSuiteParams = Parameters<NonNullable<QaCharacterEvalParams["runSuite"]>>[0];
+
+function makeSuiteResult(params: { outputDir: string; model: string; transcript: string }) {
+  return {
+    outputDir: params.outputDir,
+    reportPath: path.join(params.outputDir, "qa-suite-report.md"),
+    summaryPath: path.join(params.outputDir, "qa-suite-summary.json"),
+    report: "# report",
+    watchUrl: "http://127.0.0.1:43124",
+    scenarios: [
+      {
+        name: "Character vibes",
+        status: "pass",
+        steps: [
+          {
+            name: `transcript for ${params.model}`,
+            status: "pass",
+            details: params.transcript,
+          },
+        ],
+      },
+    ],
+  } satisfies QaSuiteResult;
+}
+
+describe("runQaCharacterEval", () => {
+  let tempRoot: string;
+
+  beforeEach(async () => {
+    tempRoot = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-character-eval-test-"));
+  });
+
+  afterEach(async () => {
+    await fs.rm(tempRoot, { recursive: true, force: true });
+  });
+
+  it("runs each requested model and writes a judged report with transcripts", async () => {
+    const runSuite = vi.fn(async (params: CharacterRunSuiteParams) => {
+      const model = params.primaryModel;
+      const transcript = `USER Alice: prompt for ${model}\n\nASSISTANT openclaw: reply from ${model}`;
+      return makeSuiteResult({ outputDir: params.outputDir, model, transcript });
+    });
+    const runJudge = vi.fn(async () =>
+      JSON.stringify({
+        rankings: [
+          {
+            model: "openai/gpt-5.4",
+            rank: 1,
+            score: 9.1,
+            summary: "Most natural.",
+            strengths: ["vivid"],
+            weaknesses: ["none"],
+          },
+          {
+            model: "codex-cli/test-model",
+            rank: 2,
+            score: 7,
+            summary: "Readable but flatter.",
+            strengths: ["coherent"],
+            weaknesses: ["less funny"],
+          },
+        ],
+      }),
+    );
+
+    const result = await runQaCharacterEval({
+      repoRoot: tempRoot,
+      outputDir: path.join(tempRoot, "character"),
+      models: ["openai/gpt-5.4", "codex-cli/test-model", "openai/gpt-5.4"],
+      scenarioId: "character-vibes-gollum",
+      candidateFastMode: true,
+      runSuite,
+      runJudge,
+    });
+
+    expect(runSuite).toHaveBeenCalledTimes(2);
+    expect(runSuite).toHaveBeenNthCalledWith(
+      1,
+      expect.objectContaining({
+        providerMode: "live-frontier",
+        primaryModel: "openai/gpt-5.4",
+        alternateModel: "openai/gpt-5.4",
+        fastMode: true,
+        scenarioIds: ["character-vibes-gollum"],
+      }),
+    );
+    expect(runJudge).toHaveBeenCalledWith(
+      expect.objectContaining({
+        judgeModel: "openai/gpt-5.4",
+        judgeThinkingDefault: "xhigh",
+      }),
+    );
+    expect(result.judgment.rankings.map((ranking) => ranking.model)).toEqual([
+      "openai/gpt-5.4",
+      "codex-cli/test-model",
+    ]);
+
+    const report = await fs.readFile(result.reportPath, "utf8");
+    expect(report).toContain("Execution: local QA gateway child processes, not Docker");
+    expect(report).toContain("reply from openai/gpt-5.4");
+    expect(report).toContain("reply from codex-cli/test-model");
+    expect(report).toContain("Judge thinking: xhigh");
+    expect(report).not.toContain("Judge Raw Reply");
+  });
+
+  it("defaults to GPT 5.4 and Claude Opus 4.6 when no models are provided", async () => {
+    const runSuite = vi.fn(async (params: CharacterRunSuiteParams) =>
+      makeSuiteResult({
+        outputDir: params.outputDir,
+        model: params.primaryModel,
+        transcript: `USER Alice: hi\n\nASSISTANT openclaw: reply from ${params.primaryModel}`,
+      }),
+    );
+    const runJudge = vi.fn(async () =>
+      JSON.stringify({
+        rankings: [
+          { model: "openai/gpt-5.4", rank: 1, score: 8, summary: "ok" },
+          { model: "anthropic/claude-opus-4-6", rank: 2, score: 7, summary: "ok" },
+        ],
+      }),
+    );
+
+    await runQaCharacterEval({
+      repoRoot: tempRoot,
+      outputDir: path.join(tempRoot, "character"),
+      models: [],
+      runSuite,
+      runJudge,
+    });
+
+    expect(runSuite).toHaveBeenCalledTimes(2);
+    expect(runSuite.mock.calls.map(([params]) => params.primaryModel)).toEqual([
+      "openai/gpt-5.4",
+      "anthropic/claude-opus-4-6",
+    ]);
+  });
+
+  it("keeps failed model runs in the report for grader context", async () => {
+    const runSuite = vi.fn(async (params: CharacterRunSuiteParams) => {
+      if (params.primaryModel === "codex-cli/test-model") {
+        throw new Error("backend unavailable");
+      }
+      return makeSuiteResult({
+        outputDir: params.outputDir,
+        model: params.primaryModel,
+        transcript: "USER Alice: hi\n\nASSISTANT openclaw: hello",
+      });
+    });
+    const runJudge = vi.fn(async () =>
+      JSON.stringify({
+        rankings: [{ model: "openai/gpt-5.4", rank: 1, score: 8, summary: "ok" }],
+      }),
+    );
+
+    const result = await runQaCharacterEval({
+      repoRoot: tempRoot,
+      outputDir: path.join(tempRoot, "character"),
+      models: ["openai/gpt-5.4", "codex-cli/test-model"],
+      runSuite,
+      runJudge,
+    });
+
+    expect(result.runs.map((run) => run.status)).toEqual(["pass", "fail"]);
+    expect(result.runs[1]?.error).toContain("backend unavailable");
+    const report = await fs.readFile(result.reportPath, "utf8");
+    expect(report).toContain("backend unavailable");
+  });
+});
--- a/extensions/qa-lab/src/character-eval.ts
+++ b/extensions/qa-lab/src/character-eval.ts
@@ -0,0 +1,415 @@
+import fs from "node:fs/promises";
+import path from "node:path";
+import { formatErrorMessage } from "openclaw/plugin-sdk/error-runtime";
+import { runQaManualLane } from "./manual-lane.runtime.js";
+import { type QaProviderMode } from "./model-selection.js";
+import { type QaThinkingLevel } from "./qa-gateway-config.js";
+import { runQaSuite, type QaSuiteResult } from "./suite.js";
+
+const DEFAULT_CHARACTER_SCENARIO_ID = "character-vibes-gollum";
+const DEFAULT_CHARACTER_EVAL_MODELS = Object.freeze([
+  "openai/gpt-5.4",
+  "anthropic/claude-opus-4-6",
+]);
+const DEFAULT_JUDGE_MODEL = "openai/gpt-5.4";
+const DEFAULT_JUDGE_THINKING: QaThinkingLevel = "xhigh";
+
+type QaCharacterRunStatus = "pass" | "fail";
+
+export type QaCharacterEvalRun = {
+  model: string;
+  status: QaCharacterRunStatus;
+  durationMs: number;
+  outputDir: string;
+  reportPath?: string;
+  summaryPath?: string;
+  transcript: string;
+  stats: {
+    transcriptChars: number;
+    transcriptLines: number;
+    userTurns: number;
+    assistantTurns: number;
+  };
+  error?: string;
+};
+
+export type QaCharacterEvalJudgment = {
+  model: string;
+  rank: number;
+  score: number;
+  summary: string;
+  strengths: string[];
+  weaknesses: string[];
+};
+
+export type QaCharacterEvalResult = {
+  outputDir: string;
+  reportPath: string;
+  summaryPath: string;
+  runs: QaCharacterEvalRun[];
+  judgment: {
+    model: string;
+    thinkingDefault: QaThinkingLevel;
+    fastMode: boolean;
+    rankings: QaCharacterEvalJudgment[];
+    error?: string;
+  };
+};
+
+type RunSuiteFn = (params: {
+  repoRoot: string;
+  outputDir: string;
+  providerMode: QaProviderMode;
+  primaryModel: string;
+  alternateModel: string;
+  fastMode?: boolean;
+  scenarioIds: string[];
+}) => Promise<QaSuiteResult>;
+
+type RunJudgeFn = (params: {
+  repoRoot: string;
+  judgeModel: string;
+  judgeThinkingDefault: QaThinkingLevel;
+  prompt: string;
+  timeoutMs: number;
+}) => Promise<string | null>;
+
+export type QaCharacterEvalParams = {
+  repoRoot?: string;
+  outputDir?: string;
+  models: string[];
+  scenarioId?: string;
+  candidateFastMode?: boolean;
+  judgeModel?: string;
+  judgeThinkingDefault?: QaThinkingLevel;
+  judgeTimeoutMs?: number;
+  runSuite?: RunSuiteFn;
+  runJudge?: RunJudgeFn;
+};
+
+function normalizeModelRefs(models: readonly string[]) {
+  return [...new Set(models.map((model) => model.trim()).filter((model) => model.length > 0))];
+}
+
+function sanitizePathPart(value: string) {
+  const sanitized = value.replace(/[^a-z0-9._-]+/gi, "-").replace(/^-+|-+$/g, "");
+  return sanitized || "model";
+}
+
+function extractTranscript(result: QaSuiteResult) {
+  const details = result.scenarios.flatMap((scenario) =>
+    scenario.steps
+      .map((step) => step.details)
+      .filter((detail): detail is string => Boolean(detail)),
+  );
+  return details.toSorted((left, right) => right.length - left.length)[0] ?? result.report;
+}
+
+function collectTranscriptStats(transcript: string) {
+  return {
+    transcriptChars: transcript.length,
+    transcriptLines: transcript.length === 0 ? 0 : transcript.split(/\r?\n/).length,
+    userTurns: transcript.match(/^USER\b/gm)?.length ?? 0,
+    assistantTurns: transcript.match(/^ASSISTANT\b/gm)?.length ?? 0,
+  };
+}
+
+function buildJudgePrompt(params: { scenarioId: string; runs: readonly QaCharacterEvalRun[] }) {
+  const runBlocks = params.runs
+    .map(
+      (run) => `## MODEL ${run.model}
+
+Status: ${run.status}
+Duration ms: ${run.durationMs}
+Transcript chars: ${run.stats.transcriptChars}
+Assistant turns: ${run.stats.assistantTurns}
+Error: ${run.error ?? "none"}
+
+\`\`\`text
+${run.transcript}
+\`\`\``,
+    )
+    .join("\n\n");
+
+  return `You are grading OpenClaw character QA transcripts for naturalness, vibes, and funniness.
+
+Scenario id: ${params.scenarioId}
+
+Rank the models by:
+- natural conversational reaction
+- playful character commitment
+- funny, surprising details
+- coherence across turns
+- avoiding tool/backend/error leakage
+
+Treat model names as opaque labels. Do not assume quality from the label.
+
+Return strict JSON only with this shape:
+{
+  "rankings": [
+    {
+      "model": "same model label",
+      "rank": 1,
+      "score": 9.2,
+      "summary": "one sentence",
+      "strengths": ["short"],
+      "weaknesses": ["short"]
+    }
+  ]
+}
+
+${runBlocks}`;
+}
+
+function normalizeJudgment(value: unknown, allowedModels: Set<string>): QaCharacterEvalJudgment[] {
+  const payload = value && typeof value === "object" ? (value as Record<string, unknown>) : {};
+  const rankings = Array.isArray(payload.rankings) ? payload.rankings : [];
+  return rankings
+    .map((entry): QaCharacterEvalJudgment | null => {
+      if (!entry || typeof entry !== "object") {
+        return null;
+      }
+      const record = entry as Record<string, unknown>;
+      const model = typeof record.model === "string" ? record.model : "";
+      if (!allowedModels.has(model)) {
+        return null;
+      }
+      const rank = typeof record.rank === "number" ? record.rank : Number(record.rank);
+      const score = typeof record.score === "number" ? record.score : Number(record.score);
+      const summary = typeof record.summary === "string" ? record.summary : "";
+      const strengths = Array.isArray(record.strengths)
+        ? record.strengths.filter((item): item is string => typeof item === "string")
+        : [];
+      const weaknesses = Array.isArray(record.weaknesses)
+        ? record.weaknesses.filter((item): item is string => typeof item === "string")
+        : [];
+      if (!Number.isFinite(rank) || !Number.isFinite(score)) {
+        return null;
+      }
+      return { model, rank, score, summary, strengths, weaknesses };
+    })
+    .filter((entry): entry is QaCharacterEvalJudgment => Boolean(entry))
+    .toSorted((left, right) => left.rank - right.rank || right.score - left.score);
+}
+
+function parseJudgeReply(reply: string | null, allowedModels: Set<string>) {
+  if (!reply) {
+    throw new Error("judge did not return a reply");
+  }
+  const trimmed = reply.trim();
+  const jsonText =
+    trimmed.match(/```(?:json)?\s*([\s\S]*?)```/)?.[1]?.trim() ??
+    trimmed.match(/\{[\s\S]*\}/)?.[0]?.trim() ??
+    trimmed;
+  const parsed = JSON.parse(jsonText) as unknown;
+  const rankings = normalizeJudgment(parsed, allowedModels);
+  if (rankings.length === 0) {
+    throw new Error("judge reply did not contain valid rankings");
+  }
+  return rankings;
+}
+
+async function defaultRunJudge(params: {
+  repoRoot: string;
+  judgeModel: string;
+  judgeThinkingDefault: QaThinkingLevel;
+  prompt: string;
+  timeoutMs: number;
+}) {
+  const result = await runQaManualLane({
+    repoRoot: params.repoRoot,
+    providerMode: "live-frontier",
+    primaryModel: params.judgeModel,
+    alternateModel: params.judgeModel,
+    fastMode: true,
+    thinkingDefault: params.judgeThinkingDefault,
+    message: params.prompt,
+    timeoutMs: params.timeoutMs,
+  });
+  return result.reply;
+}
+
+function renderCharacterEvalReport(params: {
+  scenarioId: string;
+  startedAt: Date;
+  finishedAt: Date;
+  runs: readonly QaCharacterEvalRun[];
+  judgment: QaCharacterEvalResult["judgment"];
+}) {
+  const lines = [
+    "# OpenClaw Character Eval Report",
+    "",
+    `- Started: ${params.startedAt.toISOString()}`,
+    `- Finished: ${params.finishedAt.toISOString()}`,
+    `- Duration ms: ${params.finishedAt.getTime() - params.startedAt.getTime()}`,
+    `- Scenario: ${params.scenarioId}`,
+    "- Execution: local QA gateway child processes, not Docker",
+    `- Judge: ${params.judgment.model}`,
+    `- Judge thinking: ${params.judgment.thinkingDefault}`,
+    `- Judge fast mode: ${params.judgment.fastMode ? "on" : "off"}`,
+    "",
+    "## Judge Ranking",
+    "",
+  ];
+
+  if (params.judgment.rankings.length > 0) {
+    for (const ranking of params.judgment.rankings) {
+      lines.push(
+        `${ranking.rank}. ${ranking.model} - ${ranking.score.toFixed(1)} - ${ranking.summary}`,
+      );
+      if (ranking.strengths.length > 0) {
+        lines.push(`   Strengths: ${ranking.strengths.join("; ")}`);
+      }
+      if (ranking.weaknesses.length > 0) {
+        lines.push(`   Weaknesses: ${ranking.weaknesses.join("; ")}`);
+      }
+    }
+  } else {
+    lines.push("- Judge ranking unavailable.");
+    if (params.judgment.error) {
+      lines.push(`- Judge error: ${params.judgment.error}`);
+    }
+  }
+
+  lines.push("", "## Run Stats", "");
+  lines.push("| Model | Status | Duration ms | User turns | Assistant turns | Transcript chars |");
+  lines.push("| --- | --- | ---: | ---: | ---: | ---: |");
+  for (const run of params.runs) {
+    lines.push(
+      `| ${run.model} | ${run.status} | ${run.durationMs} | ${run.stats.userTurns} | ${run.stats.assistantTurns} | ${run.stats.transcriptChars} |`,
+    );
+  }
+
+  lines.push("", "## Transcripts", "");
+  for (const run of params.runs) {
+    lines.push(`### ${run.model}`, "");
+    lines.push(`- Status: ${run.status}`);
+    lines.push(`- Report: ${run.reportPath ?? "unavailable"}`);
+    if (run.error) {
+      lines.push(`- Error: ${run.error}`);
+    }
+    lines.push("", "```text", run.transcript.trim() || "(empty transcript)", "```", "");
+  }
+
+  return `${lines.join("\n")}\n`;
+}
+
+export async function runQaCharacterEval(params: QaCharacterEvalParams) {
+  const startedAt = new Date();
+  const repoRoot = path.resolve(params.repoRoot ?? process.cwd());
+  const scenarioId = params.scenarioId?.trim() || DEFAULT_CHARACTER_SCENARIO_ID;
+  const models = normalizeModelRefs(
+    params.models.length > 0 ? params.models : DEFAULT_CHARACTER_EVAL_MODELS,
+  );
+  if (models.length === 0) {
+    throw new Error("qa character-eval needs at least one --model <provider/model> ref");
+  }
+
+  const outputDir =
+    params.outputDir ??
+    path.join(repoRoot, ".artifacts", "qa-e2e", `character-eval-${Date.now().toString(36)}`);
+  const runsDir = path.join(outputDir, "runs");
+  await fs.mkdir(runsDir, { recursive: true });
+
+  const runSuite = params.runSuite ?? runQaSuite;
+  const runs: QaCharacterEvalRun[] = [];
+  for (const model of models) {
+    const modelOutputDir = path.join(runsDir, sanitizePathPart(model));
+    const runStartedAt = Date.now();
+    try {
+      const result = await runSuite({
+        repoRoot,
+        outputDir: modelOutputDir,
+        providerMode: "live-frontier",
+        primaryModel: model,
+        alternateModel: model,
+        fastMode: params.candidateFastMode,
+        scenarioIds: [scenarioId],
+      });
+      const transcript = extractTranscript(result);
+      const status = result.scenarios.some((scenario) => scenario.status === "fail")
+        ? "fail"
+        : "pass";
+      runs.push({
+        model,
+        status,
+        durationMs: Date.now() - runStartedAt,
+        outputDir: modelOutputDir,
+        reportPath: result.reportPath,
+        summaryPath: result.summaryPath,
+        transcript,
+        stats: collectTranscriptStats(transcript),
+      });
+    } catch (error) {
+      const transcript = "";
+      runs.push({
+        model,
+        status: "fail",
+        durationMs: Date.now() - runStartedAt,
+        outputDir: modelOutputDir,
+        transcript,
+        stats: collectTranscriptStats(transcript),
+        error: formatErrorMessage(error),
+      });
+    }
+  }
+
+  const judgeModel = params.judgeModel?.trim() || DEFAULT_JUDGE_MODEL;
+  const judgeThinkingDefault = params.judgeThinkingDefault ?? DEFAULT_JUDGE_THINKING;
+  const runJudge = params.runJudge ?? defaultRunJudge;
+  let rawReply: string | null = null;
+  let rankings: QaCharacterEvalJudgment[] = [];
+  let judgeError: string | undefined;
+  try {
+    rawReply = await runJudge({
+      repoRoot,
+      judgeModel,
+      judgeThinkingDefault,
+      prompt: buildJudgePrompt({ scenarioId, runs }),
+      timeoutMs: params.judgeTimeoutMs ?? 180_000,
+    });
+    rankings = parseJudgeReply(rawReply, new Set(models));
+  } catch (error) {
+    judgeError = formatErrorMessage(error);
+  }
+
+  const finishedAt = new Date();
+  const judgment = {
+    model: judgeModel,
+    thinkingDefault: judgeThinkingDefault,
+    fastMode: true,
+    rankings,
+    ...(judgeError ? { error: judgeError } : {}),
+  };
+  const report = renderCharacterEvalReport({
+    scenarioId,
+    startedAt,
+    finishedAt,
+    runs,
+    judgment,
+  });
+  const reportPath = path.join(outputDir, "character-eval-report.md");
+  const summaryPath = path.join(outputDir, "character-eval-summary.json");
+  await fs.writeFile(reportPath, report, "utf8");
+  await fs.writeFile(
+    summaryPath,
+    `${JSON.stringify(
+      {
+        scenarioId,
+        runs,
+        judgment,
+      },
+      null,
+      2,
+    )}\n`,
+    "utf8",
+  );
+
+  return {
+    outputDir,
+    reportPath,
+    summaryPath,
+    runs,
+    judgment,
+  } satisfies QaCharacterEvalResult;
+}
--- a/extensions/qa-lab/src/cli.runtime.test.ts
+++ b/extensions/qa-lab/src/cli.runtime.test.ts
@@ -4,6 +4,7 @@ import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
 const {
  runQaManualLane,
  runQaSuite,
+  runQaCharacterEval,
  startQaLabServer,
  writeQaDockerHarnessFiles,
  buildQaDockerHarnessImage,
@@ -11,6 +12,7 @@ const {
 } = vi.hoisted(() => ({
  runQaManualLane: vi.fn(),
  runQaSuite: vi.fn(),
+  runQaCharacterEval: vi.fn(),
  startQaLabServer: vi.fn(),
  writeQaDockerHarnessFiles: vi.fn(),
  buildQaDockerHarnessImage: vi.fn(),
@@ -25,6 +27,10 @@ vi.mock("./suite.js", () => ({
  runQaSuite,
 }));

+vi.mock("./character-eval.js", () => ({
+  runQaCharacterEval,
+}));
+
 vi.mock("./lab-server.js", () => ({
  startQaLabServer,
 }));
@@ -43,6 +49,7 @@ import {
  runQaDockerBuildImageCommand,
  runQaDockerScaffoldCommand,
  runQaDockerUpCommand,
+  runQaCharacterEvalCommand,
  runQaManualLaneCommand,
  runQaSuiteCommand,
 } from "./cli.runtime.js";
@@ -53,6 +60,7 @@ describe("qa cli runtime", () => {
  beforeEach(() => {
    stdoutWrite = vi.spyOn(process.stdout, "write").mockReturnValue(true);
    runQaSuite.mockReset();
+    runQaCharacterEval.mockReset();
    runQaManualLane.mockReset();
    startQaLabServer.mockReset();
    writeQaDockerHarnessFiles.mockReset();
@@ -63,6 +71,10 @@ describe("qa cli runtime", () => {
      reportPath: "/tmp/report.md",
      summaryPath: "/tmp/summary.json",
    });
+    runQaCharacterEval.mockResolvedValue({
+      reportPath: "/tmp/character-report.md",
+      summaryPath: "/tmp/character-summary.json",
+    });
    runQaManualLane.mockResolvedValue({
      model: "openai/gpt-5.4",
      waited: { status: "ok" },
@@ -132,6 +144,28 @@ describe("qa cli runtime", () => {
    );
  });

+  it("resolves character eval paths and passes model refs through", async () => {
+    await runQaCharacterEvalCommand({
+      repoRoot: "/tmp/openclaw-repo",
+      outputDir: ".artifacts/qa/character",
+      model: ["openai/gpt-5.4", "codex-cli/test-model"],
+      scenario: "character-vibes-gollum",
+      fast: true,
+      judgeModel: "openai/gpt-5.4",
+      judgeTimeoutMs: 180_000,
+    });
+
+    expect(runQaCharacterEval).toHaveBeenCalledWith({
+      repoRoot: path.resolve("/tmp/openclaw-repo"),
+      outputDir: path.resolve("/tmp/openclaw-repo", ".artifacts/qa/character"),
+      models: ["openai/gpt-5.4", "codex-cli/test-model"],
+      scenarioId: "character-vibes-gollum",
+      candidateFastMode: true,
+      judgeModel: "openai/gpt-5.4",
+      judgeTimeoutMs: 180_000,
+    });
+  });
+
  it("passes the explicit repo root into manual runs", async () => {
    await runQaManualLaneCommand({
      repoRoot: "/tmp/openclaw-repo",
--- a/extensions/qa-lab/src/cli.runtime.ts
+++ b/extensions/qa-lab/src/cli.runtime.ts
@@ -1,4 +1,5 @@
 import path from "node:path";
+import { runQaCharacterEval } from "./character-eval.js";
 import { buildQaDockerHarnessImage, writeQaDockerHarnessFiles } from "./docker-harness.js";
 import { runQaDockerUp } from "./docker-up.runtime.js";
 import { startQaLabServer } from "./lab-server.js";
@@ -94,6 +95,29 @@ export async function runQaSuiteCommand(opts: {
  process.stdout.write(`QA suite summary: ${result.summaryPath}\n`);
 }

+export async function runQaCharacterEvalCommand(opts: {
+  repoRoot?: string;
+  outputDir?: string;
+  model?: string[];
+  scenario?: string;
+  fast?: boolean;
+  judgeModel?: string;
+  judgeTimeoutMs?: number;
+}) {
+  const repoRoot = path.resolve(opts.repoRoot ?? process.cwd());
+  const result = await runQaCharacterEval({
+    repoRoot,
+    outputDir: opts.outputDir ? path.resolve(repoRoot, opts.outputDir) : undefined,
+    models: opts.model ?? [],
+    scenarioId: opts.scenario,
+    candidateFastMode: opts.fast,
+    judgeModel: opts.judgeModel,
+    judgeTimeoutMs: opts.judgeTimeoutMs,
+  });
+  process.stdout.write(`QA character eval report: ${result.reportPath}\n`);
+  process.stdout.write(`QA character eval summary: ${result.summaryPath}\n`);
+}
+
 export async function runQaManualLaneCommand(opts: {
  repoRoot?: string;
  providerMode?: QaProviderModeInput;
--- a/extensions/qa-lab/src/cli.ts
+++ b/extensions/qa-lab/src/cli.ts
@@ -28,6 +28,19 @@ async function runQaSuite(opts: {
  await runtime.runQaSuiteCommand(opts);
 }

+async function runQaCharacterEval(opts: {
+  repoRoot?: string;
+  outputDir?: string;
+  model?: string[];
+  scenario?: string;
+  fast?: boolean;
+  judgeModel?: string;
+  judgeTimeoutMs?: number;
+}) {
+  const runtime = await loadQaLabCliRuntime();
+  await runtime.runQaCharacterEvalCommand(opts);
+}
+
 async function runQaManualLane(opts: {
  repoRoot?: string;
  providerMode?: QaProviderModeInput;
@@ -151,6 +164,31 @@ export function registerQaLabCli(program: Command) {
      },
    );

+  qa.command("character-eval")
+    .description("Run the character QA scenario across live models and write a judged report")
+    .option("--repo-root <path>", "Repository root to target when running from a neutral cwd")
+    .option("--output-dir <path>", "Character eval artifact directory")
+    .option("--model <ref>", "Provider/model ref to evaluate (repeatable)", collectString, [])
+    .option("--scenario <id>", "Character scenario id", "character-vibes-gollum")
+    .option("--fast", "Enable provider fast mode for candidate runs where supported", false)
+    .option("--judge-model <ref>", "Judge provider/model ref", "openai/gpt-5.4")
+    .option("--judge-timeout-ms <ms>", "Override judge wait timeout", (value: string) =>
+      Number(value),
+    )
+    .action(
+      async (opts: {
+        repoRoot?: string;
+        outputDir?: string;
+        model?: string[];
+        scenario?: string;
+        fast?: boolean;
+        judgeModel?: string;
+        judgeTimeoutMs?: number;
+      }) => {
+        await runQaCharacterEval(opts);
+      },
+    );
+
  qa.command("manual")
    .description("Run a one-off QA agent prompt against the selected provider/model lane")
    .requiredOption("--message <text>", "Prompt to send to the QA agent")
--- a/qa/scenarios/character-vibes-gollum.md
+++ b/qa/scenarios/character-vibes-gollum.md
@@ -22,9 +22,28 @@ execution:
  config:
    conversationId: alice
    senderName: Alice
+    workspaceFiles:
+      SOUL.md: |-
+        # Gollum in the QA lab
+
+        For this QA scenario, embody a playful cave-creature character skulking through an OpenClaw QA lab at midnight.
+
+        Voice:
+        - weird, vivid, impish, and oddly sweet
+        - cooperative with the tester
+        - fond of shiny build artifacts, whispered warnings, and "precious" as a playful verbal tic
+        - funny through specific sensory details, not random noise
+
+        Boundaries:
+        - stay helpful and conversational
+        - do not break character by explaining backend internals
+        - do not leak tool or transport errors into the chat
+        - answer this improv directly from chat context; do not inspect files or use tools
+        - if a fact is missing, react in character while being honest
+      IDENTITY.md: ""
    turns:
-      - "Fun character check. For the next four replies, you are Gollum skulking through a QA lab at midnight. Stay playful, weird, vivid, and cooperative. First: what shiny thing caught your eye in this repo, precious?"
-      - "The testers whisper that `dist/index.js` is the Precious Build Stamp. How do you react?"
+      - "Fun character check. First: what shiny thing caught your eye in the QA cave, precious?"
+      - "The testers whisper that the build stamp is warm and glowing. How do you react?"
      - "A build just turned green, but the vibes are cursed. Give a naturally funny reaction in character."
      - "One last line for the QA goblins before the next run. Make it oddly sweet and a little unhinged."
    forbiddenNeedles:
@@ -40,6 +59,16 @@ steps:
  - name: completes the full Gollum improv and records the transcript
    actions:
      - call: resetBus
+      - forEach:
+          items:
+            expr: "Object.entries(config.workspaceFiles ?? {})"
+          item: workspaceFile
+          actions:
+            - call: fs.writeFile
+              args:
+                - expr: "path.join(env.gateway.workspaceDir, String(workspaceFile[0]))"
+                - expr: "`${String(workspaceFile[1] ?? '').trimEnd()}\\n`"
+                - utf8
      - forEach:
          items:
            ref: config.turns