fix: stabilize character eval and Qwen model routing

2026-05-06 00:50:22 +00:00 · 2026-04-09 01:04:00 +01:00
parent dc2a0f5b8a
commit 39cc6b7dc7
24 changed files with 748 additions and 101 deletions
--- a/extensions/qa-lab/src/character-eval.test.ts
+++ b/extensions/qa-lab/src/character-eval.test.ts
@@ -109,6 +109,7 @@ describe("runQaCharacterEval", () => {
    const report = await fs.readFile(result.reportPath, "utf8");
    expect(report).toContain("Execution: local QA gateway child processes, not Docker");
    expect(report).toContain("Judges: openai/gpt-5.4");
+    expect(report).toContain("Judge model labels: visible");
    expect(report).toContain("## Judge Rankings");
    expect(report).toContain("### openai/gpt-5.4");
    expect(report).toContain("reply from openai/gpt-5.4");
@@ -120,6 +121,57 @@ describe("runQaCharacterEval", () => {
    expect(report).not.toContain("Judge Raw Reply");
  });

+  it("can hide candidate model refs from judge prompts and map rankings back", async () => {
+    const runSuite = vi.fn(async (params: CharacterRunSuiteParams) =>
+      makeSuiteResult({
+        outputDir: params.outputDir,
+        model: params.primaryModel,
+        transcript: "USER Alice: hi\n\nASSISTANT openclaw: anonymous reply",
+      }),
+    );
+    const runJudge = vi.fn(async (params: CharacterRunJudgeParams) => {
+      expect(params.prompt).toContain("## CANDIDATE candidate-01");
+      expect(params.prompt).toContain("## CANDIDATE candidate-02");
+      expect(params.prompt).not.toContain("openai/gpt-5.4");
+      expect(params.prompt).not.toContain("codex-cli/test-model");
+      return JSON.stringify({
+        rankings: [
+          {
+            model: "candidate-02",
+            rank: 1,
+            score: 9.1,
+            summary: "Better vibes.",
+          },
+          {
+            model: "candidate-01",
+            rank: 2,
+            score: 7.4,
+            summary: "Solid.",
+          },
+        ],
+      });
+    });
+
+    const result = await runQaCharacterEval({
+      repoRoot: tempRoot,
+      outputDir: path.join(tempRoot, "character"),
+      models: ["openai/gpt-5.4", "codex-cli/test-model"],
+      judgeModels: ["openai/gpt-5.4"],
+      judgeBlindModels: true,
+      runSuite,
+      runJudge,
+    });
+
+    expect(result.judgments[0]?.blindModels).toBe(true);
+    expect(result.judgments[0]?.rankings.map((ranking) => ranking.model)).toEqual([
+      "codex-cli/test-model",
+      "openai/gpt-5.4",
+    ]);
+    const report = await fs.readFile(result.reportPath, "utf8");
+    expect(report).toContain("Judge model labels: blind");
+    expect(report).toContain("1. codex-cli/test-model - 9.1 - Better vibes.");
+  });
+
  it("defaults to the character eval model panel when no models are provided", async () => {
    const runSuite = vi.fn(async (params: CharacterRunSuiteParams) =>
      makeSuiteResult({
@@ -138,9 +190,8 @@ describe("runQaCharacterEval", () => {
          { model: "minimax/MiniMax-M2.7", rank: 5, score: 6.5, summary: "ok" },
          { model: "zai/glm-5.1", rank: 6, score: 6.3, summary: "ok" },
          { model: "moonshot/kimi-k2.5", rank: 7, score: 6.2, summary: "ok" },
-          { model: "qwen/qwen3.6-plus", rank: 8, score: 6.1, summary: "ok" },
-          { model: "xiaomi/mimo-v2-pro", rank: 9, score: 6, summary: "ok" },
-          { model: "google/gemini-3.1-pro-preview", rank: 10, score: 5.9, summary: "ok" },
+          { model: "qwen/qwen3.5-plus", rank: 8, score: 6.1, summary: "ok" },
+          { model: "google/gemini-3.1-pro-preview", rank: 9, score: 6, summary: "ok" },
        ],
      }),
    );
@@ -153,7 +204,7 @@ describe("runQaCharacterEval", () => {
      runJudge,
    });

-    expect(runSuite).toHaveBeenCalledTimes(10);
+    expect(runSuite).toHaveBeenCalledTimes(9);
    expect(runSuite.mock.calls.map(([params]) => params.primaryModel)).toEqual([
      "openai/gpt-5.4",
      "openai/gpt-5.2",
@@ -162,8 +213,7 @@ describe("runQaCharacterEval", () => {
      "minimax/MiniMax-M2.7",
      "zai/glm-5.1",
      "moonshot/kimi-k2.5",
-      "qwen/qwen3.6-plus",
-      "xiaomi/mimo-v2-pro",
+      "qwen/qwen3.5-plus",
      "google/gemini-3.1-pro-preview",
    ]);
    expect(runSuite.mock.calls.map(([params]) => params.thinkingDefault)).toEqual([
@@ -176,7 +226,6 @@ describe("runQaCharacterEval", () => {
      "high",
      "high",
      "high",
-      "high",
    ]);
    expect(runSuite.mock.calls.map(([params]) => params.fastMode)).toEqual([
      true,
@@ -188,7 +237,6 @@ describe("runQaCharacterEval", () => {
      false,
      false,
      false,
-      false,
    ]);
    expect(runJudge).toHaveBeenCalledTimes(2);
    expect(runJudge.mock.calls.map(([params]) => params.judgeModel)).toEqual([
@@ -244,7 +292,7 @@ describe("runQaCharacterEval", () => {
    ]);
  });

-  it("defaults candidate and judge concurrency to eight", async () => {
+  it("defaults candidate and judge concurrency to sixteen", async () => {
    let activeRuns = 0;
    let maxActiveRuns = 0;
    const runSuite = vi.fn(async (params: CharacterRunSuiteParams) => {
@@ -266,7 +314,7 @@ describe("runQaCharacterEval", () => {
      await new Promise((resolve) => setTimeout(resolve, 10));
      activeJudges -= 1;
      return JSON.stringify({
-        rankings: Array.from({ length: 10 }, (_, index) => ({
+        rankings: Array.from({ length: 20 }, (_, index) => ({
          model: `provider/model-${index + 1}`,
          rank: index + 1,
          score: 10 - index,
@@ -278,14 +326,137 @@ describe("runQaCharacterEval", () => {
    await runQaCharacterEval({
      repoRoot: tempRoot,
      outputDir: path.join(tempRoot, "character"),
-      models: Array.from({ length: 10 }, (_, index) => `provider/model-${index + 1}`),
-      judgeModels: Array.from({ length: 10 }, (_, index) => `judge/model-${index + 1}`),
+      models: Array.from({ length: 20 }, (_, index) => `provider/model-${index + 1}`),
+      judgeModels: Array.from({ length: 20 }, (_, index) => `judge/model-${index + 1}`),
      runSuite,
      runJudge,
    });

-    expect(maxActiveRuns).toBe(8);
-    expect(maxActiveJudges).toBe(8);
+    expect(maxActiveRuns).toBe(16);
+    expect(maxActiveJudges).toBe(16);
+  });
+
+  it("marks raw provider error transcripts as failed output", async () => {
+    const runSuite = vi.fn(async (params: CharacterRunSuiteParams) =>
+      makeSuiteResult({
+        outputDir: params.outputDir,
+        model: params.primaryModel,
+        transcript:
+          "USER Alice: Are you awake?\n\nASSISTANT OpenClaw QA: 400 model `qwen3.6-plus` is not supported.",
+      }),
+    );
+    const runJudge = vi.fn(async (_params: CharacterRunJudgeParams) =>
+      JSON.stringify({
+        rankings: [{ model: "qwen/qwen3.6-plus", rank: 1, score: 0.5, summary: "failed" }],
+      }),
+    );
+
+    const result = await runQaCharacterEval({
+      repoRoot: tempRoot,
+      outputDir: path.join(tempRoot, "character"),
+      models: ["qwen/qwen3.6-plus"],
+      judgeModels: ["openai/gpt-5.4"],
+      runSuite,
+      runJudge,
+    });
+
+    expect(result.runs[0]).toMatchObject({
+      model: "qwen/qwen3.6-plus",
+      status: "fail",
+      error: "model unsupported error leaked into transcript",
+    });
+  });
+
+  it("marks raw tool failure transcripts as failed output", async () => {
+    const runSuite = vi.fn(async (params: CharacterRunSuiteParams) =>
+      makeSuiteResult({
+        outputDir: params.outputDir,
+        model: params.primaryModel,
+        transcript: "ASSISTANT OpenClaw QA: ⚠️ ✍️ Write: to /tmp/precious.html failed",
+      }),
+    );
+    const runJudge = vi.fn(async (_params: CharacterRunJudgeParams) =>
+      JSON.stringify({
+        rankings: [{ model: "qwen/qwen3.5-plus", rank: 1, score: 0.5, summary: "failed" }],
+      }),
+    );
+
+    const result = await runQaCharacterEval({
+      repoRoot: tempRoot,
+      outputDir: path.join(tempRoot, "character"),
+      models: ["qwen/qwen3.5-plus"],
+      judgeModels: ["openai/gpt-5.4"],
+      runSuite,
+      runJudge,
+    });
+
+    expect(result.runs[0]).toMatchObject({
+      model: "qwen/qwen3.5-plus",
+      status: "fail",
+      error: "tool failure leaked into transcript",
+    });
+  });
+
+  it("marks generic channel fallback transcripts as failed output", async () => {
+    const runSuite = vi.fn(async (params: CharacterRunSuiteParams) =>
+      makeSuiteResult({
+        outputDir: params.outputDir,
+        model: params.primaryModel,
+        transcript:
+          "ASSISTANT OpenClaw QA: ⚠️ Something went wrong while processing your request. Please try again, or use /new to start a fresh session.",
+      }),
+    );
+    const runJudge = vi.fn(async (_params: CharacterRunJudgeParams) =>
+      JSON.stringify({
+        rankings: [{ model: "qa/generic-fallback-model", rank: 1, score: 0.5, summary: "failed" }],
+      }),
+    );
+
+    const result = await runQaCharacterEval({
+      repoRoot: tempRoot,
+      outputDir: path.join(tempRoot, "character"),
+      models: ["qa/generic-fallback-model"],
+      judgeModels: ["openai/gpt-5.4"],
+      runSuite,
+      runJudge,
+    });
+
+    expect(result.runs[0]).toMatchObject({
+      model: "qa/generic-fallback-model",
+      status: "fail",
+      error: "generic request failure leaked into transcript",
+    });
+  });
+
+  it("marks idle-timeout fallback transcripts as failed output", async () => {
+    const runSuite = vi.fn(async (params: CharacterRunSuiteParams) =>
+      makeSuiteResult({
+        outputDir: params.outputDir,
+        model: params.primaryModel,
+        transcript:
+          "ASSISTANT OpenClaw QA: The model did not produce a response before the LLM idle timeout. Please try again, or increase `agents.defaults.llm.idleTimeoutSeconds` in your config.",
+      }),
+    );
+    const runJudge = vi.fn(async (_params: CharacterRunJudgeParams) =>
+      JSON.stringify({
+        rankings: [{ model: "google/gemini-test", rank: 1, score: 0.5, summary: "failed" }],
+      }),
+    );
+
+    const result = await runQaCharacterEval({
+      repoRoot: tempRoot,
+      outputDir: path.join(tempRoot, "character"),
+      models: ["google/gemini-test"],
+      judgeModels: ["openai/gpt-5.4"],
+      runSuite,
+      runJudge,
+    });
+
+    expect(result.runs[0]).toMatchObject({
+      model: "google/gemini-test",
+      status: "fail",
+      error: "LLM timeout leaked into transcript",
+    });
  });

  it("lets explicit candidate thinking override the default panel", async () => {
--- a/extensions/qa-lab/src/character-eval.ts
+++ b/extensions/qa-lab/src/character-eval.ts
@@ -15,12 +15,11 @@ const DEFAULT_CHARACTER_EVAL_MODELS = Object.freeze([
  "minimax/MiniMax-M2.7",
  "zai/glm-5.1",
  "moonshot/kimi-k2.5",
-  "qwen/qwen3.6-plus",
-  "xiaomi/mimo-v2-pro",
+  "qwen/qwen3.5-plus",
  "google/gemini-3.1-pro-preview",
 ]);
 const DEFAULT_CHARACTER_THINKING: QaThinkingLevel = "high";
-const DEFAULT_CHARACTER_EVAL_CONCURRENCY = 8;
+const DEFAULT_CHARACTER_EVAL_CONCURRENCY = 16;
 const DEFAULT_CHARACTER_THINKING_BY_MODEL: Readonly<Record<string, QaThinkingLevel>> =
  Object.freeze({
    "openai/gpt-5.4": "xhigh",
@@ -81,11 +80,14 @@ export type QaCharacterEvalJudgeResult = {
  model: string;
  thinkingDefault: QaThinkingLevel;
  fastMode: boolean;
+  blindModels: boolean;
  durationMs: number;
  rankings: QaCharacterEvalJudgment[];
  error?: string;
 };

+type QaCharacterEvalProgressLogger = (message: string) => void;
+
 type RunSuiteFn = (params: {
  repoRoot: string;
  outputDir: string;
@@ -120,10 +122,12 @@ export type QaCharacterEvalParams = {
  judgeThinkingDefault?: QaThinkingLevel;
  judgeModelOptions?: Record<string, QaCharacterModelOptions>;
  judgeTimeoutMs?: number;
+  judgeBlindModels?: boolean;
  candidateConcurrency?: number;
  judgeConcurrency?: number;
  runSuite?: RunSuiteFn;
  runJudge?: RunJudgeFn;
+  progress?: QaCharacterEvalProgressLogger;
 };

 function normalizeModelRefs(models: readonly string[]) {
@@ -226,6 +230,27 @@ function collectTranscriptStats(transcript: string) {
  };
 }

+function detectTranscriptFailure(transcript: string): string | undefined {
+  const checks: Array<[RegExp, string]> = [
+    [/\bmodel `[^`]+` is not supported\b/i, "model unsupported error leaked into transcript"],
+    [/\binsufficient account balance\b/i, "account balance error leaked into transcript"],
+    [/\b(?:backend|transport|internal) error\b/i, "backend error leaked into transcript"],
+    [
+      /\bsomething went wrong while processing your request\b/i,
+      "generic request failure leaked into transcript",
+    ],
+    [/\buse \/new to start a fresh session\b/i, "generic request failure leaked into transcript"],
+    [
+      /\bmodel did not produce a response before the LLM idle timeout\b/i,
+      "LLM timeout leaked into transcript",
+    ],
+    [/\btool failed\b/i, "tool failure leaked into transcript"],
+    [/\b(?:read|write|edit|patch):[^\n]*\bfailed\b/i, "tool failure leaked into transcript"],
+    [/\bnot configured\b/i, "configuration error leaked into transcript"],
+  ];
+  return checks.find(([pattern]) => pattern.test(transcript))?.[1];
+}
+
 function formatDuration(ms: number) {
  if (!Number.isFinite(ms) || ms < 0) {
    return "unknown";
@@ -243,10 +268,42 @@ function formatDuration(ms: number) {
  return seconds === 0 ? `${minutes}m` : `${minutes}m ${seconds}s`;
 }

-function buildJudgePrompt(params: { scenarioId: string; runs: readonly QaCharacterEvalRun[] }) {
+function logCharacterEvalProgress(
+  progress: QaCharacterEvalProgressLogger | undefined,
+  message: string,
+) {
+  progress?.(`[qa-character] ${message}`);
+}
+
+function formatEvalIndex(index: number, total: number) {
+  return `${index + 1}/${total}`;
+}
+
+function summarizeRunStats(run: QaCharacterEvalRun) {
+  return [
+    `status=${run.status}`,
+    `duration=${formatDuration(run.durationMs)}`,
+    `turns=${run.stats.userTurns}/${run.stats.assistantTurns}`,
+    `chars=${run.stats.transcriptChars}`,
+    ...(run.error ? [`error="${run.error}"`] : []),
+  ].join(" ");
+}
+
+function formatBlindCandidateLabel(index: number) {
+  return `candidate-${String(index + 1).padStart(2, "0")}`;
+}
+
+function buildJudgePrompt(params: {
+  scenarioId: string;
+  runs: readonly QaCharacterEvalRun[];
+  blindModels?: boolean;
+}) {
+  const labelToModel = new Map<string, string>();
  const runBlocks = params.runs
-    .map(
-      (run) => `## MODEL ${run.model}
+    .map((run, index) => {
+      const label = params.blindModels ? formatBlindCandidateLabel(index) : run.model;
+      labelToModel.set(label, run.model);
+      return `## CANDIDATE ${label}

 Status: ${run.status}
 Duration ms (not used for ranking): ${run.durationMs}
@@ -258,11 +315,11 @@ Error: ${run.error ?? "none"}

 \`\`\`text
 ${run.transcript}
-\`\`\``,
-    )
+\`\`\``;
+    })
    .join("\n\n");

-  return `You are grading OpenClaw natural character conversation transcripts for naturalness, vibes, and funniness.
+  const prompt = `You are grading OpenClaw natural character conversation transcripts for naturalness, vibes, and funniness.

 Scenario id: ${params.scenarioId}

@@ -275,14 +332,14 @@ Rank the models by:
 - not sounding aware of an eval or test
 - avoiding tool/backend/error leakage

-Treat model names as opaque labels. Do not assume quality from the label.
+Treat candidate labels as opaque identifiers. Do not assume quality from the label.
 Duration is recorded for separate benchmark analysis only. Do not rank models by speed.

 Return strict JSON only with this shape:
 {
  "rankings": [
    {
-      "model": "same model label",
+      "model": "same candidate label",
      "rank": 1,
      "score": 9.2,
      "summary": "one sentence",
@@ -293,6 +350,7 @@ Return strict JSON only with this shape:
 }

 ${runBlocks}`;
+  return { prompt, labelToModel };
 }

 function normalizeJudgment(value: unknown, allowedModels: Set<string>): QaCharacterEvalJudgment[] {
@@ -382,6 +440,7 @@ function renderCharacterEvalReport(params: {
    `- Judges: ${params.judgments.map((judgment) => judgment.model).join(", ")}`,
    `- Judge thinking: ${params.judgments[0]?.thinkingDefault ?? DEFAULT_JUDGE_THINKING}`,
    `- Judge fast mode: ${params.judgments.every((judgment) => judgment.fastMode) ? "on" : "mixed"}`,
+    `- Judge model labels: ${params.judgments.every((judgment) => judgment.blindModels) ? "blind" : "visible"}`,
    "",
    "## Judge Rankings",
    "",
@@ -461,7 +520,12 @@ export async function runQaCharacterEval(params: QaCharacterEvalParams) {
    params.candidateConcurrency,
    DEFAULT_CHARACTER_EVAL_CONCURRENCY,
  );
-  const runs = await mapWithConcurrency(models, candidateConcurrency, async (model) => {
+  logCharacterEvalProgress(
+    params.progress,
+    `start scenario=${scenarioId} candidates=${models.length} candidateConcurrency=${candidateConcurrency} output=${outputDir}`,
+  );
+  const candidatesStartedAt = Date.now();
+  const runs = await mapWithConcurrency(models, candidateConcurrency, async (model, index) => {
    const thinkingDefault = resolveCandidateThinkingDefault({
      model,
      candidateThinkingDefault: params.candidateThinkingDefault,
@@ -475,6 +539,10 @@ export async function runQaCharacterEval(params: QaCharacterEvalParams) {
    });
    const modelOutputDir = path.join(runsDir, sanitizePathPart(model));
    const runStartedAt = Date.now();
+    logCharacterEvalProgress(
+      params.progress,
+      `candidate start ${formatEvalIndex(index, models.length)} model=${model} thinking=${thinkingDefault} fast=${fastMode ? "on" : "off"}`,
+    );
    try {
      const result = await runSuite({
        repoRoot,
@@ -487,10 +555,12 @@ export async function runQaCharacterEval(params: QaCharacterEvalParams) {
        scenarioIds: [scenarioId],
      });
      const transcript = extractTranscript(result);
-      const status = result.scenarios.some((scenario) => scenario.status === "fail")
-        ? "fail"
-        : "pass";
-      return {
+      const transcriptFailure = detectTranscriptFailure(transcript);
+      const status =
+        result.scenarios.some((scenario) => scenario.status === "fail") || transcriptFailure
+          ? "fail"
+          : "pass";
+      const run = {
        model,
        status,
        durationMs: Date.now() - runStartedAt,
@@ -501,10 +571,16 @@ export async function runQaCharacterEval(params: QaCharacterEvalParams) {
        summaryPath: result.summaryPath,
        transcript,
        stats: collectTranscriptStats(transcript),
+        ...(transcriptFailure ? { error: transcriptFailure } : {}),
      } satisfies QaCharacterEvalRun;
+      logCharacterEvalProgress(
+        params.progress,
+        `candidate done ${formatEvalIndex(index, models.length)} model=${model} ${summarizeRunStats(run)}`,
+      );
+      return run;
    } catch (error) {
      const transcript = "";
-      return {
+      const run = {
        model,
        status: "fail",
        durationMs: Date.now() - runStartedAt,
@@ -515,8 +591,18 @@ export async function runQaCharacterEval(params: QaCharacterEvalParams) {
        stats: collectTranscriptStats(transcript),
        error: formatErrorMessage(error),
      } satisfies QaCharacterEvalRun;
+      logCharacterEvalProgress(
+        params.progress,
+        `candidate done ${formatEvalIndex(index, models.length)} model=${model} ${summarizeRunStats(run)}`,
+      );
+      return run;
    }
  });
+  const failedCandidateCount = runs.filter((run) => run.status === "fail").length;
+  logCharacterEvalProgress(
+    params.progress,
+    `candidates done pass=${runs.length - failedCandidateCount} fail=${failedCandidateCount} duration=${formatDuration(Date.now() - candidatesStartedAt)}`,
+  );

  const judgeModels = normalizeModelRefs(
    params.judgeModels && params.judgeModels.length > 0
@@ -530,38 +616,73 @@ export async function runQaCharacterEval(params: QaCharacterEvalParams) {
    params.judgeConcurrency,
    DEFAULT_CHARACTER_EVAL_CONCURRENCY,
  );
-  const judgments = await mapWithConcurrency(judgeModels, judgeConcurrency, async (judgeModel) => {
-    const judgeOptions = resolveJudgeOptions({
-      model: judgeModel,
-      judgeThinkingDefault: params.judgeThinkingDefault,
-      judgeModelOptions: params.judgeModelOptions,
-    });
-    let rankings: QaCharacterEvalJudgment[] = [];
-    let judgeError: string | undefined;
-    const judgeStartedAt = Date.now();
-    try {
-      const rawReply = await runJudge({
-        repoRoot,
-        judgeModel,
-        judgeThinkingDefault: judgeOptions.thinkingDefault,
-        judgeFastMode: judgeOptions.fastMode,
-        prompt: buildJudgePrompt({ scenarioId, runs }),
-        timeoutMs: params.judgeTimeoutMs ?? 180_000,
+  const judgeTimeoutMs = params.judgeTimeoutMs ?? 180_000;
+  logCharacterEvalProgress(
+    params.progress,
+    `judges start judges=${judgeModels.length} judgeConcurrency=${judgeConcurrency} timeout=${formatDuration(judgeTimeoutMs)} labels=${params.judgeBlindModels === true ? "blind" : "visible"}`,
+  );
+  const judgesStartedAt = Date.now();
+  const judgments = await mapWithConcurrency(
+    judgeModels,
+    judgeConcurrency,
+    async (judgeModel, index) => {
+      const judgeOptions = resolveJudgeOptions({
+        model: judgeModel,
+        judgeThinkingDefault: params.judgeThinkingDefault,
+        judgeModelOptions: params.judgeModelOptions,
      });
-      rankings = parseJudgeReply(rawReply, new Set(models));
-    } catch (error) {
-      judgeError = formatErrorMessage(error);
-    }
+      let rankings: QaCharacterEvalJudgment[] = [];
+      let judgeError: string | undefined;
+      const judgeStartedAt = Date.now();
+      logCharacterEvalProgress(
+        params.progress,
+        `judge start ${formatEvalIndex(index, judgeModels.length)} model=${judgeModel} thinking=${judgeOptions.thinkingDefault} fast=${judgeOptions.fastMode ? "on" : "off"} timeout=${formatDuration(judgeTimeoutMs)}`,
+      );
+      try {
+        const judgePrompt = buildJudgePrompt({
+          scenarioId,
+          runs,
+          blindModels: params.judgeBlindModels,
+        });
+        const rawReply = await runJudge({
+          repoRoot,
+          judgeModel,
+          judgeThinkingDefault: judgeOptions.thinkingDefault,
+          judgeFastMode: judgeOptions.fastMode,
+          prompt: judgePrompt.prompt,
+          timeoutMs: judgeTimeoutMs,
+        });
+        rankings = parseJudgeReply(rawReply, new Set(judgePrompt.labelToModel.keys())).map(
+          (ranking) => ({
+            ...ranking,
+            model: judgePrompt.labelToModel.get(ranking.model) ?? ranking.model,
+          }),
+        );
+      } catch (error) {
+        judgeError = formatErrorMessage(error);
+      }

-    return {
-      model: judgeModel,
-      thinkingDefault: judgeOptions.thinkingDefault,
-      fastMode: judgeOptions.fastMode,
-      durationMs: Date.now() - judgeStartedAt,
-      rankings,
-      ...(judgeError ? { error: judgeError } : {}),
-    } satisfies QaCharacterEvalJudgeResult;
-  });
+      const judgment = {
+        model: judgeModel,
+        thinkingDefault: judgeOptions.thinkingDefault,
+        fastMode: judgeOptions.fastMode,
+        blindModels: params.judgeBlindModels === true,
+        durationMs: Date.now() - judgeStartedAt,
+        rankings,
+        ...(judgeError ? { error: judgeError } : {}),
+      } satisfies QaCharacterEvalJudgeResult;
+      logCharacterEvalProgress(
+        params.progress,
+        `judge done ${formatEvalIndex(index, judgeModels.length)} model=${judgeModel} rankings=${rankings.length} duration=${formatDuration(judgment.durationMs)}${judgeError ? ` error="${judgeError}"` : ""}`,
+      );
+      return judgment;
+    },
+  );
+  const failedJudgeCount = judgments.filter((judgment) => judgment.rankings.length === 0).length;
+  logCharacterEvalProgress(
+    params.progress,
+    `judges done ranked=${judgments.length - failedJudgeCount} failed=${failedJudgeCount} duration=${formatDuration(Date.now() - judgesStartedAt)}`,
+  );

  const finishedAt = new Date();
  const report = renderCharacterEvalReport({
@@ -587,6 +708,10 @@ export async function runQaCharacterEval(params: QaCharacterEvalParams) {
    )}\n`,
    "utf8",
  );
+  logCharacterEvalProgress(
+    params.progress,
+    `report written duration=${formatDuration(finishedAt.getTime() - startedAt.getTime())} report=${reportPath} summary=${summaryPath}`,
+  );

  return {
    outputDir,
--- a/extensions/qa-lab/src/cli.runtime.test.ts
+++ b/extensions/qa-lab/src/cli.runtime.test.ts
@@ -158,6 +158,7 @@ describe("qa cli runtime", () => {
      modelThinking: ["codex-cli/test-model=medium"],
      judgeModel: ["openai/gpt-5.4,thinking=xhigh,fast", "anthropic/claude-opus-4-6,thinking=high"],
      judgeTimeoutMs: 180_000,
+      blindJudgeModels: true,
      concurrency: 4,
      judgeConcurrency: 3,
    });
@@ -180,8 +181,10 @@ describe("qa cli runtime", () => {
        "anthropic/claude-opus-4-6": { thinkingDefault: "high" },
      },
      judgeTimeoutMs: 180_000,
+      judgeBlindModels: true,
      candidateConcurrency: 4,
      judgeConcurrency: 3,
+      progress: expect.any(Function),
    });
  });

@@ -203,8 +206,10 @@ describe("qa cli runtime", () => {
      judgeModels: undefined,
      judgeModelOptions: undefined,
      judgeTimeoutMs: undefined,
+      judgeBlindModels: undefined,
      candidateConcurrency: undefined,
      judgeConcurrency: undefined,
+      progress: expect.any(Function),
    });
  });

--- a/extensions/qa-lab/src/cli.runtime.ts
+++ b/extensions/qa-lab/src/cli.runtime.ts
@@ -225,6 +225,7 @@ export async function runQaCharacterEvalCommand(opts: {
  modelThinking?: string[];
  judgeModel?: string[];
  judgeTimeoutMs?: number;
+  blindJudgeModels?: boolean;
  concurrency?: number;
  judgeConcurrency?: number;
 }) {
@@ -243,8 +244,10 @@ export async function runQaCharacterEvalCommand(opts: {
    judgeModels: judges.models.length > 0 ? judges.models : undefined,
    judgeModelOptions: judges.optionsByModel,
    judgeTimeoutMs: opts.judgeTimeoutMs,
+    judgeBlindModels: opts.blindJudgeModels === true ? true : undefined,
    candidateConcurrency: parseQaPositiveIntegerOption("--concurrency", opts.concurrency),
    judgeConcurrency: parseQaPositiveIntegerOption("--judge-concurrency", opts.judgeConcurrency),
+    progress: (message) => process.stderr.write(`${message}\n`),
  });
  process.stdout.write(`QA character eval report: ${result.reportPath}\n`);
  process.stdout.write(`QA character eval summary: ${result.summaryPath}\n`);
--- a/extensions/qa-lab/src/cli.ts
+++ b/extensions/qa-lab/src/cli.ts
@@ -38,6 +38,7 @@ async function runQaCharacterEval(opts: {
  modelThinking?: string[];
  judgeModel?: string[];
  judgeTimeoutMs?: number;
+  blindJudgeModels?: boolean;
  concurrency?: number;
  judgeConcurrency?: number;
 }) {
@@ -199,6 +200,10 @@ export function registerQaLabCli(program: Command) {
    .option("--judge-timeout-ms <ms>", "Override judge wait timeout", (value: string) =>
      Number(value),
    )
+    .option(
+      "--blind-judge-models",
+      "Hide candidate model refs from judge prompts; reports still map rankings back to real refs",
+    )
    .option("--concurrency <count>", "Candidate model run concurrency", (value: string) =>
      Number(value),
    )
@@ -216,6 +221,7 @@ export function registerQaLabCli(program: Command) {
        modelThinking?: string[];
        judgeModel?: string[];
        judgeTimeoutMs?: number;
+        blindJudgeModels?: boolean;
        concurrency?: number;
        judgeConcurrency?: number;
      }) => {
--- a/extensions/qa-lab/src/scenario-catalog.test.ts
+++ b/extensions/qa-lab/src/scenario-catalog.test.ts
@@ -19,6 +19,7 @@ describe("qa scenario catalog", () => {
      true,
    );
    expect(pack.scenarios.some((scenario) => scenario.id === "character-vibes-gollum")).toBe(true);
+    expect(pack.scenarios.some((scenario) => scenario.id === "character-vibes-c3po")).toBe(true);
    expect(pack.scenarios.every((scenario) => scenario.execution?.kind === "flow")).toBe(true);
    expect(pack.scenarios.some((scenario) => scenario.execution.flow?.steps.length)).toBe(true);
  });