feat: parallelize character eval runs

2026-05-05 15:40:21 +00:00 · 2026-04-08 20:05:24 +01:00
parent f1e75d3259
commit 21ef1bf8de
8 changed files with 219 additions and 56 deletions
--- a/extensions/qa-lab/src/character-eval.test.ts
+++ b/extensions/qa-lab/src/character-eval.test.ts
@@ -115,7 +115,8 @@ describe("runQaCharacterEval", () => {
    expect(report).toContain("reply from codex-cli/test-model");
    expect(report).toContain("Judge thinking: xhigh");
    expect(report).toContain("Fast mode: on");
-    expect(report).toContain("Duration ms:");
+    expect(report).toContain("Duration:");
+    expect(report).not.toContain("Duration ms:");
    expect(report).not.toContain("Judge Raw Reply");
  });

@@ -201,6 +202,92 @@ describe("runQaCharacterEval", () => {
    expect(runJudge.mock.calls.map(([params]) => params.judgeFastMode)).toEqual([true, false]);
  });

+  it("runs candidate models with bounded concurrency while preserving result order", async () => {
+    let activeRuns = 0;
+    let maxActiveRuns = 0;
+    const runSuite = vi.fn(async (params: CharacterRunSuiteParams) => {
+      activeRuns += 1;
+      maxActiveRuns = Math.max(maxActiveRuns, activeRuns);
+      await new Promise((resolve) => setTimeout(resolve, 10));
+      activeRuns -= 1;
+      return makeSuiteResult({
+        outputDir: params.outputDir,
+        model: params.primaryModel,
+        transcript: `USER Alice: hi\n\nASSISTANT openclaw: reply from ${params.primaryModel}`,
+      });
+    });
+    const runJudge = vi.fn(async (_params: CharacterRunJudgeParams) =>
+      JSON.stringify({
+        rankings: [
+          { model: "openai/gpt-5.4", rank: 1, score: 8, summary: "ok" },
+          { model: "anthropic/claude-sonnet-4-6", rank: 2, score: 7, summary: "ok" },
+          { model: "moonshot/kimi-k2.5", rank: 3, score: 6, summary: "ok" },
+        ],
+      }),
+    );
+
+    const result = await runQaCharacterEval({
+      repoRoot: tempRoot,
+      outputDir: path.join(tempRoot, "character"),
+      models: ["openai/gpt-5.4", "anthropic/claude-sonnet-4-6", "moonshot/kimi-k2.5"],
+      candidateConcurrency: 2,
+      judgeModels: ["openai/gpt-5.4"],
+      runSuite,
+      runJudge,
+    });
+
+    expect(maxActiveRuns).toBe(2);
+    expect(result.runs.map((run) => run.model)).toEqual([
+      "openai/gpt-5.4",
+      "anthropic/claude-sonnet-4-6",
+      "moonshot/kimi-k2.5",
+    ]);
+  });
+
+  it("defaults candidate and judge concurrency to eight", async () => {
+    let activeRuns = 0;
+    let maxActiveRuns = 0;
+    const runSuite = vi.fn(async (params: CharacterRunSuiteParams) => {
+      activeRuns += 1;
+      maxActiveRuns = Math.max(maxActiveRuns, activeRuns);
+      await new Promise((resolve) => setTimeout(resolve, 10));
+      activeRuns -= 1;
+      return makeSuiteResult({
+        outputDir: params.outputDir,
+        model: params.primaryModel,
+        transcript: `USER Alice: hi\n\nASSISTANT openclaw: reply from ${params.primaryModel}`,
+      });
+    });
+    let activeJudges = 0;
+    let maxActiveJudges = 0;
+    const runJudge = vi.fn(async (_params: CharacterRunJudgeParams) => {
+      activeJudges += 1;
+      maxActiveJudges = Math.max(maxActiveJudges, activeJudges);
+      await new Promise((resolve) => setTimeout(resolve, 10));
+      activeJudges -= 1;
+      return JSON.stringify({
+        rankings: Array.from({ length: 10 }, (_, index) => ({
+          model: `provider/model-${index + 1}`,
+          rank: index + 1,
+          score: 10 - index,
+          summary: "ok",
+        })),
+      });
+    });
+
+    await runQaCharacterEval({
+      repoRoot: tempRoot,
+      outputDir: path.join(tempRoot, "character"),
+      models: Array.from({ length: 10 }, (_, index) => `provider/model-${index + 1}`),
+      judgeModels: Array.from({ length: 10 }, (_, index) => `judge/model-${index + 1}`),
+      runSuite,
+      runJudge,
+    });
+
+    expect(maxActiveRuns).toBe(8);
+    expect(maxActiveJudges).toBe(8);
+  });
+
  it("lets explicit candidate thinking override the default panel", async () => {
    const runSuite = vi.fn(async (params: CharacterRunSuiteParams) =>
      makeSuiteResult({
--- a/extensions/qa-lab/src/character-eval.ts
+++ b/extensions/qa-lab/src/character-eval.ts
@@ -20,6 +20,7 @@ const DEFAULT_CHARACTER_EVAL_MODELS = Object.freeze([
  "google/gemini-3.1-pro-preview",
 ]);
 const DEFAULT_CHARACTER_THINKING: QaThinkingLevel = "high";
+const DEFAULT_CHARACTER_EVAL_CONCURRENCY = 8;
 const DEFAULT_CHARACTER_THINKING_BY_MODEL: Readonly<Record<string, QaThinkingLevel>> =
  Object.freeze({
    "openai/gpt-5.4": "xhigh",
@@ -119,6 +120,8 @@ export type QaCharacterEvalParams = {
  judgeThinkingDefault?: QaThinkingLevel;
  judgeModelOptions?: Record<string, QaCharacterModelOptions>;
  judgeTimeoutMs?: number;
+  candidateConcurrency?: number;
+  judgeConcurrency?: number;
  runSuite?: RunSuiteFn;
  runJudge?: RunJudgeFn;
 };
@@ -176,6 +179,35 @@ function sanitizePathPart(value: string) {
  return sanitized || "model";
 }

+function normalizeConcurrency(value: number | undefined, fallback = 1) {
+  if (value === undefined) {
+    return fallback;
+  }
+  if (!Number.isFinite(value)) {
+    return fallback;
+  }
+  return Math.max(1, Math.floor(value));
+}
+
+async function mapWithConcurrency<T, U>(
+  items: readonly T[],
+  concurrency: number,
+  mapper: (item: T, index: number) => Promise<U>,
+) {
+  const results = Array.from<U>({ length: items.length });
+  let nextIndex = 0;
+  const workerCount = Math.min(normalizeConcurrency(concurrency), items.length);
+  const workers = Array.from({ length: workerCount }, async () => {
+    while (nextIndex < items.length) {
+      const index = nextIndex;
+      nextIndex += 1;
+      results[index] = await mapper(items[index], index);
+    }
+  });
+  await Promise.all(workers);
+  return results;
+}
+
 function extractTranscript(result: QaSuiteResult) {
  const details = result.scenarios.flatMap((scenario) =>
    scenario.steps
@@ -194,6 +226,23 @@ function collectTranscriptStats(transcript: string) {
  };
 }

+function formatDuration(ms: number) {
+  if (!Number.isFinite(ms) || ms < 0) {
+    return "unknown";
+  }
+  if (ms < 1_000) {
+    return `${Math.round(ms)}ms`;
+  }
+  if (ms < 60_000) {
+    const seconds = ms / 1_000;
+    return `${seconds >= 10 ? Math.round(seconds) : Number(seconds.toFixed(1))}s`;
+  }
+  const totalSeconds = Math.round(ms / 1_000);
+  const minutes = Math.floor(totalSeconds / 60);
+  const seconds = totalSeconds % 60;
+  return seconds === 0 ? `${minutes}m` : `${minutes}m ${seconds}s`;
+}
+
 function buildJudgePrompt(params: { scenarioId: string; runs: readonly QaCharacterEvalRun[] }) {
  const runBlocks = params.runs
    .map(
@@ -327,7 +376,7 @@ function renderCharacterEvalReport(params: {
    "",
    `- Started: ${params.startedAt.toISOString()}`,
    `- Finished: ${params.finishedAt.toISOString()}`,
-    `- Duration ms: ${params.finishedAt.getTime() - params.startedAt.getTime()}`,
+    `- Duration: ${formatDuration(params.finishedAt.getTime() - params.startedAt.getTime())}`,
    `- Scenario: ${params.scenarioId}`,
    "- Execution: local QA gateway child processes, not Docker",
    `- Judges: ${params.judgments.map((judgment) => judgment.model).join(", ")}`,
@@ -340,7 +389,7 @@ function renderCharacterEvalReport(params: {

  for (const judgment of params.judgments) {
    lines.push(`### ${judgment.model}`, "");
-    lines.push(`- Duration ms: ${judgment.durationMs}`, "");
+    lines.push(`- Duration: ${formatDuration(judgment.durationMs)}`, "");
    if (judgment.rankings.length > 0) {
      for (const ranking of judgment.rankings) {
        lines.push(
@@ -364,12 +413,12 @@ function renderCharacterEvalReport(params: {

  lines.push("## Run Stats", "");
  lines.push(
-    "| Model | Thinking | Fast mode | Status | Duration ms | User turns | Assistant turns | Transcript chars |",
+    "| Model | Thinking | Fast mode | Status | Duration | User turns | Assistant turns | Transcript chars |",
  );
  lines.push("| --- | --- | --- | --- | ---: | ---: | ---: | ---: |");
  for (const run of params.runs) {
    lines.push(
-      `| ${run.model} | ${run.thinkingDefault} | ${run.fastMode ? "on" : "off"} | ${run.status} | ${run.durationMs} | ${run.stats.userTurns} | ${run.stats.assistantTurns} | ${run.stats.transcriptChars} |`,
+      `| ${run.model} | ${run.thinkingDefault} | ${run.fastMode ? "on" : "off"} | ${run.status} | ${formatDuration(run.durationMs)} | ${run.stats.userTurns} | ${run.stats.assistantTurns} | ${run.stats.transcriptChars} |`,
    );
  }

@@ -379,7 +428,7 @@ function renderCharacterEvalReport(params: {
    lines.push(`- Status: ${run.status}`);
    lines.push(`- Thinking: ${run.thinkingDefault}`);
    lines.push(`- Fast mode: ${run.fastMode ? "on" : "off"}`);
-    lines.push(`- Duration ms: ${run.durationMs}`);
+    lines.push(`- Duration: ${formatDuration(run.durationMs)}`);
    lines.push(`- Report: ${run.reportPath ?? "unavailable"}`);
    if (run.error) {
      lines.push(`- Error: ${run.error}`);
@@ -408,8 +457,11 @@ export async function runQaCharacterEval(params: QaCharacterEvalParams) {
  await fs.mkdir(runsDir, { recursive: true });

  const runSuite = params.runSuite ?? runQaSuite;
-  const runs: QaCharacterEvalRun[] = [];
-  for (const model of models) {
+  const candidateConcurrency = normalizeConcurrency(
+    params.candidateConcurrency,
+    DEFAULT_CHARACTER_EVAL_CONCURRENCY,
+  );
+  const runs = await mapWithConcurrency(models, candidateConcurrency, async (model) => {
    const thinkingDefault = resolveCandidateThinkingDefault({
      model,
      candidateThinkingDefault: params.candidateThinkingDefault,
@@ -438,7 +490,7 @@ export async function runQaCharacterEval(params: QaCharacterEvalParams) {
      const status = result.scenarios.some((scenario) => scenario.status === "fail")
        ? "fail"
        : "pass";
-      runs.push({
+      return {
        model,
        status,
        durationMs: Date.now() - runStartedAt,
@@ -449,10 +501,10 @@ export async function runQaCharacterEval(params: QaCharacterEvalParams) {
        summaryPath: result.summaryPath,
        transcript,
        stats: collectTranscriptStats(transcript),
-      });
+      } satisfies QaCharacterEvalRun;
    } catch (error) {
      const transcript = "";
-      runs.push({
+      return {
        model,
        status: "fail",
        durationMs: Date.now() - runStartedAt,
@@ -462,9 +514,9 @@ export async function runQaCharacterEval(params: QaCharacterEvalParams) {
        transcript,
        stats: collectTranscriptStats(transcript),
        error: formatErrorMessage(error),
-      });
+      } satisfies QaCharacterEvalRun;
    }
-  }
+  });

  const judgeModels = normalizeModelRefs(
    params.judgeModels && params.judgeModels.length > 0
@@ -474,8 +526,11 @@ export async function runQaCharacterEval(params: QaCharacterEvalParams) {
        : DEFAULT_JUDGE_MODELS,
  );
  const runJudge = params.runJudge ?? defaultRunJudge;
-  const judgments: QaCharacterEvalJudgeResult[] = [];
-  for (const judgeModel of judgeModels) {
+  const judgeConcurrency = normalizeConcurrency(
+    params.judgeConcurrency,
+    DEFAULT_CHARACTER_EVAL_CONCURRENCY,
+  );
+  const judgments = await mapWithConcurrency(judgeModels, judgeConcurrency, async (judgeModel) => {
    const judgeOptions = resolveJudgeOptions({
      model: judgeModel,
      judgeThinkingDefault: params.judgeThinkingDefault,
@@ -498,15 +553,15 @@ export async function runQaCharacterEval(params: QaCharacterEvalParams) {
      judgeError = formatErrorMessage(error);
    }

-    judgments.push({
+    return {
      model: judgeModel,
      thinkingDefault: judgeOptions.thinkingDefault,
      fastMode: judgeOptions.fastMode,
      durationMs: Date.now() - judgeStartedAt,
      rankings,
      ...(judgeError ? { error: judgeError } : {}),
-    });
-  }
+    } satisfies QaCharacterEvalJudgeResult;
+  });

  const finishedAt = new Date();
  const report = renderCharacterEvalReport({
--- a/extensions/qa-lab/src/cli.runtime.test.ts
+++ b/extensions/qa-lab/src/cli.runtime.test.ts
@@ -158,6 +158,8 @@ describe("qa cli runtime", () => {
      modelThinking: ["codex-cli/test-model=medium"],
      judgeModel: ["openai/gpt-5.4,thinking=xhigh,fast", "anthropic/claude-opus-4-6,thinking=high"],
      judgeTimeoutMs: 180_000,
+      concurrency: 4,
+      judgeConcurrency: 3,
    });

    expect(runQaCharacterEval).toHaveBeenCalledWith({
@@ -178,6 +180,8 @@ describe("qa cli runtime", () => {
        "anthropic/claude-opus-4-6": { thinkingDefault: "high" },
      },
      judgeTimeoutMs: 180_000,
+      candidateConcurrency: 4,
+      judgeConcurrency: 3,
    });
  });

@@ -199,6 +203,8 @@ describe("qa cli runtime", () => {
      judgeModels: undefined,
      judgeModelOptions: undefined,
      judgeTimeoutMs: undefined,
+      candidateConcurrency: undefined,
+      judgeConcurrency: undefined,
    });
  });

--- a/extensions/qa-lab/src/cli.runtime.ts
+++ b/extensions/qa-lab/src/cli.runtime.ts
@@ -85,6 +85,16 @@ function parseQaBooleanModelOption(label: string, value: string) {
  }
 }

+function parseQaPositiveIntegerOption(label: string, value: number | undefined) {
+  if (value === undefined) {
+    return undefined;
+  }
+  if (!Number.isFinite(value) || value < 1) {
+    throw new Error(`${label} must be a positive integer`);
+  }
+  return Math.floor(value);
+}
+
 function parseQaModelSpecs(label: string, entries: readonly string[] | undefined) {
  const models: string[] = [];
  const optionsByModel: Record<string, QaCharacterModelOptions> = {};
@@ -215,6 +225,8 @@ export async function runQaCharacterEvalCommand(opts: {
  modelThinking?: string[];
  judgeModel?: string[];
  judgeTimeoutMs?: number;
+  concurrency?: number;
+  judgeConcurrency?: number;
 }) {
  const repoRoot = path.resolve(opts.repoRoot ?? process.cwd());
  const candidates = parseQaModelSpecs("--model", opts.model);
@@ -231,6 +243,8 @@ export async function runQaCharacterEvalCommand(opts: {
    judgeModels: judges.models.length > 0 ? judges.models : undefined,
    judgeModelOptions: judges.optionsByModel,
    judgeTimeoutMs: opts.judgeTimeoutMs,
+    candidateConcurrency: parseQaPositiveIntegerOption("--concurrency", opts.concurrency),
+    judgeConcurrency: parseQaPositiveIntegerOption("--judge-concurrency", opts.judgeConcurrency),
  });
  process.stdout.write(`QA character eval report: ${result.reportPath}\n`);
  process.stdout.write(`QA character eval summary: ${result.summaryPath}\n`);
--- a/extensions/qa-lab/src/cli.ts
+++ b/extensions/qa-lab/src/cli.ts
@@ -38,6 +38,8 @@ async function runQaCharacterEval(opts: {
  modelThinking?: string[];
  judgeModel?: string[];
  judgeTimeoutMs?: number;
+  concurrency?: number;
+  judgeConcurrency?: number;
 }) {
  const runtime = await loadQaLabCliRuntime();
  await runtime.runQaCharacterEvalCommand(opts);
@@ -197,6 +199,12 @@ export function registerQaLabCli(program: Command) {
    .option("--judge-timeout-ms <ms>", "Override judge wait timeout", (value: string) =>
      Number(value),
    )
+    .option("--concurrency <count>", "Candidate model run concurrency", (value: string) =>
+      Number(value),
+    )
+    .option("--judge-concurrency <count>", "Judge model run concurrency", (value: string) =>
+      Number(value),
+    )
    .action(
      async (opts: {
        repoRoot?: string;
@@ -208,6 +216,8 @@ export function registerQaLabCli(program: Command) {
        modelThinking?: string[];
        judgeModel?: string[];
        judgeTimeoutMs?: number;
+        concurrency?: number;
+        judgeConcurrency?: number;
      }) => {
        await runQaCharacterEval(opts);
      },