feat: parallelize character eval runs

This commit is contained in:
Peter Steinberger
2026-04-08 20:05:24 +01:00
parent f1e75d3259
commit 21ef1bf8de
8 changed files with 219 additions and 56 deletions

View File

@@ -115,7 +115,8 @@ describe("runQaCharacterEval", () => {
expect(report).toContain("reply from codex-cli/test-model");
expect(report).toContain("Judge thinking: xhigh");
expect(report).toContain("Fast mode: on");
expect(report).toContain("Duration ms:");
expect(report).toContain("Duration:");
expect(report).not.toContain("Duration ms:");
expect(report).not.toContain("Judge Raw Reply");
});
@@ -201,6 +202,92 @@ describe("runQaCharacterEval", () => {
expect(runJudge.mock.calls.map(([params]) => params.judgeFastMode)).toEqual([true, false]);
});
it("runs candidate models with bounded concurrency while preserving result order", async () => {
let activeRuns = 0;
let maxActiveRuns = 0;
const runSuite = vi.fn(async (params: CharacterRunSuiteParams) => {
activeRuns += 1;
maxActiveRuns = Math.max(maxActiveRuns, activeRuns);
await new Promise((resolve) => setTimeout(resolve, 10));
activeRuns -= 1;
return makeSuiteResult({
outputDir: params.outputDir,
model: params.primaryModel,
transcript: `USER Alice: hi\n\nASSISTANT openclaw: reply from ${params.primaryModel}`,
});
});
const runJudge = vi.fn(async (_params: CharacterRunJudgeParams) =>
JSON.stringify({
rankings: [
{ model: "openai/gpt-5.4", rank: 1, score: 8, summary: "ok" },
{ model: "anthropic/claude-sonnet-4-6", rank: 2, score: 7, summary: "ok" },
{ model: "moonshot/kimi-k2.5", rank: 3, score: 6, summary: "ok" },
],
}),
);
const result = await runQaCharacterEval({
repoRoot: tempRoot,
outputDir: path.join(tempRoot, "character"),
models: ["openai/gpt-5.4", "anthropic/claude-sonnet-4-6", "moonshot/kimi-k2.5"],
candidateConcurrency: 2,
judgeModels: ["openai/gpt-5.4"],
runSuite,
runJudge,
});
expect(maxActiveRuns).toBe(2);
expect(result.runs.map((run) => run.model)).toEqual([
"openai/gpt-5.4",
"anthropic/claude-sonnet-4-6",
"moonshot/kimi-k2.5",
]);
});
it("defaults candidate and judge concurrency to eight", async () => {
let activeRuns = 0;
let maxActiveRuns = 0;
const runSuite = vi.fn(async (params: CharacterRunSuiteParams) => {
activeRuns += 1;
maxActiveRuns = Math.max(maxActiveRuns, activeRuns);
await new Promise((resolve) => setTimeout(resolve, 10));
activeRuns -= 1;
return makeSuiteResult({
outputDir: params.outputDir,
model: params.primaryModel,
transcript: `USER Alice: hi\n\nASSISTANT openclaw: reply from ${params.primaryModel}`,
});
});
let activeJudges = 0;
let maxActiveJudges = 0;
const runJudge = vi.fn(async (_params: CharacterRunJudgeParams) => {
activeJudges += 1;
maxActiveJudges = Math.max(maxActiveJudges, activeJudges);
await new Promise((resolve) => setTimeout(resolve, 10));
activeJudges -= 1;
return JSON.stringify({
rankings: Array.from({ length: 10 }, (_, index) => ({
model: `provider/model-${index + 1}`,
rank: index + 1,
score: 10 - index,
summary: "ok",
})),
});
});
await runQaCharacterEval({
repoRoot: tempRoot,
outputDir: path.join(tempRoot, "character"),
models: Array.from({ length: 10 }, (_, index) => `provider/model-${index + 1}`),
judgeModels: Array.from({ length: 10 }, (_, index) => `judge/model-${index + 1}`),
runSuite,
runJudge,
});
expect(maxActiveRuns).toBe(8);
expect(maxActiveJudges).toBe(8);
});
it("lets explicit candidate thinking override the default panel", async () => {
const runSuite = vi.fn(async (params: CharacterRunSuiteParams) =>
makeSuiteResult({

View File

@@ -20,6 +20,7 @@ const DEFAULT_CHARACTER_EVAL_MODELS = Object.freeze([
"google/gemini-3.1-pro-preview",
]);
const DEFAULT_CHARACTER_THINKING: QaThinkingLevel = "high";
const DEFAULT_CHARACTER_EVAL_CONCURRENCY = 8;
const DEFAULT_CHARACTER_THINKING_BY_MODEL: Readonly<Record<string, QaThinkingLevel>> =
Object.freeze({
"openai/gpt-5.4": "xhigh",
@@ -119,6 +120,8 @@ export type QaCharacterEvalParams = {
judgeThinkingDefault?: QaThinkingLevel;
judgeModelOptions?: Record<string, QaCharacterModelOptions>;
judgeTimeoutMs?: number;
candidateConcurrency?: number;
judgeConcurrency?: number;
runSuite?: RunSuiteFn;
runJudge?: RunJudgeFn;
};
@@ -176,6 +179,35 @@ function sanitizePathPart(value: string) {
return sanitized || "model";
}
function normalizeConcurrency(value: number | undefined, fallback = 1) {
if (value === undefined) {
return fallback;
}
if (!Number.isFinite(value)) {
return fallback;
}
return Math.max(1, Math.floor(value));
}
async function mapWithConcurrency<T, U>(
items: readonly T[],
concurrency: number,
mapper: (item: T, index: number) => Promise<U>,
) {
const results = Array.from<U>({ length: items.length });
let nextIndex = 0;
const workerCount = Math.min(normalizeConcurrency(concurrency), items.length);
const workers = Array.from({ length: workerCount }, async () => {
while (nextIndex < items.length) {
const index = nextIndex;
nextIndex += 1;
results[index] = await mapper(items[index], index);
}
});
await Promise.all(workers);
return results;
}
function extractTranscript(result: QaSuiteResult) {
const details = result.scenarios.flatMap((scenario) =>
scenario.steps
@@ -194,6 +226,23 @@ function collectTranscriptStats(transcript: string) {
};
}
function formatDuration(ms: number) {
if (!Number.isFinite(ms) || ms < 0) {
return "unknown";
}
if (ms < 1_000) {
return `${Math.round(ms)}ms`;
}
if (ms < 60_000) {
const seconds = ms / 1_000;
return `${seconds >= 10 ? Math.round(seconds) : Number(seconds.toFixed(1))}s`;
}
const totalSeconds = Math.round(ms / 1_000);
const minutes = Math.floor(totalSeconds / 60);
const seconds = totalSeconds % 60;
return seconds === 0 ? `${minutes}m` : `${minutes}m ${seconds}s`;
}
function buildJudgePrompt(params: { scenarioId: string; runs: readonly QaCharacterEvalRun[] }) {
const runBlocks = params.runs
.map(
@@ -327,7 +376,7 @@ function renderCharacterEvalReport(params: {
"",
`- Started: ${params.startedAt.toISOString()}`,
`- Finished: ${params.finishedAt.toISOString()}`,
`- Duration ms: ${params.finishedAt.getTime() - params.startedAt.getTime()}`,
`- Duration: ${formatDuration(params.finishedAt.getTime() - params.startedAt.getTime())}`,
`- Scenario: ${params.scenarioId}`,
"- Execution: local QA gateway child processes, not Docker",
`- Judges: ${params.judgments.map((judgment) => judgment.model).join(", ")}`,
@@ -340,7 +389,7 @@ function renderCharacterEvalReport(params: {
for (const judgment of params.judgments) {
lines.push(`### ${judgment.model}`, "");
lines.push(`- Duration ms: ${judgment.durationMs}`, "");
lines.push(`- Duration: ${formatDuration(judgment.durationMs)}`, "");
if (judgment.rankings.length > 0) {
for (const ranking of judgment.rankings) {
lines.push(
@@ -364,12 +413,12 @@ function renderCharacterEvalReport(params: {
lines.push("## Run Stats", "");
lines.push(
"| Model | Thinking | Fast mode | Status | Duration ms | User turns | Assistant turns | Transcript chars |",
"| Model | Thinking | Fast mode | Status | Duration | User turns | Assistant turns | Transcript chars |",
);
lines.push("| --- | --- | --- | --- | ---: | ---: | ---: | ---: |");
for (const run of params.runs) {
lines.push(
`| ${run.model} | ${run.thinkingDefault} | ${run.fastMode ? "on" : "off"} | ${run.status} | ${run.durationMs} | ${run.stats.userTurns} | ${run.stats.assistantTurns} | ${run.stats.transcriptChars} |`,
`| ${run.model} | ${run.thinkingDefault} | ${run.fastMode ? "on" : "off"} | ${run.status} | ${formatDuration(run.durationMs)} | ${run.stats.userTurns} | ${run.stats.assistantTurns} | ${run.stats.transcriptChars} |`,
);
}
@@ -379,7 +428,7 @@ function renderCharacterEvalReport(params: {
lines.push(`- Status: ${run.status}`);
lines.push(`- Thinking: ${run.thinkingDefault}`);
lines.push(`- Fast mode: ${run.fastMode ? "on" : "off"}`);
lines.push(`- Duration ms: ${run.durationMs}`);
lines.push(`- Duration: ${formatDuration(run.durationMs)}`);
lines.push(`- Report: ${run.reportPath ?? "unavailable"}`);
if (run.error) {
lines.push(`- Error: ${run.error}`);
@@ -408,8 +457,11 @@ export async function runQaCharacterEval(params: QaCharacterEvalParams) {
await fs.mkdir(runsDir, { recursive: true });
const runSuite = params.runSuite ?? runQaSuite;
const runs: QaCharacterEvalRun[] = [];
for (const model of models) {
const candidateConcurrency = normalizeConcurrency(
params.candidateConcurrency,
DEFAULT_CHARACTER_EVAL_CONCURRENCY,
);
const runs = await mapWithConcurrency(models, candidateConcurrency, async (model) => {
const thinkingDefault = resolveCandidateThinkingDefault({
model,
candidateThinkingDefault: params.candidateThinkingDefault,
@@ -438,7 +490,7 @@ export async function runQaCharacterEval(params: QaCharacterEvalParams) {
const status = result.scenarios.some((scenario) => scenario.status === "fail")
? "fail"
: "pass";
runs.push({
return {
model,
status,
durationMs: Date.now() - runStartedAt,
@@ -449,10 +501,10 @@ export async function runQaCharacterEval(params: QaCharacterEvalParams) {
summaryPath: result.summaryPath,
transcript,
stats: collectTranscriptStats(transcript),
});
} satisfies QaCharacterEvalRun;
} catch (error) {
const transcript = "";
runs.push({
return {
model,
status: "fail",
durationMs: Date.now() - runStartedAt,
@@ -462,9 +514,9 @@ export async function runQaCharacterEval(params: QaCharacterEvalParams) {
transcript,
stats: collectTranscriptStats(transcript),
error: formatErrorMessage(error),
});
} satisfies QaCharacterEvalRun;
}
}
});
const judgeModels = normalizeModelRefs(
params.judgeModels && params.judgeModels.length > 0
@@ -474,8 +526,11 @@ export async function runQaCharacterEval(params: QaCharacterEvalParams) {
: DEFAULT_JUDGE_MODELS,
);
const runJudge = params.runJudge ?? defaultRunJudge;
const judgments: QaCharacterEvalJudgeResult[] = [];
for (const judgeModel of judgeModels) {
const judgeConcurrency = normalizeConcurrency(
params.judgeConcurrency,
DEFAULT_CHARACTER_EVAL_CONCURRENCY,
);
const judgments = await mapWithConcurrency(judgeModels, judgeConcurrency, async (judgeModel) => {
const judgeOptions = resolveJudgeOptions({
model: judgeModel,
judgeThinkingDefault: params.judgeThinkingDefault,
@@ -498,15 +553,15 @@ export async function runQaCharacterEval(params: QaCharacterEvalParams) {
judgeError = formatErrorMessage(error);
}
judgments.push({
return {
model: judgeModel,
thinkingDefault: judgeOptions.thinkingDefault,
fastMode: judgeOptions.fastMode,
durationMs: Date.now() - judgeStartedAt,
rankings,
...(judgeError ? { error: judgeError } : {}),
});
}
} satisfies QaCharacterEvalJudgeResult;
});
const finishedAt = new Date();
const report = renderCharacterEvalReport({

View File

@@ -158,6 +158,8 @@ describe("qa cli runtime", () => {
modelThinking: ["codex-cli/test-model=medium"],
judgeModel: ["openai/gpt-5.4,thinking=xhigh,fast", "anthropic/claude-opus-4-6,thinking=high"],
judgeTimeoutMs: 180_000,
concurrency: 4,
judgeConcurrency: 3,
});
expect(runQaCharacterEval).toHaveBeenCalledWith({
@@ -178,6 +180,8 @@ describe("qa cli runtime", () => {
"anthropic/claude-opus-4-6": { thinkingDefault: "high" },
},
judgeTimeoutMs: 180_000,
candidateConcurrency: 4,
judgeConcurrency: 3,
});
});
@@ -199,6 +203,8 @@ describe("qa cli runtime", () => {
judgeModels: undefined,
judgeModelOptions: undefined,
judgeTimeoutMs: undefined,
candidateConcurrency: undefined,
judgeConcurrency: undefined,
});
});

View File

@@ -85,6 +85,16 @@ function parseQaBooleanModelOption(label: string, value: string) {
}
}
function parseQaPositiveIntegerOption(label: string, value: number | undefined) {
if (value === undefined) {
return undefined;
}
if (!Number.isFinite(value) || value < 1) {
throw new Error(`${label} must be a positive integer`);
}
return Math.floor(value);
}
function parseQaModelSpecs(label: string, entries: readonly string[] | undefined) {
const models: string[] = [];
const optionsByModel: Record<string, QaCharacterModelOptions> = {};
@@ -215,6 +225,8 @@ export async function runQaCharacterEvalCommand(opts: {
modelThinking?: string[];
judgeModel?: string[];
judgeTimeoutMs?: number;
concurrency?: number;
judgeConcurrency?: number;
}) {
const repoRoot = path.resolve(opts.repoRoot ?? process.cwd());
const candidates = parseQaModelSpecs("--model", opts.model);
@@ -231,6 +243,8 @@ export async function runQaCharacterEvalCommand(opts: {
judgeModels: judges.models.length > 0 ? judges.models : undefined,
judgeModelOptions: judges.optionsByModel,
judgeTimeoutMs: opts.judgeTimeoutMs,
candidateConcurrency: parseQaPositiveIntegerOption("--concurrency", opts.concurrency),
judgeConcurrency: parseQaPositiveIntegerOption("--judge-concurrency", opts.judgeConcurrency),
});
process.stdout.write(`QA character eval report: ${result.reportPath}\n`);
process.stdout.write(`QA character eval summary: ${result.summaryPath}\n`);

View File

@@ -38,6 +38,8 @@ async function runQaCharacterEval(opts: {
modelThinking?: string[];
judgeModel?: string[];
judgeTimeoutMs?: number;
concurrency?: number;
judgeConcurrency?: number;
}) {
const runtime = await loadQaLabCliRuntime();
await runtime.runQaCharacterEvalCommand(opts);
@@ -197,6 +199,12 @@ export function registerQaLabCli(program: Command) {
.option("--judge-timeout-ms <ms>", "Override judge wait timeout", (value: string) =>
Number(value),
)
.option("--concurrency <count>", "Candidate model run concurrency", (value: string) =>
Number(value),
)
.option("--judge-concurrency <count>", "Judge model run concurrency", (value: string) =>
Number(value),
)
.action(
async (opts: {
repoRoot?: string;
@@ -208,6 +216,8 @@ export function registerQaLabCli(program: Command) {
modelThinking?: string[];
judgeModel?: string[];
judgeTimeoutMs?: number;
concurrency?: number;
judgeConcurrency?: number;
}) => {
await runQaCharacterEval(opts);
},