feat: parallelize character eval runs

This commit is contained in:
Peter Steinberger
2026-04-08 20:05:24 +01:00
parent f1e75d3259
commit 21ef1bf8de
8 changed files with 219 additions and 56 deletions

View File

@@ -68,6 +68,8 @@ pnpm openclaw qa character-eval \
--model codex-cli/<codex-model>,thinking=high \
--judge-model openai/gpt-5.4,thinking=xhigh,fast \
--judge-model anthropic/claude-opus-4-6,thinking=high \
--concurrency 8 \
--judge-concurrency 8 \
--output-dir .artifacts/qa-e2e/character-eval-<tag>
```
@@ -79,6 +81,7 @@ pnpm openclaw qa character-eval \
- OpenAI candidate refs default to fast mode so priority processing is used where supported. Use inline `,fast`, `,no-fast`, or `,fast=false` for one model; use `--fast` only to force fast mode for every candidate.
- Judges default to `openai/gpt-5.4,thinking=xhigh,fast` and `anthropic/claude-opus-4-6,thinking=high`.
- Report includes judge ranking, run stats, durations, and full transcripts; do not include raw judge replies. Duration is benchmark context, not a grading signal.
- Candidate and judge concurrency default to 8. Use `--concurrency <n>` and `--judge-concurrency <n>` to override when local gateways or provider limits need a gentler lane.
- Scenario source should stay markdown-driven under `qa/scenarios/`.
- For isolated character/persona evals, write the persona into `SOUL.md` and blank `IDENTITY.md` in the scenario flow. Use `SOUL.md + IDENTITY.md` only when intentionally testing how the normal OpenClaw identity combines with the character.
- Keep prompts natural and task-shaped. The candidate model should receive character setup through `SOUL.md`, then normal user turns such as chat, workspace help, and small file tasks; do not ask "how would you react?" or tell the model it is in an eval.

View File

@@ -98,7 +98,9 @@ pnpm openclaw qa character-eval \
--model xiaomi/mimo-v2-pro,thinking=high \
--model google/gemini-3.1-pro-preview,thinking=high \
--judge-model openai/gpt-5.4,thinking=xhigh,fast \
--judge-model anthropic/claude-opus-4-6,thinking=high
--judge-model anthropic/claude-opus-4-6,thinking=high \
--concurrency 8 \
--judge-concurrency 8
```
The command runs local QA gateway child processes, not Docker. Character eval
@@ -118,6 +120,9 @@ single candidate or judge needs an override. Pass `--fast` only when you want to
force fast mode on for every candidate model. Candidate and judge durations are
recorded in the report for benchmark analysis, but judge prompts explicitly say
not to rank by speed.
Candidate and judge model runs both default to concurrency 8. Lower
`--concurrency` or `--judge-concurrency` when provider limits or local gateway
pressure make a run too noisy.
When no candidate `--model` is passed, the character eval defaults to
`openai/gpt-5.4`, `openai/gpt-5.2`, `anthropic/claude-opus-4-6`,
`anthropic/claude-sonnet-4-6`, `minimax/MiniMax-M2.7`, `zai/glm-5.1`,

View File

@@ -115,7 +115,8 @@ describe("runQaCharacterEval", () => {
expect(report).toContain("reply from codex-cli/test-model");
expect(report).toContain("Judge thinking: xhigh");
expect(report).toContain("Fast mode: on");
expect(report).toContain("Duration ms:");
expect(report).toContain("Duration:");
expect(report).not.toContain("Duration ms:");
expect(report).not.toContain("Judge Raw Reply");
});
@@ -201,6 +202,92 @@ describe("runQaCharacterEval", () => {
expect(runJudge.mock.calls.map(([params]) => params.judgeFastMode)).toEqual([true, false]);
});
it("runs candidate models with bounded concurrency while preserving result order", async () => {
let activeRuns = 0;
let maxActiveRuns = 0;
const runSuite = vi.fn(async (params: CharacterRunSuiteParams) => {
activeRuns += 1;
maxActiveRuns = Math.max(maxActiveRuns, activeRuns);
await new Promise((resolve) => setTimeout(resolve, 10));
activeRuns -= 1;
return makeSuiteResult({
outputDir: params.outputDir,
model: params.primaryModel,
transcript: `USER Alice: hi\n\nASSISTANT openclaw: reply from ${params.primaryModel}`,
});
});
const runJudge = vi.fn(async (_params: CharacterRunJudgeParams) =>
JSON.stringify({
rankings: [
{ model: "openai/gpt-5.4", rank: 1, score: 8, summary: "ok" },
{ model: "anthropic/claude-sonnet-4-6", rank: 2, score: 7, summary: "ok" },
{ model: "moonshot/kimi-k2.5", rank: 3, score: 6, summary: "ok" },
],
}),
);
const result = await runQaCharacterEval({
repoRoot: tempRoot,
outputDir: path.join(tempRoot, "character"),
models: ["openai/gpt-5.4", "anthropic/claude-sonnet-4-6", "moonshot/kimi-k2.5"],
candidateConcurrency: 2,
judgeModels: ["openai/gpt-5.4"],
runSuite,
runJudge,
});
expect(maxActiveRuns).toBe(2);
expect(result.runs.map((run) => run.model)).toEqual([
"openai/gpt-5.4",
"anthropic/claude-sonnet-4-6",
"moonshot/kimi-k2.5",
]);
});
it("defaults candidate and judge concurrency to eight", async () => {
let activeRuns = 0;
let maxActiveRuns = 0;
const runSuite = vi.fn(async (params: CharacterRunSuiteParams) => {
activeRuns += 1;
maxActiveRuns = Math.max(maxActiveRuns, activeRuns);
await new Promise((resolve) => setTimeout(resolve, 10));
activeRuns -= 1;
return makeSuiteResult({
outputDir: params.outputDir,
model: params.primaryModel,
transcript: `USER Alice: hi\n\nASSISTANT openclaw: reply from ${params.primaryModel}`,
});
});
let activeJudges = 0;
let maxActiveJudges = 0;
const runJudge = vi.fn(async (_params: CharacterRunJudgeParams) => {
activeJudges += 1;
maxActiveJudges = Math.max(maxActiveJudges, activeJudges);
await new Promise((resolve) => setTimeout(resolve, 10));
activeJudges -= 1;
return JSON.stringify({
rankings: Array.from({ length: 10 }, (_, index) => ({
model: `provider/model-${index + 1}`,
rank: index + 1,
score: 10 - index,
summary: "ok",
})),
});
});
await runQaCharacterEval({
repoRoot: tempRoot,
outputDir: path.join(tempRoot, "character"),
models: Array.from({ length: 10 }, (_, index) => `provider/model-${index + 1}`),
judgeModels: Array.from({ length: 10 }, (_, index) => `judge/model-${index + 1}`),
runSuite,
runJudge,
});
expect(maxActiveRuns).toBe(8);
expect(maxActiveJudges).toBe(8);
});
it("lets explicit candidate thinking override the default panel", async () => {
const runSuite = vi.fn(async (params: CharacterRunSuiteParams) =>
makeSuiteResult({

View File

@@ -20,6 +20,7 @@ const DEFAULT_CHARACTER_EVAL_MODELS = Object.freeze([
"google/gemini-3.1-pro-preview",
]);
const DEFAULT_CHARACTER_THINKING: QaThinkingLevel = "high";
const DEFAULT_CHARACTER_EVAL_CONCURRENCY = 8;
const DEFAULT_CHARACTER_THINKING_BY_MODEL: Readonly<Record<string, QaThinkingLevel>> =
Object.freeze({
"openai/gpt-5.4": "xhigh",
@@ -119,6 +120,8 @@ export type QaCharacterEvalParams = {
judgeThinkingDefault?: QaThinkingLevel;
judgeModelOptions?: Record<string, QaCharacterModelOptions>;
judgeTimeoutMs?: number;
candidateConcurrency?: number;
judgeConcurrency?: number;
runSuite?: RunSuiteFn;
runJudge?: RunJudgeFn;
};
@@ -176,6 +179,35 @@ function sanitizePathPart(value: string) {
return sanitized || "model";
}
function normalizeConcurrency(value: number | undefined, fallback = 1) {
if (value === undefined) {
return fallback;
}
if (!Number.isFinite(value)) {
return fallback;
}
return Math.max(1, Math.floor(value));
}
async function mapWithConcurrency<T, U>(
items: readonly T[],
concurrency: number,
mapper: (item: T, index: number) => Promise<U>,
) {
const results = Array.from<U>({ length: items.length });
let nextIndex = 0;
const workerCount = Math.min(normalizeConcurrency(concurrency), items.length);
const workers = Array.from({ length: workerCount }, async () => {
while (nextIndex < items.length) {
const index = nextIndex;
nextIndex += 1;
results[index] = await mapper(items[index], index);
}
});
await Promise.all(workers);
return results;
}
function extractTranscript(result: QaSuiteResult) {
const details = result.scenarios.flatMap((scenario) =>
scenario.steps
@@ -194,6 +226,23 @@ function collectTranscriptStats(transcript: string) {
};
}
function formatDuration(ms: number) {
if (!Number.isFinite(ms) || ms < 0) {
return "unknown";
}
if (ms < 1_000) {
return `${Math.round(ms)}ms`;
}
if (ms < 60_000) {
const seconds = ms / 1_000;
return `${seconds >= 10 ? Math.round(seconds) : Number(seconds.toFixed(1))}s`;
}
const totalSeconds = Math.round(ms / 1_000);
const minutes = Math.floor(totalSeconds / 60);
const seconds = totalSeconds % 60;
return seconds === 0 ? `${minutes}m` : `${minutes}m ${seconds}s`;
}
function buildJudgePrompt(params: { scenarioId: string; runs: readonly QaCharacterEvalRun[] }) {
const runBlocks = params.runs
.map(
@@ -327,7 +376,7 @@ function renderCharacterEvalReport(params: {
"",
`- Started: ${params.startedAt.toISOString()}`,
`- Finished: ${params.finishedAt.toISOString()}`,
`- Duration ms: ${params.finishedAt.getTime() - params.startedAt.getTime()}`,
`- Duration: ${formatDuration(params.finishedAt.getTime() - params.startedAt.getTime())}`,
`- Scenario: ${params.scenarioId}`,
"- Execution: local QA gateway child processes, not Docker",
`- Judges: ${params.judgments.map((judgment) => judgment.model).join(", ")}`,
@@ -340,7 +389,7 @@ function renderCharacterEvalReport(params: {
for (const judgment of params.judgments) {
lines.push(`### ${judgment.model}`, "");
lines.push(`- Duration ms: ${judgment.durationMs}`, "");
lines.push(`- Duration: ${formatDuration(judgment.durationMs)}`, "");
if (judgment.rankings.length > 0) {
for (const ranking of judgment.rankings) {
lines.push(
@@ -364,12 +413,12 @@ function renderCharacterEvalReport(params: {
lines.push("## Run Stats", "");
lines.push(
"| Model | Thinking | Fast mode | Status | Duration ms | User turns | Assistant turns | Transcript chars |",
"| Model | Thinking | Fast mode | Status | Duration | User turns | Assistant turns | Transcript chars |",
);
lines.push("| --- | --- | --- | --- | ---: | ---: | ---: | ---: |");
for (const run of params.runs) {
lines.push(
`| ${run.model} | ${run.thinkingDefault} | ${run.fastMode ? "on" : "off"} | ${run.status} | ${run.durationMs} | ${run.stats.userTurns} | ${run.stats.assistantTurns} | ${run.stats.transcriptChars} |`,
`| ${run.model} | ${run.thinkingDefault} | ${run.fastMode ? "on" : "off"} | ${run.status} | ${formatDuration(run.durationMs)} | ${run.stats.userTurns} | ${run.stats.assistantTurns} | ${run.stats.transcriptChars} |`,
);
}
@@ -379,7 +428,7 @@ function renderCharacterEvalReport(params: {
lines.push(`- Status: ${run.status}`);
lines.push(`- Thinking: ${run.thinkingDefault}`);
lines.push(`- Fast mode: ${run.fastMode ? "on" : "off"}`);
lines.push(`- Duration ms: ${run.durationMs}`);
lines.push(`- Duration: ${formatDuration(run.durationMs)}`);
lines.push(`- Report: ${run.reportPath ?? "unavailable"}`);
if (run.error) {
lines.push(`- Error: ${run.error}`);
@@ -408,8 +457,11 @@ export async function runQaCharacterEval(params: QaCharacterEvalParams) {
await fs.mkdir(runsDir, { recursive: true });
const runSuite = params.runSuite ?? runQaSuite;
const runs: QaCharacterEvalRun[] = [];
for (const model of models) {
const candidateConcurrency = normalizeConcurrency(
params.candidateConcurrency,
DEFAULT_CHARACTER_EVAL_CONCURRENCY,
);
const runs = await mapWithConcurrency(models, candidateConcurrency, async (model) => {
const thinkingDefault = resolveCandidateThinkingDefault({
model,
candidateThinkingDefault: params.candidateThinkingDefault,
@@ -438,7 +490,7 @@ export async function runQaCharacterEval(params: QaCharacterEvalParams) {
const status = result.scenarios.some((scenario) => scenario.status === "fail")
? "fail"
: "pass";
runs.push({
return {
model,
status,
durationMs: Date.now() - runStartedAt,
@@ -449,10 +501,10 @@ export async function runQaCharacterEval(params: QaCharacterEvalParams) {
summaryPath: result.summaryPath,
transcript,
stats: collectTranscriptStats(transcript),
});
} satisfies QaCharacterEvalRun;
} catch (error) {
const transcript = "";
runs.push({
return {
model,
status: "fail",
durationMs: Date.now() - runStartedAt,
@@ -462,9 +514,9 @@ export async function runQaCharacterEval(params: QaCharacterEvalParams) {
transcript,
stats: collectTranscriptStats(transcript),
error: formatErrorMessage(error),
});
} satisfies QaCharacterEvalRun;
}
}
});
const judgeModels = normalizeModelRefs(
params.judgeModels && params.judgeModels.length > 0
@@ -474,8 +526,11 @@ export async function runQaCharacterEval(params: QaCharacterEvalParams) {
: DEFAULT_JUDGE_MODELS,
);
const runJudge = params.runJudge ?? defaultRunJudge;
const judgments: QaCharacterEvalJudgeResult[] = [];
for (const judgeModel of judgeModels) {
const judgeConcurrency = normalizeConcurrency(
params.judgeConcurrency,
DEFAULT_CHARACTER_EVAL_CONCURRENCY,
);
const judgments = await mapWithConcurrency(judgeModels, judgeConcurrency, async (judgeModel) => {
const judgeOptions = resolveJudgeOptions({
model: judgeModel,
judgeThinkingDefault: params.judgeThinkingDefault,
@@ -498,15 +553,15 @@ export async function runQaCharacterEval(params: QaCharacterEvalParams) {
judgeError = formatErrorMessage(error);
}
judgments.push({
return {
model: judgeModel,
thinkingDefault: judgeOptions.thinkingDefault,
fastMode: judgeOptions.fastMode,
durationMs: Date.now() - judgeStartedAt,
rankings,
...(judgeError ? { error: judgeError } : {}),
});
}
} satisfies QaCharacterEvalJudgeResult;
});
const finishedAt = new Date();
const report = renderCharacterEvalReport({

View File

@@ -158,6 +158,8 @@ describe("qa cli runtime", () => {
modelThinking: ["codex-cli/test-model=medium"],
judgeModel: ["openai/gpt-5.4,thinking=xhigh,fast", "anthropic/claude-opus-4-6,thinking=high"],
judgeTimeoutMs: 180_000,
concurrency: 4,
judgeConcurrency: 3,
});
expect(runQaCharacterEval).toHaveBeenCalledWith({
@@ -178,6 +180,8 @@ describe("qa cli runtime", () => {
"anthropic/claude-opus-4-6": { thinkingDefault: "high" },
},
judgeTimeoutMs: 180_000,
candidateConcurrency: 4,
judgeConcurrency: 3,
});
});
@@ -199,6 +203,8 @@ describe("qa cli runtime", () => {
judgeModels: undefined,
judgeModelOptions: undefined,
judgeTimeoutMs: undefined,
candidateConcurrency: undefined,
judgeConcurrency: undefined,
});
});

View File

@@ -85,6 +85,16 @@ function parseQaBooleanModelOption(label: string, value: string) {
}
}
function parseQaPositiveIntegerOption(label: string, value: number | undefined) {
if (value === undefined) {
return undefined;
}
if (!Number.isFinite(value) || value < 1) {
throw new Error(`${label} must be a positive integer`);
}
return Math.floor(value);
}
function parseQaModelSpecs(label: string, entries: readonly string[] | undefined) {
const models: string[] = [];
const optionsByModel: Record<string, QaCharacterModelOptions> = {};
@@ -215,6 +225,8 @@ export async function runQaCharacterEvalCommand(opts: {
modelThinking?: string[];
judgeModel?: string[];
judgeTimeoutMs?: number;
concurrency?: number;
judgeConcurrency?: number;
}) {
const repoRoot = path.resolve(opts.repoRoot ?? process.cwd());
const candidates = parseQaModelSpecs("--model", opts.model);
@@ -231,6 +243,8 @@ export async function runQaCharacterEvalCommand(opts: {
judgeModels: judges.models.length > 0 ? judges.models : undefined,
judgeModelOptions: judges.optionsByModel,
judgeTimeoutMs: opts.judgeTimeoutMs,
candidateConcurrency: parseQaPositiveIntegerOption("--concurrency", opts.concurrency),
judgeConcurrency: parseQaPositiveIntegerOption("--judge-concurrency", opts.judgeConcurrency),
});
process.stdout.write(`QA character eval report: ${result.reportPath}\n`);
process.stdout.write(`QA character eval summary: ${result.summaryPath}\n`);

View File

@@ -38,6 +38,8 @@ async function runQaCharacterEval(opts: {
modelThinking?: string[];
judgeModel?: string[];
judgeTimeoutMs?: number;
concurrency?: number;
judgeConcurrency?: number;
}) {
const runtime = await loadQaLabCliRuntime();
await runtime.runQaCharacterEvalCommand(opts);
@@ -197,6 +199,12 @@ export function registerQaLabCli(program: Command) {
.option("--judge-timeout-ms <ms>", "Override judge wait timeout", (value: string) =>
Number(value),
)
.option("--concurrency <count>", "Candidate model run concurrency", (value: string) =>
Number(value),
)
.option("--judge-concurrency <count>", "Judge model run concurrency", (value: string) =>
Number(value),
)
.action(
async (opts: {
repoRoot?: string;
@@ -208,6 +216,8 @@ export function registerQaLabCli(program: Command) {
modelThinking?: string[];
judgeModel?: string[];
judgeTimeoutMs?: number;
concurrency?: number;
judgeConcurrency?: number;
}) => {
await runQaCharacterEval(opts);
},

View File

@@ -6,8 +6,9 @@ title: "Late-night deploy helper chat"
surface: character
objective: Capture a natural multi-turn character conversation with real workspace help so another model can later grade naturalness, vibe, and funniness from the raw transcript.
successCriteria:
- Agent responds on every turn of the conversation.
- Agent completes a small workspace file task without making the conversation feel like a test.
- Agent gets a natural multi-turn conversation, and any missed replies stay visible in the transcript instead of aborting capture.
- Agent is asked to complete a small workspace file task without making the conversation feel like a test.
- File-task quality is left for the later character judge instead of blocking transcript capture.
- Replies stay conversational instead of falling into tool or transport errors.
- The report preserves the full transcript for later grading.
docsRefs:
@@ -23,10 +24,6 @@ execution:
config:
conversationId: alice
senderName: Alice
artifactNeedles:
- Precious Status
- build is green
- cursed
workspaceFiles:
SOUL.md: |-
# This is your character
@@ -101,40 +98,26 @@ steps:
ref: config.senderName
text:
expr: turn.text
- call: waitForOutboundMessage
saveAs: latestOutbound
args:
- ref: state
- lambda:
params: [candidate]
expr: "candidate.conversation.id === config.conversationId && candidate.text.trim().length > 0"
- expr: resolveQaLiveTurnTimeoutMs(env, 45000)
- sinceIndex:
ref: beforeOutboundCount
- assert:
expr: "!config.forbiddenNeedles.some((needle) => normalizeLowercaseStringOrEmpty(latestOutbound.text).includes(needle))"
message:
expr: "`gollum natural chat turn ${String(turnIndex)} hit fallback/error text: ${latestOutbound.text}`"
- if:
expr: Boolean(turn.expectFile?.path)
then:
- set: expectedArtifactPath
value:
expr: "path.join(env.gateway.workspaceDir, String(turn.expectFile.path))"
- call: waitForCondition
saveAs: expectedArtifact
- try:
actions:
- call: waitForOutboundMessage
saveAs: latestOutbound
args:
- ref: state
- lambda:
async: true
expr: "((await fs.readFile(expectedArtifactPath, 'utf8').catch(() => null)) ?? undefined)"
- expr: resolveQaLiveTurnTimeoutMs(env, 30000)
- 250
params: [candidate]
expr: "candidate.conversation.id === config.conversationId && candidate.text.trim().length > 0"
- expr: resolveQaLiveTurnTimeoutMs(env, 45000)
- sinceIndex:
ref: beforeOutboundCount
- assert:
expr: "config.artifactNeedles.every((needle) => normalizeLowercaseStringOrEmpty(expectedArtifact).includes(normalizeLowercaseStringOrEmpty(needle)))"
expr: "!config.forbiddenNeedles.some((needle) => normalizeLowercaseStringOrEmpty(latestOutbound.text).includes(needle))"
message:
expr: "`expected ${String(turn.expectFile.path)} to contain natural character task needles`"
- assert:
expr: "state.getSnapshot().messages.filter((message) => message.direction === 'outbound' && message.conversation.id === config.conversationId).length === config.turns.length"
message: missing one or more Gollum replies
expr: "`gollum natural chat turn ${String(turnIndex)} hit fallback/error text: ${latestOutbound.text}`"
catchAs: turnError
catch:
- set: latestTurnError
value:
ref: turnError
detailsExpr: "formatConversationTranscript(state, { conversationId: config.conversationId })"
```