mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 00:50:22 +00:00
fix: stabilize character eval and Qwen model routing
This commit is contained in:
@@ -109,6 +109,7 @@ describe("runQaCharacterEval", () => {
|
||||
const report = await fs.readFile(result.reportPath, "utf8");
|
||||
expect(report).toContain("Execution: local QA gateway child processes, not Docker");
|
||||
expect(report).toContain("Judges: openai/gpt-5.4");
|
||||
expect(report).toContain("Judge model labels: visible");
|
||||
expect(report).toContain("## Judge Rankings");
|
||||
expect(report).toContain("### openai/gpt-5.4");
|
||||
expect(report).toContain("reply from openai/gpt-5.4");
|
||||
@@ -120,6 +121,57 @@ describe("runQaCharacterEval", () => {
|
||||
expect(report).not.toContain("Judge Raw Reply");
|
||||
});
|
||||
|
||||
it("can hide candidate model refs from judge prompts and map rankings back", async () => {
|
||||
const runSuite = vi.fn(async (params: CharacterRunSuiteParams) =>
|
||||
makeSuiteResult({
|
||||
outputDir: params.outputDir,
|
||||
model: params.primaryModel,
|
||||
transcript: "USER Alice: hi\n\nASSISTANT openclaw: anonymous reply",
|
||||
}),
|
||||
);
|
||||
const runJudge = vi.fn(async (params: CharacterRunJudgeParams) => {
|
||||
expect(params.prompt).toContain("## CANDIDATE candidate-01");
|
||||
expect(params.prompt).toContain("## CANDIDATE candidate-02");
|
||||
expect(params.prompt).not.toContain("openai/gpt-5.4");
|
||||
expect(params.prompt).not.toContain("codex-cli/test-model");
|
||||
return JSON.stringify({
|
||||
rankings: [
|
||||
{
|
||||
model: "candidate-02",
|
||||
rank: 1,
|
||||
score: 9.1,
|
||||
summary: "Better vibes.",
|
||||
},
|
||||
{
|
||||
model: "candidate-01",
|
||||
rank: 2,
|
||||
score: 7.4,
|
||||
summary: "Solid.",
|
||||
},
|
||||
],
|
||||
});
|
||||
});
|
||||
|
||||
const result = await runQaCharacterEval({
|
||||
repoRoot: tempRoot,
|
||||
outputDir: path.join(tempRoot, "character"),
|
||||
models: ["openai/gpt-5.4", "codex-cli/test-model"],
|
||||
judgeModels: ["openai/gpt-5.4"],
|
||||
judgeBlindModels: true,
|
||||
runSuite,
|
||||
runJudge,
|
||||
});
|
||||
|
||||
expect(result.judgments[0]?.blindModels).toBe(true);
|
||||
expect(result.judgments[0]?.rankings.map((ranking) => ranking.model)).toEqual([
|
||||
"codex-cli/test-model",
|
||||
"openai/gpt-5.4",
|
||||
]);
|
||||
const report = await fs.readFile(result.reportPath, "utf8");
|
||||
expect(report).toContain("Judge model labels: blind");
|
||||
expect(report).toContain("1. codex-cli/test-model - 9.1 - Better vibes.");
|
||||
});
|
||||
|
||||
it("defaults to the character eval model panel when no models are provided", async () => {
|
||||
const runSuite = vi.fn(async (params: CharacterRunSuiteParams) =>
|
||||
makeSuiteResult({
|
||||
@@ -138,9 +190,8 @@ describe("runQaCharacterEval", () => {
|
||||
{ model: "minimax/MiniMax-M2.7", rank: 5, score: 6.5, summary: "ok" },
|
||||
{ model: "zai/glm-5.1", rank: 6, score: 6.3, summary: "ok" },
|
||||
{ model: "moonshot/kimi-k2.5", rank: 7, score: 6.2, summary: "ok" },
|
||||
{ model: "qwen/qwen3.6-plus", rank: 8, score: 6.1, summary: "ok" },
|
||||
{ model: "xiaomi/mimo-v2-pro", rank: 9, score: 6, summary: "ok" },
|
||||
{ model: "google/gemini-3.1-pro-preview", rank: 10, score: 5.9, summary: "ok" },
|
||||
{ model: "qwen/qwen3.5-plus", rank: 8, score: 6.1, summary: "ok" },
|
||||
{ model: "google/gemini-3.1-pro-preview", rank: 9, score: 6, summary: "ok" },
|
||||
],
|
||||
}),
|
||||
);
|
||||
@@ -153,7 +204,7 @@ describe("runQaCharacterEval", () => {
|
||||
runJudge,
|
||||
});
|
||||
|
||||
expect(runSuite).toHaveBeenCalledTimes(10);
|
||||
expect(runSuite).toHaveBeenCalledTimes(9);
|
||||
expect(runSuite.mock.calls.map(([params]) => params.primaryModel)).toEqual([
|
||||
"openai/gpt-5.4",
|
||||
"openai/gpt-5.2",
|
||||
@@ -162,8 +213,7 @@ describe("runQaCharacterEval", () => {
|
||||
"minimax/MiniMax-M2.7",
|
||||
"zai/glm-5.1",
|
||||
"moonshot/kimi-k2.5",
|
||||
"qwen/qwen3.6-plus",
|
||||
"xiaomi/mimo-v2-pro",
|
||||
"qwen/qwen3.5-plus",
|
||||
"google/gemini-3.1-pro-preview",
|
||||
]);
|
||||
expect(runSuite.mock.calls.map(([params]) => params.thinkingDefault)).toEqual([
|
||||
@@ -176,7 +226,6 @@ describe("runQaCharacterEval", () => {
|
||||
"high",
|
||||
"high",
|
||||
"high",
|
||||
"high",
|
||||
]);
|
||||
expect(runSuite.mock.calls.map(([params]) => params.fastMode)).toEqual([
|
||||
true,
|
||||
@@ -188,7 +237,6 @@ describe("runQaCharacterEval", () => {
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
]);
|
||||
expect(runJudge).toHaveBeenCalledTimes(2);
|
||||
expect(runJudge.mock.calls.map(([params]) => params.judgeModel)).toEqual([
|
||||
@@ -244,7 +292,7 @@ describe("runQaCharacterEval", () => {
|
||||
]);
|
||||
});
|
||||
|
||||
it("defaults candidate and judge concurrency to eight", async () => {
|
||||
it("defaults candidate and judge concurrency to sixteen", async () => {
|
||||
let activeRuns = 0;
|
||||
let maxActiveRuns = 0;
|
||||
const runSuite = vi.fn(async (params: CharacterRunSuiteParams) => {
|
||||
@@ -266,7 +314,7 @@ describe("runQaCharacterEval", () => {
|
||||
await new Promise((resolve) => setTimeout(resolve, 10));
|
||||
activeJudges -= 1;
|
||||
return JSON.stringify({
|
||||
rankings: Array.from({ length: 10 }, (_, index) => ({
|
||||
rankings: Array.from({ length: 20 }, (_, index) => ({
|
||||
model: `provider/model-${index + 1}`,
|
||||
rank: index + 1,
|
||||
score: 10 - index,
|
||||
@@ -278,14 +326,137 @@ describe("runQaCharacterEval", () => {
|
||||
await runQaCharacterEval({
|
||||
repoRoot: tempRoot,
|
||||
outputDir: path.join(tempRoot, "character"),
|
||||
models: Array.from({ length: 10 }, (_, index) => `provider/model-${index + 1}`),
|
||||
judgeModels: Array.from({ length: 10 }, (_, index) => `judge/model-${index + 1}`),
|
||||
models: Array.from({ length: 20 }, (_, index) => `provider/model-${index + 1}`),
|
||||
judgeModels: Array.from({ length: 20 }, (_, index) => `judge/model-${index + 1}`),
|
||||
runSuite,
|
||||
runJudge,
|
||||
});
|
||||
|
||||
expect(maxActiveRuns).toBe(8);
|
||||
expect(maxActiveJudges).toBe(8);
|
||||
expect(maxActiveRuns).toBe(16);
|
||||
expect(maxActiveJudges).toBe(16);
|
||||
});
|
||||
|
||||
it("marks raw provider error transcripts as failed output", async () => {
|
||||
const runSuite = vi.fn(async (params: CharacterRunSuiteParams) =>
|
||||
makeSuiteResult({
|
||||
outputDir: params.outputDir,
|
||||
model: params.primaryModel,
|
||||
transcript:
|
||||
"USER Alice: Are you awake?\n\nASSISTANT OpenClaw QA: 400 model `qwen3.6-plus` is not supported.",
|
||||
}),
|
||||
);
|
||||
const runJudge = vi.fn(async (_params: CharacterRunJudgeParams) =>
|
||||
JSON.stringify({
|
||||
rankings: [{ model: "qwen/qwen3.6-plus", rank: 1, score: 0.5, summary: "failed" }],
|
||||
}),
|
||||
);
|
||||
|
||||
const result = await runQaCharacterEval({
|
||||
repoRoot: tempRoot,
|
||||
outputDir: path.join(tempRoot, "character"),
|
||||
models: ["qwen/qwen3.6-plus"],
|
||||
judgeModels: ["openai/gpt-5.4"],
|
||||
runSuite,
|
||||
runJudge,
|
||||
});
|
||||
|
||||
expect(result.runs[0]).toMatchObject({
|
||||
model: "qwen/qwen3.6-plus",
|
||||
status: "fail",
|
||||
error: "model unsupported error leaked into transcript",
|
||||
});
|
||||
});
|
||||
|
||||
it("marks raw tool failure transcripts as failed output", async () => {
|
||||
const runSuite = vi.fn(async (params: CharacterRunSuiteParams) =>
|
||||
makeSuiteResult({
|
||||
outputDir: params.outputDir,
|
||||
model: params.primaryModel,
|
||||
transcript: "ASSISTANT OpenClaw QA: ⚠️ ✍️ Write: to /tmp/precious.html failed",
|
||||
}),
|
||||
);
|
||||
const runJudge = vi.fn(async (_params: CharacterRunJudgeParams) =>
|
||||
JSON.stringify({
|
||||
rankings: [{ model: "qwen/qwen3.5-plus", rank: 1, score: 0.5, summary: "failed" }],
|
||||
}),
|
||||
);
|
||||
|
||||
const result = await runQaCharacterEval({
|
||||
repoRoot: tempRoot,
|
||||
outputDir: path.join(tempRoot, "character"),
|
||||
models: ["qwen/qwen3.5-plus"],
|
||||
judgeModels: ["openai/gpt-5.4"],
|
||||
runSuite,
|
||||
runJudge,
|
||||
});
|
||||
|
||||
expect(result.runs[0]).toMatchObject({
|
||||
model: "qwen/qwen3.5-plus",
|
||||
status: "fail",
|
||||
error: "tool failure leaked into transcript",
|
||||
});
|
||||
});
|
||||
|
||||
it("marks generic channel fallback transcripts as failed output", async () => {
|
||||
const runSuite = vi.fn(async (params: CharacterRunSuiteParams) =>
|
||||
makeSuiteResult({
|
||||
outputDir: params.outputDir,
|
||||
model: params.primaryModel,
|
||||
transcript:
|
||||
"ASSISTANT OpenClaw QA: ⚠️ Something went wrong while processing your request. Please try again, or use /new to start a fresh session.",
|
||||
}),
|
||||
);
|
||||
const runJudge = vi.fn(async (_params: CharacterRunJudgeParams) =>
|
||||
JSON.stringify({
|
||||
rankings: [{ model: "qa/generic-fallback-model", rank: 1, score: 0.5, summary: "failed" }],
|
||||
}),
|
||||
);
|
||||
|
||||
const result = await runQaCharacterEval({
|
||||
repoRoot: tempRoot,
|
||||
outputDir: path.join(tempRoot, "character"),
|
||||
models: ["qa/generic-fallback-model"],
|
||||
judgeModels: ["openai/gpt-5.4"],
|
||||
runSuite,
|
||||
runJudge,
|
||||
});
|
||||
|
||||
expect(result.runs[0]).toMatchObject({
|
||||
model: "qa/generic-fallback-model",
|
||||
status: "fail",
|
||||
error: "generic request failure leaked into transcript",
|
||||
});
|
||||
});
|
||||
|
||||
it("marks idle-timeout fallback transcripts as failed output", async () => {
|
||||
const runSuite = vi.fn(async (params: CharacterRunSuiteParams) =>
|
||||
makeSuiteResult({
|
||||
outputDir: params.outputDir,
|
||||
model: params.primaryModel,
|
||||
transcript:
|
||||
"ASSISTANT OpenClaw QA: The model did not produce a response before the LLM idle timeout. Please try again, or increase `agents.defaults.llm.idleTimeoutSeconds` in your config.",
|
||||
}),
|
||||
);
|
||||
const runJudge = vi.fn(async (_params: CharacterRunJudgeParams) =>
|
||||
JSON.stringify({
|
||||
rankings: [{ model: "google/gemini-test", rank: 1, score: 0.5, summary: "failed" }],
|
||||
}),
|
||||
);
|
||||
|
||||
const result = await runQaCharacterEval({
|
||||
repoRoot: tempRoot,
|
||||
outputDir: path.join(tempRoot, "character"),
|
||||
models: ["google/gemini-test"],
|
||||
judgeModels: ["openai/gpt-5.4"],
|
||||
runSuite,
|
||||
runJudge,
|
||||
});
|
||||
|
||||
expect(result.runs[0]).toMatchObject({
|
||||
model: "google/gemini-test",
|
||||
status: "fail",
|
||||
error: "LLM timeout leaked into transcript",
|
||||
});
|
||||
});
|
||||
|
||||
it("lets explicit candidate thinking override the default panel", async () => {
|
||||
|
||||
@@ -15,12 +15,11 @@ const DEFAULT_CHARACTER_EVAL_MODELS = Object.freeze([
|
||||
"minimax/MiniMax-M2.7",
|
||||
"zai/glm-5.1",
|
||||
"moonshot/kimi-k2.5",
|
||||
"qwen/qwen3.6-plus",
|
||||
"xiaomi/mimo-v2-pro",
|
||||
"qwen/qwen3.5-plus",
|
||||
"google/gemini-3.1-pro-preview",
|
||||
]);
|
||||
const DEFAULT_CHARACTER_THINKING: QaThinkingLevel = "high";
|
||||
const DEFAULT_CHARACTER_EVAL_CONCURRENCY = 8;
|
||||
const DEFAULT_CHARACTER_EVAL_CONCURRENCY = 16;
|
||||
const DEFAULT_CHARACTER_THINKING_BY_MODEL: Readonly<Record<string, QaThinkingLevel>> =
|
||||
Object.freeze({
|
||||
"openai/gpt-5.4": "xhigh",
|
||||
@@ -81,11 +80,14 @@ export type QaCharacterEvalJudgeResult = {
|
||||
model: string;
|
||||
thinkingDefault: QaThinkingLevel;
|
||||
fastMode: boolean;
|
||||
blindModels: boolean;
|
||||
durationMs: number;
|
||||
rankings: QaCharacterEvalJudgment[];
|
||||
error?: string;
|
||||
};
|
||||
|
||||
type QaCharacterEvalProgressLogger = (message: string) => void;
|
||||
|
||||
type RunSuiteFn = (params: {
|
||||
repoRoot: string;
|
||||
outputDir: string;
|
||||
@@ -120,10 +122,12 @@ export type QaCharacterEvalParams = {
|
||||
judgeThinkingDefault?: QaThinkingLevel;
|
||||
judgeModelOptions?: Record<string, QaCharacterModelOptions>;
|
||||
judgeTimeoutMs?: number;
|
||||
judgeBlindModels?: boolean;
|
||||
candidateConcurrency?: number;
|
||||
judgeConcurrency?: number;
|
||||
runSuite?: RunSuiteFn;
|
||||
runJudge?: RunJudgeFn;
|
||||
progress?: QaCharacterEvalProgressLogger;
|
||||
};
|
||||
|
||||
function normalizeModelRefs(models: readonly string[]) {
|
||||
@@ -226,6 +230,27 @@ function collectTranscriptStats(transcript: string) {
|
||||
};
|
||||
}
|
||||
|
||||
function detectTranscriptFailure(transcript: string): string | undefined {
|
||||
const checks: Array<[RegExp, string]> = [
|
||||
[/\bmodel `[^`]+` is not supported\b/i, "model unsupported error leaked into transcript"],
|
||||
[/\binsufficient account balance\b/i, "account balance error leaked into transcript"],
|
||||
[/\b(?:backend|transport|internal) error\b/i, "backend error leaked into transcript"],
|
||||
[
|
||||
/\bsomething went wrong while processing your request\b/i,
|
||||
"generic request failure leaked into transcript",
|
||||
],
|
||||
[/\buse \/new to start a fresh session\b/i, "generic request failure leaked into transcript"],
|
||||
[
|
||||
/\bmodel did not produce a response before the LLM idle timeout\b/i,
|
||||
"LLM timeout leaked into transcript",
|
||||
],
|
||||
[/\btool failed\b/i, "tool failure leaked into transcript"],
|
||||
[/\b(?:read|write|edit|patch):[^\n]*\bfailed\b/i, "tool failure leaked into transcript"],
|
||||
[/\bnot configured\b/i, "configuration error leaked into transcript"],
|
||||
];
|
||||
return checks.find(([pattern]) => pattern.test(transcript))?.[1];
|
||||
}
|
||||
|
||||
function formatDuration(ms: number) {
|
||||
if (!Number.isFinite(ms) || ms < 0) {
|
||||
return "unknown";
|
||||
@@ -243,10 +268,42 @@ function formatDuration(ms: number) {
|
||||
return seconds === 0 ? `${minutes}m` : `${minutes}m ${seconds}s`;
|
||||
}
|
||||
|
||||
function buildJudgePrompt(params: { scenarioId: string; runs: readonly QaCharacterEvalRun[] }) {
|
||||
function logCharacterEvalProgress(
|
||||
progress: QaCharacterEvalProgressLogger | undefined,
|
||||
message: string,
|
||||
) {
|
||||
progress?.(`[qa-character] ${message}`);
|
||||
}
|
||||
|
||||
function formatEvalIndex(index: number, total: number) {
|
||||
return `${index + 1}/${total}`;
|
||||
}
|
||||
|
||||
function summarizeRunStats(run: QaCharacterEvalRun) {
|
||||
return [
|
||||
`status=${run.status}`,
|
||||
`duration=${formatDuration(run.durationMs)}`,
|
||||
`turns=${run.stats.userTurns}/${run.stats.assistantTurns}`,
|
||||
`chars=${run.stats.transcriptChars}`,
|
||||
...(run.error ? [`error="${run.error}"`] : []),
|
||||
].join(" ");
|
||||
}
|
||||
|
||||
function formatBlindCandidateLabel(index: number) {
|
||||
return `candidate-${String(index + 1).padStart(2, "0")}`;
|
||||
}
|
||||
|
||||
function buildJudgePrompt(params: {
|
||||
scenarioId: string;
|
||||
runs: readonly QaCharacterEvalRun[];
|
||||
blindModels?: boolean;
|
||||
}) {
|
||||
const labelToModel = new Map<string, string>();
|
||||
const runBlocks = params.runs
|
||||
.map(
|
||||
(run) => `## MODEL ${run.model}
|
||||
.map((run, index) => {
|
||||
const label = params.blindModels ? formatBlindCandidateLabel(index) : run.model;
|
||||
labelToModel.set(label, run.model);
|
||||
return `## CANDIDATE ${label}
|
||||
|
||||
Status: ${run.status}
|
||||
Duration ms (not used for ranking): ${run.durationMs}
|
||||
@@ -258,11 +315,11 @@ Error: ${run.error ?? "none"}
|
||||
|
||||
\`\`\`text
|
||||
${run.transcript}
|
||||
\`\`\``,
|
||||
)
|
||||
\`\`\``;
|
||||
})
|
||||
.join("\n\n");
|
||||
|
||||
return `You are grading OpenClaw natural character conversation transcripts for naturalness, vibes, and funniness.
|
||||
const prompt = `You are grading OpenClaw natural character conversation transcripts for naturalness, vibes, and funniness.
|
||||
|
||||
Scenario id: ${params.scenarioId}
|
||||
|
||||
@@ -275,14 +332,14 @@ Rank the models by:
|
||||
- not sounding aware of an eval or test
|
||||
- avoiding tool/backend/error leakage
|
||||
|
||||
Treat model names as opaque labels. Do not assume quality from the label.
|
||||
Treat candidate labels as opaque identifiers. Do not assume quality from the label.
|
||||
Duration is recorded for separate benchmark analysis only. Do not rank models by speed.
|
||||
|
||||
Return strict JSON only with this shape:
|
||||
{
|
||||
"rankings": [
|
||||
{
|
||||
"model": "same model label",
|
||||
"model": "same candidate label",
|
||||
"rank": 1,
|
||||
"score": 9.2,
|
||||
"summary": "one sentence",
|
||||
@@ -293,6 +350,7 @@ Return strict JSON only with this shape:
|
||||
}
|
||||
|
||||
${runBlocks}`;
|
||||
return { prompt, labelToModel };
|
||||
}
|
||||
|
||||
function normalizeJudgment(value: unknown, allowedModels: Set<string>): QaCharacterEvalJudgment[] {
|
||||
@@ -382,6 +440,7 @@ function renderCharacterEvalReport(params: {
|
||||
`- Judges: ${params.judgments.map((judgment) => judgment.model).join(", ")}`,
|
||||
`- Judge thinking: ${params.judgments[0]?.thinkingDefault ?? DEFAULT_JUDGE_THINKING}`,
|
||||
`- Judge fast mode: ${params.judgments.every((judgment) => judgment.fastMode) ? "on" : "mixed"}`,
|
||||
`- Judge model labels: ${params.judgments.every((judgment) => judgment.blindModels) ? "blind" : "visible"}`,
|
||||
"",
|
||||
"## Judge Rankings",
|
||||
"",
|
||||
@@ -461,7 +520,12 @@ export async function runQaCharacterEval(params: QaCharacterEvalParams) {
|
||||
params.candidateConcurrency,
|
||||
DEFAULT_CHARACTER_EVAL_CONCURRENCY,
|
||||
);
|
||||
const runs = await mapWithConcurrency(models, candidateConcurrency, async (model) => {
|
||||
logCharacterEvalProgress(
|
||||
params.progress,
|
||||
`start scenario=${scenarioId} candidates=${models.length} candidateConcurrency=${candidateConcurrency} output=${outputDir}`,
|
||||
);
|
||||
const candidatesStartedAt = Date.now();
|
||||
const runs = await mapWithConcurrency(models, candidateConcurrency, async (model, index) => {
|
||||
const thinkingDefault = resolveCandidateThinkingDefault({
|
||||
model,
|
||||
candidateThinkingDefault: params.candidateThinkingDefault,
|
||||
@@ -475,6 +539,10 @@ export async function runQaCharacterEval(params: QaCharacterEvalParams) {
|
||||
});
|
||||
const modelOutputDir = path.join(runsDir, sanitizePathPart(model));
|
||||
const runStartedAt = Date.now();
|
||||
logCharacterEvalProgress(
|
||||
params.progress,
|
||||
`candidate start ${formatEvalIndex(index, models.length)} model=${model} thinking=${thinkingDefault} fast=${fastMode ? "on" : "off"}`,
|
||||
);
|
||||
try {
|
||||
const result = await runSuite({
|
||||
repoRoot,
|
||||
@@ -487,10 +555,12 @@ export async function runQaCharacterEval(params: QaCharacterEvalParams) {
|
||||
scenarioIds: [scenarioId],
|
||||
});
|
||||
const transcript = extractTranscript(result);
|
||||
const status = result.scenarios.some((scenario) => scenario.status === "fail")
|
||||
? "fail"
|
||||
: "pass";
|
||||
return {
|
||||
const transcriptFailure = detectTranscriptFailure(transcript);
|
||||
const status =
|
||||
result.scenarios.some((scenario) => scenario.status === "fail") || transcriptFailure
|
||||
? "fail"
|
||||
: "pass";
|
||||
const run = {
|
||||
model,
|
||||
status,
|
||||
durationMs: Date.now() - runStartedAt,
|
||||
@@ -501,10 +571,16 @@ export async function runQaCharacterEval(params: QaCharacterEvalParams) {
|
||||
summaryPath: result.summaryPath,
|
||||
transcript,
|
||||
stats: collectTranscriptStats(transcript),
|
||||
...(transcriptFailure ? { error: transcriptFailure } : {}),
|
||||
} satisfies QaCharacterEvalRun;
|
||||
logCharacterEvalProgress(
|
||||
params.progress,
|
||||
`candidate done ${formatEvalIndex(index, models.length)} model=${model} ${summarizeRunStats(run)}`,
|
||||
);
|
||||
return run;
|
||||
} catch (error) {
|
||||
const transcript = "";
|
||||
return {
|
||||
const run = {
|
||||
model,
|
||||
status: "fail",
|
||||
durationMs: Date.now() - runStartedAt,
|
||||
@@ -515,8 +591,18 @@ export async function runQaCharacterEval(params: QaCharacterEvalParams) {
|
||||
stats: collectTranscriptStats(transcript),
|
||||
error: formatErrorMessage(error),
|
||||
} satisfies QaCharacterEvalRun;
|
||||
logCharacterEvalProgress(
|
||||
params.progress,
|
||||
`candidate done ${formatEvalIndex(index, models.length)} model=${model} ${summarizeRunStats(run)}`,
|
||||
);
|
||||
return run;
|
||||
}
|
||||
});
|
||||
const failedCandidateCount = runs.filter((run) => run.status === "fail").length;
|
||||
logCharacterEvalProgress(
|
||||
params.progress,
|
||||
`candidates done pass=${runs.length - failedCandidateCount} fail=${failedCandidateCount} duration=${formatDuration(Date.now() - candidatesStartedAt)}`,
|
||||
);
|
||||
|
||||
const judgeModels = normalizeModelRefs(
|
||||
params.judgeModels && params.judgeModels.length > 0
|
||||
@@ -530,38 +616,73 @@ export async function runQaCharacterEval(params: QaCharacterEvalParams) {
|
||||
params.judgeConcurrency,
|
||||
DEFAULT_CHARACTER_EVAL_CONCURRENCY,
|
||||
);
|
||||
const judgments = await mapWithConcurrency(judgeModels, judgeConcurrency, async (judgeModel) => {
|
||||
const judgeOptions = resolveJudgeOptions({
|
||||
model: judgeModel,
|
||||
judgeThinkingDefault: params.judgeThinkingDefault,
|
||||
judgeModelOptions: params.judgeModelOptions,
|
||||
});
|
||||
let rankings: QaCharacterEvalJudgment[] = [];
|
||||
let judgeError: string | undefined;
|
||||
const judgeStartedAt = Date.now();
|
||||
try {
|
||||
const rawReply = await runJudge({
|
||||
repoRoot,
|
||||
judgeModel,
|
||||
judgeThinkingDefault: judgeOptions.thinkingDefault,
|
||||
judgeFastMode: judgeOptions.fastMode,
|
||||
prompt: buildJudgePrompt({ scenarioId, runs }),
|
||||
timeoutMs: params.judgeTimeoutMs ?? 180_000,
|
||||
const judgeTimeoutMs = params.judgeTimeoutMs ?? 180_000;
|
||||
logCharacterEvalProgress(
|
||||
params.progress,
|
||||
`judges start judges=${judgeModels.length} judgeConcurrency=${judgeConcurrency} timeout=${formatDuration(judgeTimeoutMs)} labels=${params.judgeBlindModels === true ? "blind" : "visible"}`,
|
||||
);
|
||||
const judgesStartedAt = Date.now();
|
||||
const judgments = await mapWithConcurrency(
|
||||
judgeModels,
|
||||
judgeConcurrency,
|
||||
async (judgeModel, index) => {
|
||||
const judgeOptions = resolveJudgeOptions({
|
||||
model: judgeModel,
|
||||
judgeThinkingDefault: params.judgeThinkingDefault,
|
||||
judgeModelOptions: params.judgeModelOptions,
|
||||
});
|
||||
rankings = parseJudgeReply(rawReply, new Set(models));
|
||||
} catch (error) {
|
||||
judgeError = formatErrorMessage(error);
|
||||
}
|
||||
let rankings: QaCharacterEvalJudgment[] = [];
|
||||
let judgeError: string | undefined;
|
||||
const judgeStartedAt = Date.now();
|
||||
logCharacterEvalProgress(
|
||||
params.progress,
|
||||
`judge start ${formatEvalIndex(index, judgeModels.length)} model=${judgeModel} thinking=${judgeOptions.thinkingDefault} fast=${judgeOptions.fastMode ? "on" : "off"} timeout=${formatDuration(judgeTimeoutMs)}`,
|
||||
);
|
||||
try {
|
||||
const judgePrompt = buildJudgePrompt({
|
||||
scenarioId,
|
||||
runs,
|
||||
blindModels: params.judgeBlindModels,
|
||||
});
|
||||
const rawReply = await runJudge({
|
||||
repoRoot,
|
||||
judgeModel,
|
||||
judgeThinkingDefault: judgeOptions.thinkingDefault,
|
||||
judgeFastMode: judgeOptions.fastMode,
|
||||
prompt: judgePrompt.prompt,
|
||||
timeoutMs: judgeTimeoutMs,
|
||||
});
|
||||
rankings = parseJudgeReply(rawReply, new Set(judgePrompt.labelToModel.keys())).map(
|
||||
(ranking) => ({
|
||||
...ranking,
|
||||
model: judgePrompt.labelToModel.get(ranking.model) ?? ranking.model,
|
||||
}),
|
||||
);
|
||||
} catch (error) {
|
||||
judgeError = formatErrorMessage(error);
|
||||
}
|
||||
|
||||
return {
|
||||
model: judgeModel,
|
||||
thinkingDefault: judgeOptions.thinkingDefault,
|
||||
fastMode: judgeOptions.fastMode,
|
||||
durationMs: Date.now() - judgeStartedAt,
|
||||
rankings,
|
||||
...(judgeError ? { error: judgeError } : {}),
|
||||
} satisfies QaCharacterEvalJudgeResult;
|
||||
});
|
||||
const judgment = {
|
||||
model: judgeModel,
|
||||
thinkingDefault: judgeOptions.thinkingDefault,
|
||||
fastMode: judgeOptions.fastMode,
|
||||
blindModels: params.judgeBlindModels === true,
|
||||
durationMs: Date.now() - judgeStartedAt,
|
||||
rankings,
|
||||
...(judgeError ? { error: judgeError } : {}),
|
||||
} satisfies QaCharacterEvalJudgeResult;
|
||||
logCharacterEvalProgress(
|
||||
params.progress,
|
||||
`judge done ${formatEvalIndex(index, judgeModels.length)} model=${judgeModel} rankings=${rankings.length} duration=${formatDuration(judgment.durationMs)}${judgeError ? ` error="${judgeError}"` : ""}`,
|
||||
);
|
||||
return judgment;
|
||||
},
|
||||
);
|
||||
const failedJudgeCount = judgments.filter((judgment) => judgment.rankings.length === 0).length;
|
||||
logCharacterEvalProgress(
|
||||
params.progress,
|
||||
`judges done ranked=${judgments.length - failedJudgeCount} failed=${failedJudgeCount} duration=${formatDuration(Date.now() - judgesStartedAt)}`,
|
||||
);
|
||||
|
||||
const finishedAt = new Date();
|
||||
const report = renderCharacterEvalReport({
|
||||
@@ -587,6 +708,10 @@ export async function runQaCharacterEval(params: QaCharacterEvalParams) {
|
||||
)}\n`,
|
||||
"utf8",
|
||||
);
|
||||
logCharacterEvalProgress(
|
||||
params.progress,
|
||||
`report written duration=${formatDuration(finishedAt.getTime() - startedAt.getTime())} report=${reportPath} summary=${summaryPath}`,
|
||||
);
|
||||
|
||||
return {
|
||||
outputDir,
|
||||
|
||||
@@ -158,6 +158,7 @@ describe("qa cli runtime", () => {
|
||||
modelThinking: ["codex-cli/test-model=medium"],
|
||||
judgeModel: ["openai/gpt-5.4,thinking=xhigh,fast", "anthropic/claude-opus-4-6,thinking=high"],
|
||||
judgeTimeoutMs: 180_000,
|
||||
blindJudgeModels: true,
|
||||
concurrency: 4,
|
||||
judgeConcurrency: 3,
|
||||
});
|
||||
@@ -180,8 +181,10 @@ describe("qa cli runtime", () => {
|
||||
"anthropic/claude-opus-4-6": { thinkingDefault: "high" },
|
||||
},
|
||||
judgeTimeoutMs: 180_000,
|
||||
judgeBlindModels: true,
|
||||
candidateConcurrency: 4,
|
||||
judgeConcurrency: 3,
|
||||
progress: expect.any(Function),
|
||||
});
|
||||
});
|
||||
|
||||
@@ -203,8 +206,10 @@ describe("qa cli runtime", () => {
|
||||
judgeModels: undefined,
|
||||
judgeModelOptions: undefined,
|
||||
judgeTimeoutMs: undefined,
|
||||
judgeBlindModels: undefined,
|
||||
candidateConcurrency: undefined,
|
||||
judgeConcurrency: undefined,
|
||||
progress: expect.any(Function),
|
||||
});
|
||||
});
|
||||
|
||||
|
||||
@@ -225,6 +225,7 @@ export async function runQaCharacterEvalCommand(opts: {
|
||||
modelThinking?: string[];
|
||||
judgeModel?: string[];
|
||||
judgeTimeoutMs?: number;
|
||||
blindJudgeModels?: boolean;
|
||||
concurrency?: number;
|
||||
judgeConcurrency?: number;
|
||||
}) {
|
||||
@@ -243,8 +244,10 @@ export async function runQaCharacterEvalCommand(opts: {
|
||||
judgeModels: judges.models.length > 0 ? judges.models : undefined,
|
||||
judgeModelOptions: judges.optionsByModel,
|
||||
judgeTimeoutMs: opts.judgeTimeoutMs,
|
||||
judgeBlindModels: opts.blindJudgeModels === true ? true : undefined,
|
||||
candidateConcurrency: parseQaPositiveIntegerOption("--concurrency", opts.concurrency),
|
||||
judgeConcurrency: parseQaPositiveIntegerOption("--judge-concurrency", opts.judgeConcurrency),
|
||||
progress: (message) => process.stderr.write(`${message}\n`),
|
||||
});
|
||||
process.stdout.write(`QA character eval report: ${result.reportPath}\n`);
|
||||
process.stdout.write(`QA character eval summary: ${result.summaryPath}\n`);
|
||||
|
||||
@@ -38,6 +38,7 @@ async function runQaCharacterEval(opts: {
|
||||
modelThinking?: string[];
|
||||
judgeModel?: string[];
|
||||
judgeTimeoutMs?: number;
|
||||
blindJudgeModels?: boolean;
|
||||
concurrency?: number;
|
||||
judgeConcurrency?: number;
|
||||
}) {
|
||||
@@ -199,6 +200,10 @@ export function registerQaLabCli(program: Command) {
|
||||
.option("--judge-timeout-ms <ms>", "Override judge wait timeout", (value: string) =>
|
||||
Number(value),
|
||||
)
|
||||
.option(
|
||||
"--blind-judge-models",
|
||||
"Hide candidate model refs from judge prompts; reports still map rankings back to real refs",
|
||||
)
|
||||
.option("--concurrency <count>", "Candidate model run concurrency", (value: string) =>
|
||||
Number(value),
|
||||
)
|
||||
@@ -216,6 +221,7 @@ export function registerQaLabCli(program: Command) {
|
||||
modelThinking?: string[];
|
||||
judgeModel?: string[];
|
||||
judgeTimeoutMs?: number;
|
||||
blindJudgeModels?: boolean;
|
||||
concurrency?: number;
|
||||
judgeConcurrency?: number;
|
||||
}) => {
|
||||
|
||||
@@ -19,6 +19,7 @@ describe("qa scenario catalog", () => {
|
||||
true,
|
||||
);
|
||||
expect(pack.scenarios.some((scenario) => scenario.id === "character-vibes-gollum")).toBe(true);
|
||||
expect(pack.scenarios.some((scenario) => scenario.id === "character-vibes-c3po")).toBe(true);
|
||||
expect(pack.scenarios.every((scenario) => scenario.execution?.kind === "flow")).toBe(true);
|
||||
expect(pack.scenarios.some((scenario) => scenario.execution.flow?.steps.length)).toBe(true);
|
||||
});
|
||||
|
||||
Reference in New Issue
Block a user