mirror of
https://github.com/openclaw/openclaw.git
synced 2026-04-14 18:51:04 +00:00
599 lines
19 KiB
TypeScript
599 lines
19 KiB
TypeScript
import fs from "node:fs/promises";
|
|
import path from "node:path";
|
|
import { formatErrorMessage } from "openclaw/plugin-sdk/error-runtime";
|
|
import { runQaManualLane } from "./manual-lane.runtime.js";
|
|
import { isQaFastModeModelRef, type QaProviderMode } from "./model-selection.js";
|
|
import { type QaThinkingLevel } from "./qa-gateway-config.js";
|
|
import { runQaSuite, type QaSuiteResult } from "./suite.js";
|
|
|
|
const DEFAULT_CHARACTER_SCENARIO_ID = "character-vibes-gollum";
|
|
const DEFAULT_CHARACTER_EVAL_MODELS = Object.freeze([
|
|
"openai/gpt-5.4",
|
|
"openai/gpt-5.2",
|
|
"anthropic/claude-opus-4-6",
|
|
"anthropic/claude-sonnet-4-6",
|
|
"minimax/MiniMax-M2.7",
|
|
"zai/glm-5.1",
|
|
"moonshot/kimi-k2.5",
|
|
"qwen/qwen3.6-plus",
|
|
"xiaomi/mimo-v2-pro",
|
|
"google/gemini-3.1-pro-preview",
|
|
]);
|
|
const DEFAULT_CHARACTER_THINKING: QaThinkingLevel = "high";
|
|
const DEFAULT_CHARACTER_EVAL_CONCURRENCY = 8;
|
|
const DEFAULT_CHARACTER_THINKING_BY_MODEL: Readonly<Record<string, QaThinkingLevel>> =
|
|
Object.freeze({
|
|
"openai/gpt-5.4": "xhigh",
|
|
"openai/gpt-5.2": "xhigh",
|
|
});
|
|
const DEFAULT_JUDGE_MODELS = Object.freeze(["openai/gpt-5.4", "anthropic/claude-opus-4-6"]);
|
|
const DEFAULT_JUDGE_THINKING: QaThinkingLevel = "xhigh";
|
|
const DEFAULT_JUDGE_MODEL_OPTIONS: Readonly<Record<string, QaCharacterModelOptions>> =
|
|
Object.freeze({
|
|
"openai/gpt-5.4": { thinkingDefault: "xhigh", fastMode: true },
|
|
"anthropic/claude-opus-4-6": { thinkingDefault: "high" },
|
|
});
|
|
|
|
type QaCharacterRunStatus = "pass" | "fail";
|
|
|
|
export type QaCharacterModelOptions = {
|
|
thinkingDefault?: QaThinkingLevel;
|
|
fastMode?: boolean;
|
|
};
|
|
|
|
export type QaCharacterEvalRun = {
|
|
model: string;
|
|
status: QaCharacterRunStatus;
|
|
durationMs: number;
|
|
outputDir: string;
|
|
thinkingDefault: QaThinkingLevel;
|
|
fastMode: boolean;
|
|
reportPath?: string;
|
|
summaryPath?: string;
|
|
transcript: string;
|
|
stats: {
|
|
transcriptChars: number;
|
|
transcriptLines: number;
|
|
userTurns: number;
|
|
assistantTurns: number;
|
|
};
|
|
error?: string;
|
|
};
|
|
|
|
export type QaCharacterEvalJudgment = {
|
|
model: string;
|
|
rank: number;
|
|
score: number;
|
|
summary: string;
|
|
strengths: string[];
|
|
weaknesses: string[];
|
|
};
|
|
|
|
export type QaCharacterEvalResult = {
|
|
outputDir: string;
|
|
reportPath: string;
|
|
summaryPath: string;
|
|
runs: QaCharacterEvalRun[];
|
|
judgments: QaCharacterEvalJudgeResult[];
|
|
};
|
|
|
|
export type QaCharacterEvalJudgeResult = {
|
|
model: string;
|
|
thinkingDefault: QaThinkingLevel;
|
|
fastMode: boolean;
|
|
durationMs: number;
|
|
rankings: QaCharacterEvalJudgment[];
|
|
error?: string;
|
|
};
|
|
|
|
type RunSuiteFn = (params: {
|
|
repoRoot: string;
|
|
outputDir: string;
|
|
providerMode: QaProviderMode;
|
|
primaryModel: string;
|
|
alternateModel: string;
|
|
fastMode?: boolean;
|
|
thinkingDefault?: QaThinkingLevel;
|
|
scenarioIds: string[];
|
|
}) => Promise<QaSuiteResult>;
|
|
|
|
type RunJudgeFn = (params: {
|
|
repoRoot: string;
|
|
judgeModel: string;
|
|
judgeThinkingDefault: QaThinkingLevel;
|
|
judgeFastMode: boolean;
|
|
prompt: string;
|
|
timeoutMs: number;
|
|
}) => Promise<string | null>;
|
|
|
|
export type QaCharacterEvalParams = {
|
|
repoRoot?: string;
|
|
outputDir?: string;
|
|
models: string[];
|
|
scenarioId?: string;
|
|
candidateFastMode?: boolean;
|
|
candidateThinkingDefault?: QaThinkingLevel;
|
|
candidateThinkingByModel?: Record<string, QaThinkingLevel>;
|
|
candidateModelOptions?: Record<string, QaCharacterModelOptions>;
|
|
judgeModel?: string;
|
|
judgeModels?: string[];
|
|
judgeThinkingDefault?: QaThinkingLevel;
|
|
judgeModelOptions?: Record<string, QaCharacterModelOptions>;
|
|
judgeTimeoutMs?: number;
|
|
candidateConcurrency?: number;
|
|
judgeConcurrency?: number;
|
|
runSuite?: RunSuiteFn;
|
|
runJudge?: RunJudgeFn;
|
|
};
|
|
|
|
function normalizeModelRefs(models: readonly string[]) {
|
|
return [...new Set(models.map((model) => model.trim()).filter((model) => model.length > 0))];
|
|
}
|
|
|
|
function resolveCandidateThinkingDefault(params: {
|
|
model: string;
|
|
candidateThinkingDefault?: QaThinkingLevel;
|
|
candidateThinkingByModel?: Record<string, QaThinkingLevel>;
|
|
candidateModelOptions?: Record<string, QaCharacterModelOptions>;
|
|
}) {
|
|
return (
|
|
params.candidateModelOptions?.[params.model]?.thinkingDefault ??
|
|
params.candidateThinkingByModel?.[params.model] ??
|
|
params.candidateThinkingDefault ??
|
|
DEFAULT_CHARACTER_THINKING_BY_MODEL[params.model] ??
|
|
DEFAULT_CHARACTER_THINKING
|
|
);
|
|
}
|
|
|
|
function resolveCandidateFastMode(params: {
|
|
model: string;
|
|
candidateFastMode?: boolean;
|
|
candidateModelOptions?: Record<string, QaCharacterModelOptions>;
|
|
}) {
|
|
return (
|
|
params.candidateModelOptions?.[params.model]?.fastMode ??
|
|
params.candidateFastMode ??
|
|
isQaFastModeModelRef(params.model)
|
|
);
|
|
}
|
|
|
|
function resolveJudgeOptions(params: {
|
|
model: string;
|
|
judgeThinkingDefault?: QaThinkingLevel;
|
|
judgeModelOptions?: Record<string, QaCharacterModelOptions>;
|
|
}) {
|
|
const modelDefaults = DEFAULT_JUDGE_MODEL_OPTIONS[params.model];
|
|
const modelOptions = params.judgeModelOptions?.[params.model];
|
|
return {
|
|
thinkingDefault:
|
|
modelOptions?.thinkingDefault ??
|
|
params.judgeThinkingDefault ??
|
|
modelDefaults?.thinkingDefault ??
|
|
DEFAULT_JUDGE_THINKING,
|
|
fastMode: modelOptions?.fastMode ?? modelDefaults?.fastMode ?? false,
|
|
};
|
|
}
|
|
|
|
function sanitizePathPart(value: string) {
|
|
const sanitized = value.replace(/[^a-z0-9._-]+/gi, "-").replace(/^-+|-+$/g, "");
|
|
return sanitized || "model";
|
|
}
|
|
|
|
function normalizeConcurrency(value: number | undefined, fallback = 1) {
|
|
if (value === undefined) {
|
|
return fallback;
|
|
}
|
|
if (!Number.isFinite(value)) {
|
|
return fallback;
|
|
}
|
|
return Math.max(1, Math.floor(value));
|
|
}
|
|
|
|
async function mapWithConcurrency<T, U>(
|
|
items: readonly T[],
|
|
concurrency: number,
|
|
mapper: (item: T, index: number) => Promise<U>,
|
|
) {
|
|
const results = Array.from<U>({ length: items.length });
|
|
let nextIndex = 0;
|
|
const workerCount = Math.min(normalizeConcurrency(concurrency), items.length);
|
|
const workers = Array.from({ length: workerCount }, async () => {
|
|
while (nextIndex < items.length) {
|
|
const index = nextIndex;
|
|
nextIndex += 1;
|
|
results[index] = await mapper(items[index], index);
|
|
}
|
|
});
|
|
await Promise.all(workers);
|
|
return results;
|
|
}
|
|
|
|
function extractTranscript(result: QaSuiteResult) {
|
|
const details = result.scenarios.flatMap((scenario) =>
|
|
scenario.steps
|
|
.map((step) => step.details)
|
|
.filter((detail): detail is string => Boolean(detail)),
|
|
);
|
|
return details.toSorted((left, right) => right.length - left.length)[0] ?? result.report;
|
|
}
|
|
|
|
function collectTranscriptStats(transcript: string) {
|
|
return {
|
|
transcriptChars: transcript.length,
|
|
transcriptLines: transcript.length === 0 ? 0 : transcript.split(/\r?\n/).length,
|
|
userTurns: transcript.match(/^USER\b/gm)?.length ?? 0,
|
|
assistantTurns: transcript.match(/^ASSISTANT\b/gm)?.length ?? 0,
|
|
};
|
|
}
|
|
|
|
function formatDuration(ms: number) {
|
|
if (!Number.isFinite(ms) || ms < 0) {
|
|
return "unknown";
|
|
}
|
|
if (ms < 1_000) {
|
|
return `${Math.round(ms)}ms`;
|
|
}
|
|
if (ms < 60_000) {
|
|
const seconds = ms / 1_000;
|
|
return `${seconds >= 10 ? Math.round(seconds) : Number(seconds.toFixed(1))}s`;
|
|
}
|
|
const totalSeconds = Math.round(ms / 1_000);
|
|
const minutes = Math.floor(totalSeconds / 60);
|
|
const seconds = totalSeconds % 60;
|
|
return seconds === 0 ? `${minutes}m` : `${minutes}m ${seconds}s`;
|
|
}
|
|
|
|
function buildJudgePrompt(params: { scenarioId: string; runs: readonly QaCharacterEvalRun[] }) {
|
|
const runBlocks = params.runs
|
|
.map(
|
|
(run) => `## MODEL ${run.model}
|
|
|
|
Status: ${run.status}
|
|
Duration ms (not used for ranking): ${run.durationMs}
|
|
Fast mode: ${run.fastMode ? "on" : "off"}
|
|
Thinking: ${run.thinkingDefault}
|
|
Transcript chars: ${run.stats.transcriptChars}
|
|
Assistant turns: ${run.stats.assistantTurns}
|
|
Error: ${run.error ?? "none"}
|
|
|
|
\`\`\`text
|
|
${run.transcript}
|
|
\`\`\``,
|
|
)
|
|
.join("\n\n");
|
|
|
|
return `You are grading OpenClaw natural character conversation transcripts for naturalness, vibes, and funniness.
|
|
|
|
Scenario id: ${params.scenarioId}
|
|
|
|
Rank the models by:
|
|
- natural conversational reaction
|
|
- playful character commitment
|
|
- funny, surprising details
|
|
- coherence across turns
|
|
- completing real user tasks without becoming generic
|
|
- not sounding aware of an eval or test
|
|
- avoiding tool/backend/error leakage
|
|
|
|
Treat model names as opaque labels. Do not assume quality from the label.
|
|
Duration is recorded for separate benchmark analysis only. Do not rank models by speed.
|
|
|
|
Return strict JSON only with this shape:
|
|
{
|
|
"rankings": [
|
|
{
|
|
"model": "same model label",
|
|
"rank": 1,
|
|
"score": 9.2,
|
|
"summary": "one sentence",
|
|
"strengths": ["short"],
|
|
"weaknesses": ["short"]
|
|
}
|
|
]
|
|
}
|
|
|
|
${runBlocks}`;
|
|
}
|
|
|
|
function normalizeJudgment(value: unknown, allowedModels: Set<string>): QaCharacterEvalJudgment[] {
|
|
const payload = value && typeof value === "object" ? (value as Record<string, unknown>) : {};
|
|
const rankings = Array.isArray(payload.rankings) ? payload.rankings : [];
|
|
return rankings
|
|
.map((entry): QaCharacterEvalJudgment | null => {
|
|
if (!entry || typeof entry !== "object") {
|
|
return null;
|
|
}
|
|
const record = entry as Record<string, unknown>;
|
|
const model = typeof record.model === "string" ? record.model : "";
|
|
if (!allowedModels.has(model)) {
|
|
return null;
|
|
}
|
|
const rank = typeof record.rank === "number" ? record.rank : Number(record.rank);
|
|
const score = typeof record.score === "number" ? record.score : Number(record.score);
|
|
const summary = typeof record.summary === "string" ? record.summary : "";
|
|
const strengths = Array.isArray(record.strengths)
|
|
? record.strengths.filter((item): item is string => typeof item === "string")
|
|
: [];
|
|
const weaknesses = Array.isArray(record.weaknesses)
|
|
? record.weaknesses.filter((item): item is string => typeof item === "string")
|
|
: [];
|
|
if (!Number.isFinite(rank) || !Number.isFinite(score)) {
|
|
return null;
|
|
}
|
|
return { model, rank, score, summary, strengths, weaknesses };
|
|
})
|
|
.filter((entry): entry is QaCharacterEvalJudgment => Boolean(entry))
|
|
.toSorted((left, right) => left.rank - right.rank || right.score - left.score);
|
|
}
|
|
|
|
function parseJudgeReply(reply: string | null, allowedModels: Set<string>) {
|
|
if (!reply) {
|
|
throw new Error("judge did not return a reply");
|
|
}
|
|
const trimmed = reply.trim();
|
|
const jsonText =
|
|
trimmed.match(/```(?:json)?\s*([\s\S]*?)```/)?.[1]?.trim() ??
|
|
trimmed.match(/\{[\s\S]*\}/)?.[0]?.trim() ??
|
|
trimmed;
|
|
const parsed = JSON.parse(jsonText) as unknown;
|
|
const rankings = normalizeJudgment(parsed, allowedModels);
|
|
if (rankings.length === 0) {
|
|
throw new Error("judge reply did not contain valid rankings");
|
|
}
|
|
return rankings;
|
|
}
|
|
|
|
async function defaultRunJudge(params: {
|
|
repoRoot: string;
|
|
judgeModel: string;
|
|
judgeThinkingDefault: QaThinkingLevel;
|
|
judgeFastMode: boolean;
|
|
prompt: string;
|
|
timeoutMs: number;
|
|
}) {
|
|
const result = await runQaManualLane({
|
|
repoRoot: params.repoRoot,
|
|
providerMode: "live-frontier",
|
|
primaryModel: params.judgeModel,
|
|
alternateModel: params.judgeModel,
|
|
fastMode: params.judgeFastMode,
|
|
thinkingDefault: params.judgeThinkingDefault,
|
|
message: params.prompt,
|
|
timeoutMs: params.timeoutMs,
|
|
});
|
|
return result.reply;
|
|
}
|
|
|
|
function renderCharacterEvalReport(params: {
|
|
scenarioId: string;
|
|
startedAt: Date;
|
|
finishedAt: Date;
|
|
runs: readonly QaCharacterEvalRun[];
|
|
judgments: readonly QaCharacterEvalJudgeResult[];
|
|
}) {
|
|
const lines = [
|
|
"# OpenClaw Character Eval Report",
|
|
"",
|
|
`- Started: ${params.startedAt.toISOString()}`,
|
|
`- Finished: ${params.finishedAt.toISOString()}`,
|
|
`- Duration: ${formatDuration(params.finishedAt.getTime() - params.startedAt.getTime())}`,
|
|
`- Scenario: ${params.scenarioId}`,
|
|
"- Execution: local QA gateway child processes, not Docker",
|
|
`- Judges: ${params.judgments.map((judgment) => judgment.model).join(", ")}`,
|
|
`- Judge thinking: ${params.judgments[0]?.thinkingDefault ?? DEFAULT_JUDGE_THINKING}`,
|
|
`- Judge fast mode: ${params.judgments.every((judgment) => judgment.fastMode) ? "on" : "mixed"}`,
|
|
"",
|
|
"## Judge Rankings",
|
|
"",
|
|
];
|
|
|
|
for (const judgment of params.judgments) {
|
|
lines.push(`### ${judgment.model}`, "");
|
|
lines.push(`- Duration: ${formatDuration(judgment.durationMs)}`, "");
|
|
if (judgment.rankings.length > 0) {
|
|
for (const ranking of judgment.rankings) {
|
|
lines.push(
|
|
`${ranking.rank}. ${ranking.model} - ${ranking.score.toFixed(1)} - ${ranking.summary}`,
|
|
);
|
|
if (ranking.strengths.length > 0) {
|
|
lines.push(` Strengths: ${ranking.strengths.join("; ")}`);
|
|
}
|
|
if (ranking.weaknesses.length > 0) {
|
|
lines.push(` Weaknesses: ${ranking.weaknesses.join("; ")}`);
|
|
}
|
|
}
|
|
} else {
|
|
lines.push("- Judge ranking unavailable.");
|
|
if (judgment.error) {
|
|
lines.push(`- Judge error: ${judgment.error}`);
|
|
}
|
|
}
|
|
lines.push("");
|
|
}
|
|
|
|
lines.push("## Run Stats", "");
|
|
lines.push(
|
|
"| Model | Thinking | Fast mode | Status | Duration | User turns | Assistant turns | Transcript chars |",
|
|
);
|
|
lines.push("| --- | --- | --- | --- | ---: | ---: | ---: | ---: |");
|
|
for (const run of params.runs) {
|
|
lines.push(
|
|
`| ${run.model} | ${run.thinkingDefault} | ${run.fastMode ? "on" : "off"} | ${run.status} | ${formatDuration(run.durationMs)} | ${run.stats.userTurns} | ${run.stats.assistantTurns} | ${run.stats.transcriptChars} |`,
|
|
);
|
|
}
|
|
|
|
lines.push("", "## Transcripts", "");
|
|
for (const run of params.runs) {
|
|
lines.push(`### ${run.model}`, "");
|
|
lines.push(`- Status: ${run.status}`);
|
|
lines.push(`- Thinking: ${run.thinkingDefault}`);
|
|
lines.push(`- Fast mode: ${run.fastMode ? "on" : "off"}`);
|
|
lines.push(`- Duration: ${formatDuration(run.durationMs)}`);
|
|
lines.push(`- Report: ${run.reportPath ?? "unavailable"}`);
|
|
if (run.error) {
|
|
lines.push(`- Error: ${run.error}`);
|
|
}
|
|
lines.push("", "```text", run.transcript.trim() || "(empty transcript)", "```", "");
|
|
}
|
|
|
|
return `${lines.join("\n")}\n`;
|
|
}
|
|
|
|
export async function runQaCharacterEval(params: QaCharacterEvalParams) {
|
|
const startedAt = new Date();
|
|
const repoRoot = path.resolve(params.repoRoot ?? process.cwd());
|
|
const scenarioId = params.scenarioId?.trim() || DEFAULT_CHARACTER_SCENARIO_ID;
|
|
const models = normalizeModelRefs(
|
|
params.models.length > 0 ? params.models : DEFAULT_CHARACTER_EVAL_MODELS,
|
|
);
|
|
if (models.length === 0) {
|
|
throw new Error("qa character-eval needs at least one --model <provider/model> ref");
|
|
}
|
|
|
|
const outputDir =
|
|
params.outputDir ??
|
|
path.join(repoRoot, ".artifacts", "qa-e2e", `character-eval-${Date.now().toString(36)}`);
|
|
const runsDir = path.join(outputDir, "runs");
|
|
await fs.mkdir(runsDir, { recursive: true });
|
|
|
|
const runSuite = params.runSuite ?? runQaSuite;
|
|
const candidateConcurrency = normalizeConcurrency(
|
|
params.candidateConcurrency,
|
|
DEFAULT_CHARACTER_EVAL_CONCURRENCY,
|
|
);
|
|
const runs = await mapWithConcurrency(models, candidateConcurrency, async (model) => {
|
|
const thinkingDefault = resolveCandidateThinkingDefault({
|
|
model,
|
|
candidateThinkingDefault: params.candidateThinkingDefault,
|
|
candidateThinkingByModel: params.candidateThinkingByModel,
|
|
candidateModelOptions: params.candidateModelOptions,
|
|
});
|
|
const fastMode = resolveCandidateFastMode({
|
|
model,
|
|
candidateFastMode: params.candidateFastMode,
|
|
candidateModelOptions: params.candidateModelOptions,
|
|
});
|
|
const modelOutputDir = path.join(runsDir, sanitizePathPart(model));
|
|
const runStartedAt = Date.now();
|
|
try {
|
|
const result = await runSuite({
|
|
repoRoot,
|
|
outputDir: modelOutputDir,
|
|
providerMode: "live-frontier",
|
|
primaryModel: model,
|
|
alternateModel: model,
|
|
fastMode,
|
|
thinkingDefault,
|
|
scenarioIds: [scenarioId],
|
|
});
|
|
const transcript = extractTranscript(result);
|
|
const status = result.scenarios.some((scenario) => scenario.status === "fail")
|
|
? "fail"
|
|
: "pass";
|
|
return {
|
|
model,
|
|
status,
|
|
durationMs: Date.now() - runStartedAt,
|
|
outputDir: modelOutputDir,
|
|
thinkingDefault,
|
|
fastMode,
|
|
reportPath: result.reportPath,
|
|
summaryPath: result.summaryPath,
|
|
transcript,
|
|
stats: collectTranscriptStats(transcript),
|
|
} satisfies QaCharacterEvalRun;
|
|
} catch (error) {
|
|
const transcript = "";
|
|
return {
|
|
model,
|
|
status: "fail",
|
|
durationMs: Date.now() - runStartedAt,
|
|
outputDir: modelOutputDir,
|
|
thinkingDefault,
|
|
fastMode,
|
|
transcript,
|
|
stats: collectTranscriptStats(transcript),
|
|
error: formatErrorMessage(error),
|
|
} satisfies QaCharacterEvalRun;
|
|
}
|
|
});
|
|
|
|
const judgeModels = normalizeModelRefs(
|
|
params.judgeModels && params.judgeModels.length > 0
|
|
? params.judgeModels
|
|
: params.judgeModel
|
|
? [params.judgeModel]
|
|
: DEFAULT_JUDGE_MODELS,
|
|
);
|
|
const runJudge = params.runJudge ?? defaultRunJudge;
|
|
const judgeConcurrency = normalizeConcurrency(
|
|
params.judgeConcurrency,
|
|
DEFAULT_CHARACTER_EVAL_CONCURRENCY,
|
|
);
|
|
const judgments = await mapWithConcurrency(judgeModels, judgeConcurrency, async (judgeModel) => {
|
|
const judgeOptions = resolveJudgeOptions({
|
|
model: judgeModel,
|
|
judgeThinkingDefault: params.judgeThinkingDefault,
|
|
judgeModelOptions: params.judgeModelOptions,
|
|
});
|
|
let rankings: QaCharacterEvalJudgment[] = [];
|
|
let judgeError: string | undefined;
|
|
const judgeStartedAt = Date.now();
|
|
try {
|
|
const rawReply = await runJudge({
|
|
repoRoot,
|
|
judgeModel,
|
|
judgeThinkingDefault: judgeOptions.thinkingDefault,
|
|
judgeFastMode: judgeOptions.fastMode,
|
|
prompt: buildJudgePrompt({ scenarioId, runs }),
|
|
timeoutMs: params.judgeTimeoutMs ?? 180_000,
|
|
});
|
|
rankings = parseJudgeReply(rawReply, new Set(models));
|
|
} catch (error) {
|
|
judgeError = formatErrorMessage(error);
|
|
}
|
|
|
|
return {
|
|
model: judgeModel,
|
|
thinkingDefault: judgeOptions.thinkingDefault,
|
|
fastMode: judgeOptions.fastMode,
|
|
durationMs: Date.now() - judgeStartedAt,
|
|
rankings,
|
|
...(judgeError ? { error: judgeError } : {}),
|
|
} satisfies QaCharacterEvalJudgeResult;
|
|
});
|
|
|
|
const finishedAt = new Date();
|
|
const report = renderCharacterEvalReport({
|
|
scenarioId,
|
|
startedAt,
|
|
finishedAt,
|
|
runs,
|
|
judgments,
|
|
});
|
|
const reportPath = path.join(outputDir, "character-eval-report.md");
|
|
const summaryPath = path.join(outputDir, "character-eval-summary.json");
|
|
await fs.writeFile(reportPath, report, "utf8");
|
|
await fs.writeFile(
|
|
summaryPath,
|
|
`${JSON.stringify(
|
|
{
|
|
scenarioId,
|
|
runs,
|
|
judgments,
|
|
},
|
|
null,
|
|
2,
|
|
)}\n`,
|
|
"utf8",
|
|
);
|
|
|
|
return {
|
|
outputDir,
|
|
reportPath,
|
|
summaryPath,
|
|
runs,
|
|
judgments,
|
|
} satisfies QaCharacterEvalResult;
|
|
}
|