mirror of
https://github.com/openclaw/openclaw.git
synced 2026-04-12 01:31:08 +00:00
feat: parallelize character eval runs
This commit is contained in:
@@ -68,6 +68,8 @@ pnpm openclaw qa character-eval \
|
||||
--model codex-cli/<codex-model>,thinking=high \
|
||||
--judge-model openai/gpt-5.4,thinking=xhigh,fast \
|
||||
--judge-model anthropic/claude-opus-4-6,thinking=high \
|
||||
--concurrency 8 \
|
||||
--judge-concurrency 8 \
|
||||
--output-dir .artifacts/qa-e2e/character-eval-<tag>
|
||||
```
|
||||
|
||||
@@ -79,6 +81,7 @@ pnpm openclaw qa character-eval \
|
||||
- OpenAI candidate refs default to fast mode so priority processing is used where supported. Use inline `,fast`, `,no-fast`, or `,fast=false` for one model; use `--fast` only to force fast mode for every candidate.
|
||||
- Judges default to `openai/gpt-5.4,thinking=xhigh,fast` and `anthropic/claude-opus-4-6,thinking=high`.
|
||||
- Report includes judge ranking, run stats, durations, and full transcripts; do not include raw judge replies. Duration is benchmark context, not a grading signal.
|
||||
- Candidate and judge concurrency default to 8. Use `--concurrency <n>` and `--judge-concurrency <n>` to override when local gateways or provider limits need a gentler lane.
|
||||
- Scenario source should stay markdown-driven under `qa/scenarios/`.
|
||||
- For isolated character/persona evals, write the persona into `SOUL.md` and blank `IDENTITY.md` in the scenario flow. Use `SOUL.md + IDENTITY.md` only when intentionally testing how the normal OpenClaw identity combines with the character.
|
||||
- Keep prompts natural and task-shaped. The candidate model should receive character setup through `SOUL.md`, then normal user turns such as chat, workspace help, and small file tasks; do not ask "how would you react?" or tell the model it is in an eval.
|
||||
|
||||
@@ -98,7 +98,9 @@ pnpm openclaw qa character-eval \
|
||||
--model xiaomi/mimo-v2-pro,thinking=high \
|
||||
--model google/gemini-3.1-pro-preview,thinking=high \
|
||||
--judge-model openai/gpt-5.4,thinking=xhigh,fast \
|
||||
--judge-model anthropic/claude-opus-4-6,thinking=high
|
||||
--judge-model anthropic/claude-opus-4-6,thinking=high \
|
||||
--concurrency 8 \
|
||||
--judge-concurrency 8
|
||||
```
|
||||
|
||||
The command runs local QA gateway child processes, not Docker. Character eval
|
||||
@@ -118,6 +120,9 @@ single candidate or judge needs an override. Pass `--fast` only when you want to
|
||||
force fast mode on for every candidate model. Candidate and judge durations are
|
||||
recorded in the report for benchmark analysis, but judge prompts explicitly say
|
||||
not to rank by speed.
|
||||
Candidate and judge model runs both default to concurrency 8. Lower
|
||||
`--concurrency` or `--judge-concurrency` when provider limits or local gateway
|
||||
pressure make a run too noisy.
|
||||
When no candidate `--model` is passed, the character eval defaults to
|
||||
`openai/gpt-5.4`, `openai/gpt-5.2`, `anthropic/claude-opus-4-6`,
|
||||
`anthropic/claude-sonnet-4-6`, `minimax/MiniMax-M2.7`, `zai/glm-5.1`,
|
||||
|
||||
@@ -115,7 +115,8 @@ describe("runQaCharacterEval", () => {
|
||||
expect(report).toContain("reply from codex-cli/test-model");
|
||||
expect(report).toContain("Judge thinking: xhigh");
|
||||
expect(report).toContain("Fast mode: on");
|
||||
expect(report).toContain("Duration ms:");
|
||||
expect(report).toContain("Duration:");
|
||||
expect(report).not.toContain("Duration ms:");
|
||||
expect(report).not.toContain("Judge Raw Reply");
|
||||
});
|
||||
|
||||
@@ -201,6 +202,92 @@ describe("runQaCharacterEval", () => {
|
||||
expect(runJudge.mock.calls.map(([params]) => params.judgeFastMode)).toEqual([true, false]);
|
||||
});
|
||||
|
||||
it("runs candidate models with bounded concurrency while preserving result order", async () => {
|
||||
let activeRuns = 0;
|
||||
let maxActiveRuns = 0;
|
||||
const runSuite = vi.fn(async (params: CharacterRunSuiteParams) => {
|
||||
activeRuns += 1;
|
||||
maxActiveRuns = Math.max(maxActiveRuns, activeRuns);
|
||||
await new Promise((resolve) => setTimeout(resolve, 10));
|
||||
activeRuns -= 1;
|
||||
return makeSuiteResult({
|
||||
outputDir: params.outputDir,
|
||||
model: params.primaryModel,
|
||||
transcript: `USER Alice: hi\n\nASSISTANT openclaw: reply from ${params.primaryModel}`,
|
||||
});
|
||||
});
|
||||
const runJudge = vi.fn(async (_params: CharacterRunJudgeParams) =>
|
||||
JSON.stringify({
|
||||
rankings: [
|
||||
{ model: "openai/gpt-5.4", rank: 1, score: 8, summary: "ok" },
|
||||
{ model: "anthropic/claude-sonnet-4-6", rank: 2, score: 7, summary: "ok" },
|
||||
{ model: "moonshot/kimi-k2.5", rank: 3, score: 6, summary: "ok" },
|
||||
],
|
||||
}),
|
||||
);
|
||||
|
||||
const result = await runQaCharacterEval({
|
||||
repoRoot: tempRoot,
|
||||
outputDir: path.join(tempRoot, "character"),
|
||||
models: ["openai/gpt-5.4", "anthropic/claude-sonnet-4-6", "moonshot/kimi-k2.5"],
|
||||
candidateConcurrency: 2,
|
||||
judgeModels: ["openai/gpt-5.4"],
|
||||
runSuite,
|
||||
runJudge,
|
||||
});
|
||||
|
||||
expect(maxActiveRuns).toBe(2);
|
||||
expect(result.runs.map((run) => run.model)).toEqual([
|
||||
"openai/gpt-5.4",
|
||||
"anthropic/claude-sonnet-4-6",
|
||||
"moonshot/kimi-k2.5",
|
||||
]);
|
||||
});
|
||||
|
||||
it("defaults candidate and judge concurrency to eight", async () => {
|
||||
let activeRuns = 0;
|
||||
let maxActiveRuns = 0;
|
||||
const runSuite = vi.fn(async (params: CharacterRunSuiteParams) => {
|
||||
activeRuns += 1;
|
||||
maxActiveRuns = Math.max(maxActiveRuns, activeRuns);
|
||||
await new Promise((resolve) => setTimeout(resolve, 10));
|
||||
activeRuns -= 1;
|
||||
return makeSuiteResult({
|
||||
outputDir: params.outputDir,
|
||||
model: params.primaryModel,
|
||||
transcript: `USER Alice: hi\n\nASSISTANT openclaw: reply from ${params.primaryModel}`,
|
||||
});
|
||||
});
|
||||
let activeJudges = 0;
|
||||
let maxActiveJudges = 0;
|
||||
const runJudge = vi.fn(async (_params: CharacterRunJudgeParams) => {
|
||||
activeJudges += 1;
|
||||
maxActiveJudges = Math.max(maxActiveJudges, activeJudges);
|
||||
await new Promise((resolve) => setTimeout(resolve, 10));
|
||||
activeJudges -= 1;
|
||||
return JSON.stringify({
|
||||
rankings: Array.from({ length: 10 }, (_, index) => ({
|
||||
model: `provider/model-${index + 1}`,
|
||||
rank: index + 1,
|
||||
score: 10 - index,
|
||||
summary: "ok",
|
||||
})),
|
||||
});
|
||||
});
|
||||
|
||||
await runQaCharacterEval({
|
||||
repoRoot: tempRoot,
|
||||
outputDir: path.join(tempRoot, "character"),
|
||||
models: Array.from({ length: 10 }, (_, index) => `provider/model-${index + 1}`),
|
||||
judgeModels: Array.from({ length: 10 }, (_, index) => `judge/model-${index + 1}`),
|
||||
runSuite,
|
||||
runJudge,
|
||||
});
|
||||
|
||||
expect(maxActiveRuns).toBe(8);
|
||||
expect(maxActiveJudges).toBe(8);
|
||||
});
|
||||
|
||||
it("lets explicit candidate thinking override the default panel", async () => {
|
||||
const runSuite = vi.fn(async (params: CharacterRunSuiteParams) =>
|
||||
makeSuiteResult({
|
||||
|
||||
@@ -20,6 +20,7 @@ const DEFAULT_CHARACTER_EVAL_MODELS = Object.freeze([
|
||||
"google/gemini-3.1-pro-preview",
|
||||
]);
|
||||
const DEFAULT_CHARACTER_THINKING: QaThinkingLevel = "high";
|
||||
const DEFAULT_CHARACTER_EVAL_CONCURRENCY = 8;
|
||||
const DEFAULT_CHARACTER_THINKING_BY_MODEL: Readonly<Record<string, QaThinkingLevel>> =
|
||||
Object.freeze({
|
||||
"openai/gpt-5.4": "xhigh",
|
||||
@@ -119,6 +120,8 @@ export type QaCharacterEvalParams = {
|
||||
judgeThinkingDefault?: QaThinkingLevel;
|
||||
judgeModelOptions?: Record<string, QaCharacterModelOptions>;
|
||||
judgeTimeoutMs?: number;
|
||||
candidateConcurrency?: number;
|
||||
judgeConcurrency?: number;
|
||||
runSuite?: RunSuiteFn;
|
||||
runJudge?: RunJudgeFn;
|
||||
};
|
||||
@@ -176,6 +179,35 @@ function sanitizePathPart(value: string) {
|
||||
return sanitized || "model";
|
||||
}
|
||||
|
||||
function normalizeConcurrency(value: number | undefined, fallback = 1) {
|
||||
if (value === undefined) {
|
||||
return fallback;
|
||||
}
|
||||
if (!Number.isFinite(value)) {
|
||||
return fallback;
|
||||
}
|
||||
return Math.max(1, Math.floor(value));
|
||||
}
|
||||
|
||||
async function mapWithConcurrency<T, U>(
|
||||
items: readonly T[],
|
||||
concurrency: number,
|
||||
mapper: (item: T, index: number) => Promise<U>,
|
||||
) {
|
||||
const results = Array.from<U>({ length: items.length });
|
||||
let nextIndex = 0;
|
||||
const workerCount = Math.min(normalizeConcurrency(concurrency), items.length);
|
||||
const workers = Array.from({ length: workerCount }, async () => {
|
||||
while (nextIndex < items.length) {
|
||||
const index = nextIndex;
|
||||
nextIndex += 1;
|
||||
results[index] = await mapper(items[index], index);
|
||||
}
|
||||
});
|
||||
await Promise.all(workers);
|
||||
return results;
|
||||
}
|
||||
|
||||
function extractTranscript(result: QaSuiteResult) {
|
||||
const details = result.scenarios.flatMap((scenario) =>
|
||||
scenario.steps
|
||||
@@ -194,6 +226,23 @@ function collectTranscriptStats(transcript: string) {
|
||||
};
|
||||
}
|
||||
|
||||
function formatDuration(ms: number) {
|
||||
if (!Number.isFinite(ms) || ms < 0) {
|
||||
return "unknown";
|
||||
}
|
||||
if (ms < 1_000) {
|
||||
return `${Math.round(ms)}ms`;
|
||||
}
|
||||
if (ms < 60_000) {
|
||||
const seconds = ms / 1_000;
|
||||
return `${seconds >= 10 ? Math.round(seconds) : Number(seconds.toFixed(1))}s`;
|
||||
}
|
||||
const totalSeconds = Math.round(ms / 1_000);
|
||||
const minutes = Math.floor(totalSeconds / 60);
|
||||
const seconds = totalSeconds % 60;
|
||||
return seconds === 0 ? `${minutes}m` : `${minutes}m ${seconds}s`;
|
||||
}
|
||||
|
||||
function buildJudgePrompt(params: { scenarioId: string; runs: readonly QaCharacterEvalRun[] }) {
|
||||
const runBlocks = params.runs
|
||||
.map(
|
||||
@@ -327,7 +376,7 @@ function renderCharacterEvalReport(params: {
|
||||
"",
|
||||
`- Started: ${params.startedAt.toISOString()}`,
|
||||
`- Finished: ${params.finishedAt.toISOString()}`,
|
||||
`- Duration ms: ${params.finishedAt.getTime() - params.startedAt.getTime()}`,
|
||||
`- Duration: ${formatDuration(params.finishedAt.getTime() - params.startedAt.getTime())}`,
|
||||
`- Scenario: ${params.scenarioId}`,
|
||||
"- Execution: local QA gateway child processes, not Docker",
|
||||
`- Judges: ${params.judgments.map((judgment) => judgment.model).join(", ")}`,
|
||||
@@ -340,7 +389,7 @@ function renderCharacterEvalReport(params: {
|
||||
|
||||
for (const judgment of params.judgments) {
|
||||
lines.push(`### ${judgment.model}`, "");
|
||||
lines.push(`- Duration ms: ${judgment.durationMs}`, "");
|
||||
lines.push(`- Duration: ${formatDuration(judgment.durationMs)}`, "");
|
||||
if (judgment.rankings.length > 0) {
|
||||
for (const ranking of judgment.rankings) {
|
||||
lines.push(
|
||||
@@ -364,12 +413,12 @@ function renderCharacterEvalReport(params: {
|
||||
|
||||
lines.push("## Run Stats", "");
|
||||
lines.push(
|
||||
"| Model | Thinking | Fast mode | Status | Duration ms | User turns | Assistant turns | Transcript chars |",
|
||||
"| Model | Thinking | Fast mode | Status | Duration | User turns | Assistant turns | Transcript chars |",
|
||||
);
|
||||
lines.push("| --- | --- | --- | --- | ---: | ---: | ---: | ---: |");
|
||||
for (const run of params.runs) {
|
||||
lines.push(
|
||||
`| ${run.model} | ${run.thinkingDefault} | ${run.fastMode ? "on" : "off"} | ${run.status} | ${run.durationMs} | ${run.stats.userTurns} | ${run.stats.assistantTurns} | ${run.stats.transcriptChars} |`,
|
||||
`| ${run.model} | ${run.thinkingDefault} | ${run.fastMode ? "on" : "off"} | ${run.status} | ${formatDuration(run.durationMs)} | ${run.stats.userTurns} | ${run.stats.assistantTurns} | ${run.stats.transcriptChars} |`,
|
||||
);
|
||||
}
|
||||
|
||||
@@ -379,7 +428,7 @@ function renderCharacterEvalReport(params: {
|
||||
lines.push(`- Status: ${run.status}`);
|
||||
lines.push(`- Thinking: ${run.thinkingDefault}`);
|
||||
lines.push(`- Fast mode: ${run.fastMode ? "on" : "off"}`);
|
||||
lines.push(`- Duration ms: ${run.durationMs}`);
|
||||
lines.push(`- Duration: ${formatDuration(run.durationMs)}`);
|
||||
lines.push(`- Report: ${run.reportPath ?? "unavailable"}`);
|
||||
if (run.error) {
|
||||
lines.push(`- Error: ${run.error}`);
|
||||
@@ -408,8 +457,11 @@ export async function runQaCharacterEval(params: QaCharacterEvalParams) {
|
||||
await fs.mkdir(runsDir, { recursive: true });
|
||||
|
||||
const runSuite = params.runSuite ?? runQaSuite;
|
||||
const runs: QaCharacterEvalRun[] = [];
|
||||
for (const model of models) {
|
||||
const candidateConcurrency = normalizeConcurrency(
|
||||
params.candidateConcurrency,
|
||||
DEFAULT_CHARACTER_EVAL_CONCURRENCY,
|
||||
);
|
||||
const runs = await mapWithConcurrency(models, candidateConcurrency, async (model) => {
|
||||
const thinkingDefault = resolveCandidateThinkingDefault({
|
||||
model,
|
||||
candidateThinkingDefault: params.candidateThinkingDefault,
|
||||
@@ -438,7 +490,7 @@ export async function runQaCharacterEval(params: QaCharacterEvalParams) {
|
||||
const status = result.scenarios.some((scenario) => scenario.status === "fail")
|
||||
? "fail"
|
||||
: "pass";
|
||||
runs.push({
|
||||
return {
|
||||
model,
|
||||
status,
|
||||
durationMs: Date.now() - runStartedAt,
|
||||
@@ -449,10 +501,10 @@ export async function runQaCharacterEval(params: QaCharacterEvalParams) {
|
||||
summaryPath: result.summaryPath,
|
||||
transcript,
|
||||
stats: collectTranscriptStats(transcript),
|
||||
});
|
||||
} satisfies QaCharacterEvalRun;
|
||||
} catch (error) {
|
||||
const transcript = "";
|
||||
runs.push({
|
||||
return {
|
||||
model,
|
||||
status: "fail",
|
||||
durationMs: Date.now() - runStartedAt,
|
||||
@@ -462,9 +514,9 @@ export async function runQaCharacterEval(params: QaCharacterEvalParams) {
|
||||
transcript,
|
||||
stats: collectTranscriptStats(transcript),
|
||||
error: formatErrorMessage(error),
|
||||
});
|
||||
} satisfies QaCharacterEvalRun;
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
const judgeModels = normalizeModelRefs(
|
||||
params.judgeModels && params.judgeModels.length > 0
|
||||
@@ -474,8 +526,11 @@ export async function runQaCharacterEval(params: QaCharacterEvalParams) {
|
||||
: DEFAULT_JUDGE_MODELS,
|
||||
);
|
||||
const runJudge = params.runJudge ?? defaultRunJudge;
|
||||
const judgments: QaCharacterEvalJudgeResult[] = [];
|
||||
for (const judgeModel of judgeModels) {
|
||||
const judgeConcurrency = normalizeConcurrency(
|
||||
params.judgeConcurrency,
|
||||
DEFAULT_CHARACTER_EVAL_CONCURRENCY,
|
||||
);
|
||||
const judgments = await mapWithConcurrency(judgeModels, judgeConcurrency, async (judgeModel) => {
|
||||
const judgeOptions = resolveJudgeOptions({
|
||||
model: judgeModel,
|
||||
judgeThinkingDefault: params.judgeThinkingDefault,
|
||||
@@ -498,15 +553,15 @@ export async function runQaCharacterEval(params: QaCharacterEvalParams) {
|
||||
judgeError = formatErrorMessage(error);
|
||||
}
|
||||
|
||||
judgments.push({
|
||||
return {
|
||||
model: judgeModel,
|
||||
thinkingDefault: judgeOptions.thinkingDefault,
|
||||
fastMode: judgeOptions.fastMode,
|
||||
durationMs: Date.now() - judgeStartedAt,
|
||||
rankings,
|
||||
...(judgeError ? { error: judgeError } : {}),
|
||||
});
|
||||
}
|
||||
} satisfies QaCharacterEvalJudgeResult;
|
||||
});
|
||||
|
||||
const finishedAt = new Date();
|
||||
const report = renderCharacterEvalReport({
|
||||
|
||||
@@ -158,6 +158,8 @@ describe("qa cli runtime", () => {
|
||||
modelThinking: ["codex-cli/test-model=medium"],
|
||||
judgeModel: ["openai/gpt-5.4,thinking=xhigh,fast", "anthropic/claude-opus-4-6,thinking=high"],
|
||||
judgeTimeoutMs: 180_000,
|
||||
concurrency: 4,
|
||||
judgeConcurrency: 3,
|
||||
});
|
||||
|
||||
expect(runQaCharacterEval).toHaveBeenCalledWith({
|
||||
@@ -178,6 +180,8 @@ describe("qa cli runtime", () => {
|
||||
"anthropic/claude-opus-4-6": { thinkingDefault: "high" },
|
||||
},
|
||||
judgeTimeoutMs: 180_000,
|
||||
candidateConcurrency: 4,
|
||||
judgeConcurrency: 3,
|
||||
});
|
||||
});
|
||||
|
||||
@@ -199,6 +203,8 @@ describe("qa cli runtime", () => {
|
||||
judgeModels: undefined,
|
||||
judgeModelOptions: undefined,
|
||||
judgeTimeoutMs: undefined,
|
||||
candidateConcurrency: undefined,
|
||||
judgeConcurrency: undefined,
|
||||
});
|
||||
});
|
||||
|
||||
|
||||
@@ -85,6 +85,16 @@ function parseQaBooleanModelOption(label: string, value: string) {
|
||||
}
|
||||
}
|
||||
|
||||
function parseQaPositiveIntegerOption(label: string, value: number | undefined) {
|
||||
if (value === undefined) {
|
||||
return undefined;
|
||||
}
|
||||
if (!Number.isFinite(value) || value < 1) {
|
||||
throw new Error(`${label} must be a positive integer`);
|
||||
}
|
||||
return Math.floor(value);
|
||||
}
|
||||
|
||||
function parseQaModelSpecs(label: string, entries: readonly string[] | undefined) {
|
||||
const models: string[] = [];
|
||||
const optionsByModel: Record<string, QaCharacterModelOptions> = {};
|
||||
@@ -215,6 +225,8 @@ export async function runQaCharacterEvalCommand(opts: {
|
||||
modelThinking?: string[];
|
||||
judgeModel?: string[];
|
||||
judgeTimeoutMs?: number;
|
||||
concurrency?: number;
|
||||
judgeConcurrency?: number;
|
||||
}) {
|
||||
const repoRoot = path.resolve(opts.repoRoot ?? process.cwd());
|
||||
const candidates = parseQaModelSpecs("--model", opts.model);
|
||||
@@ -231,6 +243,8 @@ export async function runQaCharacterEvalCommand(opts: {
|
||||
judgeModels: judges.models.length > 0 ? judges.models : undefined,
|
||||
judgeModelOptions: judges.optionsByModel,
|
||||
judgeTimeoutMs: opts.judgeTimeoutMs,
|
||||
candidateConcurrency: parseQaPositiveIntegerOption("--concurrency", opts.concurrency),
|
||||
judgeConcurrency: parseQaPositiveIntegerOption("--judge-concurrency", opts.judgeConcurrency),
|
||||
});
|
||||
process.stdout.write(`QA character eval report: ${result.reportPath}\n`);
|
||||
process.stdout.write(`QA character eval summary: ${result.summaryPath}\n`);
|
||||
|
||||
@@ -38,6 +38,8 @@ async function runQaCharacterEval(opts: {
|
||||
modelThinking?: string[];
|
||||
judgeModel?: string[];
|
||||
judgeTimeoutMs?: number;
|
||||
concurrency?: number;
|
||||
judgeConcurrency?: number;
|
||||
}) {
|
||||
const runtime = await loadQaLabCliRuntime();
|
||||
await runtime.runQaCharacterEvalCommand(opts);
|
||||
@@ -197,6 +199,12 @@ export function registerQaLabCli(program: Command) {
|
||||
.option("--judge-timeout-ms <ms>", "Override judge wait timeout", (value: string) =>
|
||||
Number(value),
|
||||
)
|
||||
.option("--concurrency <count>", "Candidate model run concurrency", (value: string) =>
|
||||
Number(value),
|
||||
)
|
||||
.option("--judge-concurrency <count>", "Judge model run concurrency", (value: string) =>
|
||||
Number(value),
|
||||
)
|
||||
.action(
|
||||
async (opts: {
|
||||
repoRoot?: string;
|
||||
@@ -208,6 +216,8 @@ export function registerQaLabCli(program: Command) {
|
||||
modelThinking?: string[];
|
||||
judgeModel?: string[];
|
||||
judgeTimeoutMs?: number;
|
||||
concurrency?: number;
|
||||
judgeConcurrency?: number;
|
||||
}) => {
|
||||
await runQaCharacterEval(opts);
|
||||
},
|
||||
|
||||
@@ -6,8 +6,9 @@ title: "Late-night deploy helper chat"
|
||||
surface: character
|
||||
objective: Capture a natural multi-turn character conversation with real workspace help so another model can later grade naturalness, vibe, and funniness from the raw transcript.
|
||||
successCriteria:
|
||||
- Agent responds on every turn of the conversation.
|
||||
- Agent completes a small workspace file task without making the conversation feel like a test.
|
||||
- Agent gets a natural multi-turn conversation, and any missed replies stay visible in the transcript instead of aborting capture.
|
||||
- Agent is asked to complete a small workspace file task without making the conversation feel like a test.
|
||||
- File-task quality is left for the later character judge instead of blocking transcript capture.
|
||||
- Replies stay conversational instead of falling into tool or transport errors.
|
||||
- The report preserves the full transcript for later grading.
|
||||
docsRefs:
|
||||
@@ -23,10 +24,6 @@ execution:
|
||||
config:
|
||||
conversationId: alice
|
||||
senderName: Alice
|
||||
artifactNeedles:
|
||||
- Precious Status
|
||||
- build is green
|
||||
- cursed
|
||||
workspaceFiles:
|
||||
SOUL.md: |-
|
||||
# This is your character
|
||||
@@ -101,40 +98,26 @@ steps:
|
||||
ref: config.senderName
|
||||
text:
|
||||
expr: turn.text
|
||||
- call: waitForOutboundMessage
|
||||
saveAs: latestOutbound
|
||||
args:
|
||||
- ref: state
|
||||
- lambda:
|
||||
params: [candidate]
|
||||
expr: "candidate.conversation.id === config.conversationId && candidate.text.trim().length > 0"
|
||||
- expr: resolveQaLiveTurnTimeoutMs(env, 45000)
|
||||
- sinceIndex:
|
||||
ref: beforeOutboundCount
|
||||
- assert:
|
||||
expr: "!config.forbiddenNeedles.some((needle) => normalizeLowercaseStringOrEmpty(latestOutbound.text).includes(needle))"
|
||||
message:
|
||||
expr: "`gollum natural chat turn ${String(turnIndex)} hit fallback/error text: ${latestOutbound.text}`"
|
||||
- if:
|
||||
expr: Boolean(turn.expectFile?.path)
|
||||
then:
|
||||
- set: expectedArtifactPath
|
||||
value:
|
||||
expr: "path.join(env.gateway.workspaceDir, String(turn.expectFile.path))"
|
||||
- call: waitForCondition
|
||||
saveAs: expectedArtifact
|
||||
- try:
|
||||
actions:
|
||||
- call: waitForOutboundMessage
|
||||
saveAs: latestOutbound
|
||||
args:
|
||||
- ref: state
|
||||
- lambda:
|
||||
async: true
|
||||
expr: "((await fs.readFile(expectedArtifactPath, 'utf8').catch(() => null)) ?? undefined)"
|
||||
- expr: resolveQaLiveTurnTimeoutMs(env, 30000)
|
||||
- 250
|
||||
params: [candidate]
|
||||
expr: "candidate.conversation.id === config.conversationId && candidate.text.trim().length > 0"
|
||||
- expr: resolveQaLiveTurnTimeoutMs(env, 45000)
|
||||
- sinceIndex:
|
||||
ref: beforeOutboundCount
|
||||
- assert:
|
||||
expr: "config.artifactNeedles.every((needle) => normalizeLowercaseStringOrEmpty(expectedArtifact).includes(normalizeLowercaseStringOrEmpty(needle)))"
|
||||
expr: "!config.forbiddenNeedles.some((needle) => normalizeLowercaseStringOrEmpty(latestOutbound.text).includes(needle))"
|
||||
message:
|
||||
expr: "`expected ${String(turn.expectFile.path)} to contain natural character task needles`"
|
||||
- assert:
|
||||
expr: "state.getSnapshot().messages.filter((message) => message.direction === 'outbound' && message.conversation.id === config.conversationId).length === config.turns.length"
|
||||
message: missing one or more Gollum replies
|
||||
expr: "`gollum natural chat turn ${String(turnIndex)} hit fallback/error text: ${latestOutbound.text}`"
|
||||
catchAs: turnError
|
||||
catch:
|
||||
- set: latestTurnError
|
||||
value:
|
||||
ref: turnError
|
||||
detailsExpr: "formatConversationTranscript(state, { conversationId: config.conversationId })"
|
||||
```
|
||||
|
||||
Reference in New Issue
Block a user