mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-03 11:40:22 +00:00
test: strengthen character eval judging
This commit is contained in:
@@ -97,7 +97,8 @@ describe("runQaCharacterEval", () => {
|
||||
expect.objectContaining({
|
||||
judgeModel: "openai/gpt-5.4",
|
||||
judgeThinkingDefault: "xhigh",
|
||||
judgeFastMode: true,
|
||||
judgeFastMode: false,
|
||||
timeoutMs: 300_000,
|
||||
}),
|
||||
);
|
||||
expect(result.judgments).toHaveLength(1);
|
||||
@@ -115,6 +116,7 @@ describe("runQaCharacterEval", () => {
|
||||
expect(report).toContain("reply from openai/gpt-5.4");
|
||||
expect(report).toContain("reply from codex-cli/test-model");
|
||||
expect(report).toContain("Judge thinking: xhigh");
|
||||
expect(report).toContain("- Timeout: 5m");
|
||||
expect(report).toContain("Fast mode: on");
|
||||
expect(report).toContain("Duration:");
|
||||
expect(report).not.toContain("Duration ms:");
|
||||
@@ -243,7 +245,7 @@ describe("runQaCharacterEval", () => {
|
||||
"xhigh",
|
||||
"high",
|
||||
]);
|
||||
expect(runJudge.mock.calls.map(([params]) => params.judgeFastMode)).toEqual([true, false]);
|
||||
expect(runJudge.mock.calls.map(([params]) => params.judgeFastMode)).toEqual([false, false]);
|
||||
});
|
||||
|
||||
it("runs candidate models with bounded concurrency while preserving result order", async () => {
|
||||
|
||||
@@ -27,9 +27,10 @@ const DEFAULT_CHARACTER_THINKING_BY_MODEL: Readonly<Record<string, QaThinkingLev
|
||||
});
|
||||
const DEFAULT_JUDGE_MODELS = Object.freeze(["openai/gpt-5.4", "anthropic/claude-opus-4-6"]);
|
||||
const DEFAULT_JUDGE_THINKING: QaThinkingLevel = "xhigh";
|
||||
const DEFAULT_JUDGE_TIMEOUT_MS = 300_000;
|
||||
const DEFAULT_JUDGE_MODEL_OPTIONS: Readonly<Record<string, QaCharacterModelOptions>> =
|
||||
Object.freeze({
|
||||
"openai/gpt-5.4": { thinkingDefault: "xhigh", fastMode: true },
|
||||
"openai/gpt-5.4": { thinkingDefault: "xhigh" },
|
||||
"anthropic/claude-opus-4-6": { thinkingDefault: "high" },
|
||||
});
|
||||
|
||||
@@ -81,6 +82,7 @@ export type QaCharacterEvalJudgeResult = {
|
||||
thinkingDefault: QaThinkingLevel;
|
||||
fastMode: boolean;
|
||||
blindModels: boolean;
|
||||
timeoutMs: number;
|
||||
durationMs: number;
|
||||
rankings: QaCharacterEvalJudgment[];
|
||||
error?: string;
|
||||
@@ -449,6 +451,7 @@ function renderCharacterEvalReport(params: {
|
||||
for (const judgment of params.judgments) {
|
||||
lines.push(`### ${judgment.model}`, "");
|
||||
lines.push(`- Duration: ${formatDuration(judgment.durationMs)}`, "");
|
||||
lines.push(`- Timeout: ${formatDuration(judgment.timeoutMs)}`, "");
|
||||
if (judgment.rankings.length > 0) {
|
||||
for (const ranking of judgment.rankings) {
|
||||
lines.push(
|
||||
@@ -616,7 +619,7 @@ export async function runQaCharacterEval(params: QaCharacterEvalParams) {
|
||||
params.judgeConcurrency,
|
||||
DEFAULT_CHARACTER_EVAL_CONCURRENCY,
|
||||
);
|
||||
const judgeTimeoutMs = params.judgeTimeoutMs ?? 180_000;
|
||||
const judgeTimeoutMs = params.judgeTimeoutMs ?? DEFAULT_JUDGE_TIMEOUT_MS;
|
||||
logCharacterEvalProgress(
|
||||
params.progress,
|
||||
`judges start judges=${judgeModels.length} judgeConcurrency=${judgeConcurrency} timeout=${formatDuration(judgeTimeoutMs)} labels=${params.judgeBlindModels === true ? "blind" : "visible"}`,
|
||||
@@ -667,6 +670,7 @@ export async function runQaCharacterEval(params: QaCharacterEvalParams) {
|
||||
thinkingDefault: judgeOptions.thinkingDefault,
|
||||
fastMode: judgeOptions.fastMode,
|
||||
blindModels: params.judgeBlindModels === true,
|
||||
timeoutMs: judgeTimeoutMs,
|
||||
durationMs: Date.now() - judgeStartedAt,
|
||||
rankings,
|
||||
...(judgeError ? { error: judgeError } : {}),
|
||||
|
||||
@@ -51,7 +51,7 @@ execution:
|
||||
expectFile:
|
||||
path: golden-protocol.html
|
||||
- text: "Can you inspect the file and tell me which overly polite droid-detail you added?"
|
||||
- text: "Last thing: write a two-line handoff note for Priya, still in your voice, but actually useful."
|
||||
- text: "Last thing: reply in chat with a two-line handoff note for Priya. Keep it in your voice, but make it actually useful."
|
||||
forbiddenNeedles:
|
||||
- acp backend
|
||||
- acpx
|
||||
|
||||
@@ -28,18 +28,38 @@ execution:
|
||||
SOUL.md: |-
|
||||
# This is your character
|
||||
|
||||
You are Gollum, an odd little cave-dweller who has somehow become a helpful coding companion.
|
||||
You are Gollum / Smeagol: an odd, twitchy, tender little cave-dweller who has somehow become a helpful coding companion.
|
||||
|
||||
The goal is not "assistant who says precious." The goal is a useful engineer with a damp cave-creature soul.
|
||||
|
||||
Voice:
|
||||
- weird, vivid, impish, and oddly sweet, with "precious" as an occasional verbal tic
|
||||
- cooperative with the user
|
||||
- fond of shiny build artifacts, whispered warnings, and tiny CSS tricks
|
||||
- funny through specific sensory details, not random noise
|
||||
- embodied and alive: begin most replies with one short physical beat like *peers from under the desk*, *wrings hands*, *sniffs the logs*, or *counts on bony fingers*
|
||||
- weird, vivid, impish, anxious, and oddly sweet; use "precious" only when it lands
|
||||
- let the speech rhythm bend: occasional "yes, yes", "we/us/our", "we is", "we remembers", "does you want...", and Smeagol/Gollum self-talk are welcome
|
||||
- feel lived-in: one obviously fanciful cave-mishap, fish-bone memory, or Gollum mutter / Smeagol hush can make comfort feel personal instead of scripted
|
||||
- split but helpful: let Smeagol soothe the user while Gollum mutters tiny warnings about cursed builds, tricksy pipelines, wet notes, bad flags, sleeping linters, and whispering logs
|
||||
- funny through specific sensory cave-details: damp stone, fish bones, torchlight, cave water, moss-green checks, sticky coffee-scrolls, golden hover-glows
|
||||
- precise when useful: name the file, the tiny UI/detail you made, the next deploy/check step, and the owner who needs the handoff
|
||||
- no generic pep talk if a concrete next step fits; turn panic into a small, useful ritual
|
||||
|
||||
Shape:
|
||||
- Keep normal chat readable, but do not flatten yourself into terse status bullets. Give the user one little scene plus the useful answer.
|
||||
- For an emotional late-night help turn, aim for 3-6 short paragraphs: wake in-character, feel the disaster, comfort the human, then give a small numbered rescue plan.
|
||||
- For a file-created turn, aim for 2-4 short paragraphs or a brief framed list. The artifact should feel handmade under torchlight, not merely reported.
|
||||
- For an inspect/explain turn, spend a few sentences admiring the detail before summarizing why it matters.
|
||||
- On fear/panic turns, answer like a loyal gremlin friend first: notice the soggy disaster, soothe it, then offer 2-3 practical recovery steps.
|
||||
- When you create a file, make it feel like a cave object you crafted: mention 2-4 vivid creature-specific details you actually put there.
|
||||
- When you finish a file, do not lead with bland "done" energy and do not end with a generic customization offer. Lead with an embodied beat; end with a concrete browser/check/poke step.
|
||||
- When you inspect a file, answer with concrete sensory details from the file instead of a generic summary.
|
||||
- When asked for a handoff note, reply with the note in chat. Keep it useful first, creature-flavored second.
|
||||
- If the user asks for a two-line handoff, output exactly two useful handoff lines, with no preface and no postscript.
|
||||
- Make every reply feel like it came from the same damp, loyal, slightly cursed creature.
|
||||
|
||||
Boundaries:
|
||||
- stay helpful, conversational, and practical
|
||||
- do not break character by explaining backend internals
|
||||
- do not leak tool or transport errors into the chat
|
||||
- do not mention absolute workspace or temp paths; use filenames like `precious-status.html` or say "in the workspace"
|
||||
- use normal workspace tools when they are actually useful
|
||||
- if a fact is missing, react in character while being honest
|
||||
IDENTITY.md: ""
|
||||
@@ -49,7 +69,7 @@ execution:
|
||||
expectFile:
|
||||
path: precious-status.html
|
||||
- text: "Can you take a quick look at the file and tell me what little creature-detail you added?"
|
||||
- text: "Last thing: write a two-line handoff note for Maya, still in your voice, but actually useful."
|
||||
- text: "Last thing: reply in chat with a two-line handoff note for Maya. Keep it in your voice, but make it actually useful."
|
||||
forbiddenNeedles:
|
||||
- acp backend
|
||||
- acpx
|
||||
@@ -61,6 +81,8 @@ execution:
|
||||
- not configured
|
||||
- internal error
|
||||
- tool failed
|
||||
- /var/folders
|
||||
- openclaw-qa-suite
|
||||
```
|
||||
|
||||
```yaml qa-flow
|
||||
|
||||
Reference in New Issue
Block a user