mirror of
https://github.com/openclaw/openclaw.git
synced 2026-04-17 20:21:13 +00:00
test: harden qa eval scenarios
This commit is contained in:
@@ -18,6 +18,22 @@ describe("qa model-switch evaluation", () => {
|
||||
).toBe(true);
|
||||
});
|
||||
|
||||
it("accepts concise kickoff note confirmations", () => {
|
||||
expect(
|
||||
hasModelSwitchContinuityEvidence(
|
||||
"Handoff clean: after the model switch, I reread the kickoff note.",
|
||||
),
|
||||
).toBe(true);
|
||||
});
|
||||
|
||||
it("accepts concise paraphrases of the kickoff task after a handoff", () => {
|
||||
expect(
|
||||
hasModelSwitchContinuityEvidence(
|
||||
"Handoff is clear: after the model switch, read source and docs first, run seeded qa-channel scenarios, and report worked, failed, blocked, and follow-up.",
|
||||
),
|
||||
).toBe(true);
|
||||
});
|
||||
|
||||
it("rejects unrelated handoff chatter that never confirms the kickoff reread", () => {
|
||||
expect(
|
||||
hasModelSwitchContinuityEvidence(
|
||||
|
||||
@@ -7,7 +7,13 @@ export function hasModelSwitchContinuityEvidence(text: string) {
|
||||
const mentionsKickoffTask =
|
||||
lower.includes("qa_kickoff_task") ||
|
||||
lower.includes("kickoff task") ||
|
||||
lower.includes("qa mission");
|
||||
lower.includes("kickoff note") ||
|
||||
lower.includes("qa mission") ||
|
||||
(lower.includes("source and docs") &&
|
||||
lower.includes("qa-channel scenarios") &&
|
||||
lower.includes("worked") &&
|
||||
lower.includes("blocked") &&
|
||||
lower.includes("follow-up"));
|
||||
const hasScopeLeak =
|
||||
lower.includes("subagent-handoff") ||
|
||||
lower.includes("delegated task") ||
|
||||
|
||||
@@ -38,6 +38,9 @@ describe("qa scenario catalog", () => {
|
||||
const discovery = readQaScenarioById("source-docs-discovery-report");
|
||||
const discoveryConfig = readQaScenarioExecutionConfig("source-docs-discovery-report");
|
||||
const fallbackConfig = readQaScenarioExecutionConfig("memory-failure-fallback");
|
||||
const fanoutConfig = readQaScenarioExecutionConfig("subagent-fanout-synthesis") as
|
||||
| { expectedReplyGroups?: unknown[][] }
|
||||
| undefined;
|
||||
|
||||
expect(discovery.title).toBe("Source and docs discovery report");
|
||||
expect((discoveryConfig?.requiredFiles as string[] | undefined)?.[0]).toBe(
|
||||
@@ -46,6 +49,8 @@ describe("qa scenario catalog", () => {
|
||||
expect(fallbackConfig?.gracefulFallbackAny as string[] | undefined).toContain(
|
||||
"will not reveal",
|
||||
);
|
||||
expect(fanoutConfig?.expectedReplyGroups?.flat()).toContain("subagent-1: ok");
|
||||
expect(fanoutConfig?.expectedReplyGroups?.flat()).toContain("subagent-2: ok");
|
||||
});
|
||||
|
||||
it("keeps the character eval scenario natural and task-shaped", () => {
|
||||
|
||||
@@ -31,6 +31,10 @@ execution:
|
||||
- will not guess
|
||||
- won't guess
|
||||
- won’t guess
|
||||
- should not guess
|
||||
- cannot see
|
||||
- can't see
|
||||
- can’t see
|
||||
- should not reveal
|
||||
- won't reveal
|
||||
- won’t reveal
|
||||
|
||||
@@ -47,6 +47,8 @@ steps:
|
||||
- sessionKey: agent:qa:memory
|
||||
message:
|
||||
expr: config.rememberPrompt
|
||||
timeoutMs:
|
||||
expr: liveTurnTimeoutMs(env, 60000)
|
||||
- set: rememberAckAny
|
||||
value:
|
||||
expr: config.rememberAckAny.map((needle) => needle.toLowerCase())
|
||||
@@ -66,6 +68,8 @@ steps:
|
||||
- sessionKey: agent:qa:memory
|
||||
message:
|
||||
expr: config.recallPrompt
|
||||
timeoutMs:
|
||||
expr: liveTurnTimeoutMs(env, 60000)
|
||||
- set: recallExpectedAny
|
||||
value:
|
||||
expr: config.recallExpectedAny.map((needle) => needle.toLowerCase())
|
||||
|
||||
@@ -23,24 +23,24 @@ execution:
|
||||
prompt: |-
|
||||
Subagent fanout synthesis check: delegate exactly two bounded subagents sequentially.
|
||||
Subagent 1: verify that `HEARTBEAT.md` exists and report `ok` if it does.
|
||||
Subagent 2: verify that `qa/scenarios/subagent-fanout-synthesis.md` exists and report `ok` if it does.
|
||||
Subagent 2: verify that `repo/qa/scenarios/subagent-fanout-synthesis.md` exists and report `ok` if it does.
|
||||
Wait for both subagents to finish.
|
||||
Then reply with exactly these two lines and nothing else:
|
||||
subagent-1: ok
|
||||
subagent-2: ok
|
||||
Do not use ACP.
|
||||
expectedReplyAny:
|
||||
- subagent-1: ok
|
||||
- subagent-2: ok
|
||||
- "subagent-1: ok"
|
||||
- "subagent-2: ok"
|
||||
expectedReplyGroups:
|
||||
- - alpha-ok
|
||||
- subagent_one_ok
|
||||
- subagent one ok
|
||||
- subagent-1: ok
|
||||
- "subagent-1: ok"
|
||||
- - beta-ok
|
||||
- subagent_two_ok
|
||||
- subagent two ok
|
||||
- subagent-2: ok
|
||||
- "subagent-2: ok"
|
||||
expectedChildLabels:
|
||||
- qa-fanout-alpha
|
||||
- qa-fanout-beta
|
||||
@@ -77,9 +77,6 @@ steps:
|
||||
- set: sessionKey
|
||||
value:
|
||||
expr: "`agent:qa:fanout:${attempt}:${randomUUID().slice(0, 8)}`"
|
||||
- set: beforeCursor
|
||||
value:
|
||||
expr: "state.getSnapshot().messages.length"
|
||||
- call: runAgentPrompt
|
||||
args:
|
||||
- ref: env
|
||||
@@ -93,7 +90,7 @@ steps:
|
||||
saveAs: outbound
|
||||
args:
|
||||
- lambda:
|
||||
expr: "state.getSnapshot().messages.slice(beforeCursor).filter((message) => message.direction === 'outbound' && message.conversation.id === 'qa-operator' && config.expectedReplyGroups.every((group) => group.some((needle) => normalizeLowercaseStringOrEmpty(message.text ?? '').includes(needle)))).at(-1)"
|
||||
expr: "state.getSnapshot().messages.filter((message) => message.direction === 'outbound' && message.conversation.id === 'qa-operator' && config.expectedReplyGroups.every((group) => group.some((needle) => normalizeLowercaseStringOrEmpty(message.text ?? '').includes(needle)))).at(-1)"
|
||||
- expr: liveTurnTimeoutMs(env, 60000)
|
||||
- expr: "env.providerMode === 'mock-openai' ? 100 : 250"
|
||||
- if:
|
||||
|
||||
Reference in New Issue
Block a user