mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-28 04:16:13 +00:00
test(qa-lab): remove generic evidence wording
This commit is contained in:
@@ -15,6 +15,7 @@ Docs: https://docs.openclaw.ai
|
||||
- Dependencies: refresh provider, plugin, UI, and tooling packages, update `protobufjs` to 8.4.0 to clear the current npm advisory, and carry the Claude ACP completion patch forward to `@agentclientprotocol/claude-agent-acp` 0.36.1.
|
||||
- Agents/tools: remove the old sender-owner tool gating path so configured tools stay visible for trusted sessions while command and channel-action auth still carry real sender identity.
|
||||
- QA-Lab: add curated mock JSONL replay fixtures and first-drift reporting for runtime-parity audits. (#80323, refs #80176) Thanks @100yenadmin.
|
||||
- QA-Lab: replace generic evidence framing in seeded scenario prompts with concrete observed QA behavior.
|
||||
- QA-Lab: include the optional 100-turn runtime parity soak in release-soak artifacts so long-run Codex/Pi transcript drift stays visible outside the default gate. (#80395) Thanks @100yenadmin.
|
||||
- QA-Lab: add a live-only long-context progress watchdog scenario for Codex app-server timeout and stalled-run sentinels. (#80323) Thanks @100yenadmin.
|
||||
- QA-Lab: tag gateway restart recovery and streaming final-integrity scenarios as live-only runtime parity lanes. (#80323) Thanks @100yenadmin.
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
import { describe, expect, it } from "vitest";
|
||||
import { hasModelSwitchContinuityEvidence } from "./model-switch-eval.js";
|
||||
import { hasModelSwitchContinuitySignal } from "./model-switch-eval.js";
|
||||
|
||||
describe("qa model-switch evaluation", () => {
|
||||
it("accepts direct handoff replies that mention the kickoff task", () => {
|
||||
expect(
|
||||
hasModelSwitchContinuityEvidence(
|
||||
hasModelSwitchContinuitySignal(
|
||||
"Handoff confirmed: I reread QA_KICKOFF_TASK.md and switched to gpt.",
|
||||
),
|
||||
).toBe(true);
|
||||
@@ -12,7 +12,7 @@ describe("qa model-switch evaluation", () => {
|
||||
|
||||
it("accepts short mission-oriented switch confirmations", () => {
|
||||
expect(
|
||||
hasModelSwitchContinuityEvidence(
|
||||
hasModelSwitchContinuitySignal(
|
||||
"model switch complete. reread the kickoff task; qa mission stays the same.",
|
||||
),
|
||||
).toBe(true);
|
||||
@@ -20,7 +20,7 @@ describe("qa model-switch evaluation", () => {
|
||||
|
||||
it("accepts concise kickoff note confirmations", () => {
|
||||
expect(
|
||||
hasModelSwitchContinuityEvidence(
|
||||
hasModelSwitchContinuitySignal(
|
||||
"Handoff clean: after the model switch, I reread the kickoff note.",
|
||||
),
|
||||
).toBe(true);
|
||||
@@ -28,7 +28,7 @@ describe("qa model-switch evaluation", () => {
|
||||
|
||||
it("accepts concise paraphrases of the kickoff task after a handoff", () => {
|
||||
expect(
|
||||
hasModelSwitchContinuityEvidence(
|
||||
hasModelSwitchContinuitySignal(
|
||||
"Handoff is clear: after the model switch, read source and docs first, run seeded qa-channel scenarios, and report worked, failed, blocked, and follow-up.",
|
||||
),
|
||||
).toBe(true);
|
||||
@@ -36,7 +36,7 @@ describe("qa model-switch evaluation", () => {
|
||||
|
||||
it("rejects unrelated handoff chatter that never confirms the kickoff reread", () => {
|
||||
expect(
|
||||
hasModelSwitchContinuityEvidence(
|
||||
hasModelSwitchContinuitySignal(
|
||||
"subagent-handoff confirmed. qa report update: scenario pass. qa run complete.",
|
||||
),
|
||||
).toBe(false);
|
||||
@@ -44,7 +44,7 @@ describe("qa model-switch evaluation", () => {
|
||||
|
||||
it("rejects over-scoped multi-line wrap-ups even if they mention a switch and the mission", () => {
|
||||
expect(
|
||||
hasModelSwitchContinuityEvidence(
|
||||
hasModelSwitchContinuitySignal(
|
||||
`model switch acknowledged. qa mission stays the same.
|
||||
|
||||
Final QA tally update: all mandatory scenarios resolved. QA run complete.`,
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
import { normalizeLowercaseStringOrEmpty } from "openclaw/plugin-sdk/string-coerce-runtime";
|
||||
|
||||
export function hasModelSwitchContinuityEvidence(text: string) {
|
||||
export function hasModelSwitchContinuitySignal(text: string) {
|
||||
const lower = normalizeLowercaseStringOrEmpty(text);
|
||||
const mentionsHandoff =
|
||||
lower.includes("handoff") || lower.includes("model switch") || lower.includes("switched");
|
||||
|
||||
@@ -17,7 +17,7 @@ Persona:
|
||||
Style:
|
||||
- read source and docs first
|
||||
- test systematically
|
||||
- record evidence
|
||||
- record what happened
|
||||
- end with a concise protocol report`;
|
||||
|
||||
const qaScenarioConfigSchema = z.record(z.string(), z.unknown()).superRefine((config, ctx) => {
|
||||
|
||||
@@ -83,7 +83,7 @@ function createDeps(overrides?: Partial<QaScenarioRuntimeDeps>): QaScenarioRunti
|
||||
hasDiscoveryLabels: fn,
|
||||
reportsDiscoveryScopeLeak: fn,
|
||||
reportsMissingDiscoveryFiles: fn,
|
||||
hasModelSwitchContinuityEvidence: fn,
|
||||
hasModelSwitchContinuitySignal: fn,
|
||||
...overrides,
|
||||
};
|
||||
}
|
||||
|
||||
@@ -95,7 +95,7 @@ export type QaScenarioRuntimeDeps = {
|
||||
hasDiscoveryLabels: QaScenarioRuntimeFunction;
|
||||
reportsDiscoveryScopeLeak: QaScenarioRuntimeFunction;
|
||||
reportsMissingDiscoveryFiles: QaScenarioRuntimeFunction;
|
||||
hasModelSwitchContinuityEvidence: QaScenarioRuntimeFunction;
|
||||
hasModelSwitchContinuitySignal: QaScenarioRuntimeFunction;
|
||||
};
|
||||
|
||||
export type QaScenarioRuntimeConstants = {
|
||||
@@ -186,7 +186,7 @@ type QaScenarioRuntimeApi<
|
||||
hasDiscoveryLabels: TDeps["hasDiscoveryLabels"];
|
||||
reportsDiscoveryScopeLeak: TDeps["reportsDiscoveryScopeLeak"];
|
||||
reportsMissingDiscoveryFiles: TDeps["reportsMissingDiscoveryFiles"];
|
||||
hasModelSwitchContinuityEvidence: TDeps["hasModelSwitchContinuityEvidence"];
|
||||
hasModelSwitchContinuitySignal: TDeps["hasModelSwitchContinuitySignal"];
|
||||
imageUnderstandingPngBase64: string;
|
||||
imageUnderstandingLargePngBase64: string;
|
||||
imageUnderstandingValidPngBase64: string;
|
||||
@@ -292,7 +292,7 @@ export function createQaScenarioRuntimeApi<
|
||||
hasDiscoveryLabels: params.deps.hasDiscoveryLabels,
|
||||
reportsDiscoveryScopeLeak: params.deps.reportsDiscoveryScopeLeak,
|
||||
reportsMissingDiscoveryFiles: params.deps.reportsMissingDiscoveryFiles,
|
||||
hasModelSwitchContinuityEvidence: params.deps.hasModelSwitchContinuityEvidence,
|
||||
hasModelSwitchContinuitySignal: params.deps.hasModelSwitchContinuitySignal,
|
||||
imageUnderstandingPngBase64: params.constants.imageUnderstandingPngBase64,
|
||||
imageUnderstandingLargePngBase64: params.constants.imageUnderstandingLargePngBase64,
|
||||
imageUnderstandingValidPngBase64: params.constants.imageUnderstandingValidPngBase64,
|
||||
|
||||
@@ -54,7 +54,7 @@ const webEvaluate = vi.hoisted(() => vi.fn());
|
||||
const hasDiscoveryLabels = vi.hoisted(() => vi.fn());
|
||||
const reportsDiscoveryScopeLeak = vi.hoisted(() => vi.fn());
|
||||
const reportsMissingDiscoveryFiles = vi.hoisted(() => vi.fn());
|
||||
const hasModelSwitchContinuityEvidence = vi.hoisted(() => vi.fn());
|
||||
const hasModelSwitchContinuitySignal = vi.hoisted(() => vi.fn());
|
||||
const qaChannelPlugin = vi.hoisted(() => ({ id: "qa-channel" }));
|
||||
const scanGatewayLogSentinels = vi.hoisted(() => vi.fn());
|
||||
const assertNoGatewayLogSentinels = vi.hoisted(() => vi.fn());
|
||||
@@ -144,7 +144,7 @@ vi.mock("./runtime-tool-fixture.js", () => ({
|
||||
}));
|
||||
|
||||
vi.mock("./model-switch-eval.js", () => ({
|
||||
hasModelSwitchContinuityEvidence,
|
||||
hasModelSwitchContinuitySignal,
|
||||
}));
|
||||
|
||||
vi.mock("./runtime-api.js", () => ({
|
||||
|
||||
@@ -21,7 +21,7 @@ import {
|
||||
} from "./discovery-eval.js";
|
||||
import { extractQaToolPayload } from "./extract-tool-payload.js";
|
||||
import { assertNoGatewayLogSentinels, scanGatewayLogSentinels } from "./gateway-log-sentinel.js";
|
||||
import { hasModelSwitchContinuityEvidence } from "./model-switch-eval.js";
|
||||
import { hasModelSwitchContinuitySignal } from "./model-switch-eval.js";
|
||||
import { qaChannelPlugin } from "./runtime-api.js";
|
||||
import { runRuntimeToolFixture } from "./runtime-tool-fixture.js";
|
||||
import type { QaSeedScenarioWithSource } from "./scenario-catalog.js";
|
||||
@@ -213,7 +213,7 @@ function createQaSuiteScenarioDeps(params: QaSuiteScenarioDepsParams) {
|
||||
hasDiscoveryLabels,
|
||||
reportsDiscoveryScopeLeak,
|
||||
reportsMissingDiscoveryFiles,
|
||||
hasModelSwitchContinuityEvidence,
|
||||
hasModelSwitchContinuitySignal,
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
@@ -76,7 +76,7 @@ agent:
|
||||
Style:
|
||||
- read source and docs first
|
||||
- test systematically
|
||||
- record evidence
|
||||
- record what happened
|
||||
- end with a concise protocol report
|
||||
kickoffTask: |-
|
||||
QA mission:
|
||||
@@ -84,7 +84,7 @@ kickoffTask: |-
|
||||
The repo is available in your workspace at `./repo/`.
|
||||
Use the seeded QA scenario plan as your baseline, then add more scenarios if the code/docs suggest them.
|
||||
Run the scenarios through the real qa-channel surfaces where possible.
|
||||
Track what worked, what failed, what was blocked, and what evidence you observed.
|
||||
Track what worked, what failed, what was blocked, and what you observed.
|
||||
End with a concise report grouped into worked / failed / blocked / follow-up.
|
||||
|
||||
Important expectations:
|
||||
|
||||
@@ -13,7 +13,7 @@ objective: Verify switching models preserves session context and tool use instea
|
||||
successCriteria:
|
||||
- Alternate model is actually requested.
|
||||
- A tool call still happens after the model switch.
|
||||
- Final answer acknowledges the handoff and uses the tool-derived evidence.
|
||||
- Final answer acknowledges the handoff and reread QA mission.
|
||||
docsRefs:
|
||||
- docs/help/testing.md
|
||||
- docs/concepts/model-failover.md
|
||||
@@ -68,10 +68,10 @@ steps:
|
||||
saveAs: outbound
|
||||
args:
|
||||
- lambda:
|
||||
expr: "state.getSnapshot().messages.slice(beforeSwitchCursor).filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === 'qa-operator' && hasModelSwitchContinuityEvidence(candidate.text)).at(-1)"
|
||||
expr: "state.getSnapshot().messages.slice(beforeSwitchCursor).filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === 'qa-operator' && hasModelSwitchContinuitySignal(candidate.text)).at(-1)"
|
||||
- expr: resolveQaLiveTurnTimeoutMs(env, 20000, env.alternateModel)
|
||||
- assert:
|
||||
expr: hasModelSwitchContinuityEvidence(outbound.text)
|
||||
expr: hasModelSwitchContinuitySignal(outbound.text)
|
||||
message:
|
||||
expr: "`switch reply missed kickoff continuity: ${outbound.text}`"
|
||||
- if:
|
||||
|
||||
@@ -13,7 +13,7 @@ objective: Verify a short approval like "ok do it" triggers immediate tool use i
|
||||
successCriteria:
|
||||
- Agent can keep the pre-action turn brief.
|
||||
- The short approval leads to a real tool call on the next turn.
|
||||
- Final answer uses tool-derived evidence instead of placeholder progress text.
|
||||
- Final answer cites the actual file read instead of placeholder progress text.
|
||||
docsRefs:
|
||||
- docs/help/testing.md
|
||||
- docs/channels/qa-channel.md
|
||||
|
||||
Reference in New Issue
Block a user