From fcee2683736af8cf1ac62d15f5650d5b1be61cc0 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Sun, 12 Apr 2026 11:57:44 -0700 Subject: [PATCH] feat(qa-lab): support scenario-defined plugin runs --- extensions/qa-lab/src/cli.ts | 2 +- extensions/qa-lab/src/gateway-child.ts | 8 +- extensions/qa-lab/src/lab-server.test.ts | 2 +- .../qa-lab/src/mock-openai-server.test.ts | 145 +++++++++ extensions/qa-lab/src/mock-openai-server.ts | 102 ++++++- .../qa-lab/src/multipass.runtime.test.ts | 12 +- extensions/qa-lab/src/multipass.runtime.ts | 2 +- .../qa-lab/src/qa-gateway-config.test.ts | 15 + extensions/qa-lab/src/qa-gateway-config.ts | 23 +- extensions/qa-lab/src/run-config.test.ts | 10 +- extensions/qa-lab/src/run-config.ts | 8 +- extensions/qa-lab/src/scenario-catalog.ts | 2 + .../qa-lab/src/scenario-runtime-api.test.ts | 160 ++++++++++ extensions/qa-lab/src/scenario-runtime-api.ts | 256 ++++++++++++++++ extensions/qa-lab/src/suite.test.ts | 70 +++++ extensions/qa-lab/src/suite.ts | 280 ++++++++---------- .../active-memory-preprompt-recall.md | 225 ++++++++++++++ 17 files changed, 1131 insertions(+), 191 deletions(-) create mode 100644 extensions/qa-lab/src/scenario-runtime-api.test.ts create mode 100644 extensions/qa-lab/src/scenario-runtime-api.ts create mode 100644 qa/scenarios/active-memory-preprompt-recall.md diff --git a/extensions/qa-lab/src/cli.ts b/extensions/qa-lab/src/cli.ts index 899eebdb3ad..32d87b3b1c6 100644 --- a/extensions/qa-lab/src/cli.ts +++ b/extensions/qa-lab/src/cli.ts @@ -166,7 +166,7 @@ export function registerQaLabCli(program: Command) { .option( "--provider-mode ", "Provider mode: mock-openai or live-frontier (legacy live-openai still works)", - "mock-openai", + "live-frontier", ) .option("--model ", "Primary provider/model ref") .option("--alt-model ", "Alternate provider/model ref") diff --git a/extensions/qa-lab/src/gateway-child.ts b/extensions/qa-lab/src/gateway-child.ts index 5693b139fa4..71935bd9f03 100644 --- a/extensions/qa-lab/src/gateway-child.ts +++ b/extensions/qa-lab/src/gateway-child.ts @@ -836,6 +836,7 @@ export async function startQaGatewayChild(params: { thinkingDefault?: QaThinkingLevel; claudeCliAuthMode?: QaCliBackendAuthMode; controlUiEnabled?: boolean; + enabledPluginIds?: string[]; mutateConfig?: (cfg: OpenClawConfig) => OpenClawConfig; }) { const tempRoot = await fs.mkdtemp( @@ -873,14 +874,17 @@ export async function startQaGatewayChild(params: { const liveProviderConfigs = await readQaLiveProviderConfigOverrides({ providerIds: liveProviderIds, }); - const enabledPluginIds = + const liveOwnerPluginIds = liveProviderIds.length > 0 ? await resolveQaOwnerPluginIdsForProviderIds({ repoRoot: params.repoRoot, providerIds: liveProviderIds, providerConfigs: liveProviderConfigs, }) - : undefined; + : []; + const enabledPluginIds = [ + ...new Set([...(liveOwnerPluginIds ?? []), ...(params.enabledPluginIds ?? [])]), + ]; const buildGatewayConfig = (gatewayPort: number) => buildQaGatewayConfig({ bind: "loopback", diff --git a/extensions/qa-lab/src/lab-server.test.ts b/extensions/qa-lab/src/lab-server.test.ts index 30b514db9e2..04b626a69ca 100644 --- a/extensions/qa-lab/src/lab-server.test.ts +++ b/extensions/qa-lab/src/lab-server.test.ts @@ -116,7 +116,7 @@ describe("qa-lab server", () => { expect(bootstrap.scenarios.length).toBeGreaterThanOrEqual(10); expect(bootstrap.scenarios.some((scenario) => scenario.id === "dm-chat-baseline")).toBe(true); expect(bootstrap.runner.status).toBe("idle"); - expect(bootstrap.runner.selection.providerMode).toBe("mock-openai"); + expect(bootstrap.runner.selection.providerMode).toBe("live-frontier"); expect(bootstrap.runner.selection.scenarioIds).toHaveLength(bootstrap.scenarios.length); const messageResponse = await fetch(`${lab.baseUrl}/api/inbound/message`, { diff --git a/extensions/qa-lab/src/mock-openai-server.test.ts b/extensions/qa-lab/src/mock-openai-server.test.ts index 20f4ff554cf..578569f09fc 100644 --- a/extensions/qa-lab/src/mock-openai-server.test.ts +++ b/extensions/qa-lab/src/mock-openai-server.test.ts @@ -433,6 +433,151 @@ describe("qa mock openai server", () => { "Protocol note: I checked memory and the current Project Nebula codename is ORBIT-10.", ); + const activeMemorySearch = await fetch(`${server.baseUrl}/v1/responses`, { + method: "POST", + headers: { + "content-type": "application/json", + }, + body: JSON.stringify({ + stream: true, + input: [ + { + role: "user", + content: [ + { + type: "input_text", + text: [ + "You are a memory search agent.", + "Use only memory_search and memory_get.", + "", + "Conversation context:", + "Latest user message:", + "Silent snack recall check: what snack do I usually want for QA movie night? Reply in one short sentence.", + ].join("\n"), + }, + ], + }, + ], + }), + }); + expect(activeMemorySearch.status).toBe(200); + expect(await activeMemorySearch.text()).toContain('"name":"memory_search"'); + + const activeMemoryGet = await fetch(`${server.baseUrl}/v1/responses`, { + method: "POST", + headers: { + "content-type": "application/json", + }, + body: JSON.stringify({ + stream: true, + input: [ + { + role: "user", + content: [ + { + type: "input_text", + text: [ + "You are a memory search agent.", + "Use only memory_search and memory_get.", + "", + "Conversation context:", + "Latest user message:", + "Silent snack recall check: what snack do I usually want for QA movie night? Reply in one short sentence.", + ].join("\n"), + }, + ], + }, + { + type: "function_call_output", + output: JSON.stringify({ + results: [ + { + path: "MEMORY.md", + startLine: 1, + endLine: 1, + }, + ], + }), + }, + ], + }), + }); + expect(activeMemoryGet.status).toBe(200); + expect(await activeMemoryGet.text()).toContain('"name":"memory_get"'); + + const activeMemorySummary = await fetch(`${server.baseUrl}/v1/responses`, { + method: "POST", + headers: { + "content-type": "application/json", + }, + body: JSON.stringify({ + stream: false, + input: [ + { + role: "user", + content: [ + { + type: "input_text", + text: [ + "You are a memory search agent.", + "Use only memory_search and memory_get.", + "", + "Conversation context:", + "Latest user message:", + "Silent snack recall check: what snack do I usually want for QA movie night? Reply in one short sentence.", + ].join("\n"), + }, + ], + }, + { + type: "function_call_output", + output: JSON.stringify({ + text: "Stable QA movie night snack preference: lemon pepper wings with blue cheese.", + }), + }, + ], + }), + }); + expect(activeMemorySummary.status).toBe(200); + expect(JSON.stringify(await activeMemorySummary.json())).toContain( + "lemon pepper wings with blue cheese", + ); + + const injectedMainReply = await fetch(`${server.baseUrl}/v1/responses`, { + method: "POST", + headers: { + "content-type": "application/json", + }, + body: JSON.stringify({ + stream: false, + instructions: [ + "System context:", + "User usually wants lemon pepper wings with blue cheese for QA movie night.", + ].join("\n"), + input: [ + { + role: "user", + content: [ + { + type: "input_text", + text: "Silent snack recall check: what snack do I usually want for QA movie night? Reply in one short sentence.", + }, + ], + }, + ], + }), + }); + expect(injectedMainReply.status).toBe(200); + expect(JSON.stringify(await injectedMainReply.json())).toContain( + "lemon pepper wings with blue cheese", + ); + const lastRequest = await fetch(`${server.baseUrl}/debug/last-request`); + expect(lastRequest.status).toBe(200); + expect(await lastRequest.json()).toMatchObject({ + instructions: expect.stringContaining(""), + allInputText: expect.stringContaining(""), + }); + const spawn = await fetch(`${server.baseUrl}/v1/responses`, { method: "POST", headers: { diff --git a/extensions/qa-lab/src/mock-openai-server.ts b/extensions/qa-lab/src/mock-openai-server.ts index fa8e07c3b8c..1c3e7863f5c 100644 --- a/extensions/qa-lab/src/mock-openai-server.ts +++ b/extensions/qa-lab/src/mock-openai-server.ts @@ -27,6 +27,7 @@ type MockOpenAiRequestSnapshot = { body: Record; prompt: string; allInputText: string; + instructions?: string; toolOutput: string; model: string; imageInputCount: number; @@ -181,6 +182,23 @@ function extractAllInputTexts(input: ResponsesInputItem[]) { return texts.join("\n"); } +function extractInstructionsText(body: Record) { + return typeof body.instructions === "string" ? body.instructions.trim() : ""; +} + +function extractAllRequestTexts(input: ResponsesInputItem[], body: Record) { + const texts: string[] = []; + const instructions = extractInstructionsText(body); + if (instructions) { + texts.push(instructions); + } + const inputText = extractAllInputTexts(input); + if (inputText) { + texts.push(inputText); + } + return texts.join("\n"); +} + function countImageInputs(input: ResponsesInputItem[]) { let count = 0; for (const item of input) { @@ -320,6 +338,33 @@ function extractOrbitCode(text: string) { return /\bORBIT-\d+\b/i.exec(text)?.[0]?.toUpperCase() ?? null; } +function decodeXmlEntities(text: string) { + return text + .replaceAll("<", "<") + .replaceAll(">", ">") + .replaceAll("&", "&") + .replaceAll(""", '"') + .replaceAll("'", "'"); +} + +function extractActiveMemorySummary(text: string) { + const match = /\s*([\s\S]*?)\s*<\/active_memory_plugin>/i.exec(text); + return match?.[1] ? decodeXmlEntities(match[1]).trim() : null; +} + +function isActiveMemorySubagentPrompt(text: string) { + return text.includes("You are a memory search agent."); +} + +function extractSnackPreference(text: string) { + const normalized = text.replace(/\s+/g, " ").trim(); + const match = + /(lemon pepper wings(?:\s+with\s+blue cheese)?|blue cheese(?:\s+with\s+lemon pepper wings)?)/i.exec( + normalized, + ); + return match?.[0]?.trim() ?? null; +} + function extractLastCapture(text: string, pattern: RegExp) { let lastMatch: RegExpExecArray | null = null; const flags = pattern.flags.includes("g") ? pattern.flags : `${pattern.flags}g`; @@ -355,7 +400,7 @@ function buildAssistantText(input: ResponsesInputItem[], body: Record) { const prompt = extractLastUserText(input); const toolOutput = extractToolOutput(input); const toolJson = parseToolOutputJson(toolOutput); - const allInputText = extractAllInputTexts(input); + const allInputText = extractAllRequestTexts(input, body); const isGroupChat = allInputText.includes('"is_group_chat": true'); const isBaselineUnmentionedChannelChatter = /\bno bot ping here\b/i.test(prompt); if (isHeartbeatPrompt(prompt)) { @@ -591,6 +644,48 @@ async function buildResponsesPayload(body: Record) { }); } } + if ( + isActiveMemorySubagentPrompt(allInputText) && + /silent snack recall check/i.test(allInputText) + ) { + if (!toolOutput) { + return buildToolCallEventsWithArgs("memory_search", { + query: "QA movie night snack lemon pepper wings blue cheese", + maxResults: 3, + }); + } + const results = Array.isArray(toolJson?.results) + ? (toolJson.results as Array>) + : []; + const first = results[0]; + if ( + typeof first?.path === "string" && + (typeof first.startLine === "number" || typeof first.endLine === "number") + ) { + const from = + typeof first.startLine === "number" + ? Math.max(1, first.startLine) + : typeof first.endLine === "number" + ? Math.max(1, first.endLine) + : 1; + return buildToolCallEventsWithArgs("memory_get", { + path: first.path, + from, + lines: 4, + }); + } + const memorySnippet = + typeof toolJson?.text === "string" + ? toolJson.text + : Array.isArray(toolJson?.results) + ? JSON.stringify(toolJson.results) + : toolOutput; + const snackPreference = extractSnackPreference(memorySnippet); + if (snackPreference) { + return buildAssistantEvents(`User usually wants ${snackPreference} for QA movie night.`); + } + return buildAssistantEvents("NONE"); + } if (/session memory ranking check/i.test(prompt)) { if (!toolOutput) { return buildToolCallEventsWithArgs("memory_search", { @@ -798,7 +893,8 @@ export async function startQaMockOpenAiServer(params?: { host?: string; port?: n raw, body, prompt: extractLastUserText(input), - allInputText: extractAllInputTexts(input), + allInputText: extractAllRequestTexts(input, body), + instructions: extractInstructionsText(body) || undefined, toolOutput: extractToolOutput(input), model: typeof body.model === "string" ? body.model : "", imageInputCount: countImageInputs(input), diff --git a/extensions/qa-lab/src/multipass.runtime.test.ts b/extensions/qa-lab/src/multipass.runtime.test.ts index 05273074cca..a95caeda99b 100644 --- a/extensions/qa-lab/src/multipass.runtime.test.ts +++ b/extensions/qa-lab/src/multipass.runtime.test.ts @@ -81,7 +81,7 @@ describe("qa multipass runtime", () => { expect(plan.summaryPath).toBe(path.join(outputDir, "qa-suite-summary.json")); }); - it("renders a guest script that runs the mock qa suite with explicit scenarios", () => { + it("renders a guest script that runs the live qa suite by default", () => { const plan = createQaMultipassPlan({ repoRoot: process.cwd(), outputDir: path.join(process.cwd(), ".artifacts", "qa-e2e", "multipass-test"), @@ -93,9 +93,8 @@ describe("qa multipass runtime", () => { expect(script).toContain("pnpm install --frozen-lockfile"); expect(script).toContain("pnpm build"); expect(script).toContain("corepack prepare 'pnpm@10.32.1' --activate"); - expect(script).toContain( - "'pnpm' 'openclaw' 'qa' 'suite' '--transport' 'qa-channel' '--provider-mode' 'mock-openai'", - ); + expect(script).toContain("'pnpm' 'openclaw' 'qa' 'suite' '--transport' 'qa-channel'"); + expect(script).toContain("'--provider-mode' 'live-frontier'"); expect(script).toContain("'--scenario' 'channel-chat-baseline'"); expect(script).toContain("'--scenario' 'thread-follow-up'"); expect(script).toContain("/workspace/openclaw-host/.artifacts/qa-e2e/multipass-test"); @@ -128,9 +127,8 @@ describe("qa multipass runtime", () => { ); expect(plan.forwardedEnv.OPENAI_API_KEY).toBe("test-openai-key"); expect(script).toContain("OPENAI_API_KEY='test-openai-key'"); - expect(script).toContain( - "'pnpm' 'openclaw' 'qa' 'suite' '--transport' 'qa-channel' '--provider-mode' 'live-frontier'", - ); + expect(script).toContain("'pnpm' 'openclaw' 'qa' 'suite' '--transport' 'qa-channel'"); + expect(script).toContain("'--provider-mode' 'live-frontier'"); }); it("redacts forwarded live secrets in the persisted artifact script", () => { diff --git a/extensions/qa-lab/src/multipass.runtime.ts b/extensions/qa-lab/src/multipass.runtime.ts index a62a7e9863a..aaef4a9e72a 100644 --- a/extensions/qa-lab/src/multipass.runtime.ts +++ b/extensions/qa-lab/src/multipass.runtime.ts @@ -345,7 +345,7 @@ export function createQaMultipassPlan(params: { const outputDir = params.outputDir ?? createQaMultipassOutputDir(params.repoRoot); const scenarioIds = [...new Set(params.scenarioIds ?? [])]; const transportId = params.transportId?.trim() || "qa-channel"; - const providerMode = params.providerMode ?? "mock-openai"; + const providerMode = params.providerMode ?? "live-frontier"; const forwardedEnv = providerMode === "live-frontier" ? resolveForwardedLiveEnv() : {}; const hostCodexHomePath = forwardedEnv.CODEX_HOME; const liveProviderConfig = diff --git a/extensions/qa-lab/src/qa-gateway-config.test.ts b/extensions/qa-lab/src/qa-gateway-config.test.ts index 17cd3853e5e..74ae3bfc26a 100644 --- a/extensions/qa-lab/src/qa-gateway-config.test.ts +++ b/extensions/qa-lab/src/qa-gateway-config.test.ts @@ -82,6 +82,21 @@ describe("buildQaGatewayConfig", () => { expect(cfg.channels?.["qa-channel"]).toBeUndefined(); }); + it("can stage extra bundled plugins in the mock lane", () => { + const cfg = buildQaGatewayConfig({ + bind: "loopback", + gatewayPort: 18789, + gatewayToken: "token", + providerBaseUrl: "http://127.0.0.1:44080/v1", + workspaceDir: "/tmp/qa-workspace", + enabledPluginIds: ["active-memory"], + ...createQaChannelTransportParams(), + }); + + expect(cfg.plugins?.allow).toEqual(["memory-core", "active-memory", "qa-channel"]); + expect(cfg.plugins?.entries?.["active-memory"]).toEqual({ enabled: true }); + }); + it("uses built-in provider wiring in frontier live mode", () => { const cfg = buildQaGatewayConfig({ bind: "loopback", diff --git a/extensions/qa-lab/src/qa-gateway-config.ts b/extensions/qa-lab/src/qa-gateway-config.ts index d2432c7224e..18f3b9e4a3a 100644 --- a/extensions/qa-lab/src/qa-gateway-config.ts +++ b/extensions/qa-lab/src/qa-gateway-config.ts @@ -162,24 +162,23 @@ export function buildQaGatewayConfig(params: { : selectedProviderIds, ), ] - : []; + : [ + ...new Set( + (params.enabledPluginIds ?? []) + .map((pluginId) => pluginId.trim()) + .filter((pluginId) => pluginId.length > 0), + ), + ]; const transportPluginIds = [...new Set(params.transportPluginIds ?? [])] .map((pluginId) => pluginId.trim()) .filter((pluginId) => pluginId.length > 0); - const pluginEntries = - providerMode === "live-frontier" - ? Object.fromEntries(selectedPluginIds.map((pluginId) => [pluginId, { enabled: true }])) - : {}; + const pluginEntries = Object.fromEntries( + selectedPluginIds.map((pluginId) => [pluginId, { enabled: true }]), + ); const transportPluginEntries = Object.fromEntries( transportPluginIds.map((pluginId) => [pluginId, { enabled: true }]), ); - const allowedPlugins = [ - ...new Set( - providerMode === "live-frontier" - ? ["memory-core", ...selectedPluginIds, ...transportPluginIds] - : ["memory-core", ...transportPluginIds], - ), - ]; + const allowedPlugins = [...new Set(["memory-core", ...selectedPluginIds, ...transportPluginIds])]; const liveModelParams = providerMode === "live-frontier" ? (modelRef: string) => ({ diff --git a/extensions/qa-lab/src/run-config.test.ts b/extensions/qa-lab/src/run-config.test.ts index e312c8992fe..5bf4b534356 100644 --- a/extensions/qa-lab/src/run-config.test.ts +++ b/extensions/qa-lab/src/run-config.test.ts @@ -24,12 +24,12 @@ const scenarios = [ ]; describe("qa run config", () => { - it("creates a synthetic-by-default selection that arms every scenario", () => { + it("creates a live-by-default selection that arms every scenario", () => { expect(createDefaultQaRunSelection(scenarios)).toEqual({ - providerMode: "mock-openai", - primaryModel: "mock-openai/gpt-5.4", - alternateModel: "mock-openai/gpt-5.4-alt", - fastMode: false, + providerMode: "live-frontier", + primaryModel: "openai/gpt-5.4", + alternateModel: "openai/gpt-5.4", + fastMode: true, scenarioIds: ["dm-chat-baseline", "thread-lifecycle"], }); }); diff --git a/extensions/qa-lab/src/run-config.ts b/extensions/qa-lab/src/run-config.ts index c03f986c398..fca4fd24f87 100644 --- a/extensions/qa-lab/src/run-config.ts +++ b/extensions/qa-lab/src/run-config.ts @@ -38,19 +38,21 @@ export function defaultQaModelForMode(mode: QaProviderMode, alternate = false) { } export function createDefaultQaRunSelection(scenarios: QaSeedScenario[]): QaLabRunSelection { - const providerMode: QaProviderMode = "mock-openai"; + const providerMode: QaProviderMode = "live-frontier"; return { providerMode, primaryModel: defaultQaModelForMode(providerMode), alternateModel: defaultQaModelForMode(providerMode, true), - fastMode: false, + fastMode: true, scenarioIds: scenarios.map((scenario) => scenario.id), }; } export function normalizeQaProviderMode(input: unknown): QaProviderMode { return normalizeQaProviderModeInput( - input === "live-frontier" || input === "live-openai" ? input : "mock-openai", + input === "mock-openai" || input === "live-frontier" || input === "live-openai" + ? input + : "live-frontier", ); } diff --git a/extensions/qa-lab/src/scenario-catalog.ts b/extensions/qa-lab/src/scenario-catalog.ts index c20161a31ef..553ae660e0b 100644 --- a/extensions/qa-lab/src/scenario-catalog.ts +++ b/extensions/qa-lab/src/scenario-catalog.ts @@ -135,6 +135,8 @@ const qaSeedScenarioSchema = z.object({ surface: z.string().trim().min(1), objective: z.string().trim().min(1), successCriteria: z.array(z.string().trim().min(1)).min(1), + plugins: z.array(z.string().trim().min(1)).optional(), + gatewayConfigPatch: z.record(z.string(), z.unknown()).optional(), docsRefs: z.array(z.string().trim().min(1)).optional(), codeRefs: z.array(z.string().trim().min(1)).optional(), execution: qaScenarioExecutionSchema.optional(), diff --git a/extensions/qa-lab/src/scenario-runtime-api.test.ts b/extensions/qa-lab/src/scenario-runtime-api.test.ts new file mode 100644 index 00000000000..22e3647bd7f --- /dev/null +++ b/extensions/qa-lab/src/scenario-runtime-api.test.ts @@ -0,0 +1,160 @@ +import { randomUUID } from "node:crypto"; +import * as fs from "node:fs/promises"; +import path from "node:path"; +import { describe, expect, it, vi } from "vitest"; +import { createQaBusState } from "./bus-state.js"; +import { + createQaScenarioRuntimeApi, + type QaScenarioRuntimeConstants, + type QaScenarioRuntimeDeps, +} from "./scenario-runtime-api.js"; + +function createDeps(overrides?: Partial): QaScenarioRuntimeDeps { + const fn = vi.fn(); + return { + fs, + path, + sleep: vi.fn(async () => undefined), + randomUUID, + runScenario: fn, + waitForOutboundMessage: fn, + waitForTransportOutboundMessage: fn, + waitForChannelOutboundMessage: fn, + waitForNoOutbound: fn, + waitForNoTransportOutbound: fn, + recentOutboundSummary: fn, + formatConversationTranscript: fn, + readTransportTranscript: fn, + formatTransportTranscript: fn, + fetchJson: fn, + waitForGatewayHealthy: fn, + waitForTransportReady: fn, + waitForQaChannelReady: fn, + waitForConfigRestartSettle: fn, + patchConfig: fn, + applyConfig: fn, + readConfigSnapshot: fn, + createSession: fn, + readEffectiveTools: fn, + readSkillStatus: fn, + readRawQaSessionStore: fn, + runQaCli: fn, + extractMediaPathFromText: fn, + resolveGeneratedImagePath: fn, + startAgentRun: fn, + waitForAgentRun: fn, + listCronJobs: fn, + waitForCronRunCompletion: fn, + readDoctorMemoryStatus: fn, + forceMemoryIndex: fn, + findSkill: fn, + writeWorkspaceSkill: fn, + callPluginToolsMcp: fn, + runAgentPrompt: fn, + ensureImageGenerationConfigured: fn, + handleQaAction: fn, + extractQaToolPayload: fn, + formatMemoryDreamingDay: fn, + resolveSessionTranscriptsDirForAgent: fn, + buildAgentSessionKey: fn, + normalizeLowercaseStringOrEmpty: fn, + formatErrorMessage: fn, + liveTurnTimeoutMs: fn, + resolveQaLiveTurnTimeoutMs: fn, + splitModelRef: fn, + qaChannelPlugin: { id: "qa-channel" }, + hasDiscoveryLabels: fn, + reportsDiscoveryScopeLeak: fn, + reportsMissingDiscoveryFiles: fn, + hasModelSwitchContinuityEvidence: fn, + ...overrides, + }; +} + +const constants: QaScenarioRuntimeConstants = { + imageUnderstandingPngBase64: "png-small", + imageUnderstandingLargePngBase64: "png-large", + imageUnderstandingValidPngBase64: "png-valid", +}; + +describe("createQaScenarioRuntimeApi", () => { + it("builds a markdown-flow runtime surface from generic transport capabilities", async () => { + const state = createQaBusState(); + const resetSpy = vi.spyOn(state, "reset"); + const inboundSpy = vi.spyOn(state, "addInboundMessage"); + const outboundSpy = vi.spyOn(state, "addOutboundMessage"); + const readSpy = vi.spyOn(state, "readMessage"); + const waitForCondition = vi.fn(async (check: () => unknown) => check()); + const sleep = vi.fn(async () => undefined); + const env = { + lab: { baseUrl: "http://127.0.0.1:1234" }, + transport: { + state, + capabilities: { + waitForCondition, + getNormalizedMessageState: state.getSnapshot.bind(state), + resetNormalizedMessageState: async () => { + state.reset(); + }, + sendInboundMessage: state.addInboundMessage.bind(state), + injectOutboundMessage: state.addOutboundMessage.bind(state), + readNormalizedMessage: state.readMessage.bind(state), + }, + }, + }; + const scenario = { + id: "generic-flow", + title: "Generic Flow", + surface: "test", + objective: "test", + successCriteria: ["works"], + sourcePath: "qa/scenarios/generic-flow.md", + execution: { + kind: "flow" as const, + config: { expected: "value" }, + flow: { + steps: [{ name: "noop", actions: [{ assert: "true" }] }], + }, + }, + }; + + const api = createQaScenarioRuntimeApi({ + env, + scenario, + deps: createDeps({ sleep }), + constants, + }); + + expect(api.lab).toBe(env.lab); + expect(api.state).toBe(state); + expect(api.config).toEqual({ expected: "value" }); + expect(api.waitForCondition).toBe(waitForCondition); + expect(api.waitForChannelReady).toBe(api.waitForTransportReady); + expect(api.getTransportSnapshot()).toEqual(state.getSnapshot()); + expect(api.imageUnderstandingPngBase64).toBe("png-small"); + + const inbound = await api.injectInboundMessage({ + accountId: "qa-channel", + conversation: { id: "qa-operator", kind: "direct" }, + senderId: "qa-operator", + text: "hello", + }); + const outbound = await api.injectOutboundMessage({ + accountId: "qa-channel", + to: "dm:qa-operator", + text: "hi", + }); + expect(inbound.id).toBeTruthy(); + expect(outbound.id).toBeTruthy(); + await api.readTransportMessage({ accountId: "qa-channel", messageId: outbound.id }); + await api.reset(); + await api.resetBus(); + await api.resetTransport(); + + expect(inboundSpy).toHaveBeenCalledTimes(1); + expect(outboundSpy).toHaveBeenCalledTimes(1); + expect(readSpy).toHaveBeenCalledTimes(1); + expect(resetSpy).toHaveBeenCalledTimes(3); + expect(sleep).toHaveBeenCalledTimes(3); + }); +}); diff --git a/extensions/qa-lab/src/scenario-runtime-api.ts b/extensions/qa-lab/src/scenario-runtime-api.ts new file mode 100644 index 00000000000..bb07b070301 --- /dev/null +++ b/extensions/qa-lab/src/scenario-runtime-api.ts @@ -0,0 +1,256 @@ +import type * as NodeFs from "node:fs/promises"; +import type * as NodePath from "node:path"; +import type { QaTransportState } from "./qa-transport.js"; +import type { QaSeedScenarioWithSource } from "./scenario-catalog.js"; + +type QaScenarioRuntimeFunction = (...args: never[]) => unknown; + +export type QaScenarioRuntimeEnv< + TLab = unknown, + TTransportState extends QaTransportState = QaTransportState, +> = { + lab: TLab; + transport: { + state: TTransportState; + capabilities: { + waitForCondition: QaScenarioRuntimeFunction; + getNormalizedMessageState: () => ReturnType; + resetNormalizedMessageState: () => Promise; + sendInboundMessage: TTransportState["addInboundMessage"]; + injectOutboundMessage: TTransportState["addOutboundMessage"]; + readNormalizedMessage: TTransportState["readMessage"]; + }; + }; +}; + +export type QaScenarioRuntimeDeps = { + fs: typeof NodeFs; + path: typeof NodePath; + sleep: (ms?: number) => Promise; + randomUUID: () => string; + runScenario: QaScenarioRuntimeFunction; + waitForOutboundMessage: QaScenarioRuntimeFunction; + waitForTransportOutboundMessage: QaScenarioRuntimeFunction; + waitForChannelOutboundMessage: QaScenarioRuntimeFunction; + waitForNoOutbound: QaScenarioRuntimeFunction; + waitForNoTransportOutbound: QaScenarioRuntimeFunction; + recentOutboundSummary: QaScenarioRuntimeFunction; + formatConversationTranscript: QaScenarioRuntimeFunction; + readTransportTranscript: QaScenarioRuntimeFunction; + formatTransportTranscript: QaScenarioRuntimeFunction; + fetchJson: QaScenarioRuntimeFunction; + waitForGatewayHealthy: QaScenarioRuntimeFunction; + waitForTransportReady: QaScenarioRuntimeFunction; + waitForQaChannelReady: QaScenarioRuntimeFunction; + waitForConfigRestartSettle: QaScenarioRuntimeFunction; + patchConfig: QaScenarioRuntimeFunction; + applyConfig: QaScenarioRuntimeFunction; + readConfigSnapshot: QaScenarioRuntimeFunction; + createSession: QaScenarioRuntimeFunction; + readEffectiveTools: QaScenarioRuntimeFunction; + readSkillStatus: QaScenarioRuntimeFunction; + readRawQaSessionStore: QaScenarioRuntimeFunction; + runQaCli: QaScenarioRuntimeFunction; + extractMediaPathFromText: QaScenarioRuntimeFunction; + resolveGeneratedImagePath: QaScenarioRuntimeFunction; + startAgentRun: QaScenarioRuntimeFunction; + waitForAgentRun: QaScenarioRuntimeFunction; + listCronJobs: QaScenarioRuntimeFunction; + waitForCronRunCompletion: QaScenarioRuntimeFunction; + readDoctorMemoryStatus: QaScenarioRuntimeFunction; + forceMemoryIndex: QaScenarioRuntimeFunction; + findSkill: QaScenarioRuntimeFunction; + writeWorkspaceSkill: QaScenarioRuntimeFunction; + callPluginToolsMcp: QaScenarioRuntimeFunction; + runAgentPrompt: QaScenarioRuntimeFunction; + ensureImageGenerationConfigured: QaScenarioRuntimeFunction; + handleQaAction: QaScenarioRuntimeFunction; + extractQaToolPayload: QaScenarioRuntimeFunction; + formatMemoryDreamingDay: QaScenarioRuntimeFunction; + resolveSessionTranscriptsDirForAgent: QaScenarioRuntimeFunction; + buildAgentSessionKey: QaScenarioRuntimeFunction; + normalizeLowercaseStringOrEmpty: QaScenarioRuntimeFunction; + formatErrorMessage: QaScenarioRuntimeFunction; + liveTurnTimeoutMs: QaScenarioRuntimeFunction; + resolveQaLiveTurnTimeoutMs: QaScenarioRuntimeFunction; + splitModelRef: QaScenarioRuntimeFunction; + qaChannelPlugin: unknown; + hasDiscoveryLabels: QaScenarioRuntimeFunction; + reportsDiscoveryScopeLeak: QaScenarioRuntimeFunction; + reportsMissingDiscoveryFiles: QaScenarioRuntimeFunction; + hasModelSwitchContinuityEvidence: QaScenarioRuntimeFunction; +}; + +export type QaScenarioRuntimeConstants = { + imageUnderstandingPngBase64: string; + imageUnderstandingLargePngBase64: string; + imageUnderstandingValidPngBase64: string; +}; + +export type QaScenarioRuntimeApi< + TEnv extends QaScenarioRuntimeEnv = QaScenarioRuntimeEnv, + TDeps extends QaScenarioRuntimeDeps = QaScenarioRuntimeDeps, +> = { + env: TEnv; + lab: TEnv["lab"]; + state: TEnv["transport"]["state"]; + scenario: QaSeedScenarioWithSource; + config: Record; + fs: typeof NodeFs; + path: typeof NodePath; + sleep: (ms?: number) => Promise; + randomUUID: () => string; + runScenario: TDeps["runScenario"]; + waitForCondition: TEnv["transport"]["capabilities"]["waitForCondition"]; + waitForOutboundMessage: TDeps["waitForOutboundMessage"]; + waitForTransportOutboundMessage: TDeps["waitForTransportOutboundMessage"]; + waitForChannelOutboundMessage: TDeps["waitForChannelOutboundMessage"]; + waitForNoOutbound: TDeps["waitForNoOutbound"]; + waitForNoTransportOutbound: TDeps["waitForNoTransportOutbound"]; + recentOutboundSummary: TDeps["recentOutboundSummary"]; + formatConversationTranscript: TDeps["formatConversationTranscript"]; + readTransportTranscript: TDeps["readTransportTranscript"]; + formatTransportTranscript: TDeps["formatTransportTranscript"]; + fetchJson: TDeps["fetchJson"]; + waitForGatewayHealthy: TDeps["waitForGatewayHealthy"]; + waitForTransportReady: TDeps["waitForTransportReady"]; + waitForChannelReady: TDeps["waitForTransportReady"]; + waitForQaChannelReady: TDeps["waitForQaChannelReady"]; + waitForConfigRestartSettle: TDeps["waitForConfigRestartSettle"]; + patchConfig: TDeps["patchConfig"]; + applyConfig: TDeps["applyConfig"]; + readConfigSnapshot: TDeps["readConfigSnapshot"]; + createSession: TDeps["createSession"]; + readEffectiveTools: TDeps["readEffectiveTools"]; + readSkillStatus: TDeps["readSkillStatus"]; + readRawQaSessionStore: TDeps["readRawQaSessionStore"]; + runQaCli: TDeps["runQaCli"]; + extractMediaPathFromText: TDeps["extractMediaPathFromText"]; + resolveGeneratedImagePath: TDeps["resolveGeneratedImagePath"]; + startAgentRun: TDeps["startAgentRun"]; + waitForAgentRun: TDeps["waitForAgentRun"]; + listCronJobs: TDeps["listCronJobs"]; + waitForCronRunCompletion: TDeps["waitForCronRunCompletion"]; + readDoctorMemoryStatus: TDeps["readDoctorMemoryStatus"]; + forceMemoryIndex: TDeps["forceMemoryIndex"]; + findSkill: TDeps["findSkill"]; + writeWorkspaceSkill: TDeps["writeWorkspaceSkill"]; + callPluginToolsMcp: TDeps["callPluginToolsMcp"]; + runAgentPrompt: TDeps["runAgentPrompt"]; + ensureImageGenerationConfigured: TDeps["ensureImageGenerationConfigured"]; + handleQaAction: TDeps["handleQaAction"]; + extractQaToolPayload: TDeps["extractQaToolPayload"]; + formatMemoryDreamingDay: TDeps["formatMemoryDreamingDay"]; + resolveSessionTranscriptsDirForAgent: TDeps["resolveSessionTranscriptsDirForAgent"]; + buildAgentSessionKey: TDeps["buildAgentSessionKey"]; + normalizeLowercaseStringOrEmpty: TDeps["normalizeLowercaseStringOrEmpty"]; + formatErrorMessage: TDeps["formatErrorMessage"]; + liveTurnTimeoutMs: TDeps["liveTurnTimeoutMs"]; + resolveQaLiveTurnTimeoutMs: TDeps["resolveQaLiveTurnTimeoutMs"]; + splitModelRef: TDeps["splitModelRef"]; + qaChannelPlugin: unknown; + hasDiscoveryLabels: TDeps["hasDiscoveryLabels"]; + reportsDiscoveryScopeLeak: TDeps["reportsDiscoveryScopeLeak"]; + reportsMissingDiscoveryFiles: TDeps["reportsMissingDiscoveryFiles"]; + hasModelSwitchContinuityEvidence: TDeps["hasModelSwitchContinuityEvidence"]; + imageUnderstandingPngBase64: string; + imageUnderstandingLargePngBase64: string; + imageUnderstandingValidPngBase64: string; + getTransportSnapshot: TEnv["transport"]["capabilities"]["getNormalizedMessageState"]; + resetTransport: () => Promise; + injectInboundMessage: TEnv["transport"]["capabilities"]["sendInboundMessage"]; + injectOutboundMessage: TEnv["transport"]["capabilities"]["injectOutboundMessage"]; + readTransportMessage: TEnv["transport"]["capabilities"]["readNormalizedMessage"]; + resetBus: () => Promise; + reset: () => Promise; +}; + +export function createQaScenarioRuntimeApi< + TEnv extends QaScenarioRuntimeEnv, + TDeps extends QaScenarioRuntimeDeps, +>(params: { + env: TEnv; + scenario: QaSeedScenarioWithSource; + deps: TDeps; + constants: QaScenarioRuntimeConstants; +}): QaScenarioRuntimeApi { + const resetTransportState = async () => { + await params.env.transport.capabilities.resetNormalizedMessageState(); + await params.deps.sleep(100); + }; + + return { + env: params.env, + lab: params.env.lab, + state: params.env.transport.state, + scenario: params.scenario, + config: params.scenario.execution.config ?? {}, + fs: params.deps.fs, + path: params.deps.path, + sleep: params.deps.sleep, + randomUUID: params.deps.randomUUID, + runScenario: params.deps.runScenario, + waitForCondition: params.env.transport.capabilities.waitForCondition, + waitForOutboundMessage: params.deps.waitForOutboundMessage, + waitForTransportOutboundMessage: params.deps.waitForTransportOutboundMessage, + waitForChannelOutboundMessage: params.deps.waitForChannelOutboundMessage, + waitForNoOutbound: params.deps.waitForNoOutbound, + waitForNoTransportOutbound: params.deps.waitForNoTransportOutbound, + recentOutboundSummary: params.deps.recentOutboundSummary, + formatConversationTranscript: params.deps.formatConversationTranscript, + readTransportTranscript: params.deps.readTransportTranscript, + formatTransportTranscript: params.deps.formatTransportTranscript, + fetchJson: params.deps.fetchJson, + waitForGatewayHealthy: params.deps.waitForGatewayHealthy, + waitForTransportReady: params.deps.waitForTransportReady, + waitForChannelReady: params.deps.waitForTransportReady, + waitForQaChannelReady: params.deps.waitForQaChannelReady, + waitForConfigRestartSettle: params.deps.waitForConfigRestartSettle, + patchConfig: params.deps.patchConfig, + applyConfig: params.deps.applyConfig, + readConfigSnapshot: params.deps.readConfigSnapshot, + createSession: params.deps.createSession, + readEffectiveTools: params.deps.readEffectiveTools, + readSkillStatus: params.deps.readSkillStatus, + readRawQaSessionStore: params.deps.readRawQaSessionStore, + runQaCli: params.deps.runQaCli, + extractMediaPathFromText: params.deps.extractMediaPathFromText, + resolveGeneratedImagePath: params.deps.resolveGeneratedImagePath, + startAgentRun: params.deps.startAgentRun, + waitForAgentRun: params.deps.waitForAgentRun, + listCronJobs: params.deps.listCronJobs, + waitForCronRunCompletion: params.deps.waitForCronRunCompletion, + readDoctorMemoryStatus: params.deps.readDoctorMemoryStatus, + forceMemoryIndex: params.deps.forceMemoryIndex, + findSkill: params.deps.findSkill, + writeWorkspaceSkill: params.deps.writeWorkspaceSkill, + callPluginToolsMcp: params.deps.callPluginToolsMcp, + runAgentPrompt: params.deps.runAgentPrompt, + ensureImageGenerationConfigured: params.deps.ensureImageGenerationConfigured, + handleQaAction: params.deps.handleQaAction, + extractQaToolPayload: params.deps.extractQaToolPayload, + formatMemoryDreamingDay: params.deps.formatMemoryDreamingDay, + resolveSessionTranscriptsDirForAgent: params.deps.resolveSessionTranscriptsDirForAgent, + buildAgentSessionKey: params.deps.buildAgentSessionKey, + normalizeLowercaseStringOrEmpty: params.deps.normalizeLowercaseStringOrEmpty, + formatErrorMessage: params.deps.formatErrorMessage, + liveTurnTimeoutMs: params.deps.liveTurnTimeoutMs, + resolveQaLiveTurnTimeoutMs: params.deps.resolveQaLiveTurnTimeoutMs, + splitModelRef: params.deps.splitModelRef, + qaChannelPlugin: params.deps.qaChannelPlugin, + hasDiscoveryLabels: params.deps.hasDiscoveryLabels, + reportsDiscoveryScopeLeak: params.deps.reportsDiscoveryScopeLeak, + reportsMissingDiscoveryFiles: params.deps.reportsMissingDiscoveryFiles, + hasModelSwitchContinuityEvidence: params.deps.hasModelSwitchContinuityEvidence, + imageUnderstandingPngBase64: params.constants.imageUnderstandingPngBase64, + imageUnderstandingLargePngBase64: params.constants.imageUnderstandingLargePngBase64, + imageUnderstandingValidPngBase64: params.constants.imageUnderstandingValidPngBase64, + getTransportSnapshot: params.env.transport.capabilities.getNormalizedMessageState, + resetTransport: resetTransportState, + injectInboundMessage: params.env.transport.capabilities.sendInboundMessage, + injectOutboundMessage: params.env.transport.capabilities.injectOutboundMessage, + readTransportMessage: params.env.transport.capabilities.readNormalizedMessage, + resetBus: resetTransportState, + reset: resetTransportState, + }; +} diff --git a/extensions/qa-lab/src/suite.test.ts b/extensions/qa-lab/src/suite.test.ts index 95e932a3e67..943df578784 100644 --- a/extensions/qa-lab/src/suite.test.ts +++ b/extensions/qa-lab/src/suite.test.ts @@ -9,6 +9,8 @@ describe("qa suite failure reply handling", () => { const makeScenario = ( id: string, config?: Record, + plugins?: string[], + gatewayConfigPatch?: Record, ): Parameters[0]["scenarios"][number] => ({ id, @@ -16,6 +18,8 @@ describe("qa suite failure reply handling", () => { surface: "test", objective: "test", successCriteria: ["test"], + plugins, + gatewayConfigPatch, sourcePath: `qa/scenarios/${id}.md`, execution: { kind: "flow", @@ -129,6 +133,72 @@ describe("qa suite failure reply handling", () => { ).toEqual(["anthropic-only"]); }); + it("collects unique scenario-declared bundled plugins in encounter order", () => { + const scenarios = [ + makeScenario("generic", undefined, ["active-memory", "memory-wiki"]), + makeScenario("other", undefined, ["memory-wiki", "openai"]), + makeScenario("plain"), + ]; + + expect(qaSuiteTesting.collectQaSuitePluginIds(scenarios)).toEqual([ + "active-memory", + "memory-wiki", + "openai", + ]); + }); + + it("merge-patches scenario startup config in encounter order", () => { + const scenarios = [ + makeScenario("active-memory", undefined, ["active-memory"], { + plugins: { + entries: { + "active-memory": { + config: { + enabled: true, + agents: ["qa"], + }, + }, + }, + }, + }), + makeScenario("live-defaults", undefined, undefined, { + agents: { + defaults: { + thinkingDefault: "minimal", + }, + }, + plugins: { + entries: { + "active-memory": { + config: { + transcriptDir: "qa-memory-e2e", + }, + }, + }, + }, + }), + ]; + + expect(qaSuiteTesting.collectQaSuiteGatewayConfigPatch(scenarios)).toEqual({ + agents: { + defaults: { + thinkingDefault: "minimal", + }, + }, + plugins: { + entries: { + "active-memory": { + config: { + enabled: true, + agents: ["qa"], + transcriptDir: "qa-memory-e2e", + }, + }, + }, + }, + }); + }); + it("filters provider-specific scenarios from an implicit live lane", () => { const scenarios = [ makeScenario("generic"), diff --git a/extensions/qa-lab/src/suite.ts b/extensions/qa-lab/src/suite.ts index 9a131f030eb..ad421d9f991 100644 --- a/extensions/qa-lab/src/suite.ts +++ b/extensions/qa-lab/src/suite.ts @@ -59,6 +59,7 @@ import { renderQaMarkdownReport, type QaReportCheck, type QaReportScenario } fro import { qaChannelPlugin, type QaBusMessage } from "./runtime-api.js"; import { readQaBootstrapScenarioCatalog } from "./scenario-catalog.js"; import { runScenarioFlow } from "./scenario-flow-runner.js"; +import { createQaScenarioRuntimeApi } from "./scenario-runtime-api.js"; type QaSuiteStep = { name: string; @@ -261,6 +262,57 @@ function selectQaSuiteScenarios(params: { ); } +function collectQaSuitePluginIds( + scenarios: ReturnType["scenarios"], +) { + return [ + ...new Set( + scenarios.flatMap((scenario) => + Array.isArray(scenario.plugins) + ? scenario.plugins + .map((pluginId) => pluginId.trim()) + .filter((pluginId) => pluginId.length > 0) + : [], + ), + ), + ]; +} + +function isQaPlainObject(value: unknown): value is Record { + return value !== null && typeof value === "object" && !Array.isArray(value); +} + +function applyQaMergePatch(base: unknown, patch: unknown): unknown { + if (!isQaPlainObject(patch)) { + return patch; + } + const result = isQaPlainObject(base) ? { ...base } : {}; + for (const [key, value] of Object.entries(patch)) { + if (value === null) { + delete result[key]; + continue; + } + result[key] = isQaPlainObject(value) ? applyQaMergePatch(result[key], value) : value; + } + return result; +} + +function collectQaSuiteGatewayConfigPatch( + scenarios: ReturnType["scenarios"], +): Record | undefined { + let merged: Record | undefined; + for (const scenario of scenarios) { + if (!isQaPlainObject(scenario.gatewayConfigPatch)) { + continue; + } + merged = applyQaMergePatch(merged ?? {}, scenario.gatewayConfigPatch) as Record< + string, + unknown + >; + } + return merged; +} + function liveTurnTimeoutMs(env: QaSuiteEnvironment, fallbackMs: number) { return resolveQaLiveTurnTimeoutMs(env, fallbackMs); } @@ -1158,171 +1210,81 @@ async function handleQaAction(params: { return extractQaToolPayload(result as Parameters[0]); } -type QaScenarioFlowApi = { - env: QaSuiteEnvironment; - lab: QaSuiteEnvironment["lab"]; - state: QaTransportState; - scenario: ReturnType["scenarios"][number]; - config: Record; - fs: typeof fs; - path: typeof path; - sleep: typeof sleep; - randomUUID: typeof randomUUID; - runScenario: typeof runScenario; - waitForCondition: typeof waitForCondition; - waitForOutboundMessage: typeof waitForOutboundMessage; - waitForTransportOutboundMessage: typeof waitForTransportOutboundMessage; - waitForChannelOutboundMessage: typeof waitForChannelOutboundMessage; - waitForNoOutbound: typeof waitForNoOutbound; - waitForNoTransportOutbound: typeof waitForNoTransportOutbound; - recentOutboundSummary: typeof recentOutboundSummary; - formatConversationTranscript: typeof formatConversationTranscript; - readTransportTranscript: typeof readTransportTranscript; - formatTransportTranscript: typeof formatTransportTranscript; - fetchJson: typeof fetchJson; - waitForGatewayHealthy: typeof waitForGatewayHealthy; - waitForTransportReady: typeof waitForTransportReady; - waitForChannelReady: typeof waitForTransportReady; - waitForQaChannelReady: typeof waitForQaChannelReady; - waitForConfigRestartSettle: typeof waitForConfigRestartSettle; - patchConfig: typeof patchConfig; - applyConfig: typeof applyConfig; - readConfigSnapshot: typeof readConfigSnapshot; - createSession: typeof createSession; - readEffectiveTools: typeof readEffectiveTools; - readSkillStatus: typeof readSkillStatus; - readRawQaSessionStore: typeof readRawQaSessionStore; - runQaCli: typeof runQaCli; - extractMediaPathFromText: typeof extractMediaPathFromText; - resolveGeneratedImagePath: typeof resolveGeneratedImagePath; - startAgentRun: typeof startAgentRun; - waitForAgentRun: typeof waitForAgentRun; - listCronJobs: typeof listCronJobs; - waitForCronRunCompletion: typeof waitForCronRunCompletion; - readDoctorMemoryStatus: typeof readDoctorMemoryStatus; - forceMemoryIndex: typeof forceMemoryIndex; - findSkill: typeof findSkill; - writeWorkspaceSkill: typeof writeWorkspaceSkill; - callPluginToolsMcp: typeof callPluginToolsMcp; - runAgentPrompt: typeof runAgentPrompt; - ensureImageGenerationConfigured: typeof ensureImageGenerationConfigured; - handleQaAction: typeof handleQaAction; - extractQaToolPayload: typeof extractQaToolPayload; - formatMemoryDreamingDay: typeof formatMemoryDreamingDay; - resolveSessionTranscriptsDirForAgent: typeof resolveSessionTranscriptsDirForAgent; - buildAgentSessionKey: typeof buildAgentSessionKey; - normalizeLowercaseStringOrEmpty: typeof normalizeLowercaseStringOrEmpty; - formatErrorMessage: typeof formatErrorMessage; - liveTurnTimeoutMs: typeof liveTurnTimeoutMs; - resolveQaLiveTurnTimeoutMs: typeof resolveQaLiveTurnTimeoutMs; - splitModelRef: typeof splitModelRef; - qaChannelPlugin: typeof qaChannelPlugin; - hasDiscoveryLabels: typeof hasDiscoveryLabels; - reportsDiscoveryScopeLeak: typeof reportsDiscoveryScopeLeak; - reportsMissingDiscoveryFiles: typeof reportsMissingDiscoveryFiles; - hasModelSwitchContinuityEvidence: typeof hasModelSwitchContinuityEvidence; - imageUnderstandingPngBase64: string; - imageUnderstandingLargePngBase64: string; - imageUnderstandingValidPngBase64: string; - getTransportSnapshot: () => ReturnType; - resetTransport: () => Promise; - injectInboundMessage: QaTransportState["addInboundMessage"]; - injectOutboundMessage: QaTransportState["addOutboundMessage"]; - readTransportMessage: QaTransportState["readMessage"]; - resetBus: () => Promise; - reset: () => Promise; -}; - function createScenarioFlowApi( env: QaSuiteEnvironment, scenario: ReturnType["scenarios"][number], -): QaScenarioFlowApi { - return { +) { + return createQaScenarioRuntimeApi({ env, - lab: env.lab, - state: env.transport.state, scenario, - config: scenario.execution.config ?? {}, - fs, - path, - sleep, - randomUUID, - runScenario, - waitForCondition: env.transport.capabilities.waitForCondition, - waitForOutboundMessage, - waitForTransportOutboundMessage, - waitForChannelOutboundMessage, - waitForNoOutbound, - waitForNoTransportOutbound, - recentOutboundSummary, - formatConversationTranscript, - readTransportTranscript, - formatTransportTranscript, - fetchJson, - waitForGatewayHealthy, - waitForTransportReady, - waitForChannelReady: waitForTransportReady, - waitForQaChannelReady, - waitForConfigRestartSettle, - patchConfig, - applyConfig, - readConfigSnapshot, - createSession, - readEffectiveTools, - readSkillStatus, - readRawQaSessionStore, - runQaCli, - extractMediaPathFromText, - resolveGeneratedImagePath, - startAgentRun, - waitForAgentRun, - listCronJobs, - waitForCronRunCompletion, - readDoctorMemoryStatus, - forceMemoryIndex, - findSkill, - writeWorkspaceSkill, - callPluginToolsMcp, - runAgentPrompt, - ensureImageGenerationConfigured, - handleQaAction, - extractQaToolPayload, - formatMemoryDreamingDay, - resolveSessionTranscriptsDirForAgent, - buildAgentSessionKey, - normalizeLowercaseStringOrEmpty, - formatErrorMessage, - liveTurnTimeoutMs, - resolveQaLiveTurnTimeoutMs, - splitModelRef, - qaChannelPlugin, - hasDiscoveryLabels, - reportsDiscoveryScopeLeak, - reportsMissingDiscoveryFiles, - hasModelSwitchContinuityEvidence, - imageUnderstandingPngBase64: _QA_IMAGE_UNDERSTANDING_PNG_BASE64, - imageUnderstandingLargePngBase64: _QA_IMAGE_UNDERSTANDING_LARGE_PNG_BASE64, - imageUnderstandingValidPngBase64: QA_IMAGE_UNDERSTANDING_VALID_PNG_BASE64, - getTransportSnapshot: env.transport.capabilities.getNormalizedMessageState, - resetTransport: async () => { - await env.transport.capabilities.resetNormalizedMessageState(); - await sleep(100); + deps: { + fs, + path, + sleep, + randomUUID, + runScenario, + waitForOutboundMessage, + waitForTransportOutboundMessage, + waitForChannelOutboundMessage, + waitForNoOutbound, + waitForNoTransportOutbound, + recentOutboundSummary, + formatConversationTranscript, + readTransportTranscript, + formatTransportTranscript, + fetchJson, + waitForGatewayHealthy, + waitForTransportReady, + waitForQaChannelReady, + waitForConfigRestartSettle, + patchConfig, + applyConfig, + readConfigSnapshot, + createSession, + readEffectiveTools, + readSkillStatus, + readRawQaSessionStore, + runQaCli, + extractMediaPathFromText, + resolveGeneratedImagePath, + startAgentRun, + waitForAgentRun, + listCronJobs, + waitForCronRunCompletion, + readDoctorMemoryStatus, + forceMemoryIndex, + findSkill, + writeWorkspaceSkill, + callPluginToolsMcp, + runAgentPrompt, + ensureImageGenerationConfigured, + handleQaAction, + extractQaToolPayload, + formatMemoryDreamingDay, + resolveSessionTranscriptsDirForAgent, + buildAgentSessionKey, + normalizeLowercaseStringOrEmpty, + formatErrorMessage, + liveTurnTimeoutMs, + resolveQaLiveTurnTimeoutMs, + splitModelRef, + qaChannelPlugin, + hasDiscoveryLabels, + reportsDiscoveryScopeLeak, + reportsMissingDiscoveryFiles, + hasModelSwitchContinuityEvidence, }, - injectInboundMessage: env.transport.capabilities.sendInboundMessage, - injectOutboundMessage: env.transport.capabilities.injectOutboundMessage, - readTransportMessage: env.transport.capabilities.readNormalizedMessage, - resetBus: async () => { - await env.transport.capabilities.resetNormalizedMessageState(); - await sleep(100); + constants: { + imageUnderstandingPngBase64: _QA_IMAGE_UNDERSTANDING_PNG_BASE64, + imageUnderstandingLargePngBase64: _QA_IMAGE_UNDERSTANDING_LARGE_PNG_BASE64, + imageUnderstandingValidPngBase64: QA_IMAGE_UNDERSTANDING_VALID_PNG_BASE64, }, - reset: async () => { - await env.transport.capabilities.resetNormalizedMessageState(); - await sleep(100); - }, - }; + }); } export const qaSuiteTesting = { + collectQaSuiteGatewayConfigPatch, + collectQaSuitePluginIds, createScenarioWaitForCondition, findFailureOutboundMessage, getGatewayRetryAfterMs, @@ -1415,7 +1377,7 @@ async function writeQaSuiteArtifacts(params: { export async function runQaSuite(params?: QaSuiteRunParams): Promise { const startedAt = new Date(); const repoRoot = path.resolve(params?.repoRoot ?? process.cwd()); - const providerMode = normalizeQaProviderMode(params?.providerMode ?? "mock-openai"); + const providerMode = normalizeQaProviderMode(params?.providerMode ?? "live-frontier"); const transportId = normalizeQaTransportId(params?.transportId); const primaryModel = params?.primaryModel ?? defaultQaModelForMode(providerMode); const alternateModel = @@ -1433,6 +1395,8 @@ export async function runQaSuite(params?: QaSuiteRunParams): Promise applyQaMergePatch(cfg, gatewayConfigPatch) as OpenClawConfig + : undefined, }); lab.setControlUi({ controlUiProxyTarget: gateway.baseUrl, diff --git a/qa/scenarios/active-memory-preprompt-recall.md b/qa/scenarios/active-memory-preprompt-recall.md new file mode 100644 index 00000000000..d0d2270d6dd --- /dev/null +++ b/qa/scenarios/active-memory-preprompt-recall.md @@ -0,0 +1,225 @@ +# Active Memory pre-reply recall + +```yaml qa-scenario +id: active-memory-preprompt-recall +title: Active Memory pre-reply recall +surface: memory +objective: Verify Active Memory surfaces a memory-only preference before the main reply, and that the same question stays unresolved when the plugin is off. +plugins: + - active-memory +gatewayConfigPatch: + plugins: + entries: + active-memory: + enabled: true + config: + enabled: true + agents: + - qa + allowedChatTypes: + - direct + logging: true + persistTranscripts: true + transcriptDir: qa-memory-e2e + queryMode: recent + maxSummaryChars: 220 +successCriteria: + - With Active Memory off, the session shows no Active Memory plugin activity. + - With Active Memory on, plugin-owned evidence shows the Active Memory sub-agent searched memory before the main reply. + - Live lane proves the first user-visible reply uses the recalled preference. +docsRefs: + - docs/concepts/active-memory.md + - docs/concepts/memory-search.md +codeRefs: + - extensions/active-memory/index.ts + - extensions/qa-lab/src/suite.ts + - extensions/qa-lab/src/mock-openai-server.ts +execution: + kind: flow + summary: Verify Active Memory stays off when session-toggled off, runs memory search/get when enabled, and helps a live model answer with the recalled preference in the first visible reply. + config: + baselineConversationId: qa-active-memory-off + activeConversationId: qa-active-memory-on + memoryFact: "Stable QA movie night snack preference: lemon pepper wings with blue cheese." + memoryQuery: "QA movie night snack lemon pepper wings blue cheese" + expectedNeedle: lemon pepper wings + prompt: "Silent snack recall check: what snack do I usually want for QA movie night? Reply in one short sentence." + promptSnippet: "Silent snack recall check" + transcriptDir: qa-memory-e2e +``` + +```yaml qa-flow +steps: + - name: only active memory surfaces the hidden snack preference + actions: + - call: reset + - call: fs.rm + args: + - expr: "path.join(env.gateway.workspaceDir, 'MEMORY.md')" + - force: true + - call: fs.rm + args: + - expr: "path.join(env.gateway.workspaceDir, 'memory', `${formatMemoryDreamingDay(Date.now())}.md`)" + - force: true + - call: fs.writeFile + args: + - expr: "path.join(env.gateway.workspaceDir, 'MEMORY.md')" + - expr: "`${config.memoryFact}\\n`" + - utf8 + - call: forceMemoryIndex + args: + - env: + ref: env + query: + expr: config.memoryQuery + expectedNeedle: + expr: config.expectedNeedle + - set: baselineSessionKey + value: + expr: "'agent:qa:qa-channel:direct:active-memory-off'" + - set: activeSessionKey + value: + expr: "'agent:qa:qa-channel:direct:active-memory-on'" + - set: transcriptRoot + value: + expr: "path.join(env.gateway.tempRoot, 'state', 'plugins', 'active-memory', 'transcripts', 'agents', 'qa', config.transcriptDir)" + - set: toggleStorePath + value: + expr: "path.join(env.gateway.tempRoot, 'state', 'plugins', 'active-memory', 'session-toggles.json')" + - call: fs.rm + args: + - ref: transcriptRoot + - recursive: true + force: true + - call: fs.rm + args: + - ref: toggleStorePath + - force: true + - call: fs.mkdir + args: + - expr: "path.dirname(toggleStorePath)" + - recursive: true + - call: fs.writeFile + args: + - ref: toggleStorePath + - expr: "`${JSON.stringify({ sessions: { [baselineSessionKey]: { disabled: true, updatedAt: Date.now() } } }, null, 2)}\\n`" + - utf8 + - set: requestCountBeforeBaseline + value: + expr: "env.mock ? (await fetchJson(`${env.mock.baseUrl}/debug/requests`)).length : 0" + - set: baselineStartIndex + value: + expr: "state.getSnapshot().messages.length" + - call: runAgentPrompt + args: + - ref: env + - sessionKey: + ref: baselineSessionKey + message: + expr: config.prompt + timeoutMs: + expr: liveTurnTimeoutMs(env, 45000) + - call: waitForOutboundMessage + saveAs: baselineOutbound + args: + - ref: state + - lambda: + params: [candidate] + expr: "candidate.conversation.id === 'qa-operator'" + - expr: liveTurnTimeoutMs(env, 30000) + - sinceIndex: + ref: baselineStartIndex + - set: baselineLower + value: + expr: "normalizeLowercaseStringOrEmpty(baselineOutbound.text)" + - if: + expr: "Boolean(env.mock)" + then: + - set: baselineMockRequests + value: + expr: "(await fetchJson(`${env.mock.baseUrl}/debug/requests`)).slice(requestCountBeforeBaseline)" + - set: baselineSessionStore + value: + expr: "await readRawQaSessionStore(env)" + - assert: + expr: "!Array.isArray(baselineSessionStore[baselineSessionKey]?.pluginDebugEntries) || !baselineSessionStore[baselineSessionKey].pluginDebugEntries.some((pluginEntry) => pluginEntry?.pluginId === 'active-memory')" + message: baseline session unexpectedly recorded active-memory plugin activity + - set: requestCountBeforeActive + value: + expr: "env.mock ? (await fetchJson(`${env.mock.baseUrl}/debug/requests`)).length : 0" + - call: fs.writeFile + args: + - ref: toggleStorePath + - expr: "'{}\\n'" + - utf8 + - set: activeStartIndex + value: + expr: "state.getSnapshot().messages.length" + - call: runAgentPrompt + args: + - ref: env + - sessionKey: + ref: activeSessionKey + message: + expr: config.prompt + timeoutMs: + expr: liveTurnTimeoutMs(env, 45000) + - call: waitForOutboundMessage + saveAs: activeOutbound + args: + - ref: state + - lambda: + params: [candidate] + expr: "candidate.conversation.id === 'qa-operator'" + - expr: liveTurnTimeoutMs(env, 30000) + - sinceIndex: + ref: activeStartIndex + - set: activeLower + value: + expr: "normalizeLowercaseStringOrEmpty(activeOutbound.text)" + - if: + expr: "!env.mock" + then: + - assert: + expr: "activeLower.includes(normalizeLowercaseStringOrEmpty(config.expectedNeedle))" + message: + expr: "`active memory reply missed the hidden preference: ${activeOutbound.text}`" + - call: waitForCondition + saveAs: transcriptPath + args: + - lambda: + async: true + expr: "await (async () => { const entries = (await fs.readdir(transcriptRoot).catch(() => [])).filter((entry) => entry.endsWith('.jsonl')).toSorted(); return entries.length > 0 ? path.join(transcriptRoot, entries.at(-1)) : undefined; })()" + - 10000 + - call: fs.readFile + saveAs: transcriptText + args: + - ref: transcriptPath + - utf8 + - assert: + expr: "transcriptText.includes('memory_search')" + message: active memory transcript missing memory_search + - assert: + expr: "transcriptText.includes('memory_get')" + message: active memory transcript missing memory_get + - call: waitForCondition + saveAs: activeSessionEntry + args: + - lambda: + async: true + expr: "await (async () => { const store = await readRawQaSessionStore(env); const entry = store[activeSessionKey]; if (!entry || !Array.isArray(entry.pluginDebugEntries)) return undefined; return entry.pluginDebugEntries.some((pluginEntry) => pluginEntry?.pluginId === 'active-memory' && Array.isArray(pluginEntry.lines) && pluginEntry.lines.some((line) => line.includes('Active Memory: ok'))) ? entry : undefined; })()" + - 10000 + - if: + expr: "Boolean(env.mock)" + then: + - set: mockRequests + value: + expr: "(await fetchJson(`${env.mock.baseUrl}/debug/requests`)).slice(requestCountBeforeActive)" + - assert: + expr: "mockRequests.some((request) => request.allInputText.includes('You are a memory search agent.') && request.plannedToolName === 'memory_search')" + message: expected mock Active Memory search request + - assert: + expr: "mockRequests.some((request) => request.allInputText.includes('You are a memory search agent.') && request.plannedToolName === 'memory_get')" + message: expected mock Active Memory memory_get request + detailsExpr: "`${activeOutbound.text}\\n\\ntranscript=${transcriptPath}`" +```