fix(qa): make matrix block streaming deterministic

This commit is contained in:
Vincent Koc
2026-05-27 20:01:08 +02:00
parent c0f16460d7
commit 140cede2e2
8 changed files with 265 additions and 59 deletions

View File

@@ -1,19 +1,45 @@
#!/usr/bin/env node
import { spawnSync } from "node:child_process";
import fs from "node:fs/promises";
import path from "node:path";
import { fileURLToPath } from "node:url";
import { build } from "esbuild";
const scriptPath = path.resolve(
path.dirname(fileURLToPath(import.meta.url)),
"../../../scripts/build-diffs-viewer-runtime.mjs",
);
const result = spawnSync(process.execPath, [scriptPath, "full"], { stdio: "inherit" });
if (result.error) {
throw result.error;
const extensionRoot = path.resolve(path.dirname(fileURLToPath(import.meta.url)), "..");
const repoRoot = path.resolve(extensionRoot, "../..");
const outputPath = path.join(extensionRoot, "assets/viewer-runtime.js");
await fs.mkdir(path.dirname(outputPath), { recursive: true });
const result = await build({
entryPoints: [path.join(repoRoot, "extensions/diffs/src/viewer-client.ts")],
bundle: true,
platform: "browser",
target: "es2020",
format: "esm",
minify: true,
legalComments: "none",
outfile: outputPath,
write: false,
});
const outputFile = result.outputFiles?.[0];
if (!outputFile) {
throw new Error(
"esbuild did not produce extensions/diffs-language-pack/assets/viewer-runtime.js",
);
}
if (result.signal) {
console.error(`build-diffs-viewer-runtime exited with signal ${result.signal}`);
process.exit(1);
const runtime = outputFile.text.replace(/[ \t]+$/gm, "");
let previousRuntime = null;
try {
previousRuntime = await fs.readFile(outputPath, "utf8");
} catch (error) {
if (error?.code !== "ENOENT") {
throw error;
}
}
if (previousRuntime !== runtime) {
await fs.writeFile(outputPath, runtime);
}
process.exit(result.status ?? 0);

View File

@@ -1,19 +1,53 @@
#!/usr/bin/env node
import { spawnSync } from "node:child_process";
import fs from "node:fs/promises";
import path from "node:path";
import { fileURLToPath } from "node:url";
import { build } from "esbuild";
const scriptPath = path.resolve(
path.dirname(fileURLToPath(import.meta.url)),
"../../../scripts/build-diffs-viewer-runtime.mjs",
);
const result = spawnSync(process.execPath, [scriptPath, "curated"], { stdio: "inherit" });
if (result.error) {
throw result.error;
const extensionRoot = path.resolve(path.dirname(fileURLToPath(import.meta.url)), "..");
const repoRoot = path.resolve(extensionRoot, "../..");
const outputPath = path.join(extensionRoot, "assets/viewer-runtime.js");
await fs.mkdir(path.dirname(outputPath), { recursive: true });
const result = await build({
entryPoints: [path.join(extensionRoot, "src/viewer-client.ts")],
bundle: true,
platform: "browser",
target: "es2020",
format: "esm",
minify: true,
legalComments: "none",
outfile: outputPath,
write: false,
plugins: [
{
name: "openclaw-diffs-curated-shiki",
setup(buildContext) {
buildContext.onResolve({ filter: /^shiki$/ }, () => ({
path: path.join(repoRoot, "scripts/diffs-shiki-curated.ts"),
}));
},
},
],
});
const outputFile = result.outputFiles?.[0];
if (!outputFile) {
throw new Error("esbuild did not produce extensions/diffs/assets/viewer-runtime.js");
}
if (result.signal) {
console.error(`build-diffs-viewer-runtime exited with signal ${result.signal}`);
process.exit(1);
const runtime = outputFile.text.replace(/[ \t]+$/gm, "");
let previousRuntime = null;
try {
previousRuntime = await fs.readFile(outputPath, "utf8");
} catch (error) {
if (error?.code !== "ENOENT") {
throw error;
}
}
if (previousRuntime !== runtime) {
await fs.writeFile(outputPath, runtime);
}
process.exit(result.status ?? 0);

View File

@@ -323,7 +323,14 @@ describe("qa mock openai server", () => {
stream: true,
input: [
makeUserInput(
"Block streaming QA check: emit exactly two assistant message blocks in order. First exact marker: `BLOCK_ONE_OK`. Second exact marker: `BLOCK_TWO_OK`.",
[
"Block streaming QA check: complete this whole sequence in one turn.",
"Step 1: send an assistant text block containing only this exact marker: `BLOCK_ONE_OK`.",
"That first marker block must be emitted before any tool call.",
"Step 2: after the first marker block, use the read tool exactly once on `QA_KICKOFF_TASK.md`.",
"Step 3: after that read completes, send a final assistant text block containing only this exact marker: `BLOCK_TWO_OK`.",
"Never put both markers in the same assistant text block.",
].join("\n"),
),
],
}),

View File

@@ -820,6 +820,12 @@ function extractLastCapture(text: string, pattern: RegExp) {
return lastMatch?.[1]?.trim() || null;
}
function extractCaptures(text: string, pattern: RegExp) {
const flags = pattern.flags.includes("g") ? pattern.flags : `${pattern.flags}g`;
const globalPattern = new RegExp(pattern.source, flags);
return Array.from(text.matchAll(globalPattern), (match) => match[1]?.trim()).filter(Boolean);
}
function extractLastMatchingUserText(texts: string[], pattern: RegExp) {
for (let index = texts.length - 1; index >= 0; index -= 1) {
const text = texts[index] ?? "";
@@ -872,6 +878,29 @@ function extractLabeledMarkerDirective(text: string, label: string) {
);
}
function extractBlockStreamingMarkerDirectives(text: string) {
const firstLabeledMarker = extractLabeledMarkerDirective(text, "first exact marker");
const secondLabeledMarker = extractLabeledMarkerDirective(text, "second exact marker");
if (firstLabeledMarker && secondLabeledMarker) {
return {
first: firstLabeledMarker,
second: secondLabeledMarker,
};
}
const markers = extractCaptures(text, /exact marker\b[^:\n]{0,120}:\s*`([^`]+)`/i);
if (markers.length < 2) {
return null;
}
const [first, second] = markers.slice(-2);
return first && second
? {
first,
second,
}
: null;
}
function extractQuotedToolArg(text: string, name: string) {
const escapedName = escapeRegExp(name);
return extractLastCapture(text, new RegExp(`\\b${escapedName}\\s*=\\s*"([^"]+)"`, "i"));
@@ -1604,15 +1633,14 @@ async function buildResponsesPayload(
extractExactReplyDirective(prompt) ?? extractExactReplyDirective(allInputText);
const exactMarkerDirective =
extractExactMarkerDirective(prompt) ?? extractExactMarkerDirective(allInputText);
const blockStreamingPrompt =
extractLastMatchingUserText(extractAllUserTexts(input), QA_BLOCK_STREAMING_PROMPT_RE) ||
prompt ||
allInputText;
const blockStreamingMarkers =
extractBlockStreamingMarkerDirectives(blockStreamingPrompt) ??
extractBlockStreamingMarkerDirectives(allInputText);
const latestImageUserTurn = extractLatestImageUserTurn(input);
const firstExactMarkerDirective = extractLabeledMarkerDirective(
allInputText,
"first exact marker",
);
const secondExactMarkerDirective = extractLabeledMarkerDirective(
allInputText,
"second exact marker",
);
const isGroupChat = allInputText.includes('"is_group_chat": true');
const isBaselineUnmentionedChannelChatter = /\bno bot ping here\b/i.test(prompt);
const hasReasoningOnlyRetryInstruction = allInputText.includes(QA_REASONING_ONLY_RETRY_NEEDLE);
@@ -1832,23 +1860,19 @@ async function buildResponsesPayload(
}
return buildAssistantEvents(toolProgressReplyDirective);
}
if (
QA_BLOCK_STREAMING_PROMPT_RE.test(allInputText) &&
firstExactMarkerDirective &&
secondExactMarkerDirective
) {
if (QA_BLOCK_STREAMING_PROMPT_RE.test(allInputText) && blockStreamingMarkers) {
return buildAssistantEvents([
{
id: "msg_mock_block_1",
phase: "final_answer",
streamDeltas: splitMockStreamingText(firstExactMarkerDirective),
text: firstExactMarkerDirective,
streamDeltas: splitMockStreamingText(blockStreamingMarkers.first),
text: blockStreamingMarkers.first,
},
{
id: "msg_mock_block_2",
phase: "final_answer",
streamDeltas: splitMockStreamingText(secondExactMarkerDirective),
text: secondExactMarkerDirective,
streamDeltas: splitMockStreamingText(blockStreamingMarkers.second),
text: blockStreamingMarkers.second,
},
]);
}

View File

@@ -156,6 +156,43 @@ describe("matrix live qa runtime", () => {
}
});
it("uses a scenario provider override for the canary only when the whole run is pinned", () => {
const blockStreamingScenario = liveTesting.MATRIX_QA_SCENARIOS.find(
(scenario) => scenario.id === "matrix-room-block-streaming",
);
const threadScenario = liveTesting.MATRIX_QA_SCENARIOS.find(
(scenario) => scenario.id === "matrix-thread-follow-up",
);
expect(blockStreamingScenario).toBeDefined();
expect(threadScenario).toBeDefined();
const pinnedSchedule = liveTesting.scheduleMatrixQaScenariosInCatalogOrder([
blockStreamingScenario!,
]);
expect(liveTesting.selectMatrixQaCanaryProviderMode(pinnedSchedule)).toBe("mock-openai");
const mixedSchedule = liveTesting.scheduleMatrixQaScenariosInCatalogOrder([
threadScenario!,
blockStreamingScenario!,
]);
expect(liveTesting.selectMatrixQaCanaryProviderMode(mixedSchedule)).toBeUndefined();
});
it("preserves explicit model pins when a scenario keeps the suite provider", () => {
const defaultModels = {
alternateModel: "mock-openai/custom-alt",
primaryModel: "mock-openai/custom",
providerMode: "mock-openai" as const,
};
expect(
liveTesting.resolveMatrixQaGatewayModels({
defaultModels,
providerMode: "mock-openai",
}),
).toEqual(defaultModels);
});
it("injects a temporary Matrix account into the QA gateway config", () => {
const baseCfg: OpenClawConfig = {
plugins: {

View File

@@ -7,7 +7,7 @@ import { formatErrorMessage } from "openclaw/plugin-sdk/error-runtime";
import { loadQaRuntimeModule } from "openclaw/plugin-sdk/qa-runner-runtime";
import type { QaReportCheck } from "../../report.js";
import { renderQaMarkdownReport } from "../../report.js";
import { type QaProviderModeInput } from "../../run-config.js";
import { normalizeQaProviderMode, type QaProviderModeInput } from "../../run-config.js";
import {
appendLiveLaneIssue,
buildLiveLaneArtifactsError,
@@ -23,7 +23,7 @@ import {
} from "../../substrate/config.js";
import type { MatrixQaObservedEvent } from "../../substrate/events.js";
import { startMatrixQaHarness } from "../../substrate/harness.runtime.js";
import { resolveMatrixQaModels } from "./model-selection.js";
import { resolveMatrixQaModels, type ResolvedMatrixQaModels } from "./model-selection.js";
import type { MatrixQaSyncStreams } from "./scenario-runtime-shared.js";
import {
MATRIX_QA_SCENARIOS,
@@ -59,8 +59,22 @@ type MatrixQaLiveLaneGatewayHarness = {
stop(opts?: { keepTemp?: boolean; preserveToDir?: string }): Promise<void>;
};
function buildMatrixQaGatewayConfigKey(overrides?: MatrixQaConfigOverrides) {
return JSON.stringify(overrides ?? null);
function buildMatrixQaGatewayConfigKey(params: {
models?: ResolvedMatrixQaModels;
overrides?: MatrixQaConfigOverrides;
providerModeKey?: string;
}) {
return JSON.stringify({
models: params.models
? {
alternateModel: params.models.alternateModel,
primaryModel: params.models.primaryModel,
providerMode: params.models.providerMode,
}
: undefined,
overrides: params.overrides ?? null,
providerModeKey: params.providerModeKey,
});
}
const MATRIX_QA_EXECUTION_TAIL_SCENARIO_IDS = new Set(["matrix-e2ee-wrong-account-recovery-key"]);
@@ -78,6 +92,11 @@ type MatrixQaScheduledScenario = {
scenario: (typeof MATRIX_QA_SCENARIOS)[number];
};
type MatrixQaGatewaySelection = {
overrides?: MatrixQaConfigOverrides;
providerMode?: QaProviderModeInput;
};
type MatrixQaScenarioConfigEntry = MatrixQaSummary["config"]["scenarios"][number];
type MatrixQaSummary = {
@@ -297,16 +316,20 @@ function buildMatrixQaScenarioConfigEntry(params: {
...params.gatewayConfigParams,
overrides: params.scenario.configOverrides,
});
const providerSummary = params.scenario.providerMode
? `providerMode=${params.scenario.providerMode}`
: undefined;
const configSummary =
params.scenario.configOverrides === undefined
? undefined
: summarizeMatrixQaConfigSnapshot(snapshot);
return {
entry: {
config: snapshot,
id: params.scenario.id,
title: params.scenario.title,
},
summary:
params.scenario.configOverrides === undefined
? undefined
: summarizeMatrixQaConfigSnapshot(snapshot),
summary: [providerSummary, configSummary].filter(Boolean).join(", ") || undefined,
};
}
@@ -345,7 +368,10 @@ function scheduleMatrixQaScenariosInCatalogOrder(
tailEntries.push(entry);
continue;
}
const key = buildMatrixQaGatewayConfigKey(entry.scenario.configOverrides);
const key = buildMatrixQaGatewayConfigKey({
overrides: entry.scenario.configOverrides,
providerModeKey: entry.scenario.providerMode ?? "suite",
});
const existingIndex = groupIndexes.get(key);
if (existingIndex !== undefined) {
groupedEntries[existingIndex]?.push(entry);
@@ -358,6 +384,38 @@ function scheduleMatrixQaScenariosInCatalogOrder(
return [...groupedEntries.flat(), ...tailEntries];
}
function selectMatrixQaCanaryProviderMode(
scheduledScenarios: readonly MatrixQaScheduledScenario[],
): QaProviderModeInput | undefined {
let selectedProviderMode: QaProviderModeInput | undefined;
for (const { scenario } of scheduledScenarios) {
if (!scenario.providerMode) {
return undefined;
}
if (!selectedProviderMode) {
selectedProviderMode = scenario.providerMode;
continue;
}
if (scenario.providerMode !== selectedProviderMode) {
return undefined;
}
}
return selectedProviderMode;
}
function resolveMatrixQaGatewayModels(params: {
defaultModels: ResolvedMatrixQaModels;
providerMode?: QaProviderModeInput;
}): ResolvedMatrixQaModels {
if (!params.providerMode) {
return params.defaultModels;
}
const providerMode = normalizeQaProviderMode(params.providerMode);
return providerMode === params.defaultModels.providerMode
? params.defaultModels
: resolveMatrixQaModels({ providerMode });
}
function getMatrixQaScenarioRestartReadyTimeoutMs(scenario: { timeoutMs: number }): number {
return scenario.timeoutMs;
}
@@ -559,11 +617,12 @@ export async function runMatrixQaLive(params: {
path.join(repoRoot, ".artifacts", "qa-e2e", `matrix-${Date.now().toString(36)}`);
await fs.mkdir(outputDir, { recursive: true });
const { providerMode, primaryModel, alternateModel } = resolveMatrixQaModels({
const defaultModels = resolveMatrixQaModels({
providerMode: params.providerMode,
primaryModel: params.primaryModel,
alternateModel: params.alternateModel,
});
const { providerMode } = defaultModels;
const sutAccountId = params.sutAccountId?.trim() || "sut";
const scenarios = findMatrixQaScenarios(params.scenarioIds, params.profile);
const runSuffix = randomUUID().slice(0, 8);
@@ -668,8 +727,16 @@ export async function runMatrixQaLive(params: {
const scheduledScenarios = scheduleMatrixQaScenariosInCatalogOrder(scenarios);
try {
const ensureGatewayHarness = async (overrides?: MatrixQaConfigOverrides) => {
const nextKey = buildMatrixQaGatewayConfigKey(overrides);
const ensureGatewayHarness = async (selection: MatrixQaGatewaySelection = {}) => {
const models = resolveMatrixQaGatewayModels({
defaultModels,
providerMode: selection.providerMode,
});
const overrides = selection.overrides;
const nextKey = buildMatrixQaGatewayConfigKey({
models,
overrides,
});
if (gatewayHarness && gatewayHarnessKey === nextKey) {
return {
durationMs: 0,
@@ -694,9 +761,9 @@ export async function runMatrixQaLive(params: {
createGatewayConfig: () => ({}),
},
transportBaseUrl: "http://127.0.0.1:43123",
providerMode,
primaryModel,
alternateModel,
providerMode: models.providerMode,
primaryModel: models.primaryModel,
alternateModel: models.alternateModel,
fastMode: params.fastMode,
controlUiEnabled: false,
mutateConfig: (cfg) =>
@@ -719,7 +786,9 @@ export async function runMatrixQaLive(params: {
};
{
const ensured = await ensureGatewayHarness();
const ensured = await ensureGatewayHarness({
providerMode: selectMatrixQaCanaryProviderMode(scheduledScenarios),
});
gatewayHarness = ensured.harness;
initialGatewayBootMs = ensured.durationMs;
}
@@ -781,7 +850,10 @@ export async function runMatrixQaLive(params: {
let transportInterruptMs = 0;
try {
writeMatrixQaProgress(`scenario start ${scenario.id}`);
const scenarioGateway = await ensureGatewayHarness(scenario.configOverrides);
const scenarioGateway = await ensureGatewayHarness({
overrides: scenario.configOverrides,
providerMode: scenario.providerMode,
});
gatewayBootMs = scenarioGateway.durationMs;
scenarioGatewayBootMs += gatewayBootMs;
const measuredScenario = await measureMatrixQaStep(() =>
@@ -1142,6 +1214,8 @@ export const testing = {
buildMatrixQaSummary,
getMatrixQaScenarioRestartReadyTimeoutMs,
scheduleMatrixQaScenariosInCatalogOrder,
selectMatrixQaCanaryProviderMode,
resolveMatrixQaGatewayModels,
MATRIX_QA_SCENARIOS,
buildMatrixQaConfig,
buildMatrixQaConfigSnapshot,

View File

@@ -1,3 +1,4 @@
import { type QaProviderModeInput } from "../../run-config.js";
import {
collectLiveTransportStandardScenarioCoverage,
selectLiveTransportScenarios,
@@ -113,6 +114,7 @@ export type MatrixQaE2eeScenarioId = Extract<MatrixQaScenarioId, `matrix-e2ee-${
export type MatrixQaScenarioDefinition = LiveTransportScenarioDefinition<MatrixQaScenarioId> & {
configOverrides?: MatrixQaConfigOverrides;
providerMode?: QaProviderModeInput;
topology?: MatrixQaTopologySpec;
};
@@ -454,6 +456,7 @@ export const MATRIX_QA_SCENARIOS: MatrixQaScenarioDefinition[] = [
timeoutMs: 75_000,
title: "Matrix block streaming preserves completed quiet preview blocks",
topology: MATRIX_QA_BLOCK_ROOM_TOPOLOGY,
providerMode: "mock-openai",
configOverrides: {
agentDefaults: {
blockStreamingChunk: {

View File

@@ -4173,6 +4173,7 @@ describe("matrix live qa scenarios", () => {
);
expect(body).toContain("Never put both markers in the same assistant text block.");
expect(scenario.configOverrides?.toolProfile).toBe("coding");
expect(scenario.providerMode).toBe("mock-openai");
expect(mockObjectArg(waitForRoomEvent, "waitForRoomEvent", 1).since).toBe(
"driver-sync-block-one",
);