fix(qa): force Matrix mention progress search

This commit is contained in:
Vincent Koc
2026-05-27 14:30:27 +02:00
parent c9d4f7e35c
commit e2f6734dac
5 changed files with 105 additions and 82 deletions

View File

@@ -397,6 +397,29 @@ describe("qa mock openai server", () => {
expect(final.output[0]?.content?.[0]?.text).toBe("TOOL_PROGRESS_MARKER_OK");
});
it("plans deterministic tool-progress exec commands from exact command prompts", async () => {
const server = await startMockServer();
const command =
"rg -n 'matrix-progress-@room-@alice:matrix-qa.test-!room:matrix-qa.test.txt' . ; sleep 2";
const prompt = `Tool progress QA check: call the exec tool exactly once with this exact command before answering: \`${command}\`. After that exec command completes or fails, reply exactly \`TOOL_PROGRESS_EXEC_OK\`.`;
const response = await fetch(`${server.baseUrl}/v1/responses`, {
method: "POST",
headers: {
"content-type": "application/json",
},
body: JSON.stringify({
stream: true,
input: [makeUserInput(prompt)],
}),
});
expect(response.status).toBe(200);
const body = await response.text();
expect(body).toContain('"name":"exec"');
expect(body).toContain(command);
});
it("honors exact replies after QA kickoff reads without marker wording", async () => {
const server = await startMockServer();
const prompt =

View File

@@ -635,6 +635,14 @@ function readTargetFromPrompt(prompt: string) {
return "repo/package.json";
}
function execCommandFromToolProgressPrompt(prompt: string) {
return (
/call the exec tool exactly once with this exact command before answering:\s*`([^`]+)`/i
.exec(prompt)?.[1]
?.trim() || null
);
}
function buildToolCallEventsWithArgs(name: string, args: Record<string, unknown>): StreamEvent[] {
const serialized = JSON.stringify(args);
const callSuffix = createHash("sha1")
@@ -1626,6 +1634,11 @@ async function buildResponsesPayload(
path: readTargetFromPrompt(toolProgressPrompt || prompt || allInputText),
});
};
const buildToolProgressExecEvents = (pattern: RegExp) => {
const toolProgressPrompt = extractLastMatchingUserText(extractAllUserTexts(input), pattern);
const command = execCommandFromToolProgressPrompt(toolProgressPrompt || prompt || allInputText);
return command ? buildToolCallEventsWithArgs("exec", { command }) : null;
};
if (
(QA_TOOL_SEARCH_PROMPT_RE.test(allInputText) ||
QA_TOOL_SEARCH_FAILURE_PROMPT_RE.test(allInputText)) &&
@@ -1812,7 +1825,10 @@ async function buildResponsesPayload(
}
if (QA_TOOL_PROGRESS_PROMPT_RE.test(allInputText) && toolProgressReplyDirective) {
if (!toolOutput) {
return buildToolProgressReadEvents(QA_TOOL_PROGRESS_PROMPT_RE);
return (
buildToolProgressExecEvents(QA_TOOL_PROGRESS_PROMPT_RE) ??
buildToolProgressReadEvents(QA_TOOL_PROGRESS_PROMPT_RE)
);
}
return buildAssistantEvents(toolProgressReplyDirective);
}

View File

@@ -32,7 +32,6 @@ import {
createMatrixQaScenarioClient,
isMatrixQaExactMarkerReply,
isMatrixQaMessageLikeKind,
MATRIX_QA_TOOL_PROGRESS_MENTION_FILENAME,
MATRIX_QA_TOOL_PROGRESS_TASK_FILENAME,
primeMatrixQaActorCursor,
primeMatrixQaDriverScenarioClient,
@@ -870,7 +869,6 @@ async function runMatrixToolProgressScenario(
finalText: string;
allowFinalOnly?: boolean;
allowTopLevelFinalWithProgress?: boolean;
taskFilename?: string;
label: string;
allowGenericProgressLine?: boolean;
mentionSafety?: boolean;
@@ -880,7 +878,7 @@ async function runMatrixToolProgressScenario(
) {
const { client, startSince } = await primeMatrixQaDriverScenarioClient(context);
const startObservedIndex = context.observedEvents.length;
await writeMatrixToolProgressTaskFile(context, params.finalText, params.taskFilename);
await writeMatrixToolProgressTaskFile(context, params.finalText);
const triggerBody = params.triggerBodyBuilder(context.sutUserId, params.finalText);
const driverEventId = await client.sendTextMessage({
body: triggerBody,
@@ -1188,13 +1186,12 @@ async function runMatrixToolProgressScenario(
async function writeMatrixToolProgressTaskFile(
context: MatrixQaScenarioContext,
finalText: string,
taskFilename = MATRIX_QA_TOOL_PROGRESS_TASK_FILENAME,
) {
if (!context.gatewayWorkspaceDir) {
return;
}
await writeFile(
path.join(context.gatewayWorkspaceDir, taskFilename),
path.join(context.gatewayWorkspaceDir, MATRIX_QA_TOOL_PROGRESS_TASK_FILENAME),
`${buildMatrixToolProgressTaskContent(finalText)}\n`,
"utf8",
);
@@ -1231,7 +1228,6 @@ export async function runToolProgressMentionSafetyScenario(context: MatrixQaScen
finalText: buildMatrixQaToken("MATRIX_QA_TOOL_PROGRESS_MENTION_SAFE"),
label: "tool progress mention safety",
mentionSafety: true,
taskFilename: MATRIX_QA_TOOL_PROGRESS_MENTION_FILENAME,
progressPattern: /@room|@alice:matrix-qa\.test|!room:matrix-qa\.test/i,
triggerBodyBuilder: buildMatrixToolProgressMentionSafetyPrompt,
});

View File

@@ -121,10 +121,12 @@ export function buildMatrixToolProgressErrorPrompt(sutUserId: string, text: stri
}
export function buildMatrixToolProgressMentionSafetyPrompt(sutUserId: string, text: string) {
const command = `rg -n '${MATRIX_QA_TOOL_PROGRESS_MENTION_FILENAME}' . ; sleep 2`;
return [
`${sutUserId} Tool progress QA check: call the read tool exactly once on \`${MATRIX_QA_TOOL_PROGRESS_MENTION_FILENAME}\` before answering.`,
`Do not use search for this check.`,
`After the read completes, reply exactly \`${text}\`.`,
`${sutUserId} Tool progress QA check: call the exec tool exactly once with this exact command before answering: \`${command}\`.`,
`The QA harness must observe that exec tool call in a Matrix tool-progress preview.`,
`Do not guess or send any marker before the tool result returns.`,
`After that exec command completes or fails, reply exactly \`${text}\`.`,
].join(" ");
}

View File

@@ -3865,87 +3865,73 @@ describe("matrix live qa scenarios", () => {
it("keeps Matrix-looking tool progress mentions inert in partial previews", async () => {
const previewEventId = "$tool-progress-mention-preview";
const gatewayWorkspaceDir = await mkdtemp(path.join(os.tmpdir(), "matrix-qa-workspace-"));
try {
const { sendTextMessage } = mockMatrixQaRoomClient({
driverEventId: "$tool-progress-mention-trigger",
events: [
{
event: matrixQaMessageEvent({
kind: "message",
const { sendTextMessage } = mockMatrixQaRoomClient({
driverEventId: "$tool-progress-mention-trigger",
events: [
{
event: matrixQaMessageEvent({
kind: "message",
eventId: previewEventId,
body: "Working...\n- `tool: exec`",
}),
since: "driver-sync-preview",
},
{
event: matrixQaMessageEvent({
kind: "message",
eventId: "$tool-progress-mention-edit",
body:
'Working...\n- `search "matrix-progress-@room-@alice:matrix-qa.test-!room:matrix-qa.test.txt" in . -> run sleep 2`',
formattedBody:
'Working...<br><ul><li><code>search "matrix-progress-@room-@alice:matrix-qa.test-!room:matrix-qa.test.txt" in . -&gt; run sleep 2</code></li></ul>',
mentions: {},
relatesTo: {
relType: "m.replace",
eventId: previewEventId,
body: "Working...\n- `tool: read`",
}),
since: "driver-sync-preview",
},
{
event: matrixQaMessageEvent({
},
}),
since: "driver-sync-progress",
},
{
event: ({ sendTextMessage }) =>
matrixQaMessageEvent({
kind: "message",
eventId: "$tool-progress-mention-edit",
body: "Working...\n- `tool: read`\n- `read from matrix-progress-@room-@alice:matrix-qa.test-!room:matrix-qa.test.txt`",
formattedBody:
"Working...<br><ul><li><code>read from matrix-progress-@room-@alice:matrix-qa.test-!room:matrix-qa.test.txt</code></li></ul>",
mentions: {},
eventId: "$tool-progress-mention-final",
body: readMatrixQaReplyDirective(
mockMessageBody(sendTextMessage, "sendTextMessage"),
"MATRIX_QA_TOOL_PROGRESS_MENTION_SAFE_FIXED",
),
relatesTo: {
relType: "m.replace",
eventId: previewEventId,
},
}),
since: "driver-sync-progress",
},
{
event: ({ sendTextMessage }) =>
matrixQaMessageEvent({
kind: "message",
eventId: "$tool-progress-mention-final",
body: readMatrixQaReplyDirective(
mockMessageBody(sendTextMessage, "sendTextMessage"),
"MATRIX_QA_TOOL_PROGRESS_MENTION_SAFE_FIXED",
),
relatesTo: {
relType: "m.replace",
eventId: previewEventId,
},
}),
since: "driver-sync-next",
},
],
});
since: "driver-sync-next",
},
],
});
const scenario = requireMatrixQaScenario("matrix-room-tool-progress-mention-safety");
const scenario = requireMatrixQaScenario("matrix-room-tool-progress-mention-safety");
const result = await runMatrixQaScenario(scenario, {
...matrixQaScenarioContext(),
gatewayWorkspaceDir,
});
const artifacts = result.artifacts as {
driverEventId?: unknown;
previewEventId?: unknown;
previewMentions?: unknown;
reply?: { eventId?: unknown };
token?: unknown;
};
expect(artifacts.driverEventId).toBe("$tool-progress-mention-trigger");
expect(artifacts.previewEventId).toBe("$tool-progress-mention-preview");
expect(artifacts.previewMentions).toEqual({});
expect(artifacts.reply?.eventId).toBe("$tool-progress-mention-final");
const prompt = mockMessageBody(sendTextMessage, "sendTextMessage");
expect(prompt).toContain(
"call the read tool exactly once on `matrix-progress-@room-@alice:matrix-qa.test-!room:matrix-qa.test.txt`",
);
expect(prompt).toContain("Do not use search for this check.");
await expect(
readFile(
path.join(
gatewayWorkspaceDir,
"matrix-progress-@room-@alice:matrix-qa.test-!room:matrix-qa.test.txt",
),
"utf8",
),
).resolves.toContain(String(artifacts.token));
} finally {
await rm(gatewayWorkspaceDir, { force: true, recursive: true });
}
const result = await runMatrixQaScenario(scenario, matrixQaScenarioContext());
const artifacts = result.artifacts as {
driverEventId?: unknown;
previewEventId?: unknown;
previewMentions?: unknown;
reply?: { eventId?: unknown };
};
expect(artifacts.driverEventId).toBe("$tool-progress-mention-trigger");
expect(artifacts.previewEventId).toBe("$tool-progress-mention-preview");
expect(artifacts.previewMentions).toEqual({});
expect(artifacts.reply?.eventId).toBe("$tool-progress-mention-final");
const prompt = mockMessageBody(sendTextMessage, "sendTextMessage");
expect(prompt).toContain(
"call the exec tool exactly once with this exact command before answering",
);
expect(prompt).toContain(
"`rg -n 'matrix-progress-@room-@alice:matrix-qa.test-!room:matrix-qa.test.txt' . ; sleep 2`",
);
expect(prompt).toContain("The QA harness must observe that exec tool call");
});
it("preserves separate finalized block events when Matrix block streaming is enabled", async () => {