diff --git a/extensions/qa-lab/src/providers/mock-openai/server.test.ts b/extensions/qa-lab/src/providers/mock-openai/server.test.ts index 86925bc5444..a8e0aed0367 100644 --- a/extensions/qa-lab/src/providers/mock-openai/server.test.ts +++ b/extensions/qa-lab/src/providers/mock-openai/server.test.ts @@ -397,6 +397,29 @@ describe("qa mock openai server", () => { expect(final.output[0]?.content?.[0]?.text).toBe("TOOL_PROGRESS_MARKER_OK"); }); + it("plans deterministic tool-progress exec commands from exact command prompts", async () => { + const server = await startMockServer(); + const command = + "rg -n 'matrix-progress-@room-@alice:matrix-qa.test-!room:matrix-qa.test.txt' . ; sleep 2"; + const prompt = `Tool progress QA check: call the exec tool exactly once with this exact command before answering: \`${command}\`. After that exec command completes or fails, reply exactly \`TOOL_PROGRESS_EXEC_OK\`.`; + + const response = await fetch(`${server.baseUrl}/v1/responses`, { + method: "POST", + headers: { + "content-type": "application/json", + }, + body: JSON.stringify({ + stream: true, + input: [makeUserInput(prompt)], + }), + }); + + expect(response.status).toBe(200); + const body = await response.text(); + expect(body).toContain('"name":"exec"'); + expect(body).toContain(command); + }); + it("honors exact replies after QA kickoff reads without marker wording", async () => { const server = await startMockServer(); const prompt = diff --git a/extensions/qa-lab/src/providers/mock-openai/server.ts b/extensions/qa-lab/src/providers/mock-openai/server.ts index e7ca3bf7d0b..2b097566ab5 100644 --- a/extensions/qa-lab/src/providers/mock-openai/server.ts +++ b/extensions/qa-lab/src/providers/mock-openai/server.ts @@ -635,6 +635,14 @@ function readTargetFromPrompt(prompt: string) { return "repo/package.json"; } +function execCommandFromToolProgressPrompt(prompt: string) { + return ( + /call the exec tool exactly once with this exact command before answering:\s*`([^`]+)`/i + .exec(prompt)?.[1] + ?.trim() || null + ); +} + function buildToolCallEventsWithArgs(name: string, args: Record): StreamEvent[] { const serialized = JSON.stringify(args); const callSuffix = createHash("sha1") @@ -1626,6 +1634,11 @@ async function buildResponsesPayload( path: readTargetFromPrompt(toolProgressPrompt || prompt || allInputText), }); }; + const buildToolProgressExecEvents = (pattern: RegExp) => { + const toolProgressPrompt = extractLastMatchingUserText(extractAllUserTexts(input), pattern); + const command = execCommandFromToolProgressPrompt(toolProgressPrompt || prompt || allInputText); + return command ? buildToolCallEventsWithArgs("exec", { command }) : null; + }; if ( (QA_TOOL_SEARCH_PROMPT_RE.test(allInputText) || QA_TOOL_SEARCH_FAILURE_PROMPT_RE.test(allInputText)) && @@ -1812,7 +1825,10 @@ async function buildResponsesPayload( } if (QA_TOOL_PROGRESS_PROMPT_RE.test(allInputText) && toolProgressReplyDirective) { if (!toolOutput) { - return buildToolProgressReadEvents(QA_TOOL_PROGRESS_PROMPT_RE); + return ( + buildToolProgressExecEvents(QA_TOOL_PROGRESS_PROMPT_RE) ?? + buildToolProgressReadEvents(QA_TOOL_PROGRESS_PROMPT_RE) + ); } return buildAssistantEvents(toolProgressReplyDirective); } diff --git a/extensions/qa-matrix/src/runners/contract/scenario-runtime-room.ts b/extensions/qa-matrix/src/runners/contract/scenario-runtime-room.ts index f7c58b450d8..0812e30e34e 100644 --- a/extensions/qa-matrix/src/runners/contract/scenario-runtime-room.ts +++ b/extensions/qa-matrix/src/runners/contract/scenario-runtime-room.ts @@ -32,7 +32,6 @@ import { createMatrixQaScenarioClient, isMatrixQaExactMarkerReply, isMatrixQaMessageLikeKind, - MATRIX_QA_TOOL_PROGRESS_MENTION_FILENAME, MATRIX_QA_TOOL_PROGRESS_TASK_FILENAME, primeMatrixQaActorCursor, primeMatrixQaDriverScenarioClient, @@ -870,7 +869,6 @@ async function runMatrixToolProgressScenario( finalText: string; allowFinalOnly?: boolean; allowTopLevelFinalWithProgress?: boolean; - taskFilename?: string; label: string; allowGenericProgressLine?: boolean; mentionSafety?: boolean; @@ -880,7 +878,7 @@ async function runMatrixToolProgressScenario( ) { const { client, startSince } = await primeMatrixQaDriverScenarioClient(context); const startObservedIndex = context.observedEvents.length; - await writeMatrixToolProgressTaskFile(context, params.finalText, params.taskFilename); + await writeMatrixToolProgressTaskFile(context, params.finalText); const triggerBody = params.triggerBodyBuilder(context.sutUserId, params.finalText); const driverEventId = await client.sendTextMessage({ body: triggerBody, @@ -1188,13 +1186,12 @@ async function runMatrixToolProgressScenario( async function writeMatrixToolProgressTaskFile( context: MatrixQaScenarioContext, finalText: string, - taskFilename = MATRIX_QA_TOOL_PROGRESS_TASK_FILENAME, ) { if (!context.gatewayWorkspaceDir) { return; } await writeFile( - path.join(context.gatewayWorkspaceDir, taskFilename), + path.join(context.gatewayWorkspaceDir, MATRIX_QA_TOOL_PROGRESS_TASK_FILENAME), `${buildMatrixToolProgressTaskContent(finalText)}\n`, "utf8", ); @@ -1231,7 +1228,6 @@ export async function runToolProgressMentionSafetyScenario(context: MatrixQaScen finalText: buildMatrixQaToken("MATRIX_QA_TOOL_PROGRESS_MENTION_SAFE"), label: "tool progress mention safety", mentionSafety: true, - taskFilename: MATRIX_QA_TOOL_PROGRESS_MENTION_FILENAME, progressPattern: /@room|@alice:matrix-qa\.test|!room:matrix-qa\.test/i, triggerBodyBuilder: buildMatrixToolProgressMentionSafetyPrompt, }); diff --git a/extensions/qa-matrix/src/runners/contract/scenario-runtime-shared.ts b/extensions/qa-matrix/src/runners/contract/scenario-runtime-shared.ts index fb7a2f8ba48..f4b7ed25c09 100644 --- a/extensions/qa-matrix/src/runners/contract/scenario-runtime-shared.ts +++ b/extensions/qa-matrix/src/runners/contract/scenario-runtime-shared.ts @@ -121,10 +121,12 @@ export function buildMatrixToolProgressErrorPrompt(sutUserId: string, text: stri } export function buildMatrixToolProgressMentionSafetyPrompt(sutUserId: string, text: string) { + const command = `rg -n '${MATRIX_QA_TOOL_PROGRESS_MENTION_FILENAME}' . ; sleep 2`; return [ - `${sutUserId} Tool progress QA check: call the read tool exactly once on \`${MATRIX_QA_TOOL_PROGRESS_MENTION_FILENAME}\` before answering.`, - `Do not use search for this check.`, - `After the read completes, reply exactly \`${text}\`.`, + `${sutUserId} Tool progress QA check: call the exec tool exactly once with this exact command before answering: \`${command}\`.`, + `The QA harness must observe that exec tool call in a Matrix tool-progress preview.`, + `Do not guess or send any marker before the tool result returns.`, + `After that exec command completes or fails, reply exactly \`${text}\`.`, ].join(" "); } diff --git a/extensions/qa-matrix/src/runners/contract/scenarios.test.ts b/extensions/qa-matrix/src/runners/contract/scenarios.test.ts index 8e55b2259e5..9dafd990873 100644 --- a/extensions/qa-matrix/src/runners/contract/scenarios.test.ts +++ b/extensions/qa-matrix/src/runners/contract/scenarios.test.ts @@ -3865,87 +3865,73 @@ describe("matrix live qa scenarios", () => { it("keeps Matrix-looking tool progress mentions inert in partial previews", async () => { const previewEventId = "$tool-progress-mention-preview"; - const gatewayWorkspaceDir = await mkdtemp(path.join(os.tmpdir(), "matrix-qa-workspace-")); - try { - const { sendTextMessage } = mockMatrixQaRoomClient({ - driverEventId: "$tool-progress-mention-trigger", - events: [ - { - event: matrixQaMessageEvent({ - kind: "message", + const { sendTextMessage } = mockMatrixQaRoomClient({ + driverEventId: "$tool-progress-mention-trigger", + events: [ + { + event: matrixQaMessageEvent({ + kind: "message", + eventId: previewEventId, + body: "Working...\n- `tool: exec`", + }), + since: "driver-sync-preview", + }, + { + event: matrixQaMessageEvent({ + kind: "message", + eventId: "$tool-progress-mention-edit", + body: + 'Working...\n- `search "matrix-progress-@room-@alice:matrix-qa.test-!room:matrix-qa.test.txt" in . -> run sleep 2`', + formattedBody: + 'Working...
', + mentions: {}, + relatesTo: { + relType: "m.replace", eventId: previewEventId, - body: "Working...\n- `tool: read`", - }), - since: "driver-sync-preview", - }, - { - event: matrixQaMessageEvent({ + }, + }), + since: "driver-sync-progress", + }, + { + event: ({ sendTextMessage }) => + matrixQaMessageEvent({ kind: "message", - eventId: "$tool-progress-mention-edit", - body: "Working...\n- `tool: read`\n- `read from matrix-progress-@room-@alice:matrix-qa.test-!room:matrix-qa.test.txt`", - formattedBody: - "Working...
", - mentions: {}, + eventId: "$tool-progress-mention-final", + body: readMatrixQaReplyDirective( + mockMessageBody(sendTextMessage, "sendTextMessage"), + "MATRIX_QA_TOOL_PROGRESS_MENTION_SAFE_FIXED", + ), relatesTo: { relType: "m.replace", eventId: previewEventId, }, }), - since: "driver-sync-progress", - }, - { - event: ({ sendTextMessage }) => - matrixQaMessageEvent({ - kind: "message", - eventId: "$tool-progress-mention-final", - body: readMatrixQaReplyDirective( - mockMessageBody(sendTextMessage, "sendTextMessage"), - "MATRIX_QA_TOOL_PROGRESS_MENTION_SAFE_FIXED", - ), - relatesTo: { - relType: "m.replace", - eventId: previewEventId, - }, - }), - since: "driver-sync-next", - }, - ], - }); + since: "driver-sync-next", + }, + ], + }); - const scenario = requireMatrixQaScenario("matrix-room-tool-progress-mention-safety"); + const scenario = requireMatrixQaScenario("matrix-room-tool-progress-mention-safety"); - const result = await runMatrixQaScenario(scenario, { - ...matrixQaScenarioContext(), - gatewayWorkspaceDir, - }); - const artifacts = result.artifacts as { - driverEventId?: unknown; - previewEventId?: unknown; - previewMentions?: unknown; - reply?: { eventId?: unknown }; - token?: unknown; - }; - expect(artifacts.driverEventId).toBe("$tool-progress-mention-trigger"); - expect(artifacts.previewEventId).toBe("$tool-progress-mention-preview"); - expect(artifacts.previewMentions).toEqual({}); - expect(artifacts.reply?.eventId).toBe("$tool-progress-mention-final"); - const prompt = mockMessageBody(sendTextMessage, "sendTextMessage"); - expect(prompt).toContain( - "call the read tool exactly once on `matrix-progress-@room-@alice:matrix-qa.test-!room:matrix-qa.test.txt`", - ); - expect(prompt).toContain("Do not use search for this check."); - await expect( - readFile( - path.join( - gatewayWorkspaceDir, - "matrix-progress-@room-@alice:matrix-qa.test-!room:matrix-qa.test.txt", - ), - "utf8", - ), - ).resolves.toContain(String(artifacts.token)); - } finally { - await rm(gatewayWorkspaceDir, { force: true, recursive: true }); - } + const result = await runMatrixQaScenario(scenario, matrixQaScenarioContext()); + const artifacts = result.artifacts as { + driverEventId?: unknown; + previewEventId?: unknown; + previewMentions?: unknown; + reply?: { eventId?: unknown }; + }; + expect(artifacts.driverEventId).toBe("$tool-progress-mention-trigger"); + expect(artifacts.previewEventId).toBe("$tool-progress-mention-preview"); + expect(artifacts.previewMentions).toEqual({}); + expect(artifacts.reply?.eventId).toBe("$tool-progress-mention-final"); + const prompt = mockMessageBody(sendTextMessage, "sendTextMessage"); + expect(prompt).toContain( + "call the exec tool exactly once with this exact command before answering", + ); + expect(prompt).toContain( + "`rg -n 'matrix-progress-@room-@alice:matrix-qa.test-!room:matrix-qa.test.txt' . ; sleep 2`", + ); + expect(prompt).toContain("The QA harness must observe that exec tool call"); }); it("preserves separate finalized block events when Matrix block streaming is enabled", async () => {