diff --git a/extensions/qa-lab/src/providers/mock-openai/server.test.ts b/extensions/qa-lab/src/providers/mock-openai/server.test.ts index 8d0dace3ba9..3e10546b3eb 100644 --- a/extensions/qa-lab/src/providers/mock-openai/server.test.ts +++ b/extensions/qa-lab/src/providers/mock-openai/server.test.ts @@ -206,6 +206,21 @@ describe("qa mock openai server", () => { expect(quietBody).toContain('"phase":"final_answer"'); expect(quietBody).toContain("QA_STREAMING_OK"); + const partialResponse = await fetch(`${server.baseUrl}/v1/responses`, { + method: "POST", + headers: { + "content-type": "application/json", + }, + body: JSON.stringify({ + stream: true, + input: [makeUserInput("Partial streaming QA check: reply exactly `QA_PARTIAL_OK`.")], + }), + }); + expect(partialResponse.status).toBe(200); + const partialBody = await partialResponse.text(); + expect(partialBody).toContain('"type":"response.output_text.delta"'); + expect(partialBody).toContain("QA_PARTIAL_OK"); + const blockResponse = await fetch(`${server.baseUrl}/v1/responses`, { method: "POST", headers: { @@ -228,6 +243,113 @@ describe("qa mock openai server", () => { expect(blockBody).toContain("BLOCK_TWO_OK"); }); + it("plans deterministic tool-progress reads from prompt paths", async () => { + const server = await startMockServer(); + + const response = await fetch(`${server.baseUrl}/v1/responses`, { + method: "POST", + headers: { + "content-type": "application/json", + }, + body: JSON.stringify({ + stream: true, + input: [ + makeUserInput( + "Tool progress QA check: read `qa-progress-target.txt` before answering. After the read completes, reply exactly `TOOL_PROGRESS_OK`.", + ), + ], + }), + }); + + expect(response.status).toBe(200); + const body = await response.text(); + expect(body).toContain('"name":"read"'); + expect(body).toContain("qa-progress-target.txt"); + }); + + it("requires deterministic tool-progress error prompts to observe a failed tool", async () => { + const server = await startMockServer(); + const prompt = + "Tool progress error QA check: read `missing-tool-progress-target.txt` before answering. After the read fails, reply exactly `TOOL_PROGRESS_ERROR_OK`."; + + const toolPlan = await fetch(`${server.baseUrl}/v1/responses`, { + method: "POST", + headers: { + "content-type": "application/json", + }, + body: JSON.stringify({ + stream: true, + input: [makeUserInput(prompt)], + }), + }); + + expect(toolPlan.status).toBe(200); + const toolPlanBody = await toolPlan.text(); + expect(toolPlanBody).toContain('"name":"read"'); + expect(toolPlanBody).toContain("missing-tool-progress-target.txt"); + + const successOutput = await expectResponsesJson<{ + output: Array<{ content?: Array<{ text?: string }> }>; + }>(server, { + stream: false, + input: [ + makeUserInput(prompt), + { + type: "function_call_output", + call_id: "call_mock_read_1", + output: JSON.stringify({ text: "unexpected success" }), + }, + ], + }); + expect(successOutput.output[0]?.content?.[0]?.text).toBe("BUG-TOOL-DID-NOT-FAIL"); + + const errorOutput = await expectResponsesJson<{ + output: Array<{ content?: Array<{ text?: string }> }>; + }>(server, { + stream: false, + input: [ + makeUserInput(prompt), + { + type: "function_call_output", + call_id: "call_mock_read_1", + output: JSON.stringify({ error: "ENOENT: no such file or directory" }), + }, + ], + }); + expect(errorOutput.output[0]?.content?.[0]?.text).toBe("TOOL_PROGRESS_ERROR_OK"); + }); + + it("uses the latest user prompt path for tool-progress plans", async () => { + const server = await startMockServer(); + + const response = await fetch(`${server.baseUrl}/v1/responses`, { + method: "POST", + headers: { + "content-type": "application/json", + }, + body: JSON.stringify({ + stream: true, + input: [ + makeUserInput( + "Tool progress QA check: read `older-progress-target.txt` before answering. After the read completes, reply exactly `OLD_PROGRESS_OK`.", + ), + makeUserInput( + "Tool progress error QA check: read `latest-missing-progress-target.txt` before answering. After the read fails, reply exactly `LATEST_PROGRESS_OK`.", + ), + makeUserInput( + "Continue with the QA scenario plan and report worked, failed, and blocked items.", + ), + ], + }), + }); + + expect(response.status).toBe(200); + const body = await response.text(); + expect(body).toContain('"name":"read"'); + expect(body).toContain("latest-missing-progress-target.txt"); + expect(body).not.toContain("older-progress-target.txt"); + }); + it("prefers path-like refs over generic quoted keys in prompts", async () => { const server = await startQaMockOpenAiServer({ host: "127.0.0.1", @@ -1674,7 +1796,7 @@ describe("qa mock openai server", () => { content: [ { type: "input_text", - text: "@qa-sut:matrix-qa.test reply with only this exact marker: MATRIX_QA_CANARY_TEST", + text: "@qa-sut.example.test reply with only this exact marker: QA_CANARY_TEST", }, ], }, @@ -1695,7 +1817,7 @@ describe("qa mock openai server", () => { expect(await response.json()).toMatchObject({ output: [ { - content: [{ text: "MATRIX_QA_CANARY_TEST" }], + content: [{ text: "QA_CANARY_TEST" }], }, ], }); @@ -1710,8 +1832,8 @@ describe("qa mock openai server", () => { await server.stop(); }); - const matrixPrompt = - "@qa-sut:matrix-qa.test Image generation check: generate a QA lighthouse image and summarize it in one short sentence."; + const channelPrompt = + "@qa-sut.example.test Image generation check: generate a QA lighthouse image and summarize it in one short sentence."; const genericPrompt = "Continue with the QA scenario plan and report worked, failed, and blocked items."; @@ -1722,7 +1844,7 @@ describe("qa mock openai server", () => { }, body: JSON.stringify({ stream: false, - input: [makeUserInput(matrixPrompt), makeUserInput(genericPrompt)], + input: [makeUserInput(channelPrompt), makeUserInput(genericPrompt)], }), }); @@ -1745,7 +1867,7 @@ describe("qa mock openai server", () => { body: JSON.stringify({ stream: false, input: [ - makeUserInput(matrixPrompt), + makeUserInput(channelPrompt), makeUserInput(genericPrompt), { type: "function_call", diff --git a/extensions/qa-lab/src/providers/mock-openai/server.ts b/extensions/qa-lab/src/providers/mock-openai/server.ts index 7992a6ece48..22adf2515c2 100644 --- a/extensions/qa-lab/src/providers/mock-openai/server.ts +++ b/extensions/qa-lab/src/providers/mock-openai/server.ts @@ -145,8 +145,10 @@ const QA_THINKING_VISIBILITY_OFF_PROMPT_RE = /qa thinking visibility check off/i const QA_THINKING_VISIBILITY_MAX_PROMPT_RE = /qa thinking visibility check max/i; const QA_EMPTY_RESPONSE_RECOVERY_PROMPT_RE = /empty response continuation qa check/i; const QA_EMPTY_RESPONSE_EXHAUSTION_PROMPT_RE = /empty response exhaustion qa check/i; -const QA_QUIET_STREAMING_PROMPT_RE = /quiet streaming qa check/i; +const QA_STREAMING_PROMPT_RE = /(?:partial|quiet) streaming qa check/i; const QA_BLOCK_STREAMING_PROMPT_RE = /block streaming qa check/i; +const QA_TOOL_PROGRESS_ERROR_PROMPT_RE = /tool progress error qa check/i; +const QA_TOOL_PROGRESS_PROMPT_RE = /tool progress qa check/i; const QA_SUBAGENT_DIRECT_FALLBACK_PROMPT_RE = /subagent direct fallback qa check/i; const QA_SUBAGENT_DIRECT_FALLBACK_WORKER_RE = /subagent direct fallback worker/i; const QA_SUBAGENT_DIRECT_FALLBACK_MARKER = "QA-SUBAGENT-DIRECT-FALLBACK-OK"; @@ -530,6 +532,16 @@ function extractLastCapture(text: string, pattern: RegExp) { return lastMatch?.[1]?.trim() || null; } +function extractLastMatchingUserText(texts: string[], pattern: RegExp) { + for (let index = texts.length - 1; index >= 0; index -= 1) { + const text = texts[index] ?? ""; + if (pattern.test(text)) { + return text; + } + } + return ""; +} + function extractExactReplyDirective(text: string) { const backtickedMatch = extractLastCapture(text, /reply(?: with)? exactly\s+`([^`]+)`/i); if (backtickedMatch) { @@ -642,6 +654,19 @@ function extractToolErrorForNamedCall(params: { return undefined; } +function hasToolErrorOutput(toolJson: Record | null, toolOutput: string) { + if (typeof toolJson?.error === "string" && toolJson.error.trim()) { + return true; + } + if ( + typeof toolJson?.status === "string" && + /\b(?:error|failed|failure)\b/i.test(toolJson.status) + ) { + return true; + } + return /\b(?:error|failed|failure|not found|no such file|enoent)\b/i.test(toolOutput); +} + function isHeartbeatPrompt(text: string) { const trimmed = text.trim(); if (!trimmed || /remember this fact/i.test(trimmed)) { @@ -1165,6 +1190,12 @@ async function buildResponsesPayload( const hasEmptyResponseRetryInstruction = allInputText.includes(QA_EMPTY_RESPONSE_RETRY_NEEDLE); const canCallSessionsSpawn = hasDeclaredTool(body, "sessions_spawn"); const canCallSessionsYield = hasDeclaredTool(body, "sessions_yield"); + const buildToolProgressReadEvents = (pattern: RegExp) => { + const toolProgressPrompt = extractLastMatchingUserText(extractAllUserTexts(input), pattern); + return buildToolCallEventsWithArgs("read", { + path: readTargetFromPrompt(toolProgressPrompt || prompt || allInputText), + }); + }; if ( allInputText.includes(QA_SUBAGENT_DIRECT_FALLBACK_MARKER) && /Internal task completion event/i.test(allInputText) @@ -1250,7 +1281,7 @@ async function buildResponsesPayload( } return buildAssistantEvents(""); } - if (QA_QUIET_STREAMING_PROMPT_RE.test(allInputText) && exactReplyDirective) { + if (QA_STREAMING_PROMPT_RE.test(allInputText) && exactReplyDirective) { return buildAssistantEvents([ { id: "msg_mock_quiet_stream", @@ -1260,6 +1291,20 @@ async function buildResponsesPayload( }, ]); } + if (QA_TOOL_PROGRESS_ERROR_PROMPT_RE.test(allInputText) && exactReplyDirective) { + if (!toolOutput) { + return buildToolProgressReadEvents(QA_TOOL_PROGRESS_ERROR_PROMPT_RE); + } + return buildAssistantEvents( + hasToolErrorOutput(toolJson, toolOutput) ? exactReplyDirective : "BUG-TOOL-DID-NOT-FAIL", + ); + } + if (QA_TOOL_PROGRESS_PROMPT_RE.test(allInputText) && exactReplyDirective) { + if (!toolOutput) { + return buildToolProgressReadEvents(QA_TOOL_PROGRESS_PROMPT_RE); + } + return buildAssistantEvents(exactReplyDirective); + } if ( QA_BLOCK_STREAMING_PROMPT_RE.test(allInputText) && firstExactMarkerDirective &&