qa-lab: generalize tool progress prompts

This commit is contained in:
Gustavo Madeira Santana
2026-04-27 16:19:20 -04:00
parent 24068f19c6
commit 3132f4990c
2 changed files with 175 additions and 8 deletions

View File

@@ -206,6 +206,21 @@ describe("qa mock openai server", () => {
expect(quietBody).toContain('"phase":"final_answer"');
expect(quietBody).toContain("QA_STREAMING_OK");
const partialResponse = await fetch(`${server.baseUrl}/v1/responses`, {
method: "POST",
headers: {
"content-type": "application/json",
},
body: JSON.stringify({
stream: true,
input: [makeUserInput("Partial streaming QA check: reply exactly `QA_PARTIAL_OK`.")],
}),
});
expect(partialResponse.status).toBe(200);
const partialBody = await partialResponse.text();
expect(partialBody).toContain('"type":"response.output_text.delta"');
expect(partialBody).toContain("QA_PARTIAL_OK");
const blockResponse = await fetch(`${server.baseUrl}/v1/responses`, {
method: "POST",
headers: {
@@ -228,6 +243,113 @@ describe("qa mock openai server", () => {
expect(blockBody).toContain("BLOCK_TWO_OK");
});
it("plans deterministic tool-progress reads from prompt paths", async () => {
const server = await startMockServer();
const response = await fetch(`${server.baseUrl}/v1/responses`, {
method: "POST",
headers: {
"content-type": "application/json",
},
body: JSON.stringify({
stream: true,
input: [
makeUserInput(
"Tool progress QA check: read `qa-progress-target.txt` before answering. After the read completes, reply exactly `TOOL_PROGRESS_OK`.",
),
],
}),
});
expect(response.status).toBe(200);
const body = await response.text();
expect(body).toContain('"name":"read"');
expect(body).toContain("qa-progress-target.txt");
});
it("requires deterministic tool-progress error prompts to observe a failed tool", async () => {
const server = await startMockServer();
const prompt =
"Tool progress error QA check: read `missing-tool-progress-target.txt` before answering. After the read fails, reply exactly `TOOL_PROGRESS_ERROR_OK`.";
const toolPlan = await fetch(`${server.baseUrl}/v1/responses`, {
method: "POST",
headers: {
"content-type": "application/json",
},
body: JSON.stringify({
stream: true,
input: [makeUserInput(prompt)],
}),
});
expect(toolPlan.status).toBe(200);
const toolPlanBody = await toolPlan.text();
expect(toolPlanBody).toContain('"name":"read"');
expect(toolPlanBody).toContain("missing-tool-progress-target.txt");
const successOutput = await expectResponsesJson<{
output: Array<{ content?: Array<{ text?: string }> }>;
}>(server, {
stream: false,
input: [
makeUserInput(prompt),
{
type: "function_call_output",
call_id: "call_mock_read_1",
output: JSON.stringify({ text: "unexpected success" }),
},
],
});
expect(successOutput.output[0]?.content?.[0]?.text).toBe("BUG-TOOL-DID-NOT-FAIL");
const errorOutput = await expectResponsesJson<{
output: Array<{ content?: Array<{ text?: string }> }>;
}>(server, {
stream: false,
input: [
makeUserInput(prompt),
{
type: "function_call_output",
call_id: "call_mock_read_1",
output: JSON.stringify({ error: "ENOENT: no such file or directory" }),
},
],
});
expect(errorOutput.output[0]?.content?.[0]?.text).toBe("TOOL_PROGRESS_ERROR_OK");
});
it("uses the latest user prompt path for tool-progress plans", async () => {
const server = await startMockServer();
const response = await fetch(`${server.baseUrl}/v1/responses`, {
method: "POST",
headers: {
"content-type": "application/json",
},
body: JSON.stringify({
stream: true,
input: [
makeUserInput(
"Tool progress QA check: read `older-progress-target.txt` before answering. After the read completes, reply exactly `OLD_PROGRESS_OK`.",
),
makeUserInput(
"Tool progress error QA check: read `latest-missing-progress-target.txt` before answering. After the read fails, reply exactly `LATEST_PROGRESS_OK`.",
),
makeUserInput(
"Continue with the QA scenario plan and report worked, failed, and blocked items.",
),
],
}),
});
expect(response.status).toBe(200);
const body = await response.text();
expect(body).toContain('"name":"read"');
expect(body).toContain("latest-missing-progress-target.txt");
expect(body).not.toContain("older-progress-target.txt");
});
it("prefers path-like refs over generic quoted keys in prompts", async () => {
const server = await startQaMockOpenAiServer({
host: "127.0.0.1",
@@ -1674,7 +1796,7 @@ describe("qa mock openai server", () => {
content: [
{
type: "input_text",
text: "@qa-sut:matrix-qa.test reply with only this exact marker: MATRIX_QA_CANARY_TEST",
text: "@qa-sut.example.test reply with only this exact marker: QA_CANARY_TEST",
},
],
},
@@ -1695,7 +1817,7 @@ describe("qa mock openai server", () => {
expect(await response.json()).toMatchObject({
output: [
{
content: [{ text: "MATRIX_QA_CANARY_TEST" }],
content: [{ text: "QA_CANARY_TEST" }],
},
],
});
@@ -1710,8 +1832,8 @@ describe("qa mock openai server", () => {
await server.stop();
});
const matrixPrompt =
"@qa-sut:matrix-qa.test Image generation check: generate a QA lighthouse image and summarize it in one short sentence.";
const channelPrompt =
"@qa-sut.example.test Image generation check: generate a QA lighthouse image and summarize it in one short sentence.";
const genericPrompt =
"Continue with the QA scenario plan and report worked, failed, and blocked items.";
@@ -1722,7 +1844,7 @@ describe("qa mock openai server", () => {
},
body: JSON.stringify({
stream: false,
input: [makeUserInput(matrixPrompt), makeUserInput(genericPrompt)],
input: [makeUserInput(channelPrompt), makeUserInput(genericPrompt)],
}),
});
@@ -1745,7 +1867,7 @@ describe("qa mock openai server", () => {
body: JSON.stringify({
stream: false,
input: [
makeUserInput(matrixPrompt),
makeUserInput(channelPrompt),
makeUserInput(genericPrompt),
{
type: "function_call",

View File

@@ -145,8 +145,10 @@ const QA_THINKING_VISIBILITY_OFF_PROMPT_RE = /qa thinking visibility check off/i
const QA_THINKING_VISIBILITY_MAX_PROMPT_RE = /qa thinking visibility check max/i;
const QA_EMPTY_RESPONSE_RECOVERY_PROMPT_RE = /empty response continuation qa check/i;
const QA_EMPTY_RESPONSE_EXHAUSTION_PROMPT_RE = /empty response exhaustion qa check/i;
const QA_QUIET_STREAMING_PROMPT_RE = /quiet streaming qa check/i;
const QA_STREAMING_PROMPT_RE = /(?:partial|quiet) streaming qa check/i;
const QA_BLOCK_STREAMING_PROMPT_RE = /block streaming qa check/i;
const QA_TOOL_PROGRESS_ERROR_PROMPT_RE = /tool progress error qa check/i;
const QA_TOOL_PROGRESS_PROMPT_RE = /tool progress qa check/i;
const QA_SUBAGENT_DIRECT_FALLBACK_PROMPT_RE = /subagent direct fallback qa check/i;
const QA_SUBAGENT_DIRECT_FALLBACK_WORKER_RE = /subagent direct fallback worker/i;
const QA_SUBAGENT_DIRECT_FALLBACK_MARKER = "QA-SUBAGENT-DIRECT-FALLBACK-OK";
@@ -530,6 +532,16 @@ function extractLastCapture(text: string, pattern: RegExp) {
return lastMatch?.[1]?.trim() || null;
}
function extractLastMatchingUserText(texts: string[], pattern: RegExp) {
for (let index = texts.length - 1; index >= 0; index -= 1) {
const text = texts[index] ?? "";
if (pattern.test(text)) {
return text;
}
}
return "";
}
function extractExactReplyDirective(text: string) {
const backtickedMatch = extractLastCapture(text, /reply(?: with)? exactly\s+`([^`]+)`/i);
if (backtickedMatch) {
@@ -642,6 +654,19 @@ function extractToolErrorForNamedCall(params: {
return undefined;
}
function hasToolErrorOutput(toolJson: Record<string, unknown> | null, toolOutput: string) {
if (typeof toolJson?.error === "string" && toolJson.error.trim()) {
return true;
}
if (
typeof toolJson?.status === "string" &&
/\b(?:error|failed|failure)\b/i.test(toolJson.status)
) {
return true;
}
return /\b(?:error|failed|failure|not found|no such file|enoent)\b/i.test(toolOutput);
}
function isHeartbeatPrompt(text: string) {
const trimmed = text.trim();
if (!trimmed || /remember this fact/i.test(trimmed)) {
@@ -1165,6 +1190,12 @@ async function buildResponsesPayload(
const hasEmptyResponseRetryInstruction = allInputText.includes(QA_EMPTY_RESPONSE_RETRY_NEEDLE);
const canCallSessionsSpawn = hasDeclaredTool(body, "sessions_spawn");
const canCallSessionsYield = hasDeclaredTool(body, "sessions_yield");
const buildToolProgressReadEvents = (pattern: RegExp) => {
const toolProgressPrompt = extractLastMatchingUserText(extractAllUserTexts(input), pattern);
return buildToolCallEventsWithArgs("read", {
path: readTargetFromPrompt(toolProgressPrompt || prompt || allInputText),
});
};
if (
allInputText.includes(QA_SUBAGENT_DIRECT_FALLBACK_MARKER) &&
/Internal task completion event/i.test(allInputText)
@@ -1250,7 +1281,7 @@ async function buildResponsesPayload(
}
return buildAssistantEvents("");
}
if (QA_QUIET_STREAMING_PROMPT_RE.test(allInputText) && exactReplyDirective) {
if (QA_STREAMING_PROMPT_RE.test(allInputText) && exactReplyDirective) {
return buildAssistantEvents([
{
id: "msg_mock_quiet_stream",
@@ -1260,6 +1291,20 @@ async function buildResponsesPayload(
},
]);
}
if (QA_TOOL_PROGRESS_ERROR_PROMPT_RE.test(allInputText) && exactReplyDirective) {
if (!toolOutput) {
return buildToolProgressReadEvents(QA_TOOL_PROGRESS_ERROR_PROMPT_RE);
}
return buildAssistantEvents(
hasToolErrorOutput(toolJson, toolOutput) ? exactReplyDirective : "BUG-TOOL-DID-NOT-FAIL",
);
}
if (QA_TOOL_PROGRESS_PROMPT_RE.test(allInputText) && exactReplyDirective) {
if (!toolOutput) {
return buildToolProgressReadEvents(QA_TOOL_PROGRESS_PROMPT_RE);
}
return buildAssistantEvents(exactReplyDirective);
}
if (
QA_BLOCK_STREAMING_PROMPT_RE.test(allInputText) &&
firstExactMarkerDirective &&