fix(qa): align mock tool progress markers

This commit is contained in:
Vincent Koc
2026-05-03 16:37:52 -07:00
parent b1db87fb36
commit 07b52b4a01
3 changed files with 50 additions and 6 deletions

View File

@@ -267,6 +267,43 @@ describe("qa mock openai server", () => {
expect(body).toContain("qa-progress-target.txt");
});
it("plans deterministic tool-progress reads for exact-marker prompts", async () => {
const server = await startMockServer();
const prompt =
"Tool progress QA check: use the read tool exactly once on `QA_KICKOFF_TASK.md` before answering. After that read completes, reply with only this exact marker and no other text: `TOOL_PROGRESS_MARKER_OK`.";
const toolPlan = await fetch(`${server.baseUrl}/v1/responses`, {
method: "POST",
headers: {
"content-type": "application/json",
},
body: JSON.stringify({
stream: true,
input: [makeUserInput(prompt)],
}),
});
expect(toolPlan.status).toBe(200);
const toolPlanBody = await toolPlan.text();
expect(toolPlanBody).toContain('"name":"read"');
expect(toolPlanBody).toContain("QA_KICKOFF_TASK.md");
const final = await expectResponsesJson<{
output: Array<{ content?: Array<{ text?: string }> }>;
}>(server, {
stream: false,
input: [
makeUserInput(prompt),
{
type: "function_call_output",
call_id: "call_mock_read_1",
output: JSON.stringify({ text: "kickoff task" }),
},
],
});
expect(final.output[0]?.content?.[0]?.text).toBe("TOOL_PROGRESS_MARKER_OK");
});
it("requires deterministic tool-progress error prompts to observe a failed tool", async () => {
const server = await startMockServer();
const prompt =

View File

@@ -562,11 +562,14 @@ function extractFinishExactlyDirective(text: string) {
}
function extractExactMarkerDirective(text: string) {
const backtickedMatch = extractLastCapture(text, /exact marker:\s*`([^`]+)`/i);
const backtickedMatch = extractLastCapture(text, /exact marker\b[^:\n]{0,120}:\s*`([^`]+)`/i);
if (backtickedMatch) {
return backtickedMatch;
}
return extractLastCapture(text, /exact marker:\s*([^\s`.,;:!?]+(?:-[^\s`.,;:!?]+)*)/i);
return extractLastCapture(
text,
/exact marker\b[^:\n]{0,120}:\s*([^\s`.,;:!?]+(?:-[^\s`.,;:!?]+)*)/i,
);
}
function extractLabeledMarkerDirective(text: string, label: string) {
@@ -1294,19 +1297,22 @@ async function buildResponsesPayload(
},
]);
}
if (QA_TOOL_PROGRESS_ERROR_PROMPT_RE.test(allInputText) && exactReplyDirective) {
const toolProgressReplyDirective = exactReplyDirective ?? exactMarkerDirective;
if (QA_TOOL_PROGRESS_ERROR_PROMPT_RE.test(allInputText) && toolProgressReplyDirective) {
if (!toolOutput) {
return buildToolProgressReadEvents(QA_TOOL_PROGRESS_ERROR_PROMPT_RE);
}
return buildAssistantEvents(
hasToolErrorOutput(toolJson, toolOutput) ? exactReplyDirective : "BUG-TOOL-DID-NOT-FAIL",
hasToolErrorOutput(toolJson, toolOutput)
? toolProgressReplyDirective
: "BUG-TOOL-DID-NOT-FAIL",
);
}
if (QA_TOOL_PROGRESS_PROMPT_RE.test(allInputText) && exactReplyDirective) {
if (QA_TOOL_PROGRESS_PROMPT_RE.test(allInputText) && toolProgressReplyDirective) {
if (!toolOutput) {
return buildToolProgressReadEvents(QA_TOOL_PROGRESS_PROMPT_RE);
}
return buildAssistantEvents(exactReplyDirective);
return buildAssistantEvents(toolProgressReplyDirective);
}
if (
QA_BLOCK_STREAMING_PROMPT_RE.test(allInputText) &&