qa-lab: generalize tool progress prompts

2026-05-06 15:50:46 +00:00 · 2026-04-27 16:19:20 -04:00
parent 24068f19c6
commit 3132f4990c
2 changed files with 175 additions and 8 deletions
--- a/extensions/qa-lab/src/providers/mock-openai/server.test.ts
+++ b/extensions/qa-lab/src/providers/mock-openai/server.test.ts
@@ -206,6 +206,21 @@ describe("qa mock openai server", () => {
    expect(quietBody).toContain('"phase":"final_answer"');
    expect(quietBody).toContain("QA_STREAMING_OK");

+    const partialResponse = await fetch(`${server.baseUrl}/v1/responses`, {
+      method: "POST",
+      headers: {
+        "content-type": "application/json",
+      },
+      body: JSON.stringify({
+        stream: true,
+        input: [makeUserInput("Partial streaming QA check: reply exactly `QA_PARTIAL_OK`.")],
+      }),
+    });
+    expect(partialResponse.status).toBe(200);
+    const partialBody = await partialResponse.text();
+    expect(partialBody).toContain('"type":"response.output_text.delta"');
+    expect(partialBody).toContain("QA_PARTIAL_OK");
+
    const blockResponse = await fetch(`${server.baseUrl}/v1/responses`, {
      method: "POST",
      headers: {
@@ -228,6 +243,113 @@ describe("qa mock openai server", () => {
    expect(blockBody).toContain("BLOCK_TWO_OK");
  });

+  it("plans deterministic tool-progress reads from prompt paths", async () => {
+    const server = await startMockServer();
+
+    const response = await fetch(`${server.baseUrl}/v1/responses`, {
+      method: "POST",
+      headers: {
+        "content-type": "application/json",
+      },
+      body: JSON.stringify({
+        stream: true,
+        input: [
+          makeUserInput(
+            "Tool progress QA check: read `qa-progress-target.txt` before answering. After the read completes, reply exactly `TOOL_PROGRESS_OK`.",
+          ),
+        ],
+      }),
+    });
+
+    expect(response.status).toBe(200);
+    const body = await response.text();
+    expect(body).toContain('"name":"read"');
+    expect(body).toContain("qa-progress-target.txt");
+  });
+
+  it("requires deterministic tool-progress error prompts to observe a failed tool", async () => {
+    const server = await startMockServer();
+    const prompt =
+      "Tool progress error QA check: read `missing-tool-progress-target.txt` before answering. After the read fails, reply exactly `TOOL_PROGRESS_ERROR_OK`.";
+
+    const toolPlan = await fetch(`${server.baseUrl}/v1/responses`, {
+      method: "POST",
+      headers: {
+        "content-type": "application/json",
+      },
+      body: JSON.stringify({
+        stream: true,
+        input: [makeUserInput(prompt)],
+      }),
+    });
+
+    expect(toolPlan.status).toBe(200);
+    const toolPlanBody = await toolPlan.text();
+    expect(toolPlanBody).toContain('"name":"read"');
+    expect(toolPlanBody).toContain("missing-tool-progress-target.txt");
+
+    const successOutput = await expectResponsesJson<{
+      output: Array<{ content?: Array<{ text?: string }> }>;
+    }>(server, {
+      stream: false,
+      input: [
+        makeUserInput(prompt),
+        {
+          type: "function_call_output",
+          call_id: "call_mock_read_1",
+          output: JSON.stringify({ text: "unexpected success" }),
+        },
+      ],
+    });
+    expect(successOutput.output[0]?.content?.[0]?.text).toBe("BUG-TOOL-DID-NOT-FAIL");
+
+    const errorOutput = await expectResponsesJson<{
+      output: Array<{ content?: Array<{ text?: string }> }>;
+    }>(server, {
+      stream: false,
+      input: [
+        makeUserInput(prompt),
+        {
+          type: "function_call_output",
+          call_id: "call_mock_read_1",
+          output: JSON.stringify({ error: "ENOENT: no such file or directory" }),
+        },
+      ],
+    });
+    expect(errorOutput.output[0]?.content?.[0]?.text).toBe("TOOL_PROGRESS_ERROR_OK");
+  });
+
+  it("uses the latest user prompt path for tool-progress plans", async () => {
+    const server = await startMockServer();
+
+    const response = await fetch(`${server.baseUrl}/v1/responses`, {
+      method: "POST",
+      headers: {
+        "content-type": "application/json",
+      },
+      body: JSON.stringify({
+        stream: true,
+        input: [
+          makeUserInput(
+            "Tool progress QA check: read `older-progress-target.txt` before answering. After the read completes, reply exactly `OLD_PROGRESS_OK`.",
+          ),
+          makeUserInput(
+            "Tool progress error QA check: read `latest-missing-progress-target.txt` before answering. After the read fails, reply exactly `LATEST_PROGRESS_OK`.",
+          ),
+          makeUserInput(
+            "Continue with the QA scenario plan and report worked, failed, and blocked items.",
+          ),
+        ],
+      }),
+    });
+
+    expect(response.status).toBe(200);
+    const body = await response.text();
+    expect(body).toContain('"name":"read"');
+    expect(body).toContain("latest-missing-progress-target.txt");
+    expect(body).not.toContain("older-progress-target.txt");
+  });
+
  it("prefers path-like refs over generic quoted keys in prompts", async () => {
    const server = await startQaMockOpenAiServer({
      host: "127.0.0.1",
@@ -1674,7 +1796,7 @@ describe("qa mock openai server", () => {
            content: [
              {
                type: "input_text",
-                text: "@qa-sut:matrix-qa.test reply with only this exact marker: MATRIX_QA_CANARY_TEST",
+                text: "@qa-sut.example.test reply with only this exact marker: QA_CANARY_TEST",
              },
            ],
          },
@@ -1695,7 +1817,7 @@ describe("qa mock openai server", () => {
    expect(await response.json()).toMatchObject({
      output: [
        {
-          content: [{ text: "MATRIX_QA_CANARY_TEST" }],
+          content: [{ text: "QA_CANARY_TEST" }],
        },
      ],
    });
@@ -1710,8 +1832,8 @@ describe("qa mock openai server", () => {
      await server.stop();
    });

-    const matrixPrompt =
-      "@qa-sut:matrix-qa.test Image generation check: generate a QA lighthouse image and summarize it in one short sentence.";
+    const channelPrompt =
+      "@qa-sut.example.test Image generation check: generate a QA lighthouse image and summarize it in one short sentence.";
    const genericPrompt =
      "Continue with the QA scenario plan and report worked, failed, and blocked items.";

@@ -1722,7 +1844,7 @@ describe("qa mock openai server", () => {
      },
      body: JSON.stringify({
        stream: false,
-        input: [makeUserInput(matrixPrompt), makeUserInput(genericPrompt)],
+        input: [makeUserInput(channelPrompt), makeUserInput(genericPrompt)],
      }),
    });

@@ -1745,7 +1867,7 @@ describe("qa mock openai server", () => {
      body: JSON.stringify({
        stream: false,
        input: [
-          makeUserInput(matrixPrompt),
+          makeUserInput(channelPrompt),
          makeUserInput(genericPrompt),
          {
            type: "function_call",
--- a/extensions/qa-lab/src/providers/mock-openai/server.ts
+++ b/extensions/qa-lab/src/providers/mock-openai/server.ts
@@ -145,8 +145,10 @@ const QA_THINKING_VISIBILITY_OFF_PROMPT_RE = /qa thinking visibility check off/i
 const QA_THINKING_VISIBILITY_MAX_PROMPT_RE = /qa thinking visibility check max/i;
 const QA_EMPTY_RESPONSE_RECOVERY_PROMPT_RE = /empty response continuation qa check/i;
 const QA_EMPTY_RESPONSE_EXHAUSTION_PROMPT_RE = /empty response exhaustion qa check/i;
-const QA_QUIET_STREAMING_PROMPT_RE = /quiet streaming qa check/i;
+const QA_STREAMING_PROMPT_RE = /(?:partial|quiet) streaming qa check/i;
 const QA_BLOCK_STREAMING_PROMPT_RE = /block streaming qa check/i;
+const QA_TOOL_PROGRESS_ERROR_PROMPT_RE = /tool progress error qa check/i;
+const QA_TOOL_PROGRESS_PROMPT_RE = /tool progress qa check/i;
 const QA_SUBAGENT_DIRECT_FALLBACK_PROMPT_RE = /subagent direct fallback qa check/i;
 const QA_SUBAGENT_DIRECT_FALLBACK_WORKER_RE = /subagent direct fallback worker/i;
 const QA_SUBAGENT_DIRECT_FALLBACK_MARKER = "QA-SUBAGENT-DIRECT-FALLBACK-OK";
@@ -530,6 +532,16 @@ function extractLastCapture(text: string, pattern: RegExp) {
  return lastMatch?.[1]?.trim() || null;
 }

+function extractLastMatchingUserText(texts: string[], pattern: RegExp) {
+  for (let index = texts.length - 1; index >= 0; index -= 1) {
+    const text = texts[index] ?? "";
+    if (pattern.test(text)) {
+      return text;
+    }
+  }
+  return "";
+}
+
 function extractExactReplyDirective(text: string) {
  const backtickedMatch = extractLastCapture(text, /reply(?: with)? exactly\s+`([^`]+)`/i);
  if (backtickedMatch) {
@@ -642,6 +654,19 @@ function extractToolErrorForNamedCall(params: {
  return undefined;
 }

+function hasToolErrorOutput(toolJson: Record<string, unknown> | null, toolOutput: string) {
+  if (typeof toolJson?.error === "string" && toolJson.error.trim()) {
+    return true;
+  }
+  if (
+    typeof toolJson?.status === "string" &&
+    /\b(?:error|failed|failure)\b/i.test(toolJson.status)
+  ) {
+    return true;
+  }
+  return /\b(?:error|failed|failure|not found|no such file|enoent)\b/i.test(toolOutput);
+}
+
 function isHeartbeatPrompt(text: string) {
  const trimmed = text.trim();
  if (!trimmed || /remember this fact/i.test(trimmed)) {
@@ -1165,6 +1190,12 @@ async function buildResponsesPayload(
  const hasEmptyResponseRetryInstruction = allInputText.includes(QA_EMPTY_RESPONSE_RETRY_NEEDLE);
  const canCallSessionsSpawn = hasDeclaredTool(body, "sessions_spawn");
  const canCallSessionsYield = hasDeclaredTool(body, "sessions_yield");
+  const buildToolProgressReadEvents = (pattern: RegExp) => {
+    const toolProgressPrompt = extractLastMatchingUserText(extractAllUserTexts(input), pattern);
+    return buildToolCallEventsWithArgs("read", {
+      path: readTargetFromPrompt(toolProgressPrompt || prompt || allInputText),
+    });
+  };
  if (
    allInputText.includes(QA_SUBAGENT_DIRECT_FALLBACK_MARKER) &&
    /Internal task completion event/i.test(allInputText)
@@ -1250,7 +1281,7 @@ async function buildResponsesPayload(
    }
    return buildAssistantEvents("");
  }
-  if (QA_QUIET_STREAMING_PROMPT_RE.test(allInputText) && exactReplyDirective) {
+  if (QA_STREAMING_PROMPT_RE.test(allInputText) && exactReplyDirective) {
    return buildAssistantEvents([
      {
        id: "msg_mock_quiet_stream",
@@ -1260,6 +1291,20 @@ async function buildResponsesPayload(
      },
    ]);
  }
+  if (QA_TOOL_PROGRESS_ERROR_PROMPT_RE.test(allInputText) && exactReplyDirective) {
+    if (!toolOutput) {
+      return buildToolProgressReadEvents(QA_TOOL_PROGRESS_ERROR_PROMPT_RE);
+    }
+    return buildAssistantEvents(
+      hasToolErrorOutput(toolJson, toolOutput) ? exactReplyDirective : "BUG-TOOL-DID-NOT-FAIL",
+    );
+  }
+  if (QA_TOOL_PROGRESS_PROMPT_RE.test(allInputText) && exactReplyDirective) {
+    if (!toolOutput) {
+      return buildToolProgressReadEvents(QA_TOOL_PROGRESS_PROMPT_RE);
+    }
+    return buildAssistantEvents(exactReplyDirective);
+  }
  if (
    QA_BLOCK_STREAMING_PROMPT_RE.test(allInputText) &&
    firstExactMarkerDirective &&