fix(qa): align mock tool progress markers

2026-05-06 06:00:43 +00:00 · 2026-05-03 16:37:52 -07:00
parent b1db87fb36
commit 07b52b4a01
3 changed files with 50 additions and 6 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -34,6 +34,7 @@ Docs: https://docs.openclaw.ai
 ### Fixes

 - Gateway/systemd: preserve operator-added secrets in the Gateway env file across re-stage while clearing OpenClaw-managed keys (such as `OPENCLAW_GATEWAY_TOKEN`) so a fresh staging value is never shadowed by a stale env-file copy; operator secrets are also retained when the state-dir `.env` is empty. Fixes #76860. Thanks @hclsys.
+- QA/Matrix: keep the mock OpenAI tool-progress provider aligned with exact-marker Matrix prompts so the hardened live preview scenario still forces a deterministic read before final delivery. Thanks @vincentkoc.
 - OpenAI/Google Meet: wait for realtime voice `session.updated` before treating the bridge as connected, so Meet joins do not return with audio queued behind an unconfigured realtime session. Thanks @vincentkoc.
 - Plugins/catalog: merge official external catalog descriptors into partial package channel config metadata, so lagging WeCom/Yuanbao manifests keep their own schema while still exposing host-supplied labels and setup text. Thanks @vincentkoc.
 - Plugins/catalog: supplement lagging official external WeCom and Yuanbao npm manifests with channel config descriptors and declared tool contracts from the OpenClaw catalog, so trusted package sweeps no longer fail because external package metadata trails the host contract. Thanks @vincentkoc.
--- a/extensions/qa-lab/src/providers/mock-openai/server.test.ts
+++ b/extensions/qa-lab/src/providers/mock-openai/server.test.ts
@@ -267,6 +267,43 @@ describe("qa mock openai server", () => {
    expect(body).toContain("qa-progress-target.txt");
  });

+  it("plans deterministic tool-progress reads for exact-marker prompts", async () => {
+    const server = await startMockServer();
+    const prompt =
+      "Tool progress QA check: use the read tool exactly once on `QA_KICKOFF_TASK.md` before answering. After that read completes, reply with only this exact marker and no other text: `TOOL_PROGRESS_MARKER_OK`.";
+
+    const toolPlan = await fetch(`${server.baseUrl}/v1/responses`, {
+      method: "POST",
+      headers: {
+        "content-type": "application/json",
+      },
+      body: JSON.stringify({
+        stream: true,
+        input: [makeUserInput(prompt)],
+      }),
+    });
+
+    expect(toolPlan.status).toBe(200);
+    const toolPlanBody = await toolPlan.text();
+    expect(toolPlanBody).toContain('"name":"read"');
+    expect(toolPlanBody).toContain("QA_KICKOFF_TASK.md");
+
+    const final = await expectResponsesJson<{
+      output: Array<{ content?: Array<{ text?: string }> }>;
+    }>(server, {
+      stream: false,
+      input: [
+        makeUserInput(prompt),
+        {
+          type: "function_call_output",
+          call_id: "call_mock_read_1",
+          output: JSON.stringify({ text: "kickoff task" }),
+        },
+      ],
+    });
+    expect(final.output[0]?.content?.[0]?.text).toBe("TOOL_PROGRESS_MARKER_OK");
+  });
+
  it("requires deterministic tool-progress error prompts to observe a failed tool", async () => {
    const server = await startMockServer();
    const prompt =
--- a/extensions/qa-lab/src/providers/mock-openai/server.ts
+++ b/extensions/qa-lab/src/providers/mock-openai/server.ts
@@ -562,11 +562,14 @@ function extractFinishExactlyDirective(text: string) {
 }

 function extractExactMarkerDirective(text: string) {
-  const backtickedMatch = extractLastCapture(text, /exact marker:\s*`([^`]+)`/i);
+  const backtickedMatch = extractLastCapture(text, /exact marker\b[^:\n]{0,120}:\s*`([^`]+)`/i);
  if (backtickedMatch) {
    return backtickedMatch;
  }
-  return extractLastCapture(text, /exact marker:\s*([^\s`.,;:!?]+(?:-[^\s`.,;:!?]+)*)/i);
+  return extractLastCapture(
+    text,
+    /exact marker\b[^:\n]{0,120}:\s*([^\s`.,;:!?]+(?:-[^\s`.,;:!?]+)*)/i,
+  );
 }

 function extractLabeledMarkerDirective(text: string, label: string) {
@@ -1294,19 +1297,22 @@ async function buildResponsesPayload(
      },
    ]);
  }
-  if (QA_TOOL_PROGRESS_ERROR_PROMPT_RE.test(allInputText) && exactReplyDirective) {
+  const toolProgressReplyDirective = exactReplyDirective ?? exactMarkerDirective;
+  if (QA_TOOL_PROGRESS_ERROR_PROMPT_RE.test(allInputText) && toolProgressReplyDirective) {
    if (!toolOutput) {
      return buildToolProgressReadEvents(QA_TOOL_PROGRESS_ERROR_PROMPT_RE);
    }
    return buildAssistantEvents(
-      hasToolErrorOutput(toolJson, toolOutput) ? exactReplyDirective : "BUG-TOOL-DID-NOT-FAIL",
+      hasToolErrorOutput(toolJson, toolOutput)
+        ? toolProgressReplyDirective
+        : "BUG-TOOL-DID-NOT-FAIL",
    );
  }
-  if (QA_TOOL_PROGRESS_PROMPT_RE.test(allInputText) && exactReplyDirective) {
+  if (QA_TOOL_PROGRESS_PROMPT_RE.test(allInputText) && toolProgressReplyDirective) {
    if (!toolOutput) {
      return buildToolProgressReadEvents(QA_TOOL_PROGRESS_PROMPT_RE);
    }
-    return buildAssistantEvents(exactReplyDirective);
+    return buildAssistantEvents(toolProgressReplyDirective);
  }
  if (
    QA_BLOCK_STREAMING_PROMPT_RE.test(allInputText) &&