QA: genericize mock streaming fixtures

2026-05-06 06:30:42 +00:00 · 2026-04-14 23:44:24 -04:00
parent 5042b8b8e3
commit fb92ca1a4d
2 changed files with 188 additions and 20 deletions
--- a/extensions/qa-lab/src/mock-openai-server.test.ts
+++ b/extensions/qa-lab/src/mock-openai-server.test.ts
@@ -99,6 +99,47 @@ describe("qa mock openai server", () => {
    expect(body).toContain('"name":"read"');
  });

+  it("emits deterministic text deltas for generic streaming QA prompts", async () => {
+    const server = await startMockServer();
+
+    const quietResponse = await fetch(`${server.baseUrl}/v1/responses`, {
+      method: "POST",
+      headers: {
+        "content-type": "application/json",
+      },
+      body: JSON.stringify({
+        stream: true,
+        input: [makeUserInput("Quiet streaming QA check: reply exactly `MATRIX_QA_STREAMING_OK`.")],
+      }),
+    });
+    expect(quietResponse.status).toBe(200);
+    const quietBody = await quietResponse.text();
+    expect(quietBody).toContain('"type":"response.output_text.delta"');
+    expect(quietBody).toContain('"phase":"final_answer"');
+    expect(quietBody).toContain("MATRIX_QA_STREAMING_OK");
+
+    const blockResponse = await fetch(`${server.baseUrl}/v1/responses`, {
+      method: "POST",
+      headers: {
+        "content-type": "application/json",
+      },
+      body: JSON.stringify({
+        stream: true,
+        input: [
+          makeUserInput(
+            "Block streaming QA check: emit exactly two assistant message blocks in order. First exact marker: `BLOCK_ONE_OK`. Second exact marker: `BLOCK_TWO_OK`.",
+          ),
+        ],
+      }),
+    });
+    expect(blockResponse.status).toBe(200);
+    const blockBody = await blockResponse.text();
+    expect(blockBody).toContain('"item_id":"msg_mock_block_1"');
+    expect(blockBody).toContain('"item_id":"msg_mock_block_2"');
+    expect(blockBody).toContain("BLOCK_ONE_OK");
+    expect(blockBody).toContain("BLOCK_TWO_OK");
+  });
+
  it("prefers path-like refs over generic quoted keys in prompts", async () => {
    const server = await startQaMockOpenAiServer({
      host: "127.0.0.1",
--- a/extensions/qa-lab/src/mock-openai-server.ts
+++ b/extensions/qa-lab/src/mock-openai-server.ts
@@ -6,6 +6,20 @@ type ResponsesInputItem = Record<string, unknown>;

 type StreamEvent =
  | { type: "response.output_item.added"; item: Record<string, unknown> }
+  | {
+      type: "response.output_text.delta";
+      item_id: string;
+      output_index: number;
+      content_index: number;
+      delta: string;
+    }
+  | {
+      type: "response.output_text.done";
+      item_id: string;
+      output_index: number;
+      content_index: number;
+      text: string;
+    }
  | { type: "response.function_call_arguments.delta"; delta: string }
  | { type: "response.output_item.done"; item: Record<string, unknown> }
  | {
@@ -128,6 +142,8 @@ const QA_REASONING_ONLY_RECOVERY_PROMPT_RE = /reasoning-only continuation qa che
 const QA_REASONING_ONLY_SIDE_EFFECT_PROMPT_RE = /reasoning-only after write safety check/i;
 const QA_EMPTY_RESPONSE_RECOVERY_PROMPT_RE = /empty response continuation qa check/i;
 const QA_EMPTY_RESPONSE_EXHAUSTION_PROMPT_RE = /empty response exhaustion qa check/i;
+const QA_QUIET_STREAMING_PROMPT_RE = /(?:matrix\s+)?quiet streaming qa check/i;
+const QA_BLOCK_STREAMING_PROMPT_RE = /(?:matrix\s+)?block streaming qa check/i;
 const QA_REASONING_ONLY_RETRY_NEEDLE =
  "recorded reasoning but did not produce a user-visible answer";
 const QA_EMPTY_RESPONSE_RETRY_NEEDLE =
@@ -507,6 +523,21 @@ function extractExactMarkerDirective(text: string) {
  return extractLastCapture(text, /exact marker:\s*([^\s`.,;:!?]+(?:-[^\s`.,;:!?]+)*)/i);
 }

+function extractLabeledMarkerDirective(text: string, label: string) {
+  const escapedLabel = label.replaceAll(/[.*+?^${}()|[\]\\]/g, "\\$&");
+  const backtickedMatch = extractLastCapture(
+    text,
+    new RegExp(`${escapedLabel}:\\s*\`([^\\\`]+)\``, "i"),
+  );
+  if (backtickedMatch) {
+    return backtickedMatch;
+  }
+  return extractLastCapture(
+    text,
+    new RegExp(`${escapedLabel}:\\s*([^\\s\\\`.,;:!?]+(?:-[^\\s\\\`.,;:!?]+)*)`, "i"),
+  );
+}
+
 function isHeartbeatPrompt(text: string) {
  const trimmed = text.trim();
  if (!trimmed || /remember this fact/i.test(trimmed)) {
@@ -691,39 +722,95 @@ function extractPlannedToolName(events: StreamEvent[]) {
  return undefined;
 }

-function buildAssistantEvents(text: string): StreamEvent[] {
-  const outputItem = {
+type MockAssistantMessageSpec = {
+  id: string;
+  phase?: "commentary" | "final_answer";
+  streamDeltas?: string[];
+  text: string;
+};
+
+function splitMockStreamingText(text: string, parts = 3) {
+  if (text.length <= 1) {
+    return [text];
+  }
+  const chunkSize = Math.max(1, Math.ceil(text.length / parts));
+  const chunks: string[] = [];
+  for (let index = 0; index < text.length; index += chunkSize) {
+    chunks.push(text.slice(index, index + chunkSize));
+  }
+  return chunks.length > 1 ? chunks : [text.slice(0, 1), text.slice(1)];
+}
+
+function buildAssistantOutputItem(spec: MockAssistantMessageSpec) {
+  return {
    type: "message",
-    id: "msg_mock_1",
+    id: spec.id,
    role: "assistant",
    status: "completed",
-    content: [{ type: "output_text", text, annotations: [] }],
+    ...(spec.phase ? { phase: spec.phase } : {}),
+    content: [{ type: "output_text", text: spec.text, annotations: [] }],
  } as const;
-  return [
-    {
+}
+
+function buildAssistantEvents(specsOrText: MockAssistantMessageSpec[] | string): StreamEvent[] {
+  const specs =
+    typeof specsOrText === "string"
+      ? [
+          {
+            id: "msg_mock_1",
+            text: specsOrText,
+          },
+        ]
+      : specsOrText;
+  const output = specs.map((spec) => buildAssistantOutputItem(spec));
+  const events: StreamEvent[] = [];
+
+  for (const [outputIndex, spec] of specs.entries()) {
+    events.push({
      type: "response.output_item.added",
      item: {
        type: "message",
-        id: "msg_mock_1",
+        id: spec.id,
        role: "assistant",
+        ...(spec.phase ? { phase: spec.phase } : {}),
        content: [],
        status: "in_progress",
      },
-    },
-    {
+    });
+    for (const delta of spec.streamDeltas ?? []) {
+      events.push({
+        type: "response.output_text.delta",
+        item_id: spec.id,
+        output_index: outputIndex,
+        content_index: 0,
+        delta,
+      });
+    }
+    if ((spec.streamDeltas ?? []).length > 0) {
+      events.push({
+        type: "response.output_text.done",
+        item_id: spec.id,
+        output_index: outputIndex,
+        content_index: 0,
+        text: spec.text,
+      });
+    }
+    events.push({
      type: "response.output_item.done",
-      item: outputItem,
+      item: output[outputIndex],
+    });
+  }
+
+  events.push({
+    type: "response.completed",
+    response: {
+      id: "resp_mock_msg_1",
+      status: "completed",
+      output,
+      usage: { input_tokens: 64, output_tokens: 24, total_tokens: 88 },
    },
-    {
-      type: "response.completed",
-      response: {
-        id: "resp_mock_msg_1",
-        status: "completed",
-        output: [outputItem],
-        usage: { input_tokens: 64, output_tokens: 24, total_tokens: 88 },
-      },
-    },
-  ];
+  });
+  return events;
 }

 function buildReasoningOnlyEvents(summaryText: string, id: string): StreamEvent[] {
@@ -766,6 +853,16 @@ async function buildResponsesPayload(
  const toolOutput = extractToolOutput(input);
  const toolJson = parseToolOutputJson(toolOutput);
  const allInputText = extractAllRequestTexts(input, body);
+  const exactReplyDirective =
+    extractExactReplyDirective(prompt) ?? extractExactReplyDirective(allInputText);
+  const firstExactMarkerDirective = extractLabeledMarkerDirective(
+    allInputText,
+    "first exact marker",
+  );
+  const secondExactMarkerDirective = extractLabeledMarkerDirective(
+    allInputText,
+    "second exact marker",
+  );
  const isGroupChat = allInputText.includes('"is_group_chat": true');
  const isBaselineUnmentionedChannelChatter = /\bno bot ping here\b/i.test(prompt);
  const hasReasoningOnlyRetryInstruction = allInputText.includes(QA_REASONING_ONLY_RETRY_NEEDLE);
@@ -818,6 +915,36 @@ async function buildResponsesPayload(
    }
    return buildAssistantEvents("");
  }
+  if (QA_QUIET_STREAMING_PROMPT_RE.test(allInputText) && exactReplyDirective) {
+    return buildAssistantEvents([
+      {
+        id: "msg_mock_quiet_stream",
+        phase: "final_answer",
+        streamDeltas: splitMockStreamingText(exactReplyDirective),
+        text: exactReplyDirective,
+      },
+    ]);
+  }
+  if (
+    QA_BLOCK_STREAMING_PROMPT_RE.test(allInputText) &&
+    firstExactMarkerDirective &&
+    secondExactMarkerDirective
+  ) {
+    return buildAssistantEvents([
+      {
+        id: "msg_mock_block_1",
+        phase: "final_answer",
+        streamDeltas: splitMockStreamingText(firstExactMarkerDirective),
+        text: firstExactMarkerDirective,
+      },
+      {
+        id: "msg_mock_block_2",
+        phase: "final_answer",
+        streamDeltas: splitMockStreamingText(secondExactMarkerDirective),
+        text: secondExactMarkerDirective,
+      },
+    ]);
+  }
  if (/lobster invaders/i.test(prompt)) {
    if (!toolOutput) {
      return buildToolCallEventsWithArgs("read", { path: "QA_KICKOFF_TASK.md" });