fix(qa): align runtime parity evidence with Codex

2026-06-26 14:39:33 +00:00 · 2026-06-24 19:49:37 +08:00
parent 31a65e0647
commit dad7168c2f
3 changed files with 61 additions and 16 deletions
--- a/extensions/qa-lab/src/runtime-parity.test.ts
+++ b/extensions/qa-lab/src/runtime-parity.test.ts
@@ -168,23 +168,42 @@ describe("runtime parity", () => {
    const scoped = __testing.filterMockRequestsForParentPrompt(
      [
        {
+          prompt: "Fanout worker alpha: inspect the QA workspace and finish with exactly ALPHA-OK.",
+          allInputText:
+            "Delegate one bounded QA task to a subagent. Fanout worker alpha: inspect the QA workspace and finish with exactly ALPHA-OK.",
+          plannedToolName: "read",
+        },
+        {
+          prompt: "Delegate one bounded QA task to a subagent.",
          allInputText: "Delegate one bounded QA task to a subagent.",
          plannedToolName: "sessions_spawn",
        },
+        {
+          prompt: "Continue the bounded QA task with the retained child result.",
+          allInputText:
+            "Delegate one bounded QA task to a subagent. Continue the bounded QA task with the retained child result.",
+          plannedToolName: "sessions_spawn",
+        },
        {
          allInputText: "Inspect the QA workspace and return one concise protocol note.",
          plannedToolName: "read",
        },
        {
+          prompt: "Delegate one bounded QA task to a subagent.",
          allInputText: "Delegate one bounded QA task to a subagent. Tool result: child accepted.",
          toolOutput: "child accepted",
        },
      ],
      "Delegate one bounded QA task to a subagent.",
+      [
+        "Delegate one bounded QA task to a subagent.",
+        "Continue the bounded QA task with the retained child result.",
+      ],
    );

-    expect(scoped).toHaveLength(2);
+    expect(scoped).toHaveLength(3);
    expect(scoped.map((request) => request.plannedToolName ?? "result")).toEqual([
+      "sessions_spawn",
      "sessions_spawn",
      "result",
    ]);
--- a/extensions/qa-lab/src/runtime-parity.ts
+++ b/extensions/qa-lab/src/runtime-parity.ts
@@ -120,6 +120,7 @@ type RuntimeParityTranscriptRecord = {
 };

 type RuntimeParityMockRequestSnapshot = {
+  prompt?: string;
  allInputText?: string;
  plannedToolName?: string;
  plannedToolArgs?: unknown;
@@ -759,14 +760,22 @@ function resolveRuntimeParityToolCalls(params: {
 function filterMockRequestsForParentPrompt(
  requests: RuntimeParityMockRequestSnapshot[],
  parentPrompt: string,
+  parentPrompts: readonly string[] = [parentPrompt],
 ) {
-  const normalizedParentPrompt = normalizeTextForParity(parentPrompt);
-  if (!normalizedParentPrompt) {
+  const normalizedParentPrompts = parentPrompts
+    .map(normalizeTextForParity)
+    .filter((prompt) => prompt.length > 0);
+  if (normalizedParentPrompts.length === 0) {
    return requests;
  }
-  const matching = requests.filter((request) =>
-    normalizeTextForParity(request.allInputText ?? "").includes(normalizedParentPrompt),
-  );
+  const matching = requests.filter((request) => {
+    const normalizedPrompt = normalizeTextForParity(request.prompt ?? "");
+    if (normalizedPrompt) {
+      return normalizedParentPrompts.some((prompt) => normalizedPrompt.includes(prompt));
+    }
+    const normalizedHistory = normalizeTextForParity(request.allInputText ?? "");
+    return normalizedParentPrompts.some((prompt) => normalizedHistory.includes(prompt));
+  });
  return matching.length > 0 ? matching : requests;
 }

@@ -966,6 +975,7 @@ async function loadRuntimeParityTranscripts(params: {
 async function loadRuntimeParityMockToolCalls(
  mockBaseUrl: string | undefined,
  parentPrompt: string,
+  parentPrompts: readonly string[] = [parentPrompt],
 ): Promise<RuntimeParityToolCall[] | null> {
  const normalizedBaseUrl = mockBaseUrl?.trim().replace(/\/+$/u, "");
  if (!normalizedBaseUrl) {
@@ -991,6 +1001,7 @@ async function loadRuntimeParityMockToolCalls(
    }
    const requests = payload.filter(isMessageRecord).map(
      (entry): RuntimeParityMockRequestSnapshot => ({
+        prompt: readNonEmptyString(entry.prompt),
        allInputText: readNonEmptyString(entry.allInputText),
        plannedToolName: readNonEmptyString(entry.plannedToolName),
        plannedToolArgs: entry.plannedToolArgs ?? null,
@@ -998,7 +1009,7 @@ async function loadRuntimeParityMockToolCalls(
      }),
    );
    return resolveToolCallOrderFromMockRequests(
-      filterMockRequestsForParentPrompt(requests, parentPrompt),
+      filterMockRequestsForParentPrompt(requests, parentPrompt, parentPrompts),
    );
  } catch {
    return null;
@@ -1015,12 +1026,16 @@ export async function captureRuntimeParityCell(
  });
  const transcriptRecords = buildTranscriptRecords(transcriptBytes);
  const transcriptToolCalls = resolveToolCallOrder(transcriptRecords);
-  const parentPrompt =
-    transcriptRecords
-      .filter((record) => record.role === "user" && !isToolResultLikeMessage(record.message))
-      .map((record) => extractAssistantText(record.message))
-      .find(Boolean) ?? "";
-  const mockToolCalls = await loadRuntimeParityMockToolCalls(params.mockBaseUrl, parentPrompt);
+  const parentPrompts = transcriptRecords
+    .filter((record) => record.role === "user")
+    .map((record) => extractAssistantText(record.message))
+    .filter((prompt) => prompt.length > 0);
+  const parentPrompt = parentPrompts[0] ?? "";
+  const mockToolCalls = await loadRuntimeParityMockToolCalls(
+    params.mockBaseUrl,
+    parentPrompt,
+    parentPrompts,
+  );
  const gatewayLogs = params.gateway.logs?.();
  const sentinelFindings = [
    ...scanGatewayLogSentinels(gatewayLogs),
--- a/qa/scenarios/runtime/long-context-cache-stability.yaml
+++ b/qa/scenarios/runtime/long-context-cache-stability.yaml
@@ -26,7 +26,9 @@ scenario:
    config:
      sessionKey: agent:qa:long-context-cache-stability
      fixtureFile: large-cache-fixture.txt
-      cacheEvidenceNeedle: CACHE-FIXTURE-0550
+      cacheEvidenceNeedle: CACHE-FIXTURE-0050
+      cacheEvidenceLine: "CACHE-FIXTURE-0050: stable tool-result evidence for prompt-cache reuse across long sessions."
+      followupPromptNeedle: Using the already-read
      warmupMarker: QA-LARGE-CACHE-WARMUP-OK
      hitMarker: QA-LARGE-CACHE-HIT-OK

@@ -84,8 +86,17 @@ flow:
        - set: debugRequests
          value:
            expr: "env.mock ? [...(await fetchJson(`${env.mock.baseUrl}/debug/requests`))] : []"
+        - set: cappedReadOutputIndex
+          value:
+            expr: "debugRequests.reduce((found, planned, index) => { if (found >= 0 || !planned.plannedToolCallId || planned.plannedToolName !== 'read' || planned.plannedToolArgs?.path !== config.fixtureFile) return found; const outputOffset = debugRequests.slice(index + 1).findIndex((candidate) => Boolean(candidate.toolOutputCallId) && candidate.toolOutputCallId === planned.plannedToolCallId); if (outputOffset < 0) return found; const output = debugRequests[index + 1 + outputOffset]; const evidence = [planned.allInputText, output.allInputText, output.toolOutput].filter((value) => typeof value === 'string').join('\\n'); const hasCodexFormattedTruncation = evidence.includes('Warning: truncated output') && (evidence.includes('chars truncated') || evidence.includes('tokens truncated')); return evidence.includes(config.cacheEvidenceLine) && (evidence.includes('[Read output capped at 50KB') || evidence.includes('...(OpenClaw truncated dynamic tool result') || evidence.includes('...(truncated)...') || hasCodexFormattedTruncation) ? index + 1 + outputOffset : found; }, -1)"
+        - set: hasCappedReadEvidence
+          value:
+            expr: "cappedReadOutputIndex >= 0"
+        - set: hasFollowupCacheEvidence
+          value:
+            expr: "cappedReadOutputIndex >= 0 && debugRequests.some((request, index) => index > cappedReadOutputIndex && String(request.prompt ?? '').includes(config.followupPromptNeedle) && String(request.allInputText ?? '').includes(config.cacheEvidenceLine))"
        - assert:
-            expr: "!env.mock || debugRequests.some((request, index) => request.plannedToolName === 'read' && request.plannedToolArgs?.path === config.fixtureFile && typeof request.plannedToolCallId === 'string' && debugRequests.slice(index + 1).some((result, resultOffset) => result.toolOutputCallId === request.plannedToolCallId && String(result.toolOutput ?? '').includes(config.cacheEvidenceNeedle) && (String(result.toolOutput ?? '').includes('[Read output capped at 50KB') || (String(result.toolOutput ?? '').includes('...(truncated)...') && String(result.toolOutput ?? '').length <= 13000)) && debugRequests.slice(index + resultOffset + 2).some((followup) => followup.plannedToolName === 'read' && followup.plannedToolArgs?.path === config.fixtureFile && String(followup.allInputText ?? '').includes(config.cacheEvidenceNeedle) && (String(followup.allInputText ?? '').includes('[Read output capped at 50KB') || String(followup.allInputText ?? '').includes('...(truncated)...')))))"
+            expr: "!env.mock || (hasCappedReadEvidence && hasFollowupCacheEvidence)"
            message:
-              expr: "`large capped read tool result was not observed: ${JSON.stringify(debugRequests.slice(-8).map((request) => ({ plannedToolName: request.plannedToolName ?? null, plannedToolArgs: request.plannedToolArgs ?? null, plannedToolCallId: request.plannedToolCallId ?? null, toolOutputCallId: request.toolOutputCallId ?? null, toolOutputLength: String(request.toolOutput ?? '').length, toolOutputHasNeedle: String(request.toolOutput ?? '').includes(config.cacheEvidenceNeedle), toolOutputHasReadCap: String(request.toolOutput ?? '').includes('[Read output capped at 50KB'), toolOutputHasCodexTruncation: String(request.toolOutput ?? '').includes('...(truncated)...'), inputHasNeedle: String(request.allInputText ?? '').includes(config.cacheEvidenceNeedle), inputHasReadCap: String(request.allInputText ?? '').includes('[Read output capped at 50KB'), inputHasCodexTruncation: String(request.allInputText ?? '').includes('...(truncated)...') })))}`"
+              expr: "`large capped read cache evidence was not observed: ${JSON.stringify({ hasCappedReadEvidence, hasFollowupCacheEvidence, requests: debugRequests.slice(-8).map((request) => ({ prompt: request.prompt ?? null, plannedToolName: request.plannedToolName ?? null, plannedToolArgs: request.plannedToolArgs ?? null, plannedToolCallId: request.plannedToolCallId ?? null, toolOutputCallId: request.toolOutputCallId ?? null, toolOutputLength: String(request.toolOutput ?? '').length, outputHasReadCap: String(request.toolOutput ?? '').includes('[Read output capped at 50KB'), outputHasCodexTruncation: String(request.toolOutput ?? '').includes('...(truncated)...'), inputHasEvidenceLine: String(request.allInputText ?? '').includes(config.cacheEvidenceLine) })) })}`"
      detailsExpr: "outbound?.text ?? config.hitMarker"