diff --git a/extensions/qa-lab/src/runtime-parity.test.ts b/extensions/qa-lab/src/runtime-parity.test.ts index c3770360605..a55eb98b0b0 100644 --- a/extensions/qa-lab/src/runtime-parity.test.ts +++ b/extensions/qa-lab/src/runtime-parity.test.ts @@ -168,23 +168,42 @@ describe("runtime parity", () => { const scoped = __testing.filterMockRequestsForParentPrompt( [ { + prompt: "Fanout worker alpha: inspect the QA workspace and finish with exactly ALPHA-OK.", + allInputText: + "Delegate one bounded QA task to a subagent. Fanout worker alpha: inspect the QA workspace and finish with exactly ALPHA-OK.", + plannedToolName: "read", + }, + { + prompt: "Delegate one bounded QA task to a subagent.", allInputText: "Delegate one bounded QA task to a subagent.", plannedToolName: "sessions_spawn", }, + { + prompt: "Continue the bounded QA task with the retained child result.", + allInputText: + "Delegate one bounded QA task to a subagent. Continue the bounded QA task with the retained child result.", + plannedToolName: "sessions_spawn", + }, { allInputText: "Inspect the QA workspace and return one concise protocol note.", plannedToolName: "read", }, { + prompt: "Delegate one bounded QA task to a subagent.", allInputText: "Delegate one bounded QA task to a subagent. Tool result: child accepted.", toolOutput: "child accepted", }, ], "Delegate one bounded QA task to a subagent.", + [ + "Delegate one bounded QA task to a subagent.", + "Continue the bounded QA task with the retained child result.", + ], ); - expect(scoped).toHaveLength(2); + expect(scoped).toHaveLength(3); expect(scoped.map((request) => request.plannedToolName ?? "result")).toEqual([ + "sessions_spawn", "sessions_spawn", "result", ]); diff --git a/extensions/qa-lab/src/runtime-parity.ts b/extensions/qa-lab/src/runtime-parity.ts index cbfeb22ea6f..19101da4c03 100644 --- a/extensions/qa-lab/src/runtime-parity.ts +++ b/extensions/qa-lab/src/runtime-parity.ts @@ -120,6 +120,7 @@ type RuntimeParityTranscriptRecord = { }; type RuntimeParityMockRequestSnapshot = { + prompt?: string; allInputText?: string; plannedToolName?: string; plannedToolArgs?: unknown; @@ -759,14 +760,22 @@ function resolveRuntimeParityToolCalls(params: { function filterMockRequestsForParentPrompt( requests: RuntimeParityMockRequestSnapshot[], parentPrompt: string, + parentPrompts: readonly string[] = [parentPrompt], ) { - const normalizedParentPrompt = normalizeTextForParity(parentPrompt); - if (!normalizedParentPrompt) { + const normalizedParentPrompts = parentPrompts + .map(normalizeTextForParity) + .filter((prompt) => prompt.length > 0); + if (normalizedParentPrompts.length === 0) { return requests; } - const matching = requests.filter((request) => - normalizeTextForParity(request.allInputText ?? "").includes(normalizedParentPrompt), - ); + const matching = requests.filter((request) => { + const normalizedPrompt = normalizeTextForParity(request.prompt ?? ""); + if (normalizedPrompt) { + return normalizedParentPrompts.some((prompt) => normalizedPrompt.includes(prompt)); + } + const normalizedHistory = normalizeTextForParity(request.allInputText ?? ""); + return normalizedParentPrompts.some((prompt) => normalizedHistory.includes(prompt)); + }); return matching.length > 0 ? matching : requests; } @@ -966,6 +975,7 @@ async function loadRuntimeParityTranscripts(params: { async function loadRuntimeParityMockToolCalls( mockBaseUrl: string | undefined, parentPrompt: string, + parentPrompts: readonly string[] = [parentPrompt], ): Promise { const normalizedBaseUrl = mockBaseUrl?.trim().replace(/\/+$/u, ""); if (!normalizedBaseUrl) { @@ -991,6 +1001,7 @@ async function loadRuntimeParityMockToolCalls( } const requests = payload.filter(isMessageRecord).map( (entry): RuntimeParityMockRequestSnapshot => ({ + prompt: readNonEmptyString(entry.prompt), allInputText: readNonEmptyString(entry.allInputText), plannedToolName: readNonEmptyString(entry.plannedToolName), plannedToolArgs: entry.plannedToolArgs ?? null, @@ -998,7 +1009,7 @@ async function loadRuntimeParityMockToolCalls( }), ); return resolveToolCallOrderFromMockRequests( - filterMockRequestsForParentPrompt(requests, parentPrompt), + filterMockRequestsForParentPrompt(requests, parentPrompt, parentPrompts), ); } catch { return null; @@ -1015,12 +1026,16 @@ export async function captureRuntimeParityCell( }); const transcriptRecords = buildTranscriptRecords(transcriptBytes); const transcriptToolCalls = resolveToolCallOrder(transcriptRecords); - const parentPrompt = - transcriptRecords - .filter((record) => record.role === "user" && !isToolResultLikeMessage(record.message)) - .map((record) => extractAssistantText(record.message)) - .find(Boolean) ?? ""; - const mockToolCalls = await loadRuntimeParityMockToolCalls(params.mockBaseUrl, parentPrompt); + const parentPrompts = transcriptRecords + .filter((record) => record.role === "user") + .map((record) => extractAssistantText(record.message)) + .filter((prompt) => prompt.length > 0); + const parentPrompt = parentPrompts[0] ?? ""; + const mockToolCalls = await loadRuntimeParityMockToolCalls( + params.mockBaseUrl, + parentPrompt, + parentPrompts, + ); const gatewayLogs = params.gateway.logs?.(); const sentinelFindings = [ ...scanGatewayLogSentinels(gatewayLogs), diff --git a/qa/scenarios/runtime/long-context-cache-stability.yaml b/qa/scenarios/runtime/long-context-cache-stability.yaml index 67c8314a54a..b0040da7125 100644 --- a/qa/scenarios/runtime/long-context-cache-stability.yaml +++ b/qa/scenarios/runtime/long-context-cache-stability.yaml @@ -26,7 +26,9 @@ scenario: config: sessionKey: agent:qa:long-context-cache-stability fixtureFile: large-cache-fixture.txt - cacheEvidenceNeedle: CACHE-FIXTURE-0550 + cacheEvidenceNeedle: CACHE-FIXTURE-0050 + cacheEvidenceLine: "CACHE-FIXTURE-0050: stable tool-result evidence for prompt-cache reuse across long sessions." + followupPromptNeedle: Using the already-read warmupMarker: QA-LARGE-CACHE-WARMUP-OK hitMarker: QA-LARGE-CACHE-HIT-OK @@ -84,8 +86,17 @@ flow: - set: debugRequests value: expr: "env.mock ? [...(await fetchJson(`${env.mock.baseUrl}/debug/requests`))] : []" + - set: cappedReadOutputIndex + value: + expr: "debugRequests.reduce((found, planned, index) => { if (found >= 0 || !planned.plannedToolCallId || planned.plannedToolName !== 'read' || planned.plannedToolArgs?.path !== config.fixtureFile) return found; const outputOffset = debugRequests.slice(index + 1).findIndex((candidate) => Boolean(candidate.toolOutputCallId) && candidate.toolOutputCallId === planned.plannedToolCallId); if (outputOffset < 0) return found; const output = debugRequests[index + 1 + outputOffset]; const evidence = [planned.allInputText, output.allInputText, output.toolOutput].filter((value) => typeof value === 'string').join('\\n'); const hasCodexFormattedTruncation = evidence.includes('Warning: truncated output') && (evidence.includes('chars truncated') || evidence.includes('tokens truncated')); return evidence.includes(config.cacheEvidenceLine) && (evidence.includes('[Read output capped at 50KB') || evidence.includes('...(OpenClaw truncated dynamic tool result') || evidence.includes('...(truncated)...') || hasCodexFormattedTruncation) ? index + 1 + outputOffset : found; }, -1)" + - set: hasCappedReadEvidence + value: + expr: "cappedReadOutputIndex >= 0" + - set: hasFollowupCacheEvidence + value: + expr: "cappedReadOutputIndex >= 0 && debugRequests.some((request, index) => index > cappedReadOutputIndex && String(request.prompt ?? '').includes(config.followupPromptNeedle) && String(request.allInputText ?? '').includes(config.cacheEvidenceLine))" - assert: - expr: "!env.mock || debugRequests.some((request, index) => request.plannedToolName === 'read' && request.plannedToolArgs?.path === config.fixtureFile && typeof request.plannedToolCallId === 'string' && debugRequests.slice(index + 1).some((result, resultOffset) => result.toolOutputCallId === request.plannedToolCallId && String(result.toolOutput ?? '').includes(config.cacheEvidenceNeedle) && (String(result.toolOutput ?? '').includes('[Read output capped at 50KB') || (String(result.toolOutput ?? '').includes('...(truncated)...') && String(result.toolOutput ?? '').length <= 13000)) && debugRequests.slice(index + resultOffset + 2).some((followup) => followup.plannedToolName === 'read' && followup.plannedToolArgs?.path === config.fixtureFile && String(followup.allInputText ?? '').includes(config.cacheEvidenceNeedle) && (String(followup.allInputText ?? '').includes('[Read output capped at 50KB') || String(followup.allInputText ?? '').includes('...(truncated)...')))))" + expr: "!env.mock || (hasCappedReadEvidence && hasFollowupCacheEvidence)" message: - expr: "`large capped read tool result was not observed: ${JSON.stringify(debugRequests.slice(-8).map((request) => ({ plannedToolName: request.plannedToolName ?? null, plannedToolArgs: request.plannedToolArgs ?? null, plannedToolCallId: request.plannedToolCallId ?? null, toolOutputCallId: request.toolOutputCallId ?? null, toolOutputLength: String(request.toolOutput ?? '').length, toolOutputHasNeedle: String(request.toolOutput ?? '').includes(config.cacheEvidenceNeedle), toolOutputHasReadCap: String(request.toolOutput ?? '').includes('[Read output capped at 50KB'), toolOutputHasCodexTruncation: String(request.toolOutput ?? '').includes('...(truncated)...'), inputHasNeedle: String(request.allInputText ?? '').includes(config.cacheEvidenceNeedle), inputHasReadCap: String(request.allInputText ?? '').includes('[Read output capped at 50KB'), inputHasCodexTruncation: String(request.allInputText ?? '').includes('...(truncated)...') })))}`" + expr: "`large capped read cache evidence was not observed: ${JSON.stringify({ hasCappedReadEvidence, hasFollowupCacheEvidence, requests: debugRequests.slice(-8).map((request) => ({ prompt: request.prompt ?? null, plannedToolName: request.plannedToolName ?? null, plannedToolArgs: request.plannedToolArgs ?? null, plannedToolCallId: request.plannedToolCallId ?? null, toolOutputCallId: request.toolOutputCallId ?? null, toolOutputLength: String(request.toolOutput ?? '').length, outputHasReadCap: String(request.toolOutput ?? '').includes('[Read output capped at 50KB'), outputHasCodexTruncation: String(request.toolOutput ?? '').includes('...(truncated)...'), inputHasEvidenceLine: String(request.allInputText ?? '').includes(config.cacheEvidenceLine) })) })}`" detailsExpr: "outbound?.text ?? config.hitMarker"