fix(qa): align runtime parity evidence with Codex

This commit is contained in:
Vincent Koc
2026-06-24 19:49:37 +08:00
committed by Vincent Koc
parent 31a65e0647
commit dad7168c2f
3 changed files with 61 additions and 16 deletions

View File

@@ -168,23 +168,42 @@ describe("runtime parity", () => {
const scoped = __testing.filterMockRequestsForParentPrompt(
[
{
prompt: "Fanout worker alpha: inspect the QA workspace and finish with exactly ALPHA-OK.",
allInputText:
"Delegate one bounded QA task to a subagent. Fanout worker alpha: inspect the QA workspace and finish with exactly ALPHA-OK.",
plannedToolName: "read",
},
{
prompt: "Delegate one bounded QA task to a subagent.",
allInputText: "Delegate one bounded QA task to a subagent.",
plannedToolName: "sessions_spawn",
},
{
prompt: "Continue the bounded QA task with the retained child result.",
allInputText:
"Delegate one bounded QA task to a subagent. Continue the bounded QA task with the retained child result.",
plannedToolName: "sessions_spawn",
},
{
allInputText: "Inspect the QA workspace and return one concise protocol note.",
plannedToolName: "read",
},
{
prompt: "Delegate one bounded QA task to a subagent.",
allInputText: "Delegate one bounded QA task to a subagent. Tool result: child accepted.",
toolOutput: "child accepted",
},
],
"Delegate one bounded QA task to a subagent.",
[
"Delegate one bounded QA task to a subagent.",
"Continue the bounded QA task with the retained child result.",
],
);
expect(scoped).toHaveLength(2);
expect(scoped).toHaveLength(3);
expect(scoped.map((request) => request.plannedToolName ?? "result")).toEqual([
"sessions_spawn",
"sessions_spawn",
"result",
]);

View File

@@ -120,6 +120,7 @@ type RuntimeParityTranscriptRecord = {
};
type RuntimeParityMockRequestSnapshot = {
prompt?: string;
allInputText?: string;
plannedToolName?: string;
plannedToolArgs?: unknown;
@@ -759,14 +760,22 @@ function resolveRuntimeParityToolCalls(params: {
function filterMockRequestsForParentPrompt(
requests: RuntimeParityMockRequestSnapshot[],
parentPrompt: string,
parentPrompts: readonly string[] = [parentPrompt],
) {
const normalizedParentPrompt = normalizeTextForParity(parentPrompt);
if (!normalizedParentPrompt) {
const normalizedParentPrompts = parentPrompts
.map(normalizeTextForParity)
.filter((prompt) => prompt.length > 0);
if (normalizedParentPrompts.length === 0) {
return requests;
}
const matching = requests.filter((request) =>
normalizeTextForParity(request.allInputText ?? "").includes(normalizedParentPrompt),
);
const matching = requests.filter((request) => {
const normalizedPrompt = normalizeTextForParity(request.prompt ?? "");
if (normalizedPrompt) {
return normalizedParentPrompts.some((prompt) => normalizedPrompt.includes(prompt));
}
const normalizedHistory = normalizeTextForParity(request.allInputText ?? "");
return normalizedParentPrompts.some((prompt) => normalizedHistory.includes(prompt));
});
return matching.length > 0 ? matching : requests;
}
@@ -966,6 +975,7 @@ async function loadRuntimeParityTranscripts(params: {
async function loadRuntimeParityMockToolCalls(
mockBaseUrl: string | undefined,
parentPrompt: string,
parentPrompts: readonly string[] = [parentPrompt],
): Promise<RuntimeParityToolCall[] | null> {
const normalizedBaseUrl = mockBaseUrl?.trim().replace(/\/+$/u, "");
if (!normalizedBaseUrl) {
@@ -991,6 +1001,7 @@ async function loadRuntimeParityMockToolCalls(
}
const requests = payload.filter(isMessageRecord).map(
(entry): RuntimeParityMockRequestSnapshot => ({
prompt: readNonEmptyString(entry.prompt),
allInputText: readNonEmptyString(entry.allInputText),
plannedToolName: readNonEmptyString(entry.plannedToolName),
plannedToolArgs: entry.plannedToolArgs ?? null,
@@ -998,7 +1009,7 @@ async function loadRuntimeParityMockToolCalls(
}),
);
return resolveToolCallOrderFromMockRequests(
filterMockRequestsForParentPrompt(requests, parentPrompt),
filterMockRequestsForParentPrompt(requests, parentPrompt, parentPrompts),
);
} catch {
return null;
@@ -1015,12 +1026,16 @@ export async function captureRuntimeParityCell(
});
const transcriptRecords = buildTranscriptRecords(transcriptBytes);
const transcriptToolCalls = resolveToolCallOrder(transcriptRecords);
const parentPrompt =
transcriptRecords
.filter((record) => record.role === "user" && !isToolResultLikeMessage(record.message))
.map((record) => extractAssistantText(record.message))
.find(Boolean) ?? "";
const mockToolCalls = await loadRuntimeParityMockToolCalls(params.mockBaseUrl, parentPrompt);
const parentPrompts = transcriptRecords
.filter((record) => record.role === "user")
.map((record) => extractAssistantText(record.message))
.filter((prompt) => prompt.length > 0);
const parentPrompt = parentPrompts[0] ?? "";
const mockToolCalls = await loadRuntimeParityMockToolCalls(
params.mockBaseUrl,
parentPrompt,
parentPrompts,
);
const gatewayLogs = params.gateway.logs?.();
const sentinelFindings = [
...scanGatewayLogSentinels(gatewayLogs),

View File

@@ -26,7 +26,9 @@ scenario:
config:
sessionKey: agent:qa:long-context-cache-stability
fixtureFile: large-cache-fixture.txt
cacheEvidenceNeedle: CACHE-FIXTURE-0550
cacheEvidenceNeedle: CACHE-FIXTURE-0050
cacheEvidenceLine: "CACHE-FIXTURE-0050: stable tool-result evidence for prompt-cache reuse across long sessions."
followupPromptNeedle: Using the already-read
warmupMarker: QA-LARGE-CACHE-WARMUP-OK
hitMarker: QA-LARGE-CACHE-HIT-OK
@@ -84,8 +86,17 @@ flow:
- set: debugRequests
value:
expr: "env.mock ? [...(await fetchJson(`${env.mock.baseUrl}/debug/requests`))] : []"
- set: cappedReadOutputIndex
value:
expr: "debugRequests.reduce((found, planned, index) => { if (found >= 0 || !planned.plannedToolCallId || planned.plannedToolName !== 'read' || planned.plannedToolArgs?.path !== config.fixtureFile) return found; const outputOffset = debugRequests.slice(index + 1).findIndex((candidate) => Boolean(candidate.toolOutputCallId) && candidate.toolOutputCallId === planned.plannedToolCallId); if (outputOffset < 0) return found; const output = debugRequests[index + 1 + outputOffset]; const evidence = [planned.allInputText, output.allInputText, output.toolOutput].filter((value) => typeof value === 'string').join('\\n'); const hasCodexFormattedTruncation = evidence.includes('Warning: truncated output') && (evidence.includes('chars truncated') || evidence.includes('tokens truncated')); return evidence.includes(config.cacheEvidenceLine) && (evidence.includes('[Read output capped at 50KB') || evidence.includes('...(OpenClaw truncated dynamic tool result') || evidence.includes('...(truncated)...') || hasCodexFormattedTruncation) ? index + 1 + outputOffset : found; }, -1)"
- set: hasCappedReadEvidence
value:
expr: "cappedReadOutputIndex >= 0"
- set: hasFollowupCacheEvidence
value:
expr: "cappedReadOutputIndex >= 0 && debugRequests.some((request, index) => index > cappedReadOutputIndex && String(request.prompt ?? '').includes(config.followupPromptNeedle) && String(request.allInputText ?? '').includes(config.cacheEvidenceLine))"
- assert:
expr: "!env.mock || debugRequests.some((request, index) => request.plannedToolName === 'read' && request.plannedToolArgs?.path === config.fixtureFile && typeof request.plannedToolCallId === 'string' && debugRequests.slice(index + 1).some((result, resultOffset) => result.toolOutputCallId === request.plannedToolCallId && String(result.toolOutput ?? '').includes(config.cacheEvidenceNeedle) && (String(result.toolOutput ?? '').includes('[Read output capped at 50KB') || (String(result.toolOutput ?? '').includes('...(truncated)...') && String(result.toolOutput ?? '').length <= 13000)) && debugRequests.slice(index + resultOffset + 2).some((followup) => followup.plannedToolName === 'read' && followup.plannedToolArgs?.path === config.fixtureFile && String(followup.allInputText ?? '').includes(config.cacheEvidenceNeedle) && (String(followup.allInputText ?? '').includes('[Read output capped at 50KB') || String(followup.allInputText ?? '').includes('...(truncated)...')))))"
expr: "!env.mock || (hasCappedReadEvidence && hasFollowupCacheEvidence)"
message:
expr: "`large capped read tool result was not observed: ${JSON.stringify(debugRequests.slice(-8).map((request) => ({ plannedToolName: request.plannedToolName ?? null, plannedToolArgs: request.plannedToolArgs ?? null, plannedToolCallId: request.plannedToolCallId ?? null, toolOutputCallId: request.toolOutputCallId ?? null, toolOutputLength: String(request.toolOutput ?? '').length, toolOutputHasNeedle: String(request.toolOutput ?? '').includes(config.cacheEvidenceNeedle), toolOutputHasReadCap: String(request.toolOutput ?? '').includes('[Read output capped at 50KB'), toolOutputHasCodexTruncation: String(request.toolOutput ?? '').includes('...(truncated)...'), inputHasNeedle: String(request.allInputText ?? '').includes(config.cacheEvidenceNeedle), inputHasReadCap: String(request.allInputText ?? '').includes('[Read output capped at 50KB'), inputHasCodexTruncation: String(request.allInputText ?? '').includes('...(truncated)...') })))}`"
expr: "`large capped read cache evidence was not observed: ${JSON.stringify({ hasCappedReadEvidence, hasFollowupCacheEvidence, requests: debugRequests.slice(-8).map((request) => ({ prompt: request.prompt ?? null, plannedToolName: request.plannedToolName ?? null, plannedToolArgs: request.plannedToolArgs ?? null, plannedToolCallId: request.plannedToolCallId ?? null, toolOutputCallId: request.toolOutputCallId ?? null, toolOutputLength: String(request.toolOutput ?? '').length, outputHasReadCap: String(request.toolOutput ?? '').includes('[Read output capped at 50KB'), outputHasCodexTruncation: String(request.toolOutput ?? '').includes('...(truncated)...'), inputHasEvidenceLine: String(request.allInputText ?? '').includes(config.cacheEvidenceLine) })) })}`"
detailsExpr: "outbound?.text ?? config.hitMarker"