mirror of
https://github.com/openclaw/openclaw.git
synced 2026-06-26 14:39:33 +00:00
fix(qa): align runtime parity evidence with Codex
This commit is contained in:
@@ -168,23 +168,42 @@ describe("runtime parity", () => {
|
||||
const scoped = __testing.filterMockRequestsForParentPrompt(
|
||||
[
|
||||
{
|
||||
prompt: "Fanout worker alpha: inspect the QA workspace and finish with exactly ALPHA-OK.",
|
||||
allInputText:
|
||||
"Delegate one bounded QA task to a subagent. Fanout worker alpha: inspect the QA workspace and finish with exactly ALPHA-OK.",
|
||||
plannedToolName: "read",
|
||||
},
|
||||
{
|
||||
prompt: "Delegate one bounded QA task to a subagent.",
|
||||
allInputText: "Delegate one bounded QA task to a subagent.",
|
||||
plannedToolName: "sessions_spawn",
|
||||
},
|
||||
{
|
||||
prompt: "Continue the bounded QA task with the retained child result.",
|
||||
allInputText:
|
||||
"Delegate one bounded QA task to a subagent. Continue the bounded QA task with the retained child result.",
|
||||
plannedToolName: "sessions_spawn",
|
||||
},
|
||||
{
|
||||
allInputText: "Inspect the QA workspace and return one concise protocol note.",
|
||||
plannedToolName: "read",
|
||||
},
|
||||
{
|
||||
prompt: "Delegate one bounded QA task to a subagent.",
|
||||
allInputText: "Delegate one bounded QA task to a subagent. Tool result: child accepted.",
|
||||
toolOutput: "child accepted",
|
||||
},
|
||||
],
|
||||
"Delegate one bounded QA task to a subagent.",
|
||||
[
|
||||
"Delegate one bounded QA task to a subagent.",
|
||||
"Continue the bounded QA task with the retained child result.",
|
||||
],
|
||||
);
|
||||
|
||||
expect(scoped).toHaveLength(2);
|
||||
expect(scoped).toHaveLength(3);
|
||||
expect(scoped.map((request) => request.plannedToolName ?? "result")).toEqual([
|
||||
"sessions_spawn",
|
||||
"sessions_spawn",
|
||||
"result",
|
||||
]);
|
||||
|
||||
@@ -120,6 +120,7 @@ type RuntimeParityTranscriptRecord = {
|
||||
};
|
||||
|
||||
type RuntimeParityMockRequestSnapshot = {
|
||||
prompt?: string;
|
||||
allInputText?: string;
|
||||
plannedToolName?: string;
|
||||
plannedToolArgs?: unknown;
|
||||
@@ -759,14 +760,22 @@ function resolveRuntimeParityToolCalls(params: {
|
||||
function filterMockRequestsForParentPrompt(
|
||||
requests: RuntimeParityMockRequestSnapshot[],
|
||||
parentPrompt: string,
|
||||
parentPrompts: readonly string[] = [parentPrompt],
|
||||
) {
|
||||
const normalizedParentPrompt = normalizeTextForParity(parentPrompt);
|
||||
if (!normalizedParentPrompt) {
|
||||
const normalizedParentPrompts = parentPrompts
|
||||
.map(normalizeTextForParity)
|
||||
.filter((prompt) => prompt.length > 0);
|
||||
if (normalizedParentPrompts.length === 0) {
|
||||
return requests;
|
||||
}
|
||||
const matching = requests.filter((request) =>
|
||||
normalizeTextForParity(request.allInputText ?? "").includes(normalizedParentPrompt),
|
||||
);
|
||||
const matching = requests.filter((request) => {
|
||||
const normalizedPrompt = normalizeTextForParity(request.prompt ?? "");
|
||||
if (normalizedPrompt) {
|
||||
return normalizedParentPrompts.some((prompt) => normalizedPrompt.includes(prompt));
|
||||
}
|
||||
const normalizedHistory = normalizeTextForParity(request.allInputText ?? "");
|
||||
return normalizedParentPrompts.some((prompt) => normalizedHistory.includes(prompt));
|
||||
});
|
||||
return matching.length > 0 ? matching : requests;
|
||||
}
|
||||
|
||||
@@ -966,6 +975,7 @@ async function loadRuntimeParityTranscripts(params: {
|
||||
async function loadRuntimeParityMockToolCalls(
|
||||
mockBaseUrl: string | undefined,
|
||||
parentPrompt: string,
|
||||
parentPrompts: readonly string[] = [parentPrompt],
|
||||
): Promise<RuntimeParityToolCall[] | null> {
|
||||
const normalizedBaseUrl = mockBaseUrl?.trim().replace(/\/+$/u, "");
|
||||
if (!normalizedBaseUrl) {
|
||||
@@ -991,6 +1001,7 @@ async function loadRuntimeParityMockToolCalls(
|
||||
}
|
||||
const requests = payload.filter(isMessageRecord).map(
|
||||
(entry): RuntimeParityMockRequestSnapshot => ({
|
||||
prompt: readNonEmptyString(entry.prompt),
|
||||
allInputText: readNonEmptyString(entry.allInputText),
|
||||
plannedToolName: readNonEmptyString(entry.plannedToolName),
|
||||
plannedToolArgs: entry.plannedToolArgs ?? null,
|
||||
@@ -998,7 +1009,7 @@ async function loadRuntimeParityMockToolCalls(
|
||||
}),
|
||||
);
|
||||
return resolveToolCallOrderFromMockRequests(
|
||||
filterMockRequestsForParentPrompt(requests, parentPrompt),
|
||||
filterMockRequestsForParentPrompt(requests, parentPrompt, parentPrompts),
|
||||
);
|
||||
} catch {
|
||||
return null;
|
||||
@@ -1015,12 +1026,16 @@ export async function captureRuntimeParityCell(
|
||||
});
|
||||
const transcriptRecords = buildTranscriptRecords(transcriptBytes);
|
||||
const transcriptToolCalls = resolveToolCallOrder(transcriptRecords);
|
||||
const parentPrompt =
|
||||
transcriptRecords
|
||||
.filter((record) => record.role === "user" && !isToolResultLikeMessage(record.message))
|
||||
.map((record) => extractAssistantText(record.message))
|
||||
.find(Boolean) ?? "";
|
||||
const mockToolCalls = await loadRuntimeParityMockToolCalls(params.mockBaseUrl, parentPrompt);
|
||||
const parentPrompts = transcriptRecords
|
||||
.filter((record) => record.role === "user")
|
||||
.map((record) => extractAssistantText(record.message))
|
||||
.filter((prompt) => prompt.length > 0);
|
||||
const parentPrompt = parentPrompts[0] ?? "";
|
||||
const mockToolCalls = await loadRuntimeParityMockToolCalls(
|
||||
params.mockBaseUrl,
|
||||
parentPrompt,
|
||||
parentPrompts,
|
||||
);
|
||||
const gatewayLogs = params.gateway.logs?.();
|
||||
const sentinelFindings = [
|
||||
...scanGatewayLogSentinels(gatewayLogs),
|
||||
|
||||
@@ -26,7 +26,9 @@ scenario:
|
||||
config:
|
||||
sessionKey: agent:qa:long-context-cache-stability
|
||||
fixtureFile: large-cache-fixture.txt
|
||||
cacheEvidenceNeedle: CACHE-FIXTURE-0550
|
||||
cacheEvidenceNeedle: CACHE-FIXTURE-0050
|
||||
cacheEvidenceLine: "CACHE-FIXTURE-0050: stable tool-result evidence for prompt-cache reuse across long sessions."
|
||||
followupPromptNeedle: Using the already-read
|
||||
warmupMarker: QA-LARGE-CACHE-WARMUP-OK
|
||||
hitMarker: QA-LARGE-CACHE-HIT-OK
|
||||
|
||||
@@ -84,8 +86,17 @@ flow:
|
||||
- set: debugRequests
|
||||
value:
|
||||
expr: "env.mock ? [...(await fetchJson(`${env.mock.baseUrl}/debug/requests`))] : []"
|
||||
- set: cappedReadOutputIndex
|
||||
value:
|
||||
expr: "debugRequests.reduce((found, planned, index) => { if (found >= 0 || !planned.plannedToolCallId || planned.plannedToolName !== 'read' || planned.plannedToolArgs?.path !== config.fixtureFile) return found; const outputOffset = debugRequests.slice(index + 1).findIndex((candidate) => Boolean(candidate.toolOutputCallId) && candidate.toolOutputCallId === planned.plannedToolCallId); if (outputOffset < 0) return found; const output = debugRequests[index + 1 + outputOffset]; const evidence = [planned.allInputText, output.allInputText, output.toolOutput].filter((value) => typeof value === 'string').join('\\n'); const hasCodexFormattedTruncation = evidence.includes('Warning: truncated output') && (evidence.includes('chars truncated') || evidence.includes('tokens truncated')); return evidence.includes(config.cacheEvidenceLine) && (evidence.includes('[Read output capped at 50KB') || evidence.includes('...(OpenClaw truncated dynamic tool result') || evidence.includes('...(truncated)...') || hasCodexFormattedTruncation) ? index + 1 + outputOffset : found; }, -1)"
|
||||
- set: hasCappedReadEvidence
|
||||
value:
|
||||
expr: "cappedReadOutputIndex >= 0"
|
||||
- set: hasFollowupCacheEvidence
|
||||
value:
|
||||
expr: "cappedReadOutputIndex >= 0 && debugRequests.some((request, index) => index > cappedReadOutputIndex && String(request.prompt ?? '').includes(config.followupPromptNeedle) && String(request.allInputText ?? '').includes(config.cacheEvidenceLine))"
|
||||
- assert:
|
||||
expr: "!env.mock || debugRequests.some((request, index) => request.plannedToolName === 'read' && request.plannedToolArgs?.path === config.fixtureFile && typeof request.plannedToolCallId === 'string' && debugRequests.slice(index + 1).some((result, resultOffset) => result.toolOutputCallId === request.plannedToolCallId && String(result.toolOutput ?? '').includes(config.cacheEvidenceNeedle) && (String(result.toolOutput ?? '').includes('[Read output capped at 50KB') || (String(result.toolOutput ?? '').includes('...(truncated)...') && String(result.toolOutput ?? '').length <= 13000)) && debugRequests.slice(index + resultOffset + 2).some((followup) => followup.plannedToolName === 'read' && followup.plannedToolArgs?.path === config.fixtureFile && String(followup.allInputText ?? '').includes(config.cacheEvidenceNeedle) && (String(followup.allInputText ?? '').includes('[Read output capped at 50KB') || String(followup.allInputText ?? '').includes('...(truncated)...')))))"
|
||||
expr: "!env.mock || (hasCappedReadEvidence && hasFollowupCacheEvidence)"
|
||||
message:
|
||||
expr: "`large capped read tool result was not observed: ${JSON.stringify(debugRequests.slice(-8).map((request) => ({ plannedToolName: request.plannedToolName ?? null, plannedToolArgs: request.plannedToolArgs ?? null, plannedToolCallId: request.plannedToolCallId ?? null, toolOutputCallId: request.toolOutputCallId ?? null, toolOutputLength: String(request.toolOutput ?? '').length, toolOutputHasNeedle: String(request.toolOutput ?? '').includes(config.cacheEvidenceNeedle), toolOutputHasReadCap: String(request.toolOutput ?? '').includes('[Read output capped at 50KB'), toolOutputHasCodexTruncation: String(request.toolOutput ?? '').includes('...(truncated)...'), inputHasNeedle: String(request.allInputText ?? '').includes(config.cacheEvidenceNeedle), inputHasReadCap: String(request.allInputText ?? '').includes('[Read output capped at 50KB'), inputHasCodexTruncation: String(request.allInputText ?? '').includes('...(truncated)...') })))}`"
|
||||
expr: "`large capped read cache evidence was not observed: ${JSON.stringify({ hasCappedReadEvidence, hasFollowupCacheEvidence, requests: debugRequests.slice(-8).map((request) => ({ prompt: request.prompt ?? null, plannedToolName: request.plannedToolName ?? null, plannedToolArgs: request.plannedToolArgs ?? null, plannedToolCallId: request.plannedToolCallId ?? null, toolOutputCallId: request.toolOutputCallId ?? null, toolOutputLength: String(request.toolOutput ?? '').length, outputHasReadCap: String(request.toolOutput ?? '').includes('[Read output capped at 50KB'), outputHasCodexTruncation: String(request.toolOutput ?? '').includes('...(truncated)...'), inputHasEvidenceLine: String(request.allInputText ?? '').includes(config.cacheEvidenceLine) })) })}`"
|
||||
detailsExpr: "outbound?.text ?? config.hitMarker"
|
||||
|
||||
Reference in New Issue
Block a user