mirror of
https://github.com/openclaw/openclaw.git
synced 2026-06-05 20:22:53 +00:00
* refactor: extract agent core package Introduce packages/agent-core as the OpenClaw-owned home for reusable agent loop, harness, session, prompt, and runtime dependency contracts. * refactor: extract shared llm runtime Move provider model registries, stream wrappers, OAuth helpers, and LLM utilities into src/llm with plugin-sdk barrels instead of depending on the old embedded runtime layout. * refactor: remove pi runtime internals Rename remaining Pi-shaped agent surfaces to OpenClaw agent runtime names, delete obsolete Pi docs and package graph checks, and add the third-party notice for incorporated code. * refactor: tighten agent session runtime Make agent-core/runtime dependencies explicit, consolidate compaction and session transcript helpers, and move model/session helpers behind OpenClaw-owned contracts. * refactor: remove static model and pi auth paths Drop static model catalogs and Pi auth bridges, move model/provider facts to manifest-owned runtime contracts, and harden internal embedded-agent utilities. * refactor: remove legacy provider compat paths * docs: remove agent parity notes * fix: skip provider wildcard metadata parsing * refactor: share session extension sdk loading * refactor: inline acpx proxy error formatter * refactor: fold edit recovery into edit tool * fix: accept extension batch separator * test: align startup provider plugin expectations * fix: restore provider-scoped release discovery * test: align static asset packaging expectations * fix: run static provider catalogs during scoped discovery * fix: add provider entry catalogs for scoped live discovery * fix: load lightweight provider catalog entries * fix: refresh provider-scoped plugin metadata * fix: keep provider catalog entries on release live path * fix: keep static manifest models in release live checks * fix: harden release model discovery * fix: reduce OpenAI live cache probe reasoning * fix: disable OpenAI cache probe reasoning * ci: extend OpenAI gateway live timeout * fix: extend live gateway model budget * fix: stabilize release validation regressions * fix: honor provider aliases in model rows * fix: stabilize release validation lanes * fix: stabilize release memory qa * ci: stabilize release validation lanes * ci: prefer ipv4 for live docker node calls * fix: restore shared tool-call stream wrapper * ci: remove legacy pi test shard alias * fix: clean up embedded agent test drift * fix: stabilize runtime alias status * fix: clean up embedded agent ci drift * fix: restore release ci invariants * fix: clean up post-rebase runtime drift * fix: restore release ci checks * fix: restore release ci after rebase * fix: remove stale pi runtime path * test: align compaction runtime expectations * test: update plugin prerelease expectations * fix: handle claude live tool approvals * fix: stabilize release validation gates * fix: finish agent runtime import * test: finish post-rebase agent runtime mocks * fix: keep codex compaction native * fix: stabilize codex app-server hook tests * test: isolate codex diagnostic active run * test: remove codex diagnostic completion race # Conflicts: # extensions/codex/src/app-server/run-attempt.test.ts * ci: fix full release manifest performance run id * refactor: narrow llm plugin sdk boundary * chore: drop generated google boundary stamps * fix: repair rebase fallout * fix: clean up rebased runtime references * fix: decode codex jwt payloads as base64url * fix: preserve shipped pi runtime alias * fix: add scoped sdk virtual modules * fix: decode llm codex oauth jwt as base64url * fix: avoid stale vertex adc negative cache * fix: harden tool arg decoding and codeql path * fix: keep vertex adc negative checks live * refactor: consolidate codex jwt and edit helpers * fix: await codex oauth node runtime imports * fix: preserve sdk tool and notice contracts * fix: preserve shipped compat config boundaries * fix: align codex oauth callback host * fix: terminate agent-core loop streams on failure * fix: keep codex oauth callback alive during fallback * ci: include session tools in critical codeql scans * fix: keep Cloudflare Anthropic provider auth header * docs: redirect legacy pi runtime pages * fix: honor bundled web provider compat discovery * fix: protect session output spill files * fix: keep legacy agent dir env blocked * fix: contain auto-discovered skill symlinks * fix: harden agent core sdk proxy surfaces * fix: restore approval reaction sdk compat * fix: keep live docker runs bounded * fix: keep codex oauth redirect host aligned * fix: resolve post-rebase agent runtime drift * fix: redact anthropic oauth parse failures * fix: preserve responses strict tool shaping * fix: repair agent runtime rebase cleanup * docs: redirect retired parity pages * fix: bound auto-discovered resources to roots * fix: repair post-rebase agent test drift * fix: preserve bundled provider allowlist migration * fix: preserve manifest-owned provider aliases * fix: declare photon image dependency * fix: keep provider headers out of proxy body * fix: preserve shipped env aliases * fix: refresh control ui i18n generated state * fix: quote read fallback paths * fix: preview edits through configured backend * test: satisfy core test typecheck * fix: preserve ZAI usage auth fallback * test: repair codex diagnostic test * fix: repair agent runtime rebase drift * test: finish embedded runner import rename * fix: repair agent runtime rebase integrations * test: align compaction oauth fallback expectations * fix: allow sdk-auth session models * fix: update doctor tool schema import * fix: preserve bedrock plugin region * fix: stream harmony-like prose immediately * ci: include session runtime in codeql shards * fix: repair latest rebase integrations * fix: honor explicit codex websocket transport * fix: keep openai-compatible credentials provider-scoped * fix: refresh sdk api baseline after rebase * fix: route cli runtime aliases through openclaw harness * test: rename stale harness mock expectation * test: rename embedded agent overflow calls * test: clean embedded auth test wording * test: use openclaw stream types in deepinfra cache test * fix: refresh sdk api baseline on latest main * fix: honor bundled discovery compat allowlists * fix: refresh sdk api baseline after latest rebase * fix: remove stale rebase imports * test: rename stale model catalog mock * test: mock renamed doctor runtime modules * fix: map canonical kimi env auth * fix: use internal model registry in bench script * fix: migrate deepinfra provider catalog entry * fix: enforce builtin tool suppression * fix: route compaction auth and proxy payloads safely * refactor: prune unused llm registry leftovers * test: update codex hooks session import * test: fix model picker ci coverage * test: align model picker auth mock types
1027 lines
31 KiB
TypeScript
1027 lines
31 KiB
TypeScript
import { createHash } from "node:crypto";
|
|
import fs from "node:fs/promises";
|
|
import path from "node:path";
|
|
import { fetchWithSsrFGuard } from "openclaw/plugin-sdk/ssrf-runtime";
|
|
import {
|
|
asFiniteNumber as readFiniteNumber,
|
|
isRecord as isMessageRecord,
|
|
normalizeOptionalString as readNonEmptyString,
|
|
} from "openclaw/plugin-sdk/string-coerce-runtime";
|
|
import {
|
|
scanDirectReplyTranscriptSentinels,
|
|
scanGatewayLogSentinels,
|
|
type GatewayLogSentinelFinding,
|
|
} from "./gateway-log-sentinel.js";
|
|
|
|
export type RuntimeId = "openclaw" | "codex";
|
|
|
|
export type RuntimeParityToolCall = {
|
|
tool: string;
|
|
argsHash: string;
|
|
resultHash: string;
|
|
errorClass?: string;
|
|
};
|
|
|
|
export type RuntimeParityUsage = {
|
|
inputTokens: number;
|
|
outputTokens: number;
|
|
totalTokens: number;
|
|
cacheRead?: number;
|
|
cacheWrite?: number;
|
|
};
|
|
|
|
export type RuntimeParityCell = {
|
|
runtime: RuntimeId;
|
|
transcriptBytes: string;
|
|
toolCalls: RuntimeParityToolCall[];
|
|
finalText: string;
|
|
usage: RuntimeParityUsage;
|
|
wallClockMs: number;
|
|
transportErrorClass?: string;
|
|
runtimeErrorClass?: string;
|
|
bootStateLines: string[];
|
|
sentinelFindings?: GatewayLogSentinelFinding[];
|
|
};
|
|
|
|
export type RuntimeParityDrift =
|
|
| "none"
|
|
| "text-only"
|
|
| "tool-call-shape"
|
|
| "tool-result-shape"
|
|
| "structural"
|
|
| "failure-mode";
|
|
|
|
export type RuntimeParityResult = {
|
|
scenarioId: string;
|
|
cells: {
|
|
openclaw: RuntimeParityCell;
|
|
codex: RuntimeParityCell;
|
|
};
|
|
drift: RuntimeParityDrift;
|
|
driftDetails?: string;
|
|
};
|
|
|
|
export type RuntimeParityScenarioExecution = {
|
|
scenarioStatus: "pass" | "fail";
|
|
scenarioDetails?: string;
|
|
cell: RuntimeParityCell;
|
|
};
|
|
|
|
export function runtimeParityCellStatus(
|
|
cell: RuntimeParityCell | undefined,
|
|
): "pass" | "fail" | "missing" {
|
|
if (!cell) {
|
|
return "missing";
|
|
}
|
|
return cell.runtimeErrorClass || cell.transportErrorClass ? "fail" : "pass";
|
|
}
|
|
|
|
export function isRuntimeParityResultPass(result: RuntimeParityResult) {
|
|
return (
|
|
result.drift !== "failure-mode" &&
|
|
runtimeParityCellStatus(result.cells.openclaw) === "pass" &&
|
|
runtimeParityCellStatus(result.cells.codex) === "pass"
|
|
);
|
|
}
|
|
|
|
type QaGatewayLike = {
|
|
logs?: () => string;
|
|
tempRoot: string;
|
|
};
|
|
|
|
type QaSuiteScenarioLike = {
|
|
details?: string;
|
|
status: "pass" | "fail";
|
|
};
|
|
|
|
type RuntimeParityCaptureParams = {
|
|
runtime: RuntimeId;
|
|
gateway: QaGatewayLike;
|
|
scenarioResult: QaSuiteScenarioLike;
|
|
wallClockMs: number;
|
|
agentId?: string;
|
|
mockBaseUrl?: string;
|
|
};
|
|
|
|
type RuntimeParitySessionEntry = {
|
|
sessionId?: string;
|
|
sessionFile?: string;
|
|
updatedAt?: number;
|
|
spawnedBy?: string;
|
|
parentSessionKey?: string;
|
|
spawnDepth?: number;
|
|
subagentRole?: string;
|
|
};
|
|
|
|
type RuntimeParityTranscriptRecord = {
|
|
message: Record<string, unknown>;
|
|
role: "user" | "assistant" | "tool" | "toolResult";
|
|
};
|
|
|
|
type RuntimeParityMockRequestSnapshot = {
|
|
plannedToolName?: string;
|
|
plannedToolArgs?: unknown;
|
|
toolOutput?: string;
|
|
};
|
|
|
|
type RuntimeParityPendingToolCall = RuntimeParityToolCall & {
|
|
_resolved: boolean;
|
|
};
|
|
|
|
const DEFAULT_AGENT_ID = "qa";
|
|
const HEARTBEAT_RESPONSE_TOOL_NAME = "heartbeat_respond";
|
|
const HEARTBEAT_TRANSCRIPT_PROMPT = "[OpenClaw heartbeat poll]";
|
|
const HEARTBEAT_TASK_PROMPT_PREFIX =
|
|
"Run the following periodic tasks (only those due based on their intervals):";
|
|
const BOOT_STATE_LINE_RE =
|
|
/\b(?:FailoverError|No API key found|Codex app-server|auth profile|runtime policy|restart mode:|plugin|doctor)\b/i;
|
|
const TOOL_RESULT_ERROR_RE = /\b(?:error|failed|failure|timeout|denied|enoent|not found)\b/i;
|
|
|
|
function normalizeTextForParity(text: string) {
|
|
return text.replace(/\s+/gu, " ").trim();
|
|
}
|
|
|
|
function sha256(value: string) {
|
|
return createHash("sha256").update(value).digest("hex");
|
|
}
|
|
|
|
function normalizeForStableHash(value: unknown): unknown {
|
|
if (Array.isArray(value)) {
|
|
return value.map((entry) => normalizeForStableHash(entry));
|
|
}
|
|
if (value && typeof value === "object") {
|
|
const record = value as Record<string, unknown>;
|
|
return Object.fromEntries(
|
|
Object.keys(record)
|
|
.toSorted((left, right) => left.localeCompare(right))
|
|
.map((key) => [key, normalizeForStableHash(record[key])]),
|
|
);
|
|
}
|
|
return value;
|
|
}
|
|
|
|
function stableHash(value: unknown) {
|
|
return sha256(JSON.stringify(normalizeForStableHash(value)) ?? "null");
|
|
}
|
|
|
|
function readUsageTotals(raw: unknown): RuntimeParityUsage {
|
|
const usage = isMessageRecord(raw) ? raw : {};
|
|
const inputTokens =
|
|
readFiniteNumber(usage.input) ??
|
|
readFiniteNumber(usage.inputTokens) ??
|
|
readFiniteNumber(usage.input_tokens) ??
|
|
0;
|
|
const outputTokens =
|
|
readFiniteNumber(usage.output) ??
|
|
readFiniteNumber(usage.outputTokens) ??
|
|
readFiniteNumber(usage.output_tokens) ??
|
|
0;
|
|
const cacheRead = readFiniteNumber(usage.cacheRead) ?? readFiniteNumber(usage.cache_read_tokens);
|
|
const cacheWrite =
|
|
readFiniteNumber(usage.cacheWrite) ?? readFiniteNumber(usage.cache_write_tokens);
|
|
const componentTotal = inputTokens + outputTokens + (cacheRead ?? 0) + (cacheWrite ?? 0);
|
|
const totalTokens =
|
|
readFiniteNumber(usage.total) ??
|
|
readFiniteNumber(usage.totalTokens) ??
|
|
readFiniteNumber(usage.total_tokens) ??
|
|
componentTotal;
|
|
return {
|
|
inputTokens,
|
|
outputTokens,
|
|
totalTokens,
|
|
...(cacheRead !== undefined ? { cacheRead } : {}),
|
|
...(cacheWrite !== undefined ? { cacheWrite } : {}),
|
|
};
|
|
}
|
|
|
|
function addUsage(target: RuntimeParityUsage, next: RuntimeParityUsage) {
|
|
target.inputTokens += next.inputTokens;
|
|
target.outputTokens += next.outputTokens;
|
|
target.totalTokens += next.totalTokens;
|
|
if (next.cacheRead !== undefined) {
|
|
target.cacheRead = (target.cacheRead ?? 0) + next.cacheRead;
|
|
}
|
|
if (next.cacheWrite !== undefined) {
|
|
target.cacheWrite = (target.cacheWrite ?? 0) + next.cacheWrite;
|
|
}
|
|
}
|
|
|
|
function extractAssistantText(message: Record<string, unknown>) {
|
|
const rawContent = message.content;
|
|
if (typeof rawContent === "string") {
|
|
return rawContent.trim();
|
|
}
|
|
if (!Array.isArray(rawContent)) {
|
|
return "";
|
|
}
|
|
const parts: string[] = [];
|
|
for (const block of rawContent) {
|
|
if (typeof block === "string") {
|
|
if (block.trim()) {
|
|
parts.push(block.trim());
|
|
}
|
|
continue;
|
|
}
|
|
if (!isMessageRecord(block)) {
|
|
continue;
|
|
}
|
|
const text = readNonEmptyString(block.text);
|
|
if (text) {
|
|
parts.push(text);
|
|
continue;
|
|
}
|
|
const nestedText = readNonEmptyString(block.content);
|
|
if (
|
|
nestedText &&
|
|
(block.type === "output_text" || block.type === "text" || block.type === "message")
|
|
) {
|
|
parts.push(nestedText);
|
|
}
|
|
}
|
|
return parts.join("\n").trim();
|
|
}
|
|
|
|
function normalizeToolCallId(value: unknown) {
|
|
return readNonEmptyString(value);
|
|
}
|
|
|
|
function parseJsonRecord(value: string): Record<string, unknown> | undefined {
|
|
if (!value.trim()) {
|
|
return undefined;
|
|
}
|
|
try {
|
|
const parsed = JSON.parse(value) as unknown;
|
|
return isMessageRecord(parsed) ? parsed : undefined;
|
|
} catch {
|
|
return undefined;
|
|
}
|
|
}
|
|
|
|
function extractToolCalls(message: Record<string, unknown>): Array<{
|
|
id?: string;
|
|
tool: string;
|
|
args: unknown;
|
|
}> {
|
|
const calls: Array<{ id?: string; tool: string; args: unknown }> = [];
|
|
const rawContent = message.content;
|
|
if (Array.isArray(rawContent)) {
|
|
for (const block of rawContent) {
|
|
if (!isMessageRecord(block)) {
|
|
continue;
|
|
}
|
|
const type = readNonEmptyString(block.type)?.toLowerCase();
|
|
if (type !== "tool_use" && type !== "toolcall" && type !== "tool_call") {
|
|
continue;
|
|
}
|
|
const tool = readNonEmptyString(block.name) ?? "unknown";
|
|
calls.push({
|
|
id:
|
|
normalizeToolCallId(block.id) ??
|
|
normalizeToolCallId(block.toolCallId) ??
|
|
normalizeToolCallId(block.toolUseId),
|
|
tool,
|
|
args: block.input ?? block.arguments ?? block.args ?? block.payload ?? null,
|
|
});
|
|
}
|
|
}
|
|
const rawToolCalls =
|
|
message.tool_calls ?? message.toolCalls ?? message.function_call ?? message.functionCall;
|
|
const toolCalls = Array.isArray(rawToolCalls) ? rawToolCalls : rawToolCalls ? [rawToolCalls] : [];
|
|
for (const call of toolCalls) {
|
|
if (!isMessageRecord(call)) {
|
|
continue;
|
|
}
|
|
const functionRecord = isMessageRecord(call.function) ? call.function : undefined;
|
|
const tool =
|
|
readNonEmptyString(call.name) ?? readNonEmptyString(functionRecord?.name) ?? "unknown";
|
|
calls.push({
|
|
id:
|
|
normalizeToolCallId(call.id) ??
|
|
normalizeToolCallId(call.toolCallId) ??
|
|
normalizeToolCallId(call.toolUseId),
|
|
tool,
|
|
args:
|
|
call.arguments ?? functionRecord?.arguments ?? call.input ?? functionRecord?.input ?? null,
|
|
});
|
|
}
|
|
return calls;
|
|
}
|
|
|
|
function extractToolResults(message: Record<string, unknown>): Array<{
|
|
id?: string;
|
|
tool?: string;
|
|
result: unknown;
|
|
errorClass?: string;
|
|
}> {
|
|
const results: Array<{ id?: string; tool?: string; result: unknown; errorClass?: string }> = [];
|
|
const toolName =
|
|
readNonEmptyString(message.toolName) ??
|
|
readNonEmptyString(message.tool_name) ??
|
|
readNonEmptyString(message.name) ??
|
|
readNonEmptyString(message.tool);
|
|
if ((message.role === "tool" || message.role === "toolResult") && message.content !== undefined) {
|
|
const contentText = extractAssistantText(message);
|
|
results.push({
|
|
tool: toolName,
|
|
result: message.content,
|
|
...(TOOL_RESULT_ERROR_RE.test(contentText) ? { errorClass: "tool-result-error" } : {}),
|
|
});
|
|
}
|
|
const rawContent = message.content;
|
|
if (!Array.isArray(rawContent)) {
|
|
return results;
|
|
}
|
|
for (const block of rawContent) {
|
|
if (!isMessageRecord(block)) {
|
|
continue;
|
|
}
|
|
const type = readNonEmptyString(block.type)?.toLowerCase();
|
|
if (type !== "tool_result" && type !== "tool_result_error") {
|
|
continue;
|
|
}
|
|
const content = block.content ?? block.result ?? block.output ?? block.text ?? null;
|
|
const contentText =
|
|
typeof content === "string"
|
|
? content
|
|
: Array.isArray(content)
|
|
? JSON.stringify(content)
|
|
: JSON.stringify(content ?? "");
|
|
results.push({
|
|
id:
|
|
normalizeToolCallId(block.tool_use_id) ??
|
|
normalizeToolCallId(block.toolUseId) ??
|
|
normalizeToolCallId(block.tool_call_id) ??
|
|
normalizeToolCallId(block.toolCallId),
|
|
tool: toolName,
|
|
result: content,
|
|
...(block.is_error === true ||
|
|
type === "tool_result_error" ||
|
|
TOOL_RESULT_ERROR_RE.test(contentText)
|
|
? { errorClass: "tool-result-error" }
|
|
: {}),
|
|
});
|
|
}
|
|
return results;
|
|
}
|
|
|
|
function classifyToolResultError(params: {
|
|
rawOutput: string;
|
|
parsedOutput: Record<string, unknown> | undefined;
|
|
}) {
|
|
const error = readNonEmptyString(params.parsedOutput?.error);
|
|
if (error) {
|
|
return "tool-result-error";
|
|
}
|
|
const status = readNonEmptyString(params.parsedOutput?.status);
|
|
if (status && /\b(?:error|failed|failure)\b/i.test(status)) {
|
|
return "tool-result-error";
|
|
}
|
|
if (!params.parsedOutput) {
|
|
const normalized = params.rawOutput.trim().toLowerCase();
|
|
if (
|
|
normalized.startsWith("error:") ||
|
|
normalized.startsWith("failed:") ||
|
|
normalized.includes("unsupported call:") ||
|
|
normalized.includes("permission denied") ||
|
|
normalized.includes("no such file") ||
|
|
normalized.includes("enoent")
|
|
) {
|
|
return "tool-result-error";
|
|
}
|
|
}
|
|
return undefined;
|
|
}
|
|
|
|
function resolveToolCallOrder(records: RuntimeParityTranscriptRecord[]): RuntimeParityToolCall[] {
|
|
const ordered: RuntimeParityPendingToolCall[] = [];
|
|
const byId = new Map<string, number>();
|
|
const unresolvedByTool = new Map<string, number[]>();
|
|
const unresolvedOrder: number[] = [];
|
|
|
|
const enqueueUnresolved = (tool: string, index: number) => {
|
|
const indices = unresolvedByTool.get(tool) ?? [];
|
|
indices.push(index);
|
|
unresolvedByTool.set(tool, indices);
|
|
unresolvedOrder.push(index);
|
|
};
|
|
|
|
const markResolved = (index: number) => {
|
|
ordered[index] = { ...ordered[index], _resolved: true };
|
|
const unresolvedIndex = unresolvedOrder.indexOf(index);
|
|
if (unresolvedIndex >= 0) {
|
|
unresolvedOrder.splice(unresolvedIndex, 1);
|
|
}
|
|
const toolIndices = unresolvedByTool.get(ordered[index].tool);
|
|
if (!toolIndices) {
|
|
return;
|
|
}
|
|
const nextIndices = toolIndices.filter((candidate) => candidate !== index);
|
|
if (nextIndices.length > 0) {
|
|
unresolvedByTool.set(ordered[index].tool, nextIndices);
|
|
return;
|
|
}
|
|
unresolvedByTool.delete(ordered[index].tool);
|
|
};
|
|
|
|
const matchPendingIndex = (result: { id?: string; tool?: string }) => {
|
|
if (result.id && byId.has(result.id)) {
|
|
return byId.get(result.id);
|
|
}
|
|
if (result.tool) {
|
|
const toolIndices = unresolvedByTool.get(result.tool);
|
|
if (toolIndices && toolIndices.length > 0) {
|
|
return toolIndices[0];
|
|
}
|
|
}
|
|
return unresolvedOrder[0];
|
|
};
|
|
|
|
for (const record of records) {
|
|
if (record.role === "assistant") {
|
|
for (const call of extractToolCalls(record.message)) {
|
|
const index =
|
|
ordered.push({
|
|
tool: call.tool,
|
|
argsHash: stableHash(call.args),
|
|
resultHash: stableHash(null),
|
|
_resolved: false,
|
|
}) - 1;
|
|
if (call.id) {
|
|
byId.set(call.id, index);
|
|
}
|
|
enqueueUnresolved(call.tool, index);
|
|
}
|
|
}
|
|
if (record.role === "user" || record.role === "tool" || record.role === "toolResult") {
|
|
for (const result of extractToolResults(record.message)) {
|
|
const pendingIndex = matchPendingIndex(result);
|
|
const nextValue: RuntimeParityToolCall = {
|
|
tool:
|
|
result.tool ??
|
|
(pendingIndex !== undefined ? ordered[pendingIndex]?.tool : undefined) ??
|
|
"unknown",
|
|
argsHash:
|
|
pendingIndex !== undefined
|
|
? (ordered[pendingIndex]?.argsHash ?? stableHash(null))
|
|
: stableHash(null),
|
|
resultHash: stableHash(result.result),
|
|
...(result.errorClass ? { errorClass: result.errorClass } : {}),
|
|
};
|
|
if (pendingIndex === undefined || !ordered[pendingIndex]) {
|
|
ordered.push({ ...nextValue, _resolved: true });
|
|
continue;
|
|
}
|
|
ordered[pendingIndex] = {
|
|
...nextValue,
|
|
_resolved: true,
|
|
};
|
|
markResolved(pendingIndex);
|
|
}
|
|
}
|
|
}
|
|
|
|
return ordered.map(({ _resolved: _ignored, ...toolCall }) => toolCall);
|
|
}
|
|
|
|
function resolveToolCallOrderFromMockRequests(
|
|
requests: RuntimeParityMockRequestSnapshot[],
|
|
): RuntimeParityToolCall[] {
|
|
const ordered: RuntimeParityPendingToolCall[] = [];
|
|
const unresolvedOrder: number[] = [];
|
|
|
|
const enqueueUnresolved = (index: number) => {
|
|
unresolvedOrder.push(index);
|
|
};
|
|
|
|
const markResolved = (index: number) => {
|
|
ordered[index] = { ...ordered[index], _resolved: true };
|
|
const unresolvedIndex = unresolvedOrder.indexOf(index);
|
|
if (unresolvedIndex >= 0) {
|
|
unresolvedOrder.splice(unresolvedIndex, 1);
|
|
}
|
|
};
|
|
|
|
for (const request of requests) {
|
|
const rawToolOutput = readNonEmptyString(request.toolOutput) ?? "";
|
|
if (rawToolOutput) {
|
|
const pendingIndex = unresolvedOrder[0];
|
|
const parsedOutput = parseJsonRecord(rawToolOutput);
|
|
const resolvedCall: RuntimeParityToolCall = {
|
|
tool: pendingIndex !== undefined ? (ordered[pendingIndex]?.tool ?? "unknown") : "unknown",
|
|
argsHash:
|
|
pendingIndex !== undefined
|
|
? (ordered[pendingIndex]?.argsHash ?? stableHash(null))
|
|
: stableHash(null),
|
|
resultHash: stableHash(parsedOutput ?? rawToolOutput),
|
|
...(classifyToolResultError({
|
|
rawOutput: rawToolOutput,
|
|
parsedOutput,
|
|
})
|
|
? { errorClass: "tool-result-error" }
|
|
: {}),
|
|
};
|
|
if (pendingIndex === undefined || !ordered[pendingIndex]) {
|
|
ordered.push({ ...resolvedCall, _resolved: true });
|
|
} else {
|
|
ordered[pendingIndex] = {
|
|
...resolvedCall,
|
|
_resolved: true,
|
|
};
|
|
markResolved(pendingIndex);
|
|
}
|
|
}
|
|
|
|
const plannedToolName = readNonEmptyString(request.plannedToolName);
|
|
if (!plannedToolName) {
|
|
continue;
|
|
}
|
|
ordered.push({
|
|
tool: plannedToolName,
|
|
argsHash: stableHash(request.plannedToolArgs ?? null),
|
|
resultHash: stableHash(null),
|
|
_resolved: false,
|
|
});
|
|
enqueueUnresolved(ordered.length - 1);
|
|
}
|
|
|
|
return ordered.map(({ _resolved: _ignored, ...toolCall }) => toolCall);
|
|
}
|
|
|
|
function classifyScenarioError(details: string | undefined): string | undefined {
|
|
const normalized = normalizeTextForParity(details ?? "").toLowerCase();
|
|
if (!normalized) {
|
|
return undefined;
|
|
}
|
|
if (normalized.includes("no api key found")) {
|
|
return "missing-api-key";
|
|
}
|
|
if (normalized.includes("failover")) {
|
|
return "failover";
|
|
}
|
|
if (normalized.includes("timeout") || normalized.includes("timed out")) {
|
|
return "timeout";
|
|
}
|
|
if (normalized.includes("codex app-server")) {
|
|
return "codex-app-server";
|
|
}
|
|
if (
|
|
normalized.includes("auth profile") ||
|
|
normalized.includes("oauth") ||
|
|
normalized.includes("api key")
|
|
) {
|
|
return "auth";
|
|
}
|
|
if (normalized.includes("tool")) {
|
|
return "tool-error";
|
|
}
|
|
return "scenario-failure";
|
|
}
|
|
|
|
function extractBootStateLines(logs: string | undefined): string[] {
|
|
if (!logs) {
|
|
return [];
|
|
}
|
|
return logs
|
|
.split(/\r?\n/u)
|
|
.map((line) => line.trim())
|
|
.filter((line) => line.length > 0 && BOOT_STATE_LINE_RE.test(line))
|
|
.slice(-30);
|
|
}
|
|
|
|
function buildTranscriptRecords(transcriptBytes: string): RuntimeParityTranscriptRecord[] {
|
|
const records: RuntimeParityTranscriptRecord[] = [];
|
|
for (const line of transcriptBytes.split(/\r?\n/u)) {
|
|
const trimmed = line.trim();
|
|
if (!trimmed) {
|
|
continue;
|
|
}
|
|
try {
|
|
const parsed = JSON.parse(trimmed) as Record<string, unknown>;
|
|
const message = isMessageRecord(parsed.message) ? parsed.message : undefined;
|
|
const role = readNonEmptyString(message?.role);
|
|
if (
|
|
!message ||
|
|
(role !== "user" && role !== "assistant" && role !== "tool" && role !== "toolResult")
|
|
) {
|
|
continue;
|
|
}
|
|
records.push({
|
|
message,
|
|
role,
|
|
});
|
|
} catch {
|
|
// Ignore malformed QA transcript rows and keep the classifier deterministic.
|
|
}
|
|
}
|
|
return records;
|
|
}
|
|
|
|
function isHeartbeatOnlyRuntimeTranscript(transcriptBytes: string) {
|
|
const records = buildTranscriptRecords(transcriptBytes);
|
|
if (records.length === 0) {
|
|
return false;
|
|
}
|
|
const userTexts = records
|
|
.filter((record) => record.role === "user" && !isToolResultLikeMessage(record.message))
|
|
.map((record) => extractAssistantText(record.message));
|
|
return userTexts.length > 0 && userTexts.every(isHeartbeatRuntimeUserText);
|
|
}
|
|
|
|
function isToolResultLikeMessage(message: Record<string, unknown>) {
|
|
if (message.role === "tool" || message.role === "toolResult") {
|
|
return true;
|
|
}
|
|
const rawContent = message.content;
|
|
if (!Array.isArray(rawContent)) {
|
|
return false;
|
|
}
|
|
return rawContent.some((block) => {
|
|
if (!isMessageRecord(block)) {
|
|
return false;
|
|
}
|
|
const type = readNonEmptyString(block.type)?.toLowerCase();
|
|
return type === "tool_result" || type === "toolresult" || type === "tool_result_error";
|
|
});
|
|
}
|
|
|
|
function isHeartbeatRuntimeUserText(text: string) {
|
|
const normalized = normalizeTextForParity(text).toLowerCase();
|
|
if (!normalized) {
|
|
return false;
|
|
}
|
|
if (normalized === HEARTBEAT_TRANSCRIPT_PROMPT.toLowerCase()) {
|
|
return true;
|
|
}
|
|
if (normalized.startsWith("read heartbeat.md") && normalized.includes("heartbeat_ok")) {
|
|
return true;
|
|
}
|
|
if (
|
|
normalized.startsWith("read heartbeat.md") &&
|
|
normalized.includes(HEARTBEAT_RESPONSE_TOOL_NAME)
|
|
) {
|
|
return true;
|
|
}
|
|
return (
|
|
normalized.startsWith(HEARTBEAT_TASK_PROMPT_PREFIX.toLowerCase()) &&
|
|
(normalized.includes("heartbeat_ok") || normalized.includes(HEARTBEAT_RESPONSE_TOOL_NAME))
|
|
);
|
|
}
|
|
|
|
function extractFinalAssistantText(records: RuntimeParityTranscriptRecord[]) {
|
|
let lastAssistantText = "";
|
|
for (const record of records) {
|
|
if (record.role !== "assistant") {
|
|
continue;
|
|
}
|
|
const text = extractAssistantText(record.message);
|
|
if (text) {
|
|
lastAssistantText = text;
|
|
}
|
|
}
|
|
return normalizeTextForParity(lastAssistantText);
|
|
}
|
|
|
|
function aggregateUsage(records: RuntimeParityTranscriptRecord[]): RuntimeParityUsage {
|
|
const totals: RuntimeParityUsage = {
|
|
inputTokens: 0,
|
|
outputTokens: 0,
|
|
totalTokens: 0,
|
|
};
|
|
for (const record of records) {
|
|
if (record.role !== "assistant") {
|
|
continue;
|
|
}
|
|
const usage = readUsageTotals(record.message.usage ?? null);
|
|
addUsage(totals, usage);
|
|
}
|
|
return totals;
|
|
}
|
|
|
|
function compareToolCallShape(
|
|
left: RuntimeParityToolCall[],
|
|
right: RuntimeParityToolCall[],
|
|
): string | undefined {
|
|
if (left.length !== right.length) {
|
|
return `tool call count differs (${left.length} vs ${right.length})`;
|
|
}
|
|
for (let index = 0; index < left.length; index += 1) {
|
|
const leftCall = left[index];
|
|
const rightCall = right[index];
|
|
if (!leftCall || !rightCall) {
|
|
return `tool call row ${index + 1} missing`;
|
|
}
|
|
if (leftCall.tool !== rightCall.tool || leftCall.argsHash !== rightCall.argsHash) {
|
|
return `tool call ${index + 1} differs (${leftCall.tool}/${leftCall.argsHash} vs ${rightCall.tool}/${rightCall.argsHash})`;
|
|
}
|
|
}
|
|
return undefined;
|
|
}
|
|
|
|
function compareToolResultShape(
|
|
left: RuntimeParityToolCall[],
|
|
right: RuntimeParityToolCall[],
|
|
): string | undefined {
|
|
const total = Math.min(left.length, right.length);
|
|
for (let index = 0; index < total; index += 1) {
|
|
const leftCall = left[index];
|
|
const rightCall = right[index];
|
|
if (!leftCall || !rightCall) {
|
|
continue;
|
|
}
|
|
if (
|
|
leftCall.resultHash !== rightCall.resultHash ||
|
|
(leftCall.errorClass ?? "") !== (rightCall.errorClass ?? "")
|
|
) {
|
|
return `tool result ${index + 1} differs (${leftCall.tool})`;
|
|
}
|
|
}
|
|
return undefined;
|
|
}
|
|
|
|
function isHardFailureRuntimeError(errorClass: string | undefined) {
|
|
return (
|
|
errorClass === "missing-api-key" ||
|
|
errorClass === "failover" ||
|
|
errorClass === "codex-app-server" ||
|
|
errorClass === "auth" ||
|
|
errorClass === "capture-missing" ||
|
|
errorClass?.startsWith("sentinel:") === true
|
|
);
|
|
}
|
|
|
|
function summarizeSentinelErrorClass(findings: readonly GatewayLogSentinelFinding[]) {
|
|
if (findings.length === 0) {
|
|
return undefined;
|
|
}
|
|
return `sentinel:${findings
|
|
.map((finding) => finding.kind)
|
|
.toSorted((left, right) => left.localeCompare(right))
|
|
.join(",")}`;
|
|
}
|
|
|
|
function classifyRuntimeParityCells(params: {
|
|
openclaw: RuntimeParityCell;
|
|
codex: RuntimeParityCell;
|
|
openclawScenarioStatus: "pass" | "fail";
|
|
codexScenarioStatus: "pass" | "fail";
|
|
}): Pick<RuntimeParityResult, "drift" | "driftDetails"> {
|
|
if (
|
|
isHardFailureRuntimeError(params.openclaw.runtimeErrorClass) ||
|
|
isHardFailureRuntimeError(params.codex.runtimeErrorClass) ||
|
|
params.openclaw.transportErrorClass ||
|
|
params.codex.transportErrorClass
|
|
) {
|
|
return {
|
|
drift: "failure-mode",
|
|
driftDetails:
|
|
params.openclaw.transportErrorClass || params.codex.transportErrorClass
|
|
? "at least one runtime hit a transport failure"
|
|
: "at least one runtime hit a hard runtime failure",
|
|
};
|
|
}
|
|
|
|
const toolCallShapeDetails = compareToolCallShape(
|
|
params.openclaw.toolCalls,
|
|
params.codex.toolCalls,
|
|
);
|
|
if (toolCallShapeDetails) {
|
|
return { drift: "tool-call-shape", driftDetails: toolCallShapeDetails };
|
|
}
|
|
|
|
const toolResultShapeDetails = compareToolResultShape(
|
|
params.openclaw.toolCalls,
|
|
params.codex.toolCalls,
|
|
);
|
|
if (toolResultShapeDetails) {
|
|
return { drift: "tool-result-shape", driftDetails: toolResultShapeDetails };
|
|
}
|
|
|
|
const openclawTranscriptLines = params.openclaw.transcriptBytes.trim().length
|
|
? params.openclaw.transcriptBytes.trim().split(/\r?\n/u).length
|
|
: 0;
|
|
const codexTranscriptLines = params.codex.transcriptBytes.trim().length
|
|
? params.codex.transcriptBytes.trim().split(/\r?\n/u).length
|
|
: 0;
|
|
if (
|
|
openclawTranscriptLines !== codexTranscriptLines ||
|
|
(!params.openclaw.finalText && !!params.codex.finalText) ||
|
|
(!!params.openclaw.finalText && !params.codex.finalText)
|
|
) {
|
|
return {
|
|
drift: "structural",
|
|
driftDetails: `transcript/final-text structure differs (${openclawTranscriptLines} lines vs ${codexTranscriptLines})`,
|
|
};
|
|
}
|
|
|
|
if (
|
|
params.openclawScenarioStatus === "fail" ||
|
|
params.codexScenarioStatus === "fail" ||
|
|
params.openclaw.runtimeErrorClass ||
|
|
params.codex.runtimeErrorClass
|
|
) {
|
|
return {
|
|
drift: "failure-mode",
|
|
driftDetails:
|
|
params.openclawScenarioStatus === params.codexScenarioStatus
|
|
? "at least one runtime failed"
|
|
: `scenario status differs (${params.openclawScenarioStatus} vs ${params.codexScenarioStatus})`,
|
|
};
|
|
}
|
|
|
|
if (
|
|
normalizeTextForParity(params.openclaw.finalText) ===
|
|
normalizeTextForParity(params.codex.finalText)
|
|
) {
|
|
return { drift: "none" };
|
|
}
|
|
|
|
return { drift: "text-only", driftDetails: "final text differs after whitespace normalization" };
|
|
}
|
|
|
|
function resolveSessionTranscriptFile(params: {
|
|
sessionsDir: string;
|
|
sessionId: string;
|
|
sessionEntry?: RuntimeParitySessionEntry;
|
|
}): string | undefined {
|
|
const explicitSessionFile = readNonEmptyString(params.sessionEntry?.sessionFile);
|
|
if (explicitSessionFile) {
|
|
const candidate = path.isAbsolute(explicitSessionFile)
|
|
? explicitSessionFile
|
|
: path.join(params.sessionsDir, explicitSessionFile);
|
|
return candidate;
|
|
}
|
|
const baseName = `${params.sessionId}.jsonl`;
|
|
return path.join(params.sessionsDir, baseName);
|
|
}
|
|
|
|
function isRuntimeParityRootSession(entry: RuntimeParitySessionEntry) {
|
|
if (readNonEmptyString(entry.spawnedBy) || readNonEmptyString(entry.parentSessionKey)) {
|
|
return false;
|
|
}
|
|
if (typeof entry.spawnDepth === "number" && entry.spawnDepth > 0) {
|
|
return false;
|
|
}
|
|
if (readNonEmptyString(entry.subagentRole)) {
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
async function readRuntimeParitySessionEntries(params: {
|
|
stateDir: string;
|
|
agentId: string;
|
|
}): Promise<Array<RuntimeParitySessionEntry>> {
|
|
const storePath = path.join(
|
|
params.stateDir,
|
|
"agents",
|
|
params.agentId,
|
|
"sessions",
|
|
"sessions.json",
|
|
);
|
|
try {
|
|
const raw = await fs.readFile(storePath, "utf8");
|
|
const parsed = JSON.parse(raw) as Record<string, RuntimeParitySessionEntry>;
|
|
const entries = Object.values(parsed).filter((entry) => readNonEmptyString(entry?.sessionId));
|
|
const rootEntries = entries.filter(isRuntimeParityRootSession);
|
|
const candidates = rootEntries.length > 0 ? rootEntries : entries;
|
|
return candidates.toSorted((left, right) => (right.updatedAt ?? 0) - (left.updatedAt ?? 0));
|
|
} catch {
|
|
return [];
|
|
}
|
|
}
|
|
|
|
async function loadRuntimeParityTranscripts(params: {
|
|
gateway: QaGatewayLike;
|
|
agentId: string;
|
|
}): Promise<string> {
|
|
const sessionsDir = path.join(
|
|
params.gateway.tempRoot,
|
|
"state",
|
|
"agents",
|
|
params.agentId,
|
|
"sessions",
|
|
);
|
|
const sessionEntries = await readRuntimeParitySessionEntries({
|
|
stateDir: path.join(params.gateway.tempRoot, "state"),
|
|
agentId: params.agentId,
|
|
});
|
|
const transcripts: string[] = [];
|
|
for (const sessionEntry of sessionEntries) {
|
|
const sessionId = readNonEmptyString(sessionEntry.sessionId);
|
|
if (!sessionId) {
|
|
continue;
|
|
}
|
|
const sessionFile = resolveSessionTranscriptFile({
|
|
sessionsDir,
|
|
sessionId,
|
|
sessionEntry,
|
|
});
|
|
if (!sessionFile) {
|
|
continue;
|
|
}
|
|
try {
|
|
const transcript = await fs.readFile(sessionFile, "utf8");
|
|
if (transcript.trim().length > 0 && !isHeartbeatOnlyRuntimeTranscript(transcript)) {
|
|
transcripts.push(transcript.trimEnd());
|
|
break;
|
|
}
|
|
} catch {
|
|
// Ignore missing transcript files so failed cells still render.
|
|
}
|
|
}
|
|
return transcripts.join("\n");
|
|
}
|
|
|
|
async function loadRuntimeParityMockToolCalls(
|
|
mockBaseUrl: string | undefined,
|
|
): Promise<RuntimeParityToolCall[] | null> {
|
|
const normalizedBaseUrl = mockBaseUrl?.trim().replace(/\/+$/u, "");
|
|
if (!normalizedBaseUrl) {
|
|
return null;
|
|
}
|
|
try {
|
|
const { response, release } = await fetchWithSsrFGuard({
|
|
url: `${normalizedBaseUrl}/debug/requests`,
|
|
policy: { allowPrivateNetwork: true },
|
|
auditContext: "qa-lab-runtime-parity-mock-tool-calls",
|
|
});
|
|
let payload: unknown;
|
|
try {
|
|
if (!response.ok) {
|
|
return null;
|
|
}
|
|
payload = await response.json();
|
|
} finally {
|
|
await release();
|
|
}
|
|
if (!Array.isArray(payload)) {
|
|
return null;
|
|
}
|
|
const requests = payload.filter(isMessageRecord).map(
|
|
(entry): RuntimeParityMockRequestSnapshot => ({
|
|
plannedToolName: readNonEmptyString(entry.plannedToolName),
|
|
plannedToolArgs: entry.plannedToolArgs ?? null,
|
|
toolOutput: readNonEmptyString(entry.toolOutput) ?? "",
|
|
}),
|
|
);
|
|
return resolveToolCallOrderFromMockRequests(requests);
|
|
} catch {
|
|
return null;
|
|
}
|
|
}
|
|
|
|
export async function captureRuntimeParityCell(
|
|
params: RuntimeParityCaptureParams,
|
|
): Promise<RuntimeParityCell> {
|
|
const agentId = params.agentId ?? DEFAULT_AGENT_ID;
|
|
const transcriptBytes = await loadRuntimeParityTranscripts({
|
|
gateway: params.gateway,
|
|
agentId,
|
|
});
|
|
const transcriptRecords = buildTranscriptRecords(transcriptBytes);
|
|
const mockToolCalls = await loadRuntimeParityMockToolCalls(params.mockBaseUrl);
|
|
const gatewayLogs = params.gateway.logs?.();
|
|
const sentinelFindings = [
|
|
...scanGatewayLogSentinels(gatewayLogs),
|
|
...scanDirectReplyTranscriptSentinels(transcriptBytes),
|
|
];
|
|
const scenarioErrorClass = classifyScenarioError(params.scenarioResult.details);
|
|
const sentinelErrorClass = summarizeSentinelErrorClass(sentinelFindings);
|
|
return {
|
|
runtime: params.runtime,
|
|
transcriptBytes,
|
|
toolCalls: mockToolCalls ?? resolveToolCallOrder(transcriptRecords),
|
|
finalText: extractFinalAssistantText(transcriptRecords),
|
|
usage: aggregateUsage(transcriptRecords),
|
|
wallClockMs: params.wallClockMs,
|
|
...(scenarioErrorClass || sentinelErrorClass
|
|
? { runtimeErrorClass: scenarioErrorClass ?? sentinelErrorClass }
|
|
: {}),
|
|
bootStateLines: extractBootStateLines(gatewayLogs),
|
|
...(sentinelFindings.length > 0 ? { sentinelFindings } : {}),
|
|
};
|
|
}
|
|
|
|
export async function runRuntimeParityScenario(params: {
|
|
scenarioId: string;
|
|
runCell: (runtime: RuntimeId) => Promise<RuntimeParityScenarioExecution>;
|
|
}): Promise<RuntimeParityResult> {
|
|
const openclaw = await params.runCell("openclaw");
|
|
const codex = await params.runCell("codex");
|
|
const drift = classifyRuntimeParityCells({
|
|
openclaw: openclaw.cell,
|
|
codex: codex.cell,
|
|
openclawScenarioStatus: openclaw.scenarioStatus,
|
|
codexScenarioStatus: codex.scenarioStatus,
|
|
});
|
|
return {
|
|
scenarioId: params.scenarioId,
|
|
cells: {
|
|
openclaw: openclaw.cell,
|
|
codex: codex.cell,
|
|
},
|
|
drift: drift.drift,
|
|
...(drift.driftDetails ? { driftDetails: drift.driftDetails } : {}),
|
|
};
|
|
}
|