diff --git a/CHANGELOG.md b/CHANGELOG.md index 08baa2699ff..2b14279c927 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -83,6 +83,7 @@ Docs: https://docs.openclaw.ai - CLI/setup: collapse raw gateway config keys in existing-config summaries into friendly `Model` and `Gateway` rows. - CLI/config: show concise human config-write output with an indented backup path instead of printing checksum-heavy overwrite audit details by default. - CLI/docs: call the canonical lowercase docs MCP search tool and surface MCP errors instead of returning empty search results. Fixes #82702. (#82704) Thanks @hclsys. +- QA-Lab: add gateway log sentinels for plugin hook failures, Codex app-server stalls/timeouts, cron allowlist drift, live quota blockers, and direct-reply self-message transcripts so harness proof fails on self-health regressions. (#80323) Thanks @100yenadmin. - QA-Lab: ignore heartbeat-only operational transcripts when capturing runtime parity cells so background checks cannot replace the scenario reply. (#80323) Thanks @100yenadmin. - QA-Lab: pin threaded-memory parity runs to `memory-core`, keep bundled plugin resolution enabled for QA commands, and retry transient session-store lock reads. (#72045) Thanks @WuKongAI-CMU. - QA-Lab/qa-channel: keep mock memory ranking, inbound media notes, and opened-file realpath checks stable for mock OpenAI qa-channel runs. (#66826) Thanks @gumadeiras. diff --git a/extensions/qa-lab/src/gateway-log-sentinel.test.ts b/extensions/qa-lab/src/gateway-log-sentinel.test.ts new file mode 100644 index 00000000000..ba0c8896c8a --- /dev/null +++ b/extensions/qa-lab/src/gateway-log-sentinel.test.ts @@ -0,0 +1,123 @@ +import { describe, expect, it } from "vitest"; +import { + assertNoGatewayLogSentinels, + formatGatewayLogSentinelSummary, + scanDirectReplyTranscriptSentinels, + scanGatewayLogSentinels, +} from "./gateway-log-sentinel.js"; + +describe("gateway log sentinels", () => { + it("classifies May 13 beta.5 operational failure signatures", () => { + const findings = scanGatewayLogSentinels( + [ + "2026-05-13T00:00:01Z plugin before_prompt_build hook failed: TypeError: boom", + "2026-05-13T00:00:02Z plugin before_tool_call crashed while evaluating policy", + "2026-05-13T00:00:03Z plugin manifest invalid: missing contracts.tools registration", + "2026-05-13T00:00:04Z codex app-server attempt timed out after 180000ms", + "2026-05-13T00:00:05Z codex_app_server progress stalled for run abc123", + "2026-05-13T00:00:06Z cron payload model openai/gpt-5.4 is not in model allowlist", + "2026-05-13T00:00:07Z OpenAI quota exceeded for live-frontier request", + ].join("\n"), + ); + + expect(findings.map((finding) => finding.kind)).toEqual([ + "plugin-hook-failure", + "plugin-hook-failure", + "plugin-contract-error", + "codex-app-server-timeout", + "stalled-agent-run", + "cron-model-allowlist", + "live-quota-or-subscription", + ]); + expect(findings.find((finding) => finding.kind === "plugin-hook-failure")).toMatchObject({ + verdict: "qa-harness-bug", + owner: "plugin", + productImpact: "P1", + }); + expect(findings.find((finding) => finding.kind === "live-quota-or-subscription")).toMatchObject( + { + verdict: "environment-blocked", + owner: "environment", + productImpact: "P4", + }, + ); + }); + + it("honors log cursors while preserving absolute line numbers", () => { + const prefix = "safe line\n"; + const findings = scanGatewayLogSentinels(`${prefix}codex app-server attempt timed out`, { + since: prefix.length, + }); + + expect(findings).toHaveLength(1); + expect(findings[0]).toMatchObject({ + kind: "codex-app-server-timeout", + line: 2, + }); + }); + + it("throws actionable summaries unless only environment blockers are allowed", () => { + expect(() => assertNoGatewayLogSentinels("codex_app_server progress stalled")).toThrow( + "stalled-agent-run", + ); + expect(() => + assertNoGatewayLogSentinels("OpenAI quota exceeded", { allowEnvironmentBlocked: true }), + ).not.toThrow(); + expect(formatGatewayLogSentinelSummary(scanGatewayLogSentinels("OpenAI quota exceeded"))).toBe( + "live-quota-or-subscription@1 environment-blocked owner=environment: OpenAI quota exceeded", + ); + }); + + it("detects direct reply self-message transcripts separately from gateway logs", () => { + const findings = scanDirectReplyTranscriptSentinels( + [ + JSON.stringify({ + message: { + role: "assistant", + content: [ + { + type: "tool_use", + name: "message", + input: { action: "send", conversationId: "qa-operator", text: "hello" }, + }, + ], + }, + }), + JSON.stringify({ message: { role: "assistant", content: "Sent." } }), + ].join("\n"), + ); + + expect(findings).toHaveLength(1); + expect(findings[0]).toMatchObject({ + kind: "direct-reply-self-message", + verdict: "product-bug", + owner: "openclaw-routing", + }); + }); + + it("detects OpenAI function_call-shaped direct reply transcripts", () => { + const findings = scanDirectReplyTranscriptSentinels( + [ + JSON.stringify({ + message: { + role: "assistant", + content: [ + { + type: "function_call", + name: "message", + arguments: JSON.stringify({ + action: "send", + target: "current", + text: "hello", + }), + }, + ], + }, + }), + JSON.stringify({ message: { role: "assistant", content: "Sent." } }), + ].join("\n"), + ); + + expect(findings.map((finding) => finding.kind)).toEqual(["direct-reply-self-message"]); + }); +}); diff --git a/extensions/qa-lab/src/gateway-log-sentinel.ts b/extensions/qa-lab/src/gateway-log-sentinel.ts new file mode 100644 index 00000000000..a0efef601ad --- /dev/null +++ b/extensions/qa-lab/src/gateway-log-sentinel.ts @@ -0,0 +1,367 @@ +export type GatewayLogSentinelKind = + | "plugin-hook-failure" + | "plugin-contract-error" + | "direct-reply-self-message" + | "codex-app-server-timeout" + | "stalled-agent-run" + | "cron-model-allowlist" + | "live-quota-or-subscription"; + +export type GatewayLogSentinelVerdict = + | "product-bug" + | "qa-harness-bug" + | "fixture-bug" + | "environment-blocked"; + +export type GatewayLogSentinelOwner = + | "plugin" + | "openclaw-routing" + | "codex-runtime" + | "openclaw-cron" + | "environment"; + +export type GatewayLogSentinelFinding = { + kind: GatewayLogSentinelKind; + verdict: GatewayLogSentinelVerdict; + owner: GatewayLogSentinelOwner; + productImpact: "P0" | "P1" | "P2" | "P3" | "P4"; + qaImpact: "P0" | "P1" | "P2" | "P3" | "P4"; + line: number; + text: string; +}; + +export type GatewayLogSentinelScanOptions = { + since?: number; + kinds?: readonly GatewayLogSentinelKind[]; + ignoreKinds?: readonly GatewayLogSentinelKind[]; +}; + +export type GatewayLogSentinelAssertOptions = GatewayLogSentinelScanOptions & { + allowEnvironmentBlocked?: boolean; +}; + +type GatewayLogSentinelRule = Omit & { + test: (line: string) => boolean; +}; + +type GatewayLogSentinelToolCall = { + name: string; + args: unknown; +}; + +const GATEWAY_LOG_SENTINEL_RULES: GatewayLogSentinelRule[] = [ + { + kind: "plugin-hook-failure", + verdict: "qa-harness-bug", + owner: "plugin", + productImpact: "P1", + qaImpact: "P0", + test: (line) => + /\bbefore_(?:prompt_build|tool_call)\b/iu.test(line) && + /\b(?:crash(?:ed)?|exception|failed|failure|error)\b/iu.test(line), + }, + { + kind: "plugin-contract-error", + verdict: "qa-harness-bug", + owner: "plugin", + productImpact: "P1", + qaImpact: "P0", + test: (line) => + /\bcontracts\.tools\b/iu.test(line) && + /\b(?:missing|invalid|registration|register|manifest|contract|schema|error)\b/iu.test(line), + }, + { + kind: "codex-app-server-timeout", + verdict: "product-bug", + owner: "codex-runtime", + productImpact: "P1", + qaImpact: "P0", + test: (line) => + /\bcodex app-server\b.*\btimed out\b|\btimed out\b.*\bcodex app-server\b/iu.test(line), + }, + { + kind: "stalled-agent-run", + verdict: "product-bug", + owner: "codex-runtime", + productImpact: "P1", + qaImpact: "P0", + test: (line) => + /\bcodex_app_server\b.*\b(?:stalled|no progress|progress stalled)\b|\b(?:stalled|no progress|progress stalled)\b.*\bcodex_app_server\b/iu.test( + line, + ), + }, + { + kind: "cron-model-allowlist", + verdict: "product-bug", + owner: "openclaw-cron", + productImpact: "P2", + qaImpact: "P0", + test: (line) => + /\bcron\b/iu.test(line) && + (/\bmodel allowlist\b/iu.test(line) || + /\ballowlist\b.*\bmodel\b/iu.test(line) || + /\bmodel\b.*\b(?:not in|outside|blocked by)\b.*\ballowlist\b/iu.test(line)), + }, + { + kind: "live-quota-or-subscription", + verdict: "environment-blocked", + owner: "environment", + productImpact: "P4", + qaImpact: "P0", + test: (line) => + /\b(?:quota exceeded|insufficient_quota|subscription exhausted|no active subscription|billing hard limit|usage limit)\b/iu.test( + line, + ), + }, +]; + +function filterGatewayLogSentinelFindings( + findings: GatewayLogSentinelFinding[], + options: GatewayLogSentinelScanOptions | undefined, +) { + const kinds = new Set(options?.kinds ?? []); + const ignoreKinds = new Set(options?.ignoreKinds ?? []); + return findings.filter((finding) => { + if (kinds.size > 0 && !kinds.has(finding.kind)) { + return false; + } + return !ignoreKinds.has(finding.kind); + }); +} + +function lineNumberForOffset(logs: string, offset: number) { + if (offset <= 0) { + return 1; + } + return logs.slice(0, offset).split(/\r?\n/u).length; +} + +function isRecord(value: unknown): value is Record { + return Boolean(value) && typeof value === "object" && !Array.isArray(value); +} + +function readNonEmptyString(value: unknown): string | undefined { + return typeof value === "string" && value.trim().length > 0 ? value.trim() : undefined; +} + +function extractMessageText(message: Record) { + const rawContent = message.content; + if (typeof rawContent === "string") { + return rawContent.trim(); + } + if (!Array.isArray(rawContent)) { + return ""; + } + const parts: string[] = []; + for (const block of rawContent) { + if (typeof block === "string") { + if (block.trim()) { + parts.push(block.trim()); + } + continue; + } + if (!isRecord(block)) { + continue; + } + const text = readNonEmptyString(block.text); + if (text) { + parts.push(text); + continue; + } + const nestedText = readNonEmptyString(block.content); + if ( + nestedText && + (block.type === "output_text" || block.type === "text" || block.type === "message") + ) { + parts.push(nestedText); + } + } + return parts.join("\n").trim(); +} + +function parseJsonArguments(value: unknown): unknown { + if (typeof value !== "string") { + return value; + } + try { + return JSON.parse(value) as unknown; + } catch { + return value; + } +} + +function extractAssistantToolCalls(message: Record): GatewayLogSentinelToolCall[] { + const calls: GatewayLogSentinelToolCall[] = []; + const rawContent = message.content; + if (Array.isArray(rawContent)) { + for (const block of rawContent) { + if (!isRecord(block)) { + continue; + } + const type = readNonEmptyString(block.type)?.toLowerCase(); + if ( + type !== "tool_use" && + type !== "toolcall" && + type !== "tool_call" && + type !== "function_call" + ) { + continue; + } + calls.push({ + name: readNonEmptyString(block.name) ?? "unknown", + args: parseJsonArguments(block.input ?? block.arguments ?? block.args ?? null), + }); + } + } + + const rawToolCalls = + message.tool_calls ?? message.toolCalls ?? message.function_call ?? message.functionCall; + const toolCalls = Array.isArray(rawToolCalls) ? rawToolCalls : rawToolCalls ? [rawToolCalls] : []; + for (const call of toolCalls) { + if (!isRecord(call)) { + continue; + } + const functionRecord = isRecord(call.function) ? call.function : undefined; + calls.push({ + name: readNonEmptyString(call.name) ?? readNonEmptyString(functionRecord?.name) ?? "unknown", + args: parseJsonArguments( + call.arguments ?? functionRecord?.arguments ?? call.input ?? functionRecord?.input ?? null, + ), + }); + } + return calls; +} + +function isCurrentChatMessageSend(call: GatewayLogSentinelToolCall) { + if (call.name !== "message") { + return false; + } + if (!isRecord(call.args) || readNonEmptyString(call.args.action)?.toLowerCase() !== "send") { + return false; + } + const explicitTarget = + readNonEmptyString(call.args.conversationId) ?? + readNonEmptyString(call.args.conversation) ?? + readNonEmptyString(call.args.to) ?? + readNonEmptyString(call.args.target); + if (!explicitTarget) { + return true; + } + return /\b(?:current|same-chat|qa-operator|dm:qa-operator)\b/iu.test(explicitTarget); +} + +function normalizeTranscriptText(text: string) { + return text.replace(/\s+/gu, " ").trim(); +} + +function transcriptHasDirectReplySelfMessage(transcriptBytes: string) { + let lastAssistantText = ""; + const toolCalls: GatewayLogSentinelToolCall[] = []; + for (const line of transcriptBytes.split(/\r?\n/u)) { + const trimmed = line.trim(); + if (!trimmed) { + continue; + } + try { + const parsed = JSON.parse(trimmed) as unknown; + const message = isRecord(parsed) && isRecord(parsed.message) ? parsed.message : undefined; + if (!message || message.role !== "assistant") { + continue; + } + const text = extractMessageText(message); + if (text) { + lastAssistantText = text; + } + toolCalls.push(...extractAssistantToolCalls(message)); + } catch { + // Ignore malformed QA transcript rows and keep sentinel scans deterministic. + } + } + return ( + toolCalls.some(isCurrentChatMessageSend) && + normalizeTranscriptText(lastAssistantText).toLowerCase() === "sent." + ); +} + +export function scanGatewayLogSentinels( + logs: string | undefined, + options?: GatewayLogSentinelScanOptions, +): GatewayLogSentinelFinding[] { + if (!logs) { + return []; + } + const startOffset = Math.max(0, Math.min(logs.length, Math.floor(options?.since ?? 0))); + const lineOffset = lineNumberForOffset(logs, startOffset) - 1; + const findings: GatewayLogSentinelFinding[] = []; + for (const [index, rawLine] of logs.slice(startOffset).split(/\r?\n/u).entries()) { + const text = rawLine.trim(); + if (!text) { + continue; + } + for (const rule of GATEWAY_LOG_SENTINEL_RULES) { + if (!rule.test(text)) { + continue; + } + findings.push({ + kind: rule.kind, + verdict: rule.verdict, + owner: rule.owner, + productImpact: rule.productImpact, + qaImpact: rule.qaImpact, + line: lineOffset + index + 1, + text, + }); + } + } + return filterGatewayLogSentinelFindings(findings, options); +} + +export function scanDirectReplyTranscriptSentinels( + transcriptBytes: string, +): GatewayLogSentinelFinding[] { + if (!transcriptHasDirectReplySelfMessage(transcriptBytes)) { + return []; + } + return [ + { + kind: "direct-reply-self-message", + verdict: "product-bug", + owner: "openclaw-routing", + productImpact: "P1", + qaImpact: "P0", + line: 1, + text: "assistant called message(action=send) and then produced final text Sent.", + }, + ]; +} + +export function formatGatewayLogSentinelSummary(findings: readonly GatewayLogSentinelFinding[]) { + if (findings.length === 0) { + return "no gateway log sentinels"; + } + return findings + .map( + (finding) => + `${finding.kind}@${finding.line} ${finding.verdict} owner=${finding.owner}: ${finding.text}`, + ) + .join("\n"); +} + +export function assertNoGatewayLogSentinels( + logs: string | undefined, + options?: GatewayLogSentinelAssertOptions, +) { + const findings = scanGatewayLogSentinels(logs, options); + if (findings.length === 0) { + return findings; + } + if ( + options?.allowEnvironmentBlocked === true && + findings.every((finding) => finding.verdict === "environment-blocked") + ) { + return findings; + } + throw new Error( + `Gateway log sentinel(s) detected:\n${formatGatewayLogSentinelSummary(findings)}`, + ); +} diff --git a/extensions/qa-lab/src/runtime-parity.test.ts b/extensions/qa-lab/src/runtime-parity.test.ts index 4c5b7075d52..6a804bc56f6 100644 --- a/extensions/qa-lab/src/runtime-parity.test.ts +++ b/extensions/qa-lab/src/runtime-parity.test.ts @@ -101,11 +101,7 @@ async function createRuntimeParityGatewayTempRoot( }, ]), ); - await fs.writeFile( - path.join(sessionsDir, "sessions.json"), - JSON.stringify(store), - "utf8", - ); + await fs.writeFile(path.join(sessionsDir, "sessions.json"), JSON.stringify(store), "utf8"); await Promise.all( fixtures.map((entry) => fs.writeFile( @@ -671,4 +667,72 @@ describe("runtime parity", () => { expect(cell.finalText).toBe("scenario final"); expect(cell.transcriptBytes).not.toContain("deployment ok"); }); + + it("marks captured cells failed when gateway logs contain QA sentinel signatures", async () => { + const tempRoot = await createRuntimeParityGatewayTempRoot( + JSON.stringify({ + message: { + role: "assistant", + content: "scenario final", + }, + }), + ); + + const cell = await captureRuntimeParityCell({ + runtime: "codex", + gateway: { + tempRoot, + logs: () => "codex_app_server progress stalled for run abc123", + }, + scenarioResult: { + status: "pass", + }, + wallClockMs: 42, + }); + + expect(cell.runtimeErrorClass).toBe("sentinel:stalled-agent-run"); + expect(cell.sentinelFindings?.map((finding) => finding.kind)).toEqual(["stalled-agent-run"]); + }); + + it("marks direct-reply self-message transcripts as captured cell failures", async () => { + const tempRoot = await createRuntimeParityGatewayTempRoot( + [ + JSON.stringify({ + message: { + role: "assistant", + content: [ + { + type: "tool_use", + name: "message", + input: { action: "send", conversationId: "qa-operator", text: "hello" }, + }, + ], + }, + }), + JSON.stringify({ + message: { + role: "assistant", + content: "Sent.", + }, + }), + ].join("\n"), + ); + + const cell = await captureRuntimeParityCell({ + runtime: "pi", + gateway: { + tempRoot, + }, + scenarioResult: { + status: "pass", + }, + wallClockMs: 42, + }); + + expect(cell.finalText).toBe("Sent."); + expect(cell.runtimeErrorClass).toBe("sentinel:direct-reply-self-message"); + expect(cell.sentinelFindings?.map((finding) => finding.kind)).toEqual([ + "direct-reply-self-message", + ]); + }); }); diff --git a/extensions/qa-lab/src/runtime-parity.ts b/extensions/qa-lab/src/runtime-parity.ts index 605f6bd7868..6879ab2f873 100644 --- a/extensions/qa-lab/src/runtime-parity.ts +++ b/extensions/qa-lab/src/runtime-parity.ts @@ -2,6 +2,11 @@ import { createHash } from "node:crypto"; import fs from "node:fs/promises"; import path from "node:path"; import { fetchWithSsrFGuard } from "openclaw/plugin-sdk/ssrf-runtime"; +import { + scanDirectReplyTranscriptSentinels, + scanGatewayLogSentinels, + type GatewayLogSentinelFinding, +} from "./gateway-log-sentinel.js"; export type RuntimeId = "pi" | "codex"; @@ -30,6 +35,7 @@ export type RuntimeParityCell = { transportErrorClass?: string; runtimeErrorClass?: string; bootStateLines: string[]; + sentinelFindings?: GatewayLogSentinelFinding[]; }; export type RuntimeParityDrift = @@ -725,10 +731,21 @@ function isHardFailureRuntimeError(errorClass: string | undefined) { errorClass === "failover" || errorClass === "codex-app-server" || errorClass === "auth" || - errorClass === "capture-missing" + errorClass === "capture-missing" || + errorClass?.startsWith("sentinel:") === true ); } +function summarizeSentinelErrorClass(findings: readonly GatewayLogSentinelFinding[]) { + if (findings.length === 0) { + return undefined; + } + return `sentinel:${findings + .map((finding) => finding.kind) + .toSorted((left, right) => left.localeCompare(right)) + .join(",")}`; +} + function classifyRuntimeParityCells(params: { pi: RuntimeParityCell; codex: RuntimeParityCell; @@ -946,6 +963,13 @@ export async function captureRuntimeParityCell( }); const transcriptRecords = buildTranscriptRecords(transcriptBytes); const mockToolCalls = await loadRuntimeParityMockToolCalls(params.mockBaseUrl); + const gatewayLogs = params.gateway.logs?.(); + const sentinelFindings = [ + ...scanGatewayLogSentinels(gatewayLogs), + ...scanDirectReplyTranscriptSentinels(transcriptBytes), + ]; + const scenarioErrorClass = classifyScenarioError(params.scenarioResult.details); + const sentinelErrorClass = summarizeSentinelErrorClass(sentinelFindings); return { runtime: params.runtime, transcriptBytes, @@ -953,10 +977,11 @@ export async function captureRuntimeParityCell( finalText: extractFinalAssistantText(transcriptRecords), usage: aggregateUsage(transcriptRecords), wallClockMs: params.wallClockMs, - ...(classifyScenarioError(params.scenarioResult.details) - ? { runtimeErrorClass: classifyScenarioError(params.scenarioResult.details) } + ...(scenarioErrorClass || sentinelErrorClass + ? { runtimeErrorClass: scenarioErrorClass ?? sentinelErrorClass } : {}), - bootStateLines: extractBootStateLines(params.gateway.logs?.()), + bootStateLines: extractBootStateLines(gatewayLogs), + ...(sentinelFindings.length > 0 ? { sentinelFindings } : {}), }; } diff --git a/extensions/qa-lab/src/suite.ts b/extensions/qa-lab/src/suite.ts index 63682a39247..1b73d133c00 100644 --- a/extensions/qa-lab/src/suite.ts +++ b/extensions/qa-lab/src/suite.ts @@ -281,6 +281,7 @@ function isRuntimeParityPass(result: RuntimeParityResult) { function formatRuntimeParityCellDetails(cell: RuntimeParityCell) { const errors = [cell.transportErrorClass, cell.runtimeErrorClass].filter(Boolean).join(", "); + const sentinels = cell.sentinelFindings?.map((finding) => finding.kind).join(", "); return [ `runtime=${cell.runtime}`, `wallMs=${cell.wallClockMs}`, @@ -288,6 +289,7 @@ function formatRuntimeParityCellDetails(cell: RuntimeParityCell) { `finalChars=${cell.finalText.length}`, `tokens=${cell.usage.totalTokens}`, ...(errors ? [`errors=${errors}`] : []), + ...(sentinels ? [`sentinels=${sentinels}`] : []), ].join(" "); }