fix(qa-lab): add gateway log sentinels

This commit is contained in:
Vincent Koc
2026-05-17 10:44:24 +08:00
parent ca236d098d
commit d801d27dbc
6 changed files with 591 additions and 9 deletions

View File

@@ -83,6 +83,7 @@ Docs: https://docs.openclaw.ai
- CLI/setup: collapse raw gateway config keys in existing-config summaries into friendly `Model` and `Gateway` rows.
- CLI/config: show concise human config-write output with an indented backup path instead of printing checksum-heavy overwrite audit details by default.
- CLI/docs: call the canonical lowercase docs MCP search tool and surface MCP errors instead of returning empty search results. Fixes #82702. (#82704) Thanks @hclsys.
- QA-Lab: add gateway log sentinels for plugin hook failures, Codex app-server stalls/timeouts, cron allowlist drift, live quota blockers, and direct-reply self-message transcripts so harness proof fails on self-health regressions. (#80323) Thanks @100yenadmin.
- QA-Lab: ignore heartbeat-only operational transcripts when capturing runtime parity cells so background checks cannot replace the scenario reply. (#80323) Thanks @100yenadmin.
- QA-Lab: pin threaded-memory parity runs to `memory-core`, keep bundled plugin resolution enabled for QA commands, and retry transient session-store lock reads. (#72045) Thanks @WuKongAI-CMU.
- QA-Lab/qa-channel: keep mock memory ranking, inbound media notes, and opened-file realpath checks stable for mock OpenAI qa-channel runs. (#66826) Thanks @gumadeiras.

View File

@@ -0,0 +1,123 @@
import { describe, expect, it } from "vitest";
import {
assertNoGatewayLogSentinels,
formatGatewayLogSentinelSummary,
scanDirectReplyTranscriptSentinels,
scanGatewayLogSentinels,
} from "./gateway-log-sentinel.js";
describe("gateway log sentinels", () => {
it("classifies May 13 beta.5 operational failure signatures", () => {
const findings = scanGatewayLogSentinels(
[
"2026-05-13T00:00:01Z plugin before_prompt_build hook failed: TypeError: boom",
"2026-05-13T00:00:02Z plugin before_tool_call crashed while evaluating policy",
"2026-05-13T00:00:03Z plugin manifest invalid: missing contracts.tools registration",
"2026-05-13T00:00:04Z codex app-server attempt timed out after 180000ms",
"2026-05-13T00:00:05Z codex_app_server progress stalled for run abc123",
"2026-05-13T00:00:06Z cron payload model openai/gpt-5.4 is not in model allowlist",
"2026-05-13T00:00:07Z OpenAI quota exceeded for live-frontier request",
].join("\n"),
);
expect(findings.map((finding) => finding.kind)).toEqual([
"plugin-hook-failure",
"plugin-hook-failure",
"plugin-contract-error",
"codex-app-server-timeout",
"stalled-agent-run",
"cron-model-allowlist",
"live-quota-or-subscription",
]);
expect(findings.find((finding) => finding.kind === "plugin-hook-failure")).toMatchObject({
verdict: "qa-harness-bug",
owner: "plugin",
productImpact: "P1",
});
expect(findings.find((finding) => finding.kind === "live-quota-or-subscription")).toMatchObject(
{
verdict: "environment-blocked",
owner: "environment",
productImpact: "P4",
},
);
});
it("honors log cursors while preserving absolute line numbers", () => {
const prefix = "safe line\n";
const findings = scanGatewayLogSentinels(`${prefix}codex app-server attempt timed out`, {
since: prefix.length,
});
expect(findings).toHaveLength(1);
expect(findings[0]).toMatchObject({
kind: "codex-app-server-timeout",
line: 2,
});
});
it("throws actionable summaries unless only environment blockers are allowed", () => {
expect(() => assertNoGatewayLogSentinels("codex_app_server progress stalled")).toThrow(
"stalled-agent-run",
);
expect(() =>
assertNoGatewayLogSentinels("OpenAI quota exceeded", { allowEnvironmentBlocked: true }),
).not.toThrow();
expect(formatGatewayLogSentinelSummary(scanGatewayLogSentinels("OpenAI quota exceeded"))).toBe(
"live-quota-or-subscription@1 environment-blocked owner=environment: OpenAI quota exceeded",
);
});
it("detects direct reply self-message transcripts separately from gateway logs", () => {
const findings = scanDirectReplyTranscriptSentinels(
[
JSON.stringify({
message: {
role: "assistant",
content: [
{
type: "tool_use",
name: "message",
input: { action: "send", conversationId: "qa-operator", text: "hello" },
},
],
},
}),
JSON.stringify({ message: { role: "assistant", content: "Sent." } }),
].join("\n"),
);
expect(findings).toHaveLength(1);
expect(findings[0]).toMatchObject({
kind: "direct-reply-self-message",
verdict: "product-bug",
owner: "openclaw-routing",
});
});
it("detects OpenAI function_call-shaped direct reply transcripts", () => {
const findings = scanDirectReplyTranscriptSentinels(
[
JSON.stringify({
message: {
role: "assistant",
content: [
{
type: "function_call",
name: "message",
arguments: JSON.stringify({
action: "send",
target: "current",
text: "hello",
}),
},
],
},
}),
JSON.stringify({ message: { role: "assistant", content: "Sent." } }),
].join("\n"),
);
expect(findings.map((finding) => finding.kind)).toEqual(["direct-reply-self-message"]);
});
});

View File

@@ -0,0 +1,367 @@
export type GatewayLogSentinelKind =
| "plugin-hook-failure"
| "plugin-contract-error"
| "direct-reply-self-message"
| "codex-app-server-timeout"
| "stalled-agent-run"
| "cron-model-allowlist"
| "live-quota-or-subscription";
export type GatewayLogSentinelVerdict =
| "product-bug"
| "qa-harness-bug"
| "fixture-bug"
| "environment-blocked";
export type GatewayLogSentinelOwner =
| "plugin"
| "openclaw-routing"
| "codex-runtime"
| "openclaw-cron"
| "environment";
export type GatewayLogSentinelFinding = {
kind: GatewayLogSentinelKind;
verdict: GatewayLogSentinelVerdict;
owner: GatewayLogSentinelOwner;
productImpact: "P0" | "P1" | "P2" | "P3" | "P4";
qaImpact: "P0" | "P1" | "P2" | "P3" | "P4";
line: number;
text: string;
};
export type GatewayLogSentinelScanOptions = {
since?: number;
kinds?: readonly GatewayLogSentinelKind[];
ignoreKinds?: readonly GatewayLogSentinelKind[];
};
export type GatewayLogSentinelAssertOptions = GatewayLogSentinelScanOptions & {
allowEnvironmentBlocked?: boolean;
};
type GatewayLogSentinelRule = Omit<GatewayLogSentinelFinding, "line" | "text"> & {
test: (line: string) => boolean;
};
type GatewayLogSentinelToolCall = {
name: string;
args: unknown;
};
const GATEWAY_LOG_SENTINEL_RULES: GatewayLogSentinelRule[] = [
{
kind: "plugin-hook-failure",
verdict: "qa-harness-bug",
owner: "plugin",
productImpact: "P1",
qaImpact: "P0",
test: (line) =>
/\bbefore_(?:prompt_build|tool_call)\b/iu.test(line) &&
/\b(?:crash(?:ed)?|exception|failed|failure|error)\b/iu.test(line),
},
{
kind: "plugin-contract-error",
verdict: "qa-harness-bug",
owner: "plugin",
productImpact: "P1",
qaImpact: "P0",
test: (line) =>
/\bcontracts\.tools\b/iu.test(line) &&
/\b(?:missing|invalid|registration|register|manifest|contract|schema|error)\b/iu.test(line),
},
{
kind: "codex-app-server-timeout",
verdict: "product-bug",
owner: "codex-runtime",
productImpact: "P1",
qaImpact: "P0",
test: (line) =>
/\bcodex app-server\b.*\btimed out\b|\btimed out\b.*\bcodex app-server\b/iu.test(line),
},
{
kind: "stalled-agent-run",
verdict: "product-bug",
owner: "codex-runtime",
productImpact: "P1",
qaImpact: "P0",
test: (line) =>
/\bcodex_app_server\b.*\b(?:stalled|no progress|progress stalled)\b|\b(?:stalled|no progress|progress stalled)\b.*\bcodex_app_server\b/iu.test(
line,
),
},
{
kind: "cron-model-allowlist",
verdict: "product-bug",
owner: "openclaw-cron",
productImpact: "P2",
qaImpact: "P0",
test: (line) =>
/\bcron\b/iu.test(line) &&
(/\bmodel allowlist\b/iu.test(line) ||
/\ballowlist\b.*\bmodel\b/iu.test(line) ||
/\bmodel\b.*\b(?:not in|outside|blocked by)\b.*\ballowlist\b/iu.test(line)),
},
{
kind: "live-quota-or-subscription",
verdict: "environment-blocked",
owner: "environment",
productImpact: "P4",
qaImpact: "P0",
test: (line) =>
/\b(?:quota exceeded|insufficient_quota|subscription exhausted|no active subscription|billing hard limit|usage limit)\b/iu.test(
line,
),
},
];
function filterGatewayLogSentinelFindings(
findings: GatewayLogSentinelFinding[],
options: GatewayLogSentinelScanOptions | undefined,
) {
const kinds = new Set(options?.kinds ?? []);
const ignoreKinds = new Set(options?.ignoreKinds ?? []);
return findings.filter((finding) => {
if (kinds.size > 0 && !kinds.has(finding.kind)) {
return false;
}
return !ignoreKinds.has(finding.kind);
});
}
function lineNumberForOffset(logs: string, offset: number) {
if (offset <= 0) {
return 1;
}
return logs.slice(0, offset).split(/\r?\n/u).length;
}
function isRecord(value: unknown): value is Record<string, unknown> {
return Boolean(value) && typeof value === "object" && !Array.isArray(value);
}
function readNonEmptyString(value: unknown): string | undefined {
return typeof value === "string" && value.trim().length > 0 ? value.trim() : undefined;
}
function extractMessageText(message: Record<string, unknown>) {
const rawContent = message.content;
if (typeof rawContent === "string") {
return rawContent.trim();
}
if (!Array.isArray(rawContent)) {
return "";
}
const parts: string[] = [];
for (const block of rawContent) {
if (typeof block === "string") {
if (block.trim()) {
parts.push(block.trim());
}
continue;
}
if (!isRecord(block)) {
continue;
}
const text = readNonEmptyString(block.text);
if (text) {
parts.push(text);
continue;
}
const nestedText = readNonEmptyString(block.content);
if (
nestedText &&
(block.type === "output_text" || block.type === "text" || block.type === "message")
) {
parts.push(nestedText);
}
}
return parts.join("\n").trim();
}
function parseJsonArguments(value: unknown): unknown {
if (typeof value !== "string") {
return value;
}
try {
return JSON.parse(value) as unknown;
} catch {
return value;
}
}
function extractAssistantToolCalls(message: Record<string, unknown>): GatewayLogSentinelToolCall[] {
const calls: GatewayLogSentinelToolCall[] = [];
const rawContent = message.content;
if (Array.isArray(rawContent)) {
for (const block of rawContent) {
if (!isRecord(block)) {
continue;
}
const type = readNonEmptyString(block.type)?.toLowerCase();
if (
type !== "tool_use" &&
type !== "toolcall" &&
type !== "tool_call" &&
type !== "function_call"
) {
continue;
}
calls.push({
name: readNonEmptyString(block.name) ?? "unknown",
args: parseJsonArguments(block.input ?? block.arguments ?? block.args ?? null),
});
}
}
const rawToolCalls =
message.tool_calls ?? message.toolCalls ?? message.function_call ?? message.functionCall;
const toolCalls = Array.isArray(rawToolCalls) ? rawToolCalls : rawToolCalls ? [rawToolCalls] : [];
for (const call of toolCalls) {
if (!isRecord(call)) {
continue;
}
const functionRecord = isRecord(call.function) ? call.function : undefined;
calls.push({
name: readNonEmptyString(call.name) ?? readNonEmptyString(functionRecord?.name) ?? "unknown",
args: parseJsonArguments(
call.arguments ?? functionRecord?.arguments ?? call.input ?? functionRecord?.input ?? null,
),
});
}
return calls;
}
function isCurrentChatMessageSend(call: GatewayLogSentinelToolCall) {
if (call.name !== "message") {
return false;
}
if (!isRecord(call.args) || readNonEmptyString(call.args.action)?.toLowerCase() !== "send") {
return false;
}
const explicitTarget =
readNonEmptyString(call.args.conversationId) ??
readNonEmptyString(call.args.conversation) ??
readNonEmptyString(call.args.to) ??
readNonEmptyString(call.args.target);
if (!explicitTarget) {
return true;
}
return /\b(?:current|same-chat|qa-operator|dm:qa-operator)\b/iu.test(explicitTarget);
}
function normalizeTranscriptText(text: string) {
return text.replace(/\s+/gu, " ").trim();
}
function transcriptHasDirectReplySelfMessage(transcriptBytes: string) {
let lastAssistantText = "";
const toolCalls: GatewayLogSentinelToolCall[] = [];
for (const line of transcriptBytes.split(/\r?\n/u)) {
const trimmed = line.trim();
if (!trimmed) {
continue;
}
try {
const parsed = JSON.parse(trimmed) as unknown;
const message = isRecord(parsed) && isRecord(parsed.message) ? parsed.message : undefined;
if (!message || message.role !== "assistant") {
continue;
}
const text = extractMessageText(message);
if (text) {
lastAssistantText = text;
}
toolCalls.push(...extractAssistantToolCalls(message));
} catch {
// Ignore malformed QA transcript rows and keep sentinel scans deterministic.
}
}
return (
toolCalls.some(isCurrentChatMessageSend) &&
normalizeTranscriptText(lastAssistantText).toLowerCase() === "sent."
);
}
export function scanGatewayLogSentinels(
logs: string | undefined,
options?: GatewayLogSentinelScanOptions,
): GatewayLogSentinelFinding[] {
if (!logs) {
return [];
}
const startOffset = Math.max(0, Math.min(logs.length, Math.floor(options?.since ?? 0)));
const lineOffset = lineNumberForOffset(logs, startOffset) - 1;
const findings: GatewayLogSentinelFinding[] = [];
for (const [index, rawLine] of logs.slice(startOffset).split(/\r?\n/u).entries()) {
const text = rawLine.trim();
if (!text) {
continue;
}
for (const rule of GATEWAY_LOG_SENTINEL_RULES) {
if (!rule.test(text)) {
continue;
}
findings.push({
kind: rule.kind,
verdict: rule.verdict,
owner: rule.owner,
productImpact: rule.productImpact,
qaImpact: rule.qaImpact,
line: lineOffset + index + 1,
text,
});
}
}
return filterGatewayLogSentinelFindings(findings, options);
}
export function scanDirectReplyTranscriptSentinels(
transcriptBytes: string,
): GatewayLogSentinelFinding[] {
if (!transcriptHasDirectReplySelfMessage(transcriptBytes)) {
return [];
}
return [
{
kind: "direct-reply-self-message",
verdict: "product-bug",
owner: "openclaw-routing",
productImpact: "P1",
qaImpact: "P0",
line: 1,
text: "assistant called message(action=send) and then produced final text Sent.",
},
];
}
export function formatGatewayLogSentinelSummary(findings: readonly GatewayLogSentinelFinding[]) {
if (findings.length === 0) {
return "no gateway log sentinels";
}
return findings
.map(
(finding) =>
`${finding.kind}@${finding.line} ${finding.verdict} owner=${finding.owner}: ${finding.text}`,
)
.join("\n");
}
export function assertNoGatewayLogSentinels(
logs: string | undefined,
options?: GatewayLogSentinelAssertOptions,
) {
const findings = scanGatewayLogSentinels(logs, options);
if (findings.length === 0) {
return findings;
}
if (
options?.allowEnvironmentBlocked === true &&
findings.every((finding) => finding.verdict === "environment-blocked")
) {
return findings;
}
throw new Error(
`Gateway log sentinel(s) detected:\n${formatGatewayLogSentinelSummary(findings)}`,
);
}

View File

@@ -101,11 +101,7 @@ async function createRuntimeParityGatewayTempRoot(
},
]),
);
await fs.writeFile(
path.join(sessionsDir, "sessions.json"),
JSON.stringify(store),
"utf8",
);
await fs.writeFile(path.join(sessionsDir, "sessions.json"), JSON.stringify(store), "utf8");
await Promise.all(
fixtures.map((entry) =>
fs.writeFile(
@@ -671,4 +667,72 @@ describe("runtime parity", () => {
expect(cell.finalText).toBe("scenario final");
expect(cell.transcriptBytes).not.toContain("deployment ok");
});
it("marks captured cells failed when gateway logs contain QA sentinel signatures", async () => {
const tempRoot = await createRuntimeParityGatewayTempRoot(
JSON.stringify({
message: {
role: "assistant",
content: "scenario final",
},
}),
);
const cell = await captureRuntimeParityCell({
runtime: "codex",
gateway: {
tempRoot,
logs: () => "codex_app_server progress stalled for run abc123",
},
scenarioResult: {
status: "pass",
},
wallClockMs: 42,
});
expect(cell.runtimeErrorClass).toBe("sentinel:stalled-agent-run");
expect(cell.sentinelFindings?.map((finding) => finding.kind)).toEqual(["stalled-agent-run"]);
});
it("marks direct-reply self-message transcripts as captured cell failures", async () => {
const tempRoot = await createRuntimeParityGatewayTempRoot(
[
JSON.stringify({
message: {
role: "assistant",
content: [
{
type: "tool_use",
name: "message",
input: { action: "send", conversationId: "qa-operator", text: "hello" },
},
],
},
}),
JSON.stringify({
message: {
role: "assistant",
content: "Sent.",
},
}),
].join("\n"),
);
const cell = await captureRuntimeParityCell({
runtime: "pi",
gateway: {
tempRoot,
},
scenarioResult: {
status: "pass",
},
wallClockMs: 42,
});
expect(cell.finalText).toBe("Sent.");
expect(cell.runtimeErrorClass).toBe("sentinel:direct-reply-self-message");
expect(cell.sentinelFindings?.map((finding) => finding.kind)).toEqual([
"direct-reply-self-message",
]);
});
});

View File

@@ -2,6 +2,11 @@ import { createHash } from "node:crypto";
import fs from "node:fs/promises";
import path from "node:path";
import { fetchWithSsrFGuard } from "openclaw/plugin-sdk/ssrf-runtime";
import {
scanDirectReplyTranscriptSentinels,
scanGatewayLogSentinels,
type GatewayLogSentinelFinding,
} from "./gateway-log-sentinel.js";
export type RuntimeId = "pi" | "codex";
@@ -30,6 +35,7 @@ export type RuntimeParityCell = {
transportErrorClass?: string;
runtimeErrorClass?: string;
bootStateLines: string[];
sentinelFindings?: GatewayLogSentinelFinding[];
};
export type RuntimeParityDrift =
@@ -725,10 +731,21 @@ function isHardFailureRuntimeError(errorClass: string | undefined) {
errorClass === "failover" ||
errorClass === "codex-app-server" ||
errorClass === "auth" ||
errorClass === "capture-missing"
errorClass === "capture-missing" ||
errorClass?.startsWith("sentinel:") === true
);
}
function summarizeSentinelErrorClass(findings: readonly GatewayLogSentinelFinding[]) {
if (findings.length === 0) {
return undefined;
}
return `sentinel:${findings
.map((finding) => finding.kind)
.toSorted((left, right) => left.localeCompare(right))
.join(",")}`;
}
function classifyRuntimeParityCells(params: {
pi: RuntimeParityCell;
codex: RuntimeParityCell;
@@ -946,6 +963,13 @@ export async function captureRuntimeParityCell(
});
const transcriptRecords = buildTranscriptRecords(transcriptBytes);
const mockToolCalls = await loadRuntimeParityMockToolCalls(params.mockBaseUrl);
const gatewayLogs = params.gateway.logs?.();
const sentinelFindings = [
...scanGatewayLogSentinels(gatewayLogs),
...scanDirectReplyTranscriptSentinels(transcriptBytes),
];
const scenarioErrorClass = classifyScenarioError(params.scenarioResult.details);
const sentinelErrorClass = summarizeSentinelErrorClass(sentinelFindings);
return {
runtime: params.runtime,
transcriptBytes,
@@ -953,10 +977,11 @@ export async function captureRuntimeParityCell(
finalText: extractFinalAssistantText(transcriptRecords),
usage: aggregateUsage(transcriptRecords),
wallClockMs: params.wallClockMs,
...(classifyScenarioError(params.scenarioResult.details)
? { runtimeErrorClass: classifyScenarioError(params.scenarioResult.details) }
...(scenarioErrorClass || sentinelErrorClass
? { runtimeErrorClass: scenarioErrorClass ?? sentinelErrorClass }
: {}),
bootStateLines: extractBootStateLines(params.gateway.logs?.()),
bootStateLines: extractBootStateLines(gatewayLogs),
...(sentinelFindings.length > 0 ? { sentinelFindings } : {}),
};
}

View File

@@ -281,6 +281,7 @@ function isRuntimeParityPass(result: RuntimeParityResult) {
function formatRuntimeParityCellDetails(cell: RuntimeParityCell) {
const errors = [cell.transportErrorClass, cell.runtimeErrorClass].filter(Boolean).join(", ");
const sentinels = cell.sentinelFindings?.map((finding) => finding.kind).join(", ");
return [
`runtime=${cell.runtime}`,
`wallMs=${cell.wallClockMs}`,
@@ -288,6 +289,7 @@ function formatRuntimeParityCellDetails(cell: RuntimeParityCell) {
`finalChars=${cell.finalText.length}`,
`tokens=${cell.usage.totalTokens}`,
...(errors ? [`errors=${errors}`] : []),
...(sentinels ? [`sentinels=${sentinels}`] : []),
].join(" ");
}