fix: harden startup readiness and discord replies

(cherry picked from commit 3956672106b3387d42427a485a9ca01e77f3b78f)
This commit is contained in:
Satoshi
2026-05-04 14:42:16 +01:00
committed by Peter Steinberger
parent 7e229f0d3d
commit e259938e96
12 changed files with 381 additions and 49 deletions

View File

@@ -0,0 +1,93 @@
# OpenClaw Startup Readiness And Leak Fix - 2026-05-04
## Current Truth
- Incident inputs confirmed Discord front-channel leakage of internal execution/commentary-like traces and Gateway startup instability in the same window.
- Observed bad startup keywords from local operator evidence:
- `gateway event loop readiness timeout`
- `discord: gateway was not ready after 15000ms; restarting gateway`
- `sessions.list` requests around 40 seconds
- `exit 78` with systemd `RestartPreventExitStatus=78`
- This source fix addresses the startup terminal-fail path and Discord final outbound leakage guard. It does not restart any running Gateway by itself.
## Code Changes
- Startup control-plane load shedding:
- Added `sessions.list` to `STARTUP_UNAVAILABLE_GATEWAY_METHODS`.
- During sidecar startup, Gateway now returns retryable startup `UNAVAILABLE` for `sessions.list` instead of dispatching the costly session scan path.
- Native approval bootstrap readiness handling:
- Changed approval-client readiness failure text away from the production incident keyword.
- Changed exec-approval runtime readiness failure text away from the production incident keyword.
- Classified gateway readiness/startup close errors as retryable bootstrap deferrals.
- Normalized legacy readiness-timeout errors before logging retry deferrals, so old incident keywords do not reappear in native-approval retry logs.
- Native approval handler startup now warns and retries instead of emitting the old terminal-looking `failed to start native approval handler` path for readiness-only failures.
- Discord gateway READY wait:
- Replaced the one-restart-then-throw startup behavior with reconnect plus 2 second backoff until READY, stop, or abort.
- Removed the old log string `gateway was not ready after 15000ms; restarting gateway` from the nonfatal retry path.
- Discord final outbound safety filter:
- Added `extensions/discord/src/monitor/reply-safety.ts`.
- `deliverDiscordReply` sanitizes payload text at the final Discord send boundary.
- The filter uses the existing assistant-visible-text sanitizer, strips standalone internal trace/channel lines outside code fences, drops pure-internal text-only payloads, and preserves media-only payloads.
## Why This Should Work
- The startup window no longer allows Control UI `sessions.list` polling to compete with sidecar/channel readiness through the expensive session listing path.
- Discord READY timeout no longer escalates a transient event-loop stall into a thrown startup failure after a single reconnect attempt.
- Approval handler readiness failures are treated as recoverable gateway-readiness deferrals, matching the actual failure mode from the incident.
- Leakage protection is placed at the last Discord send boundary, so upstream mistakes in agent output assembly, commentary routing, or tool-call formatting get one final scrub before front-channel delivery.
## Modified Files
- `src/gateway/server-startup-unavailable-methods.ts`
- `src/gateway/operator-approvals-client.ts`
- `src/infra/approval-handler-bootstrap.ts`
- `src/infra/approval-handler-bootstrap.test.ts`
- `src/infra/exec-approval-channel-runtime.ts`
- `src/infra/exec-approval-channel-runtime.test.ts`
- `extensions/discord/src/monitor/provider.lifecycle.ts`
- `extensions/discord/src/monitor/provider.lifecycle.test.ts`
- `extensions/discord/src/monitor/reply-delivery.ts`
- `extensions/discord/src/monitor/reply-delivery.test.ts`
- `extensions/discord/src/monitor/reply-safety.ts`
- `docs/status/openclaw-startup-readiness-and-leak-fix-20260504.md`
## Validation
- `node scripts/run-vitest.mjs run --config test/vitest/vitest.extension-discord.config.ts extensions/discord/src/monitor/provider.lifecycle.test.ts extensions/discord/src/monitor/reply-delivery.test.ts`
- Passed: 2 files, 28 tests.
- `OPENCLAW_GATEWAY_PROJECT_SHARDS=1 node scripts/run-vitest.mjs run --config test/vitest/vitest.gateway.config.ts src/gateway/server-methods.control-plane-rate-limit.test.ts`
- Passed: 1 file, 12 tests.
- `node scripts/run-vitest.mjs run --config test/vitest/vitest.infra.config.ts src/infra/approval-handler-bootstrap.test.ts src/infra/exec-approval-channel-runtime.test.ts`
- Passed: 2 files, 30 tests.
- `git diff --check`
- Passed.
## Acceptance Log Keywords
- Must stay absent during the 30-60 minute post-deploy startup soak:
- `gateway event loop readiness timeout`
- `discord: gateway was not ready after 15000ms; restarting gateway`
- `discord gateway did not reach READY within 15000ms after restart`
- `sessions.list` with 40 second scale durations
- `exit 78`
- Expected nonterminal readiness retry keyword if Discord is slow to become READY:
- `discord: gateway READY wait timed out after 15000ms; reconnecting with backoff`
- Expected approval bootstrap deferral keyword if Gateway is still starting:
- `native approval handler deferred until gateway readiness recovers`
## Risks
- `sessions.list` is temporarily unavailable during startup until sidecars clear startup gating. Control UI must retry retryable `UNAVAILABLE` responses.
- The Discord READY wait can keep reconnecting until stop/abort. If credentials or network are truly broken, operator-visible status remains `startup-not-ready` instead of crashing the Gateway.
- The final outbound scrub intentionally removes standalone internal trace lines. A user-visible reply that literally begins with `analysis:`, `commentary:`, or tool execution labels outside a code fence will be stripped from Discord text. Code-fenced examples are preserved.
## Rollback
- Source rollback: `git revert <commit-hash>` from this repo.
- If already deployed, rebuild/reinstall the reverted source using the normal OpenClaw packaging path, then restart the Gateway using the operator's configured service manager.
## Next Action
- Deploy this source build to an isolated or production-managed OpenClaw path.
- Run a 30-60 minute startup soak with Control UI open and Discord connected.
- During the soak, watch `/tmp/openclaw/openclaw-2026-05-04.log` or the active daily log for the acceptance keywords above.

View File

@@ -333,7 +333,7 @@ describe("runDiscordGatewayLifecycle", () => {
expect(statusSink).toHaveBeenCalledTimes(callCountAfterCleanup);
});
it("restarts the gateway once when startup never reaches READY, then recovers", async () => {
it("reconnects with backoff when startup never reaches READY, then recovers", async () => {
vi.useFakeTimers();
try {
const { emitter, gateway } = createGatewayHarness();
@@ -347,10 +347,13 @@ describe("runDiscordGatewayLifecycle", () => {
const { lifecycleParams, runtimeError, statusSink } = createLifecycleHarness({ gateway });
const lifecyclePromise = runDiscordGatewayLifecycle(lifecycleParams);
await vi.advanceTimersByTimeAsync(16_500);
await vi.advanceTimersByTimeAsync(18_500);
await expect(lifecyclePromise).resolves.toBeUndefined();
expect(runtimeError).toHaveBeenCalledWith(
expect.stringContaining("gateway READY wait timed out after 15000ms"),
);
expect(runtimeError).not.toHaveBeenCalledWith(
expect.stringContaining("gateway was not ready after 15000ms; restarting gateway"),
);
expect(gateway.disconnect).toHaveBeenCalledTimes(1);
@@ -396,14 +399,14 @@ describe("runDiscordGatewayLifecycle", () => {
expect(gateway.connect).toHaveBeenCalledTimes(1);
expect(gateway.connect).toHaveBeenCalledWith(false);
await vi.advanceTimersByTimeAsync(1_000);
await vi.advanceTimersByTimeAsync(3_000);
await expect(lifecyclePromise).resolves.toBeUndefined();
} finally {
vi.useRealTimers();
}
});
it("fails when startup still is not ready after a restart", async () => {
it("keeps retrying when startup still is not ready after a reconnect", async () => {
vi.useFakeTimers();
try {
const { emitter, gateway } = createGatewayHarness();
@@ -414,19 +417,17 @@ describe("runDiscordGatewayLifecycle", () => {
const lifecyclePromise = runDiscordGatewayLifecycle(lifecycleParams);
lifecyclePromise.catch(() => {});
await vi.advanceTimersByTimeAsync(31_000);
await vi.advanceTimersByTimeAsync(34_000);
await expect(lifecyclePromise).rejects.toThrow(
"discord gateway did not reach READY within 15000ms after restart",
);
expect(gateway.disconnect).toHaveBeenCalledTimes(1);
expect(gateway.connect).toHaveBeenCalledTimes(1);
expect(gateway.disconnect).toHaveBeenCalledTimes(2);
expect(gateway.connect).toHaveBeenCalledTimes(2);
expect(gateway.connect).toHaveBeenCalledWith(false);
expectLifecycleCleanup({
threadStop,
waitCalls: 0,
gatewaySupervisor,
});
expect(waitForDiscordGatewayStopMock).not.toHaveBeenCalled();
gateway.isConnected = true;
await vi.advanceTimersByTimeAsync(2_500);
await expect(lifecyclePromise).resolves.toBeUndefined();
expectLifecycleCleanup({ threadStop, waitCalls: 1, gatewaySupervisor });
} finally {
vi.useRealTimers();
}

View File

@@ -25,6 +25,7 @@ const MAX_DISCORD_GATEWAY_READY_TIMEOUT_MS = 120_000;
const DISCORD_GATEWAY_READY_TIMEOUT_ENV = "OPENCLAW_DISCORD_READY_TIMEOUT_MS";
const DISCORD_GATEWAY_RUNTIME_READY_TIMEOUT_ENV = "OPENCLAW_DISCORD_RUNTIME_READY_TIMEOUT_MS";
const DISCORD_GATEWAY_READY_POLL_MS = 250;
const DISCORD_GATEWAY_READY_RETRY_BACKOFF_MS = 2_000;
const DISCORD_GATEWAY_STARTUP_DISCONNECT_DRAIN_TIMEOUT_MS = 5_000;
const DISCORD_GATEWAY_STARTUP_TERMINATE_CLOSE_TIMEOUT_MS = 1_000;
const DISCORD_GATEWAY_TRANSPORT_ACTIVITY_STATUS_MIN_INTERVAL_MS = 30_000;
@@ -355,41 +356,50 @@ async function waitForGatewayReady(params: {
return "stopped";
};
const firstAttempt = await waitUntilReady();
if (firstAttempt !== "timeout") {
return;
}
if (!params.gateway) {
throw new Error(`discord gateway did not reach READY within ${params.readyTimeoutMs}ms`);
}
const restartAt = Date.now();
params.runtime.error?.(
danger(`discord: gateway was not ready after ${params.readyTimeoutMs}ms; restarting gateway`),
);
params.pushStatus?.({
connected: false,
lastEventAt: restartAt,
lastDisconnect: {
at: restartAt,
error: "startup-not-ready",
},
lastError: "startup-not-ready",
});
if (params.abortSignal?.aborted) {
const attempt = await waitUntilReady();
if (attempt === "timeout") {
throw new Error(`discord gateway did not reach READY within ${params.readyTimeoutMs}ms`);
}
return;
}
await params.beforeRestart?.();
await restartGatewayAfterReadyTimeout({
gateway: params.gateway,
abortSignal: params.abortSignal,
runtime: params.runtime,
});
if ((await waitUntilReady()) === "timeout") {
throw new Error(
`discord gateway did not reach READY within ${params.readyTimeoutMs}ms after restart`,
let attempt = 0;
while (!params.abortSignal?.aborted) {
const result = await waitUntilReady();
if (result !== "timeout") {
return;
}
attempt += 1;
const restartAt = Date.now();
params.runtime.error?.(
danger(
`discord: gateway READY wait timed out after ${params.readyTimeoutMs}ms; reconnecting with backoff (attempt ${attempt})`,
),
);
params.pushStatus?.({
connected: false,
lastEventAt: restartAt,
lastDisconnect: {
at: restartAt,
error: "startup-not-ready",
},
lastError: "startup-not-ready",
});
await params.beforeRestart?.();
await restartGatewayAfterReadyTimeout({
gateway: params.gateway,
abortSignal: params.abortSignal,
runtime: params.runtime,
});
if (params.abortSignal?.aborted) {
return;
}
await new Promise<void>((resolve) => {
const timeout = setTimeout(resolve, DISCORD_GATEWAY_READY_RETRY_BACKOFF_MS);
timeout.unref?.();
});
}
}

View File

@@ -105,6 +105,76 @@ describe("deliverDiscordReply", () => {
);
});
it("strips internal execution trace lines at the final Discord send boundary", async () => {
await deliverDiscordReply({
replies: [
{
text: [
"📊 Session Status: current",
"🛠️ Exec: run git status",
"📖 Read: lines 1-40 from secret.md",
"Visible reply.",
].join("\n"),
},
],
target: "channel:101",
token: "token",
accountId: "default",
runtime,
cfg,
textLimit: 2000,
});
expect(deliverOutboundPayloadsMock).toHaveBeenCalledWith(
expect.objectContaining({
payloads: [{ text: "Visible reply." }],
}),
);
});
it("drops pure internal trace text while preserving media-only delivery", async () => {
await deliverDiscordReply({
replies: [
{
text: "commentary: calling tool\nanalysis: inspect private state",
mediaUrl: "https://example.com/result.png",
},
],
target: "channel:101",
token: "token",
accountId: "default",
runtime,
cfg,
textLimit: 2000,
});
expect(deliverOutboundPayloadsMock).toHaveBeenCalledWith(
expect.objectContaining({
payloads: [{ mediaUrl: "https://example.com/result.png", text: undefined }],
}),
);
});
it("does not strip ordinary code-fenced examples of tool-call labels", async () => {
const text = ["Example:", "```", "🛠️ Exec: run ls", "```"].join("\n");
await deliverDiscordReply({
replies: [{ text }],
target: "channel:101",
token: "token",
accountId: "default",
runtime,
cfg,
textLimit: 2000,
});
expect(deliverOutboundPayloadsMock).toHaveBeenCalledWith(
expect.objectContaining({
payloads: [{ text }],
}),
);
});
it("passes resolved Discord formatting options as explicit delivery options", async () => {
const baseCfg = {
channels: {

View File

@@ -18,6 +18,7 @@ import type { RuntimeEnv } from "openclaw/plugin-sdk/runtime-env";
import { normalizeOptionalString } from "openclaw/plugin-sdk/text-runtime";
import type { RequestClient } from "../internal/discord.js";
import { sendMessageDiscord, sendVoiceMessageDiscord } from "../send.js";
import { sanitizeDiscordFrontChannelReplyPayloads } from "./reply-safety.js";
export type DiscordThreadBindingLookupRecord = {
accountId: string;
@@ -175,13 +176,17 @@ export async function deliverDiscordReply(params: {
void params.runtime;
const delivery = resolveDiscordDeliveryOptions(params);
const payloads = sanitizeDiscordFrontChannelReplyPayloads(params.replies);
if (payloads.length === 0) {
return;
}
await deliverOutboundPayloads({
cfg: params.cfg,
channel: "discord",
to: delivery.to,
accountId: params.accountId,
payloads: params.replies,
payloads,
replyToId: normalizeOptionalString(params.replyToId),
replyToMode: delivery.replyToMode,
formatting: delivery.formatting,

View File

@@ -0,0 +1,64 @@
import type { ReplyPayload } from "openclaw/plugin-sdk/reply-dispatch-runtime";
import { resolveSendableOutboundReplyParts } from "openclaw/plugin-sdk/reply-payload";
import { sanitizeAssistantVisibleText } from "openclaw/plugin-sdk/text-runtime";
const DISCORD_INTERNAL_TRACE_LINE_RE =
/^(?:>\s*)?(?:(?:📊|🛠️|📖|📝|🔍|🔎|⚙️)\s*)?(?:Session Status|Exec|Read|Edit|Write|Patch|Search|Open|Click|Find|Screenshot|Update Plan|Tool Call|Tool Result|Function Call|Shell|Command)\s*:/i;
const DISCORD_INTERNAL_CHANNEL_LINE_RE =
/^(?:>\s*)?(?:analysis|commentary|tool[-_ ]?call|tool[-_ ]?result|function[-_ ]?call|thinking|reasoning)\s*[:=]/i;
function stripDiscordInternalTraceLines(text: string): string {
let inFence = false;
const kept: string[] = [];
for (const line of text.split(/\r?\n/)) {
if (/^\s*```/.test(line)) {
inFence = !inFence;
kept.push(line);
continue;
}
if (!inFence) {
const trimmed = line.trim();
if (
DISCORD_INTERNAL_TRACE_LINE_RE.test(trimmed) ||
DISCORD_INTERNAL_CHANNEL_LINE_RE.test(trimmed)
) {
continue;
}
}
kept.push(line);
}
return kept.join("\n");
}
function collapseExcessBlankLines(text: string): string {
return text.replace(/[ \t]+\n/g, "\n").replace(/\n{3,}/g, "\n\n");
}
export function sanitizeDiscordFrontChannelText(text: string): string {
const withoutAssistantScaffolding = sanitizeAssistantVisibleText(text);
const withoutTraceLines = stripDiscordInternalTraceLines(withoutAssistantScaffolding);
return collapseExcessBlankLines(withoutTraceLines).trim();
}
export function sanitizeDiscordFrontChannelReplyPayloads(
payloads: readonly ReplyPayload[],
): ReplyPayload[] {
const safePayloads: ReplyPayload[] = [];
for (const payload of payloads) {
const originalParts = resolveSendableOutboundReplyParts(payload);
const safeText =
typeof payload.text === "string"
? sanitizeDiscordFrontChannelText(payload.text)
: payload.text;
const nextPayload =
safeText === payload.text
? payload
: ({ ...payload, text: safeText || undefined } as ReplyPayload);
const nextParts = resolveSendableOutboundReplyParts(nextPayload);
if (!nextParts.hasText && !originalParts.hasMedia) {
continue;
}
safePayloads.push(nextPayload);
}
return safePayloads;
}

View File

@@ -118,7 +118,11 @@ export async function withOperatorApprovalsGatewayClient<T>(
clientOptions: { preauthHandshakeTimeoutMs: params.config.gateway?.handshakeTimeoutMs },
});
if (!readiness.ready) {
throw new Error("gateway event loop readiness timeout");
throw new Error(
readiness.aborted
? "gateway approval client start aborted before readiness"
: "gateway readiness unavailable before approval client start",
);
}
await ready;
return await run(gatewayClient);

View File

@@ -2,6 +2,7 @@ export const STARTUP_UNAVAILABLE_GATEWAY_METHODS = [
"agent.wait",
"chat.history",
"models.list",
"sessions.list",
"sessions.abort",
"sessions.create",
"sessions.send",

View File

@@ -232,6 +232,55 @@ describe("startChannelApprovalHandlerBootstrap", () => {
await cleanup();
});
it("defers retryable gateway readiness startup failures without terminal error logs", async () => {
vi.useFakeTimers();
const channelRuntime = createRuntimeChannel();
const readinessError = new Error("gateway event loop readiness timeout");
const start = vi.fn().mockRejectedValueOnce(readinessError).mockResolvedValueOnce(undefined);
const stop = vi.fn().mockResolvedValue(undefined);
const logger = {
error: vi.fn(),
warn: vi.fn(),
info: vi.fn(),
debug: vi.fn(),
child: vi.fn(),
isEnabled: vi.fn().mockReturnValue(true),
isVerboseEnabled: vi.fn().mockReturnValue(false),
verbose: vi.fn(),
};
createChannelApprovalHandlerFromCapability
.mockResolvedValueOnce({ start, stop })
.mockResolvedValueOnce({ start, stop });
const cleanup = await startTestBootstrap({ channelRuntime, logger });
registerApprovalContext(channelRuntime);
await flushTransitions();
expect(start).toHaveBeenCalledTimes(1);
await flushTransitions();
expect(logger.error).not.toHaveBeenCalledWith(
expect.stringContaining("failed to start native approval handler"),
);
expect(logger.warn).toHaveBeenCalledWith(
expect.stringContaining("native approval handler deferred until gateway readiness recovers"),
);
expect(logger.warn).toHaveBeenCalledWith(
expect.stringContaining("gateway readiness unavailable before approval handler start"),
);
expect(logger.warn).not.toHaveBeenCalledWith(
expect.stringContaining("gateway event loop readiness timeout"),
);
await vi.advanceTimersByTimeAsync(1_000);
await flushTransitions();
expect(createChannelApprovalHandlerFromCapability).toHaveBeenCalledTimes(2);
expect(start).toHaveBeenCalledTimes(2);
await cleanup();
});
it("does not retry terminal native approval startup failures", async () => {
vi.useFakeTimers();
const channelRuntime = createRuntimeChannel();

View File

@@ -17,6 +17,28 @@ import { isExecApprovalChannelRuntimeTerminalStartError } from "./exec-approval-
type ApprovalBootstrapHandler = ChannelApprovalHandler;
const APPROVAL_HANDLER_BOOTSTRAP_RETRY_MS = 1_000;
function isRetryableApprovalBootstrapStartError(error: unknown): boolean {
const message = String(error);
return (
message.includes("gateway readiness unavailable before approval client start") ||
message.includes("gateway approval client start aborted before readiness") ||
message.includes("gateway readiness unavailable before exec approval runtime start") ||
message.includes("gateway approval runtime start aborted before readiness") ||
message.includes("gateway event loop readiness timeout") ||
message.includes("gateway starting") ||
message.includes("code=1013") ||
message.includes("close code 1013")
);
}
function formatRetryableApprovalBootstrapStartError(error: unknown): string {
const message = String(error);
if (message.includes("gateway event loop readiness timeout")) {
return "gateway readiness unavailable before approval handler start";
}
return message;
}
export async function startChannelApprovalHandlerBootstrap(params: {
plugin: Pick<ChannelPlugin, "id" | "meta" | "approvalCapability">;
cfg: OpenClawConfig;
@@ -122,6 +144,13 @@ export async function startChannelApprovalHandlerBootstrap(params: {
logger.error(`native approval handler disabled: ${String(error)}`);
return;
}
if (isRetryableApprovalBootstrapStartError(error)) {
logger.warn(
`native approval handler deferred until gateway readiness recovers: ${formatRetryableApprovalBootstrapStartError(error)}`,
);
scheduleRetryForContext(context, generation);
return;
}
logger.error(`failed to start native approval handler: ${String(error)}`);
scheduleRetryForContext(context, generation);
}

View File

@@ -291,7 +291,9 @@ describe("createExecApprovalChannelRuntime", () => {
finalizeResolved: async () => undefined,
});
await expect(runtime.start()).rejects.toThrow("gateway event loop readiness timeout");
await expect(runtime.start()).rejects.toThrow(
"gateway readiness unavailable before exec approval runtime start",
);
expect(mockGatewayClientStarts).not.toHaveBeenCalled();
expect(mockGatewayClientStops).toHaveBeenCalledTimes(1);

View File

@@ -365,7 +365,11 @@ export function createExecApprovalChannelRuntime<
},
});
if (!readiness.ready) {
throw new Error("gateway event loop readiness timeout");
throw new Error(
readiness.aborted
? "gateway approval runtime start aborted before readiness"
: "gateway readiness unavailable before exec approval runtime start",
);
}
await ready;
if (stopClientIfInactive(client)) {