From 99950c7f1272dff6e2c34c2be45dfc5f89e62a60 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Wed, 29 Apr 2026 11:32:02 +0100 Subject: [PATCH] fix(doctor): use lightweight gateway liveness check --- CHANGELOG.md | 1 + src/commands/doctor-gateway-health.test.ts | 46 ++++++++++++++++++- src/commands/doctor-gateway-health.ts | 8 +++- src/gateway/server-methods/health.ts | 3 +- .../server-methods/server-methods.test.ts | 24 +++++++++- 5 files changed, 77 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0c388fd289e..1d4b564690b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -77,6 +77,7 @@ Docs: https://docs.openclaw.ai - Gateway: expose `gateway.handshakeTimeoutMs` in config, schema, and docs while preserving `OPENCLAW_HANDSHAKE_TIMEOUT_MS` precedence, so loaded or low-powered hosts can tune local WebSocket pre-auth handshakes without patching dist files. Supersedes #51282; refs #73592 and #73652. Thanks @henry-the-frog. - Gateway/TUI/status: align configured and env-based WebSocket handshake budgets across local clients, probes, and fallback RPCs while preserving explicit status timeouts and paired-device auth fallback, so slow local gateways are not marked unreachable by a shorter client watchdog. Refs #73524, #73535, #73592, and #73602. Thanks @harshcatsystems-collab, @DJBlackhawk, and @Vksh07. - Gateway/startup: return retryable `UNAVAILABLE` during the sidecar startup window and keep CLI/TUI/status clients retrying inside their existing timeout budget, so early connects no longer surface as terminal handshake failures. Fixes #73652. Thanks @spenceryang1996-dot. +- Doctor/Gateway: use a lightweight `status` RPC without channel summary work for doctor Gateway liveness, so slow health snapshots do not falsely drive service restart repair. Fixes #64400; supersedes #64511. Thanks @CHE10X and @EronFan. - Agents/auth: scope external CLI credential discovery to configured providers during model auth status and startup prewarm, so opencode-only and other single-provider gateways do not block on unrelated Claude CLI Keychain probes. Fixes #73908. Thanks @Ailuras. - Agents/model selection: resolve slash-form aliases before provider/model parsing and keep alias-resolved primary models subject to transient provider cooldowns, so cron and persisted sessions do not retry cooled-down raw aliases. Fixes #73573 and #73657. Thanks @akai-shuuichi and @hashslingers. - Agents/Claude CLI: reuse already-cached macOS Keychain credentials for no-prompt Claude credential reads, so doctor/runtime checks do not miss fresh interactive Claude auth. Fixes #73682. Thanks @RyanSandoval. diff --git a/src/commands/doctor-gateway-health.test.ts b/src/commands/doctor-gateway-health.test.ts index e734461966e..3a84d6170eb 100644 --- a/src/commands/doctor-gateway-health.test.ts +++ b/src/commands/doctor-gateway-health.test.ts @@ -14,7 +14,51 @@ vi.mock("./health.js", () => ({ healthCommand: vi.fn(), })); -import { probeGatewayMemoryStatus } from "./doctor-gateway-health.js"; +import { checkGatewayHealth, probeGatewayMemoryStatus } from "./doctor-gateway-health.js"; + +describe("checkGatewayHealth", () => { + const cfg = {} as OpenClawConfig; + + beforeEach(() => { + callGateway.mockReset(); + }); + + it("uses a lightweight status RPC for the restart liveness gate", async () => { + callGateway.mockResolvedValueOnce({ ok: true }).mockResolvedValueOnce({}); + const runtime = { log: vi.fn(), error: vi.fn(), exit: vi.fn() }; + + await expect( + checkGatewayHealth({ runtime: runtime as never, cfg, timeoutMs: 3000 }), + ).resolves.toEqual({ healthOk: true }); + + expect(callGateway).toHaveBeenNthCalledWith(1, { + method: "status", + params: { includeChannelSummary: false }, + timeoutMs: 3000, + config: cfg, + }); + expect(callGateway).toHaveBeenNthCalledWith(2, { + method: "channels.status", + params: { probe: true, timeoutMs: 5000 }, + timeoutMs: 6000, + }); + expect(runtime.error).not.toHaveBeenCalled(); + }); + + it("does not run follow-up channel probes when liveness fails", async () => { + callGateway.mockRejectedValueOnce(new Error("gateway timeout after 3000ms")); + const runtime = { log: vi.fn(), error: vi.fn(), exit: vi.fn() }; + + await expect( + checkGatewayHealth({ runtime: runtime as never, cfg, timeoutMs: 3000 }), + ).resolves.toEqual({ healthOk: false }); + + expect(callGateway).toHaveBeenCalledTimes(1); + expect(runtime.error).toHaveBeenCalledWith( + expect.stringContaining("Health check failed: Error: gateway timeout after 3000ms"), + ); + }); +}); describe("probeGatewayMemoryStatus", () => { const cfg = {} as OpenClawConfig; diff --git a/src/commands/doctor-gateway-health.ts b/src/commands/doctor-gateway-health.ts index c8dc97762e4..12469d383c2 100644 --- a/src/commands/doctor-gateway-health.ts +++ b/src/commands/doctor-gateway-health.ts @@ -6,7 +6,6 @@ import { formatErrorMessage } from "../infra/errors.js"; import type { RuntimeEnv } from "../runtime.js"; import { note } from "../terminal/note.js"; import { formatHealthCheckFailure } from "./health-format.js"; -import { healthCommand } from "./health.js"; export type GatewayMemoryProbe = { checked: boolean; @@ -28,7 +27,12 @@ export async function checkGatewayHealth(params: { typeof params.timeoutMs === "number" && params.timeoutMs > 0 ? params.timeoutMs : 10_000; let healthOk = false; try { - await healthCommand({ json: false, timeoutMs, config: params.cfg }, params.runtime); + await callGateway({ + method: "status", + params: { includeChannelSummary: false }, + timeoutMs, + config: params.cfg, + }); healthOk = true; } catch (err) { const message = String(err); diff --git a/src/gateway/server-methods/health.ts b/src/gateway/server-methods/health.ts index b7e8d55f701..4f8e724b47d 100644 --- a/src/gateway/server-methods/health.ts +++ b/src/gateway/server-methods/health.ts @@ -29,10 +29,11 @@ export const healthHandlers: GatewayRequestHandlers = { respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, formatForLog(err))); } }, - status: async ({ respond, client }) => { + status: async ({ respond, client, params }) => { const scopes = Array.isArray(client?.connect?.scopes) ? client.connect.scopes : []; const status = await getStatusSummary({ includeSensitive: scopes.includes(ADMIN_SCOPE), + includeChannelSummary: params.includeChannelSummary !== false, }); respond(true, status, undefined); }, diff --git a/src/gateway/server-methods/server-methods.test.ts b/src/gateway/server-methods/server-methods.test.ts index 2df7400b251..9672e377747 100644 --- a/src/gateway/server-methods/server-methods.test.ts +++ b/src/gateway/server-methods/server-methods.test.ts @@ -1893,10 +1893,32 @@ describe("gateway healthHandlers.status scope handling", () => { async ({ scopes, includeSensitive }) => { const respond = await runHealthStatus(scopes); - expect(vi.mocked(statusModule.getStatusSummary)).toHaveBeenCalledWith({ includeSensitive }); + expect(vi.mocked(statusModule.getStatusSummary)).toHaveBeenCalledWith({ + includeSensitive, + includeChannelSummary: true, + }); expect(respond).toHaveBeenCalledWith(true, { ok: true }, undefined); }, ); + + it("can skip channel summary work for liveness-only status requests", async () => { + const respond = vi.fn(); + + await healthHandlers.status({ + req: {} as never, + params: { includeChannelSummary: false }, + respond: respond as never, + context: {} as never, + client: { connect: { role: "operator", scopes: ["operator.read"] } } as never, + isWebchatConnect: () => false, + }); + + expect(vi.mocked(statusModule.getStatusSummary)).toHaveBeenCalledWith({ + includeSensitive: false, + includeChannelSummary: false, + }); + expect(respond).toHaveBeenCalledWith(true, { ok: true }, undefined); + }); }); describe("logs.tail", () => {