fix(doctor): use lightweight gateway liveness check

2026-05-06 14:10:51 +00:00 · 2026-04-29 11:32:02 +01:00
parent a1197b9075
commit 99950c7f12
5 changed files with 77 additions and 5 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -77,6 +77,7 @@ Docs: https://docs.openclaw.ai
 - Gateway: expose `gateway.handshakeTimeoutMs` in config, schema, and docs while preserving `OPENCLAW_HANDSHAKE_TIMEOUT_MS` precedence, so loaded or low-powered hosts can tune local WebSocket pre-auth handshakes without patching dist files. Supersedes #51282; refs #73592 and #73652. Thanks @henry-the-frog.
 - Gateway/TUI/status: align configured and env-based WebSocket handshake budgets across local clients, probes, and fallback RPCs while preserving explicit status timeouts and paired-device auth fallback, so slow local gateways are not marked unreachable by a shorter client watchdog. Refs #73524, #73535, #73592, and #73602. Thanks @harshcatsystems-collab, @DJBlackhawk, and @Vksh07.
 - Gateway/startup: return retryable `UNAVAILABLE` during the sidecar startup window and keep CLI/TUI/status clients retrying inside their existing timeout budget, so early connects no longer surface as terminal handshake failures. Fixes #73652. Thanks @spenceryang1996-dot.
+- Doctor/Gateway: use a lightweight `status` RPC without channel summary work for doctor Gateway liveness, so slow health snapshots do not falsely drive service restart repair. Fixes #64400; supersedes #64511. Thanks @CHE10X and @EronFan.
 - Agents/auth: scope external CLI credential discovery to configured providers during model auth status and startup prewarm, so opencode-only and other single-provider gateways do not block on unrelated Claude CLI Keychain probes. Fixes #73908. Thanks @Ailuras.
 - Agents/model selection: resolve slash-form aliases before provider/model parsing and keep alias-resolved primary models subject to transient provider cooldowns, so cron and persisted sessions do not retry cooled-down raw aliases. Fixes #73573 and #73657. Thanks @akai-shuuichi and @hashslingers.
 - Agents/Claude CLI: reuse already-cached macOS Keychain credentials for no-prompt Claude credential reads, so doctor/runtime checks do not miss fresh interactive Claude auth. Fixes #73682. Thanks @RyanSandoval.
--- a/src/commands/doctor-gateway-health.test.ts
+++ b/src/commands/doctor-gateway-health.test.ts
@@ -14,7 +14,51 @@ vi.mock("./health.js", () => ({
  healthCommand: vi.fn(),
 }));

-import { probeGatewayMemoryStatus } from "./doctor-gateway-health.js";
+import { checkGatewayHealth, probeGatewayMemoryStatus } from "./doctor-gateway-health.js";
+
+describe("checkGatewayHealth", () => {
+  const cfg = {} as OpenClawConfig;
+
+  beforeEach(() => {
+    callGateway.mockReset();
+  });
+
+  it("uses a lightweight status RPC for the restart liveness gate", async () => {
+    callGateway.mockResolvedValueOnce({ ok: true }).mockResolvedValueOnce({});
+    const runtime = { log: vi.fn(), error: vi.fn(), exit: vi.fn() };
+
+    await expect(
+      checkGatewayHealth({ runtime: runtime as never, cfg, timeoutMs: 3000 }),
+    ).resolves.toEqual({ healthOk: true });
+
+    expect(callGateway).toHaveBeenNthCalledWith(1, {
+      method: "status",
+      params: { includeChannelSummary: false },
+      timeoutMs: 3000,
+      config: cfg,
+    });
+    expect(callGateway).toHaveBeenNthCalledWith(2, {
+      method: "channels.status",
+      params: { probe: true, timeoutMs: 5000 },
+      timeoutMs: 6000,
+    });
+    expect(runtime.error).not.toHaveBeenCalled();
+  });
+
+  it("does not run follow-up channel probes when liveness fails", async () => {
+    callGateway.mockRejectedValueOnce(new Error("gateway timeout after 3000ms"));
+    const runtime = { log: vi.fn(), error: vi.fn(), exit: vi.fn() };
+
+    await expect(
+      checkGatewayHealth({ runtime: runtime as never, cfg, timeoutMs: 3000 }),
+    ).resolves.toEqual({ healthOk: false });
+
+    expect(callGateway).toHaveBeenCalledTimes(1);
+    expect(runtime.error).toHaveBeenCalledWith(
+      expect.stringContaining("Health check failed: Error: gateway timeout after 3000ms"),
+    );
+  });
+});

 describe("probeGatewayMemoryStatus", () => {
  const cfg = {} as OpenClawConfig;
--- a/src/commands/doctor-gateway-health.ts
+++ b/src/commands/doctor-gateway-health.ts
@@ -6,7 +6,6 @@ import { formatErrorMessage } from "../infra/errors.js";
 import type { RuntimeEnv } from "../runtime.js";
 import { note } from "../terminal/note.js";
 import { formatHealthCheckFailure } from "./health-format.js";
-import { healthCommand } from "./health.js";

 export type GatewayMemoryProbe = {
  checked: boolean;
@@ -28,7 +27,12 @@ export async function checkGatewayHealth(params: {
    typeof params.timeoutMs === "number" && params.timeoutMs > 0 ? params.timeoutMs : 10_000;
  let healthOk = false;
  try {
-    await healthCommand({ json: false, timeoutMs, config: params.cfg }, params.runtime);
+    await callGateway({
+      method: "status",
+      params: { includeChannelSummary: false },
+      timeoutMs,
+      config: params.cfg,
+    });
    healthOk = true;
  } catch (err) {
    const message = String(err);
--- a/src/gateway/server-methods/health.ts
+++ b/src/gateway/server-methods/health.ts
@@ -29,10 +29,11 @@ export const healthHandlers: GatewayRequestHandlers = {
      respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, formatForLog(err)));
    }
  },
-  status: async ({ respond, client }) => {
+  status: async ({ respond, client, params }) => {
    const scopes = Array.isArray(client?.connect?.scopes) ? client.connect.scopes : [];
    const status = await getStatusSummary({
      includeSensitive: scopes.includes(ADMIN_SCOPE),
+      includeChannelSummary: params.includeChannelSummary !== false,
    });
    respond(true, status, undefined);
  },
--- a/src/gateway/server-methods/server-methods.test.ts
+++ b/src/gateway/server-methods/server-methods.test.ts
@@ -1893,10 +1893,32 @@ describe("gateway healthHandlers.status scope handling", () => {
    async ({ scopes, includeSensitive }) => {
      const respond = await runHealthStatus(scopes);

-      expect(vi.mocked(statusModule.getStatusSummary)).toHaveBeenCalledWith({ includeSensitive });
+      expect(vi.mocked(statusModule.getStatusSummary)).toHaveBeenCalledWith({
+        includeSensitive,
+        includeChannelSummary: true,
+      });
      expect(respond).toHaveBeenCalledWith(true, { ok: true }, undefined);
    },
  );
+
+  it("can skip channel summary work for liveness-only status requests", async () => {
+    const respond = vi.fn();
+
+    await healthHandlers.status({
+      req: {} as never,
+      params: { includeChannelSummary: false },
+      respond: respond as never,
+      context: {} as never,
+      client: { connect: { role: "operator", scopes: ["operator.read"] } } as never,
+      isWebchatConnect: () => false,
+    });
+
+    expect(vi.mocked(statusModule.getStatusSummary)).toHaveBeenCalledWith({
+      includeSensitive: false,
+      includeChannelSummary: false,
+    });
+    expect(respond).toHaveBeenCalledWith(true, { ok: true }, undefined);
+  });
 });

 describe("logs.tail", () => {