fix(doctor): use lightweight gateway liveness check

This commit is contained in:
Peter Steinberger
2026-04-29 11:32:02 +01:00
parent a1197b9075
commit 99950c7f12
5 changed files with 77 additions and 5 deletions

View File

@@ -77,6 +77,7 @@ Docs: https://docs.openclaw.ai
- Gateway: expose `gateway.handshakeTimeoutMs` in config, schema, and docs while preserving `OPENCLAW_HANDSHAKE_TIMEOUT_MS` precedence, so loaded or low-powered hosts can tune local WebSocket pre-auth handshakes without patching dist files. Supersedes #51282; refs #73592 and #73652. Thanks @henry-the-frog.
- Gateway/TUI/status: align configured and env-based WebSocket handshake budgets across local clients, probes, and fallback RPCs while preserving explicit status timeouts and paired-device auth fallback, so slow local gateways are not marked unreachable by a shorter client watchdog. Refs #73524, #73535, #73592, and #73602. Thanks @harshcatsystems-collab, @DJBlackhawk, and @Vksh07.
- Gateway/startup: return retryable `UNAVAILABLE` during the sidecar startup window and keep CLI/TUI/status clients retrying inside their existing timeout budget, so early connects no longer surface as terminal handshake failures. Fixes #73652. Thanks @spenceryang1996-dot.
- Doctor/Gateway: use a lightweight `status` RPC without channel summary work for doctor Gateway liveness, so slow health snapshots do not falsely drive service restart repair. Fixes #64400; supersedes #64511. Thanks @CHE10X and @EronFan.
- Agents/auth: scope external CLI credential discovery to configured providers during model auth status and startup prewarm, so opencode-only and other single-provider gateways do not block on unrelated Claude CLI Keychain probes. Fixes #73908. Thanks @Ailuras.
- Agents/model selection: resolve slash-form aliases before provider/model parsing and keep alias-resolved primary models subject to transient provider cooldowns, so cron and persisted sessions do not retry cooled-down raw aliases. Fixes #73573 and #73657. Thanks @akai-shuuichi and @hashslingers.
- Agents/Claude CLI: reuse already-cached macOS Keychain credentials for no-prompt Claude credential reads, so doctor/runtime checks do not miss fresh interactive Claude auth. Fixes #73682. Thanks @RyanSandoval.

View File

@@ -14,7 +14,51 @@ vi.mock("./health.js", () => ({
healthCommand: vi.fn(),
}));
import { probeGatewayMemoryStatus } from "./doctor-gateway-health.js";
import { checkGatewayHealth, probeGatewayMemoryStatus } from "./doctor-gateway-health.js";
describe("checkGatewayHealth", () => {
const cfg = {} as OpenClawConfig;
beforeEach(() => {
callGateway.mockReset();
});
it("uses a lightweight status RPC for the restart liveness gate", async () => {
callGateway.mockResolvedValueOnce({ ok: true }).mockResolvedValueOnce({});
const runtime = { log: vi.fn(), error: vi.fn(), exit: vi.fn() };
await expect(
checkGatewayHealth({ runtime: runtime as never, cfg, timeoutMs: 3000 }),
).resolves.toEqual({ healthOk: true });
expect(callGateway).toHaveBeenNthCalledWith(1, {
method: "status",
params: { includeChannelSummary: false },
timeoutMs: 3000,
config: cfg,
});
expect(callGateway).toHaveBeenNthCalledWith(2, {
method: "channels.status",
params: { probe: true, timeoutMs: 5000 },
timeoutMs: 6000,
});
expect(runtime.error).not.toHaveBeenCalled();
});
it("does not run follow-up channel probes when liveness fails", async () => {
callGateway.mockRejectedValueOnce(new Error("gateway timeout after 3000ms"));
const runtime = { log: vi.fn(), error: vi.fn(), exit: vi.fn() };
await expect(
checkGatewayHealth({ runtime: runtime as never, cfg, timeoutMs: 3000 }),
).resolves.toEqual({ healthOk: false });
expect(callGateway).toHaveBeenCalledTimes(1);
expect(runtime.error).toHaveBeenCalledWith(
expect.stringContaining("Health check failed: Error: gateway timeout after 3000ms"),
);
});
});
describe("probeGatewayMemoryStatus", () => {
const cfg = {} as OpenClawConfig;

View File

@@ -6,7 +6,6 @@ import { formatErrorMessage } from "../infra/errors.js";
import type { RuntimeEnv } from "../runtime.js";
import { note } from "../terminal/note.js";
import { formatHealthCheckFailure } from "./health-format.js";
import { healthCommand } from "./health.js";
export type GatewayMemoryProbe = {
checked: boolean;
@@ -28,7 +27,12 @@ export async function checkGatewayHealth(params: {
typeof params.timeoutMs === "number" && params.timeoutMs > 0 ? params.timeoutMs : 10_000;
let healthOk = false;
try {
await healthCommand({ json: false, timeoutMs, config: params.cfg }, params.runtime);
await callGateway({
method: "status",
params: { includeChannelSummary: false },
timeoutMs,
config: params.cfg,
});
healthOk = true;
} catch (err) {
const message = String(err);

View File

@@ -29,10 +29,11 @@ export const healthHandlers: GatewayRequestHandlers = {
respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, formatForLog(err)));
}
},
status: async ({ respond, client }) => {
status: async ({ respond, client, params }) => {
const scopes = Array.isArray(client?.connect?.scopes) ? client.connect.scopes : [];
const status = await getStatusSummary({
includeSensitive: scopes.includes(ADMIN_SCOPE),
includeChannelSummary: params.includeChannelSummary !== false,
});
respond(true, status, undefined);
},

View File

@@ -1893,10 +1893,32 @@ describe("gateway healthHandlers.status scope handling", () => {
async ({ scopes, includeSensitive }) => {
const respond = await runHealthStatus(scopes);
expect(vi.mocked(statusModule.getStatusSummary)).toHaveBeenCalledWith({ includeSensitive });
expect(vi.mocked(statusModule.getStatusSummary)).toHaveBeenCalledWith({
includeSensitive,
includeChannelSummary: true,
});
expect(respond).toHaveBeenCalledWith(true, { ok: true }, undefined);
},
);
it("can skip channel summary work for liveness-only status requests", async () => {
const respond = vi.fn();
await healthHandlers.status({
req: {} as never,
params: { includeChannelSummary: false },
respond: respond as never,
context: {} as never,
client: { connect: { role: "operator", scopes: ["operator.read"] } } as never,
isWebchatConnect: () => false,
});
expect(vi.mocked(statusModule.getStatusSummary)).toHaveBeenCalledWith({
includeSensitive: false,
includeChannelSummary: false,
});
expect(respond).toHaveBeenCalledWith(true, { ok: true }, undefined);
});
});
describe("logs.tail", () => {