From e7f1b10ff8bcb4ee887d209adb001569945b2201 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Sun, 3 May 2026 17:11:07 +0100 Subject: [PATCH] fix: auto-repair gateway watch startup --- CHANGELOG.md | 1 + docs/help/debugging.md | 4 +++ docs/start/setup.md | 5 ++- scripts/watch-node.mjs | 50 +++++++++++++++++++++++++++++ src/infra/watch-node.test.ts | 62 +++++++++++++++++++++++++++++++++++- 5 files changed, 120 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ffd25c6317f..3ccc0fdb418 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,6 +27,7 @@ Docs: https://docs.openclaw.ai - Memory/status: keep plain `openclaw memory status` and `openclaw memory status --json` on the cheap read-only path by reserving vector and embedding provider probes for `--deep` or `--index`. Fixes #76769. Thanks @daruire. - Control UI/Sessions: avoid full `sessions.list` reloads for chat-turn `sessions.changed` payloads, so large session stores no longer add multi-second delays while chat responses are being delivered. (#76676) Thanks @VACInc. +- Gateway/watch: run `doctor --fix --non-interactive` once and retry when the dev Gateway child exits during startup, so stale local plugin install/config state does not leave the tmux watch session disappearing without a repair attempt. - Doctor/Telegram: warn when selected Telegram quote replies can suppress `streaming.preview.toolProgress`, and document the `replyToMode` trade-off without changing runtime delivery. Fixes #73487. Thanks @GodsBoy. - Channels/Discord: send a best-effort native typing cue immediately after an inbound DM is accepted, so slow pre-dispatch turns show Discord liveness before queueing, context assembly, model, or tool work starts. Fixes #76417. Thanks @mlopez14. - Plugins/install: reject source-only TypeScript package installs and installed plugin packages that are missing compiled runtime output, so broken npm artifacts fail at install/discovery time instead of falling through jiti and surfacing later as unavailable providers. Fixes #76720. diff --git a/docs/help/debugging.md b/docs/help/debugging.md index 441f51469a7..a88a286d114 100644 --- a/docs/help/debugging.md +++ b/docs/help/debugging.md @@ -152,6 +152,10 @@ The tmux wrapper carries common non-secret runtime selectors such as `OPENCLAW_GATEWAY_PORT`, and `OPENCLAW_SKIP_CHANNELS` into the pane. Put provider credentials in your normal profile/config, or use raw foreground mode for one-off ephemeral secrets. +If the watched Gateway exits during startup, the watcher runs +`openclaw doctor --fix --non-interactive` once and restarts the Gateway child. +Use `OPENCLAW_GATEWAY_WATCH_AUTO_DOCTOR=0` when you want the original startup +failure without the dev-only repair pass. The managed tmux pane also defaults to colored Gateway logs for readability; set `FORCE_COLOR=0` when starting `pnpm gateway:watch` to disable ANSI output. diff --git a/docs/start/setup.md b/docs/start/setup.md index cd23176eb2b..82e6a2f2cda 100644 --- a/docs/start/setup.md +++ b/docs/start/setup.md @@ -103,7 +103,10 @@ session and auto-attaches from interactive terminals. Non-interactive shells sta detached and print `tmux attach -t openclaw-gateway-watch-main`; use `OPENCLAW_GATEWAY_WATCH_ATTACH=0 pnpm gateway:watch` to keep an interactive run detached, or `pnpm gateway:watch:raw` for foreground watch mode. The watcher -reloads on relevant source, config, and bundled-plugin metadata changes. +reloads on relevant source, config, and bundled-plugin metadata changes. If the +watched Gateway exits during startup, `gateway:watch` runs +`openclaw doctor --fix --non-interactive` once and retries; set +`OPENCLAW_GATEWAY_WATCH_AUTO_DOCTOR=0` to disable that dev-only repair pass. `pnpm openclaw setup` is the one-time local config/workspace initialization step for a fresh checkout. `pnpm gateway:watch` does not rebuild `dist/control-ui`, so rerun `pnpm ui:build` after `ui/` changes or use `pnpm ui:dev` while developing the Control UI. diff --git a/scripts/watch-node.mjs b/scripts/watch-node.mjs index edefe2e7930..faf9f006b9e 100644 --- a/scripts/watch-node.mjs +++ b/scripts/watch-node.mjs @@ -15,8 +15,10 @@ const WATCH_IGNORED_PATH_SEGMENTS = new Set([".git", "dist", "node_modules"]); const WATCH_LOCK_WAIT_MS = 5_000; const WATCH_LOCK_POLL_MS = 100; const WATCH_LOCK_DIR = path.join(".local", "watch-node"); +const AUTO_DOCTOR_DISABLE_VALUES = new Set(["0", "false", "no", "off"]); const buildRunnerArgs = (args) => [WATCH_NODE_RUNNER, ...args]; +const buildDoctorRunnerArgs = () => [WATCH_NODE_RUNNER, "doctor", "--fix", "--non-interactive"]; const normalizePath = (filePath) => String(filePath ?? "") @@ -69,6 +71,15 @@ const shouldRestartAfterChildExit = (exitCode, exitSignal) => (typeof exitCode === "number" && WATCH_RESTARTABLE_CHILD_EXIT_CODES.has(exitCode)) || (typeof exitSignal === "string" && WATCH_RESTARTABLE_CHILD_SIGNALS.has(exitSignal)); +const isGatewayWatchCommand = (args) => args[0] === "gateway"; + +const shouldRunAutoDoctor = (deps, autoDoctorAttempted) => + !autoDoctorAttempted && + isGatewayWatchCommand(deps.args) && + !AUTO_DOCTOR_DISABLE_VALUES.has( + String(deps.env.OPENCLAW_GATEWAY_WATCH_AUTO_DOCTOR ?? "").toLowerCase(), + ); + const isProcessAlive = (pid, signalProcess) => { if (!Number.isInteger(pid) || pid <= 0) { return false; @@ -288,6 +299,7 @@ export async function runWatchMain(params = {}) { let restartRequested = false; let watchProcess = null; let lockHandle = null; + let autoDoctorAttempted = false; let onSigInt; let onSigTerm; @@ -334,6 +346,44 @@ export async function runWatchMain(params = {}) { startRunner(); return; } + if (shouldRunAutoDoctor(deps, autoDoctorAttempted)) { + runAutoDoctorAndRestart(); + return; + } + settle(exitSignal ? 1 : (exitCode ?? 1)); + }); + }; + + const runAutoDoctorAndRestart = () => { + autoDoctorAttempted = true; + logWatcher( + "Gateway exited early; running `openclaw doctor --fix --non-interactive` once.", + deps, + ); + watchProcess = deps.spawn(deps.process.execPath, buildDoctorRunnerArgs(), { + cwd: deps.cwd, + env: childEnv, + stdio: "inherit", + }); + watchProcess.on("error", (error) => { + watchProcess = null; + logWatcher(`Failed to spawn doctor repair: ${error?.message ?? "unknown error"}`, deps); + settle(1); + }); + watchProcess.on("exit", (exitCode, exitSignal) => { + watchProcess = null; + if (shuttingDown) { + return; + } + if (exitCode === 0 && !exitSignal) { + logWatcher("Doctor repair completed; restarting gateway watch child.", deps); + startRunner(); + return; + } + logWatcher( + `Doctor repair failed; gateway:watch exiting with code ${exitSignal ? 1 : (exitCode ?? 1)}.`, + deps, + ); settle(exitSignal ? 1 : (exitCode ?? 1)); }); }; diff --git a/src/infra/watch-node.test.ts b/src/infra/watch-node.test.ts index f761899ae50..3c573da5e43 100644 --- a/src/infra/watch-node.test.ts +++ b/src/infra/watch-node.test.ts @@ -201,7 +201,7 @@ describe("watch-node script", () => { const { child, spawn, watcher, createWatcher, fakeProcess } = createWatchHarness(); const runPromise = runWatch({ - args: ["gateway", "--force", "--help"], + args: ["config", "validate"], createWatcher, lockDisabled: true, process: fakeProcess, @@ -217,6 +217,66 @@ describe("watch-node script", () => { expect(fakeProcess.listenerCount("SIGTERM")).toBe(0); }); + it("runs doctor once and restarts when gateway exits nonzero", async () => { + const gatewayA = Object.assign(new EventEmitter(), { kill: vi.fn() }); + const doctor = Object.assign(new EventEmitter(), { kill: vi.fn() }); + const gatewayB = Object.assign(new EventEmitter(), { kill: vi.fn() }); + const spawn = vi + .fn() + .mockReturnValueOnce(gatewayA) + .mockReturnValueOnce(doctor) + .mockReturnValueOnce(gatewayB); + const { watcher, fakeProcess, runPromise } = startWatchRun({ spawn }); + + gatewayA.emit("exit", 1, null); + await new Promise((resolve) => setImmediate(resolve)); + + expect(spawn).toHaveBeenCalledTimes(2); + expect(spawn).toHaveBeenNthCalledWith( + 2, + "/usr/local/bin/node", + ["scripts/run-node.mjs", "doctor", "--fix", "--non-interactive"], + expect.objectContaining({ stdio: "inherit" }), + ); + + doctor.emit("exit", 0, null); + await new Promise((resolve) => setImmediate(resolve)); + + expect(spawn).toHaveBeenCalledTimes(3); + expect(spawn).toHaveBeenNthCalledWith( + 3, + "/usr/local/bin/node", + ["scripts/run-node.mjs", "gateway", "--force"], + expect.objectContaining({ stdio: "inherit" }), + ); + + fakeProcess.emit("SIGINT"); + const exitCode = await runPromise; + expect(exitCode).toBe(130); + expect(gatewayB.kill).toHaveBeenCalledWith("SIGTERM"); + expect(watcher.close).toHaveBeenCalledTimes(1); + }); + + it("does not run doctor after a gateway failure when auto doctor is disabled", async () => { + const { child, spawn, watcher, createWatcher, fakeProcess } = createWatchHarness(); + + const runPromise = runWatch({ + args: ["gateway", "--force"], + createWatcher, + env: { OPENCLAW_GATEWAY_WATCH_AUTO_DOCTOR: "0" }, + lockDisabled: true, + process: fakeProcess, + spawn, + }); + + child.emit("exit", 1, null); + const exitCode = await runPromise; + + expect(exitCode).toBe(1); + expect(spawn).toHaveBeenCalledTimes(1); + expect(watcher.close).toHaveBeenCalledTimes(1); + }); + it("restarts when the runner exits with a SIGTERM-derived code unexpectedly", async () => { const childA = Object.assign(new EventEmitter(), { kill: vi.fn(),