fix: harden gateway install recovery paths

This commit is contained in:
Peter Steinberger
2026-05-04 01:27:52 +01:00
parent 9799e412f8
commit 9c37cfcbdb
11 changed files with 317 additions and 36 deletions

View File

@@ -38,6 +38,10 @@ Docs: https://docs.openclaw.ai
### Fixes
- Channels/WhatsApp: allow `@whiskeysockets/libsignal-node` in `onlyBuiltDependencies` so pnpm v9+ `blockExoticSubdeps` no longer rejects the baileys git-tarball subdep and silences all inbound agent replies. Fixes #76539. Thanks @ottodeng and @vincentkoc.
- Gateway/install: keep `.env`-managed values in the macOS LaunchAgent env file while still tracking `OPENCLAW_SERVICE_MANAGED_ENV_KEYS`, so regenerated services do not boot without managed auth/provider keys. Fixes #75374.
- Gateway/restart: verify listener PIDs by argv when `lsof` reports only the Node process name, so stale gateway cleanup can find macOS `cnode` listeners. Fixes #70664.
- Gateway/logging: expand leading `~` in `logging.file` before creating the file logger, preventing startup crash loops for home-relative log paths. Fixes #73587.
- Channels/CLI: keep `openclaw channels list --json` usable when provider usage fetching fails, and report per-provider usage errors without aborting the channel list. Refs #67595.
- Gateway/systemd: preserve operator-added secrets in the Gateway env file across re-stage while clearing OpenClaw-managed keys (such as `OPENCLAW_GATEWAY_TOKEN`) so a fresh staging value is never shadowed by a stale env-file copy; operator secrets are also retained when the state-dir `.env` is empty. Fixes #76860. Thanks @hclsys.
- Plugin updates: do not short-circuit trusted official npm updates as unchanged when the default/latest spec still resolves to an already-installed prerelease that the installer should replace with a stable fallback. Thanks @vincentkoc.
- Plugin tools: keep auth-unavailable optional tools hidden even when another default tool from the same plugin is available and `tools.alsoAllow` names the optional tool. Thanks @vincentkoc.

View File

@@ -13,6 +13,7 @@ const mocks = vi.hoisted(() => ({
loadAuthProfileStoreWithoutExternalProfiles: vi.fn(),
listReadOnlyChannelPluginsForConfig: vi.fn<() => ChannelPlugin[]>(() => []),
buildChannelAccountSnapshot: vi.fn(),
loadProviderUsageSummary: vi.fn(),
}));
vi.mock("../config/config.js", () => ({
@@ -39,6 +40,11 @@ vi.mock("../channels/plugins/status.js", () => ({
buildChannelAccountSnapshot: mocks.buildChannelAccountSnapshot,
}));
vi.mock("../infra/provider-usage.js", () => ({
formatUsageReportLines: () => [],
loadProviderUsageSummary: mocks.loadProviderUsageSummary,
}));
import { channelsListCommand } from "./channels/list.js";
function createMockChannelPlugin(accountIds: string[]): ChannelPlugin {
@@ -64,6 +70,7 @@ describe("channels list auth profiles", () => {
mocks.readConfigFileSnapshot.mockReset();
mocks.resolveCommandConfigWithSecrets.mockClear();
mocks.loadAuthProfileStoreWithoutExternalProfiles.mockReset();
mocks.loadProviderUsageSummary.mockReset();
mocks.listReadOnlyChannelPluginsForConfig.mockReset();
mocks.listReadOnlyChannelPluginsForConfig.mockReturnValue([]);
mocks.buildChannelAccountSnapshot.mockReset();
@@ -143,6 +150,27 @@ describe("channels list auth profiles", () => {
expect(payload.chat?.telegram).toEqual(["alerts", "default"]);
});
it("keeps JSON output valid when usage loading fails", async () => {
const runtime = createTestRuntime();
mocks.readConfigFileSnapshot.mockResolvedValue({
...baseConfigSnapshot,
config: {},
});
mocks.loadAuthProfileStoreWithoutExternalProfiles.mockReturnValue({
version: 1,
profiles: {},
});
mocks.loadProviderUsageSummary.mockRejectedValue(new Error("fetch failed"));
await channelsListCommand({ json: true }, runtime);
const payload = JSON.parse(runtime.log.mock.calls[0]?.[0] as string) as {
usage?: unknown;
};
expect(payload.usage).toBeUndefined();
expect(runtime.error).not.toHaveBeenCalled();
});
it("prints configured chat channel accounts before auth providers", async () => {
const runtime = createTestRuntime();
mocks.listReadOnlyChannelPluginsForConfig.mockReturnValue([

View File

@@ -91,14 +91,17 @@ function formatAccountLine(params: {
}
async function loadUsageWithProgress(
runtime: RuntimeEnv,
progress = true,
): Promise<Awaited<ReturnType<typeof loadProviderUsageSummary>> | null> {
try {
return await withProgress(
{ label: "Fetching usage snapshot…", indeterminate: true, enabled: true },
{ label: "Fetching usage snapshot…", indeterminate: true, enabled: progress },
async () => await loadProviderUsageSummary({ skipPluginAuthWithoutCredentialSource: true }),
);
} catch (err) {
runtime.error(String(err));
if (progress) {
runtime.error(String(err));
}
return null;
}
}
@@ -125,9 +128,7 @@ export async function channelsListCommand(
isExternal: false,
}));
if (opts.json) {
const usage = includeUsage
? await loadProviderUsageSummary({ skipPluginAuthWithoutCredentialSource: true })
: undefined;
const usage = includeUsage ? await loadUsageWithProgress(runtime, false) : undefined;
const chat: Record<string, string[]> = {};
for (const plugin of plugins) {
chat[plugin.id] = plugin.config.listAccountIds(cfg);

View File

@@ -597,6 +597,67 @@ describe("buildGatewayInstallPlan — dotenv merge", () => {
);
});
it("retains managed .env values for macOS LaunchAgent env files", async () => {
await writeStateDirDotEnv("TAVILY_API_KEY=dotenv-tavily\nOPENROUTER_API_KEY=or-key\n", {
stateDir: path.join(tmpDir, ".openclaw"),
});
mockNodeGatewayPlanFixture({
serviceEnvironment: {
HOME: "/from-service",
OPENCLAW_LAUNCHD_LABEL: "ai.openclaw.gateway",
OPENCLAW_PORT: "3000",
},
});
const plan = await buildGatewayInstallPlan({
env: { HOME: tmpDir },
port: 3000,
runtime: "node",
platform: "darwin",
});
expect(plan.environment.TAVILY_API_KEY).toBe("dotenv-tavily");
expect(plan.environment.OPENROUTER_API_KEY).toBe("or-key");
expect(plan.environment.OPENCLAW_SERVICE_MANAGED_ENV_KEYS).toBe(
"OPENROUTER_API_KEY,TAVILY_API_KEY",
);
});
it("does not retain config env values for macOS LaunchAgent env files", async () => {
await writeStateDirDotEnv("OPENROUTER_API_KEY=or-dotenv\nTAVILY_API_KEY=dotenv-tavily\n", {
stateDir: path.join(tmpDir, ".openclaw"),
});
mockNodeGatewayPlanFixture({
serviceEnvironment: {
HOME: "/from-service",
OPENCLAW_LAUNCHD_LABEL: "ai.openclaw.gateway",
OPENCLAW_PORT: "3000",
},
});
const plan = await buildGatewayInstallPlan({
env: { HOME: tmpDir },
port: 3000,
runtime: "node",
platform: "darwin",
config: {
env: {
vars: {
BRAVE_API_KEY: "brave-config-key",
OPENROUTER_API_KEY: "or-config-key",
},
},
},
});
expect(plan.environment.BRAVE_API_KEY).toBeUndefined();
expect(plan.environment.OPENROUTER_API_KEY).toBeUndefined();
expect(plan.environment.TAVILY_API_KEY).toBe("dotenv-tavily");
expect(plan.environment.OPENCLAW_SERVICE_MANAGED_ENV_KEYS).toBe(
"BRAVE_API_KEY,OPENROUTER_API_KEY,TAVILY_API_KEY",
);
});
it("works when .env file does not exist", async () => {
mockNodeGatewayPlanFixture({ serviceEnvironment: { OPENCLAW_PORT: "3000" } });

View File

@@ -3,7 +3,10 @@ import os from "node:os";
import path from "node:path";
import type { AuthProfileStore } from "../agents/auth-profiles/types.js";
import { formatCliCommand } from "../cli/command-format.js";
import { collectDurableServiceEnvVars } from "../config/state-dir-dotenv.js";
import {
collectDurableServiceEnvVars,
readStateDirDotEnvVars,
} from "../config/state-dir-dotenv.js";
import type { OpenClawConfig } from "../config/types.js";
import { resolveSecretInputRef } from "../config/types.secrets.js";
import { resolveGatewayLaunchAgentLabel } from "../daemon/constants.js";
@@ -392,6 +395,35 @@ function resolveGatewayInstallWorkingDirectory(params: {
return resolveGatewayStateDir(params.env);
}
function retainLaunchAgentManagedServiceEnvValues(params: {
environment: Record<string, string | undefined>;
durableEnvironment: Record<string, string | undefined>;
managedServiceEnvKeys: string | undefined;
stateDirDotEnvEnvironment: Record<string, string | undefined>;
serviceEnvironment: Record<string, string | undefined>;
platform: NodeJS.Platform;
}): void {
if (params.platform !== "darwin" || !params.serviceEnvironment.OPENCLAW_LAUNCHD_LABEL?.trim()) {
return;
}
const managedKeys = readManagedServiceEnvKeysFromEnvironment({
OPENCLAW_SERVICE_MANAGED_ENV_KEYS: params.managedServiceEnvKeys,
});
if (managedKeys.size === 0) {
return;
}
for (const [rawKey, value] of Object.entries(params.stateDirDotEnvEnvironment)) {
const key = normalizeEnvVarKey(rawKey, { portable: true })?.toUpperCase();
if (!key || !managedKeys.has(key) || typeof value !== "string" || !value.trim()) {
continue;
}
if (params.durableEnvironment[rawKey] !== value) {
continue;
}
params.environment[rawKey] = value;
}
}
async function buildGatewayInstallEnvironment(params: {
env: Record<string, string | undefined>;
config?: OpenClawConfig;
@@ -408,6 +440,7 @@ async function buildGatewayInstallEnvironment(params: {
environment: Record<string, string | undefined>;
environmentValueSources: Record<string, GatewayServiceEnvironmentValueSource | undefined>;
}> {
const stateDirDotEnvEnvironment = readStateDirDotEnvVars(params.env);
const durableEnvironment = collectDurableServiceEnvVars({
env: params.env,
config: params.config,
@@ -463,6 +496,14 @@ async function buildGatewayInstallEnvironment(params: {
omitKeys: Object.keys(params.serviceEnvironment),
});
writeManagedServiceEnvKeysToEnvironment(environment, managedServiceEnvKeys);
retainLaunchAgentManagedServiceEnvValues({
environment,
durableEnvironment,
managedServiceEnvKeys,
stateDirDotEnvEnvironment,
serviceEnvironment: params.serviceEnvironment,
platform: params.platform,
});
if (environment.OPENCLAW_SERVICE_MANAGED_ENV_KEYS) {
environmentValueSources.OPENCLAW_SERVICE_MANAGED_ENV_KEYS = "inline";
}

View File

@@ -133,6 +133,48 @@ describe("provider-usage.load", () => {
}
});
it("keeps usage summary available when one provider fetch rejects", async () => {
resolveProviderUsageSnapshotWithPluginMock.mockImplementation(
async ({ provider }): Promise<ProviderUsageSnapshot | null> => {
if (provider === "anthropic") {
throw new Error("fetch failed");
}
const usageProvider = provider as ProviderUsageSnapshot["provider"];
return {
provider: usageProvider,
displayName: "Codex",
windows: [{ label: "3h", usedPercent: 12 }],
};
},
);
const mockFetch = createProviderUsageFetch(async () => {
throw new Error("legacy fetch should not run");
});
const summary = await loadUsageWithAuth(
loadProviderUsageSummary,
[
{ provider: "anthropic", token: "token-a" },
{ provider: "openai-codex", token: "token-codex" },
],
mockFetch,
);
expect(summary.providers).toEqual([
{
provider: "anthropic",
displayName: "Claude",
windows: [],
error: "fetch failed",
},
{
provider: "openai-codex",
displayName: "Codex",
windows: [{ label: "3h", usedPercent: 12 }],
},
]);
});
it("throws when fetch is unavailable", async () => {
const previousFetch = globalThis.fetch;
vi.stubGlobal("fetch", undefined);

View File

@@ -103,8 +103,14 @@ export async function loadProviderUsageSummary(
return { updatedAt: now, providers: [] };
}
const tasks = auths.map((auth) =>
withTimeout(
const tasks = auths.map((auth) => {
const failureSnapshot = (error: string): ProviderUsageSnapshot => ({
provider: auth.provider,
displayName: PROVIDER_LABELS[auth.provider] ?? auth.provider,
windows: [],
error,
});
return withTimeout(
fetchProviderUsageSnapshot({
auth,
config,
@@ -121,8 +127,11 @@ export async function loadProviderUsageSummary(
windows: [],
error: "Timeout",
},
),
);
).catch((error: unknown) => {
const message = error instanceof Error ? error.message : String(error);
return failureSnapshot(message.trim() || "Fetch failed");
});
});
const snapshots = await Promise.all(tasks);
const providers = snapshots.filter((entry) => {

View File

@@ -256,6 +256,33 @@ describe.skipIf(isWindows)("restart-stale-pids", () => {
expect(pids).not.toContain(process.pid);
});
it("verifies argv when lsof reports the node process name instead of openclaw", () => {
const stalePid = process.pid + 101;
mockSpawnSync.mockImplementation((command: unknown) => {
if (command === "ps") {
return {
error: null,
status: 0,
stdout: "node /opt/openclaw/dist/entry.js gateway\n",
stderr: "",
};
}
return {
error: null,
status: 0,
stdout: lsofOutput([{ pid: stalePid, cmd: "cnode" }]),
stderr: "",
};
});
expect(findGatewayPidsOnPortSync(18789)).toEqual([stalePid]);
expect(mockSpawnSync).toHaveBeenCalledWith(
"ps",
["-ww", "-p", String(stalePid), "-o", "command="],
expect.objectContaining({ timeout: 2000 }),
);
});
it("excludes ancestor pids so a sidecar cannot kill its parent gateway — regression for #68451", () => {
// Regression: openclaw-weixin sidecar (child of the gateway) invoked
// cleanStaleGatewayProcessesSync during init. lsof reported the parent
@@ -1174,8 +1201,9 @@ describe.skipIf(isWindows)("restart-stale-pids", () => {
vi.spyOn(process, "kill").mockReturnValue(true);
// Should complete cleanly — no openclaw pids in status-1 output → free
expect(() => cleanStaleGatewayProcessesSync()).not.toThrow();
// Completed in exactly 2 calls (initial find + 1 free poll)
expect(getCallCount()).toBe(2);
// Completed with one argv verification after the status-1 poll output:
// initial lsof + poll lsof + ps argv check.
expect(getCallCount()).toBe(3);
});
});

View File

@@ -4,7 +4,7 @@ import path from "node:path";
import { resolveGatewayPort } from "../config/paths.js";
import { createSubsystemLogger } from "../logging/subsystem.js";
import { normalizeLowercaseStringOrEmpty } from "../shared/string-coerce.js";
import { isGatewayArgv } from "./gateway-process-argv.js";
import { isGatewayArgv, parseProcCmdline } from "./gateway-process-argv.js";
import { resolveLsofCommandSync } from "./ports-lsof.js";
import { getWindowsInstallRoots } from "./windows-install-roots.js";
import {
@@ -166,7 +166,7 @@ export function getSelfAndAncestorPidsSync(): Set<number> {
}
/**
* Parse openclaw gateway PIDs from lsof -Fpc stdout, excluding the current
* Parse raw PIDs from lsof -Fpc stdout, excluding the current
* process and its ancestors (see `getSelfAndAncestorPidsSync` for the full
* rationale). On Linux the ancestor lookup reads up to
* `MAX_ANCESTOR_WALK_DEPTH` entries from `/proc/<pid>/status`; each read is
@@ -174,19 +174,18 @@ export function getSelfAndAncestorPidsSync(): Set<number> {
* in try/catch and degrades silently. On macOS/Windows the lookup is
* in-memory via `process.ppid` only.
*/
function parsePidsFromLsofOutput(stdout: string): number[] {
const pids: number[] = [];
function parseLsofEntries(stdout: string): Array<{ pid: number; cmd?: string }> {
const entries: Array<{ pid: number; cmd?: string }> = [];
let currentPid: number | undefined;
let currentCmd: string | undefined;
const flush = () => {
if (currentPid != null) {
entries.push({ pid: currentPid, ...(currentCmd ? { cmd: currentCmd } : {}) });
}
};
for (const line of stdout.split(/\r?\n/).filter(Boolean)) {
if (line.startsWith("p")) {
if (
currentPid != null &&
currentCmd &&
normalizeLowercaseStringOrEmpty(currentCmd).includes("openclaw")
) {
pids.push(currentPid);
}
flush();
const parsed = Number.parseInt(line.slice(1), 10);
currentPid = Number.isFinite(parsed) && parsed > 0 ? parsed : undefined;
currentCmd = undefined;
@@ -194,19 +193,67 @@ function parsePidsFromLsofOutput(stdout: string): number[] {
currentCmd = line.slice(1);
}
}
if (
currentPid != null &&
currentCmd &&
normalizeLowercaseStringOrEmpty(currentCmd).includes("openclaw")
) {
pids.push(currentPid);
flush();
return entries;
}
function parsePsCommandLine(raw: string): string[] {
const args: string[] = [];
for (const match of raw.matchAll(/"([^"]*)"|'([^']*)'|(\S+)/g)) {
const value = match[1] ?? match[2] ?? match[3];
if (value) {
args.push(value);
}
}
return args;
}
function readUnixProcessArgsSync(pid: number, spawnTimeoutMs: number): string[] | null {
if (process.platform === "linux") {
try {
const args = parseProcCmdline(readFileSync(`/proc/${pid}/cmdline`, "utf8"));
if (args.length > 0) {
return args;
}
} catch {
// Fall back to ps below; /proc may be unavailable or restricted.
}
}
const res = spawnSync("ps", ["-ww", "-p", String(pid), "-o", "command="], {
encoding: "utf8",
timeout: spawnTimeoutMs,
});
if (res.error || res.status !== 0 || !res.stdout.trim()) {
return null;
}
return parsePsCommandLine(res.stdout.trim());
}
function verifyGatewayPidByArgvSync(pid: number, spawnTimeoutMs: number): boolean {
const args = readUnixProcessArgsSync(pid, spawnTimeoutMs);
return args != null && isGatewayArgv(args, { allowGatewayBinary: true });
}
function parsePidsFromLsofOutput(stdout: string, spawnTimeoutMs: number): number[] {
// Deduplicate: dual-stack listeners (IPv4 + IPv6) cause lsof to emit the
// same PID twice. Return each PID at most once to avoid double-killing.
// Exclude self and ancestors — terminating any ancestor cascade-kills the
// caller via the supervisor, recreating the #68451 restart loop.
const excluded = getSelfAndAncestorPidsSync();
return [...new Set(pids)].filter((pid) => !excluded.has(pid));
const pids: number[] = [];
for (const entry of parseLsofEntries(stdout)) {
if (excluded.has(entry.pid)) {
continue;
}
if (entry.cmd && normalizeLowercaseStringOrEmpty(entry.cmd).includes("openclaw")) {
pids.push(entry.pid);
continue;
}
if (verifyGatewayPidByArgvSync(entry.pid, spawnTimeoutMs)) {
pids.push(entry.pid);
}
}
return [...new Set(pids)];
}
/**
@@ -298,7 +345,7 @@ export function findGatewayPidsOnPortSync(
);
return [];
}
return parsePidsFromLsofOutput(res.stdout);
return parsePidsFromLsofOutput(res.stdout, spawnTimeoutMs);
}
/**
@@ -345,7 +392,7 @@ function pollPortOnce(port: number): PollResult {
// user namespaces), lsof can exit 1 AND still emit some output for the
// processes it could read. Parse stdout when non-empty to avoid false-free.
if (res.stdout) {
const pids = parsePidsFromLsofOutput(res.stdout);
const pids = parsePidsFromLsofOutput(res.stdout, POLL_SPAWN_TIMEOUT_MS);
return pids.length === 0 ? { free: true } : { free: false };
}
return { free: true };
@@ -358,7 +405,7 @@ function pollPortOnce(port: number): PollResult {
}
// status === 0: lsof found listeners. Parse pids from the stdout we
// already hold — no second lsof spawn, no new failure surface.
const pids = parsePidsFromLsofOutput(res.stdout);
const pids = parsePidsFromLsofOutput(res.stdout, POLL_SPAWN_TIMEOUT_MS);
return pids.length === 0 ? { free: true } : { free: false };
} catch {
return { free: null, permanent: false };

View File

@@ -1,4 +1,5 @@
import fs from "node:fs";
import path from "node:path";
import { afterAll, afterEach, beforeAll, describe, expect, it } from "vitest";
import {
createDiagnosticTraceContext,
@@ -7,12 +8,14 @@ import {
} from "../infra/diagnostic-trace-context.js";
import { getChildLogger, getLogger, resetLogger, setLoggerOverride } from "../logging.js";
import { createSuiteLogPathTracker } from "./log-test-helpers.js";
import { __test__ as loggerTest } from "./logger.js";
const secret = "sk-testsecret1234567890abcd";
const TRACE_ID = "4bf92f3577b34da6a3ce929d0e0e4736";
const SPAN_ID = "00f067aa0ba902b7";
const logPathTracker = createSuiteLogPathTracker("openclaw-log-redaction-");
const originalConfigPath = process.env.OPENCLAW_CONFIG_PATH;
const originalHome = process.env.HOME;
const originalTestFileLog = process.env.OPENCLAW_TEST_FILE_LOG;
beforeAll(async () => {
@@ -25,6 +28,11 @@ afterEach(() => {
} else {
process.env.OPENCLAW_CONFIG_PATH = originalConfigPath;
}
if (originalHome === undefined) {
delete process.env.HOME;
} else {
process.env.HOME = originalHome;
}
if (originalTestFileLog === undefined) {
delete process.env.OPENCLAW_TEST_FILE_LOG;
} else {
@@ -84,6 +92,15 @@ describe("file log redaction", () => {
expect(content).toContain("configured log path works");
});
it("expands leading tilde in logging.file", () => {
const home = path.join(path.dirname(logPathTracker.nextPath()), "home");
process.env.HOME = home;
expect(loggerTest.resolveActiveLogFile("~/custom-openclaw.log")).toBe(
path.join(home, "custom-openclaw.log"),
);
});
it("writes trace context as top-level JSONL fields", () => {
const logPath = logPathTracker.nextPath();
setLoggerOverride({ level: "info", file: logPath });

View File

@@ -11,6 +11,7 @@ import {
isValidDiagnosticTraceId,
type DiagnosticTraceContext,
} from "../infra/diagnostic-trace-context.js";
import { expandHomePrefix } from "../infra/home-dir.js";
import { isBlockedObjectKey } from "../infra/prototype-keys.js";
import {
POSIX_OPENCLAW_TMP_DIR,
@@ -672,6 +673,7 @@ export function resetLogger() {
}
export const __test__ = {
resolveActiveLogFile,
shouldSkipMutatingLoggingConfigRead,
};
@@ -692,10 +694,11 @@ function rollingPathForDate(dir: string, date: Date): string {
}
function resolveActiveLogFile(file: string): string {
if (!isRollingPath(file)) {
return file;
const expandedFile = expandHomePrefix(file);
if (!isRollingPath(expandedFile)) {
return expandedFile;
}
return rollingPathForDate(path.dirname(file), new Date());
return rollingPathForDate(path.dirname(expandedFile), new Date());
}
function isRollingPath(file: string): boolean {