fix: stabilize remote skill node probes

This commit is contained in:
Peter Steinberger
2026-04-25 23:41:19 +01:00
parent e4bfc8066e
commit b1b29a8fc2
6 changed files with 365 additions and 23 deletions

View File

@@ -69,6 +69,13 @@ Docs: https://docs.openclaw.ai
- Providers/Azure OpenAI: give deployment-scoped image generation requests a
longer 600s default timeout so slow `gpt-image-2` generations can complete
without a per-call `timeoutMs`. Fixes #71705. Thanks @voytas75.
- Gateway/plugins: link source-checkout bundled runtime dependency caches instead
of recursively copying `node_modules` on the gateway main thread, preventing
local status, node, and skill probes from timing out during startup cache
restores. Thanks @steipete.
- Skills/remote nodes: only expose remote macOS skill bins for connected nodes,
clear stale bin matches when node probes fail, and include probe command,
timeout, bin count, and connection state in timeout logs. Thanks @steipete.
- CLI/gateway: keep diagnostic probes from creating first-time read-only device
pairings, while still reusing cached device tokens for detailed read probes.
Fixes #71766. Thanks @SunboZ.

View File

@@ -349,7 +349,7 @@ agent.
If the Gateway is running on Linux but a **macOS node** is connected **with `system.run` allowed** (Exec approvals security not set to `deny`), OpenClaw can treat macOS-only skills as eligible when the required binaries are present on that node. The agent should execute those skills via the `exec` tool with `host=node`.
This relies on the node reporting its command support and on a bin probe via `system.run`. If the macOS node goes offline later, the skills remain visible; invocations may fail until the node reconnects.
This relies on the node reporting its command support and on a bin probe via `system.which` or `system.run`. Offline nodes do not make remote-only skills visible. If a connected node stops answering bin probes, OpenClaw clears its cached bin matches so agents no longer see skills that cannot currently run there.
## Skills watcher (auto-refresh)

View File

@@ -1,14 +1,25 @@
import { randomUUID } from "node:crypto";
import { describe, expect, it } from "vitest";
import fs from "node:fs";
import os from "node:os";
import path from "node:path";
import { afterEach, describe, expect, it } from "vitest";
import { getSkillsSnapshotVersion, resetSkillsRefreshForTest } from "../agents/skills/refresh.js";
import type { OpenClawConfig } from "../config/types.openclaw.js";
import type { NodeRegistry } from "../gateway/node-registry.js";
import {
getRemoteSkillEligibility,
recordRemoteNodeBins,
recordRemoteNodeInfo,
removeRemoteNodeInfo,
refreshRemoteNodeBins,
setSkillsRemoteRegistry,
} from "./skills-remote.js";
describe("skills-remote", () => {
afterEach(() => {
setSkillsRemoteRegistry(null);
});
it("removes disconnected nodes from remote skill eligibility", () => {
const nodeId = `node-${randomUUID()}`;
const bin = `bin-${randomUUID()}`;
@@ -134,4 +145,156 @@ describe("skills-remote", () => {
removeRemoteNodeInfo(nodeId);
}
});
it("does not expose bins for nodes that only have cached paired metadata", () => {
const nodeId = `node-${randomUUID()}`;
const bin = `bin-${randomUUID()}`;
try {
recordRemoteNodeBins(nodeId, [bin]);
expect(getRemoteSkillEligibility()?.hasBin(bin) ?? false).toBe(false);
} finally {
removeRemoteNodeInfo(nodeId);
}
});
it("clears stale bins when a connected node probe times out", async () => {
await resetSkillsRefreshForTest();
const workspaceDir = fs.mkdtempSync(path.join(os.tmpdir(), "openclaw-remote-skills-"));
const nodeId = `node-${randomUUID()}`;
const bin = `bin-${randomUUID()}`;
try {
fs.mkdirSync(path.join(workspaceDir, "remote-skill"), { recursive: true });
fs.writeFileSync(
path.join(workspaceDir, "remote-skill", "SKILL.md"),
[
"---",
"name: remote-skill",
"description: Needs a remote bin",
`metadata: { "openclaw": { "os": ["darwin"], "requires": { "bins": ["${bin}"] } } }`,
"---",
"# Remote Skill",
"",
].join("\n"),
);
const cfg = {
agents: {
defaults: {
workspace: workspaceDir,
},
},
} satisfies OpenClawConfig;
const invokeCalls: string[] = [];
setSkillsRemoteRegistry({
listConnected: () => [],
get: () => undefined,
invoke: async (params: { command: string }) => {
invokeCalls.push(params.command);
return {
ok: false,
error: { code: "TIMEOUT", message: "node invoke timed out" },
};
},
} as unknown as NodeRegistry);
recordRemoteNodeInfo({
nodeId,
displayName: "Remote Mac",
platform: "darwin",
commands: ["system.run", "system.which"],
});
recordRemoteNodeBins(nodeId, [bin]);
const before = getSkillsSnapshotVersion(workspaceDir);
await refreshRemoteNodeBins({
nodeId,
platform: "darwin",
commands: ["system.run", "system.which"],
cfg,
timeoutMs: 10,
});
expect(invokeCalls).toEqual(["system.which"]);
expect(getRemoteSkillEligibility()?.hasBin(bin) ?? false).toBe(false);
expect(getSkillsSnapshotVersion(workspaceDir)).toBeGreaterThan(before);
} finally {
removeRemoteNodeInfo(nodeId);
fs.rmSync(workspaceDir, { recursive: true, force: true });
}
});
it("coalesces overlapping bin probes for the same node", async () => {
const workspaceDir = fs.mkdtempSync(path.join(os.tmpdir(), "openclaw-remote-skills-"));
const nodeId = `node-${randomUUID()}`;
const bin = `bin-${randomUUID()}`;
let invokeCount = 0;
let releaseProbe!: () => void;
const probeStarted = new Promise<void>((resolve) => {
setSkillsRemoteRegistry({
listConnected: () => [],
get: () => undefined,
invoke: async () => {
invokeCount += 1;
resolve();
await new Promise<void>((release) => {
releaseProbe = release;
});
return {
ok: false,
error: { code: "TIMEOUT", message: "node invoke timed out" },
};
},
} as unknown as NodeRegistry);
});
try {
fs.mkdirSync(path.join(workspaceDir, "remote-skill"), { recursive: true });
fs.writeFileSync(
path.join(workspaceDir, "remote-skill", "SKILL.md"),
[
"---",
"name: remote-skill",
"description: Needs a remote bin",
`metadata: { "openclaw": { "os": ["darwin"], "requires": { "bins": ["${bin}"] } } }`,
"---",
"# Remote Skill",
"",
].join("\n"),
);
const cfg = {
agents: {
defaults: {
workspace: workspaceDir,
},
},
} satisfies OpenClawConfig;
recordRemoteNodeInfo({
nodeId,
displayName: "Remote Mac",
platform: "darwin",
commands: ["system.run", "system.which"],
});
const first = refreshRemoteNodeBins({
nodeId,
platform: "darwin",
commands: ["system.run", "system.which"],
cfg,
timeoutMs: 10,
});
await probeStarted;
const second = refreshRemoteNodeBins({
nodeId,
platform: "darwin",
commands: ["system.run", "system.which"],
cfg,
timeoutMs: 10,
});
releaseProbe();
await Promise.all([first, second]);
expect(invokeCount).toBe(1);
} finally {
removeRemoteNodeInfo(nodeId);
fs.rmSync(workspaceDir, { recursive: true, force: true });
}
});
});

View File

@@ -18,11 +18,13 @@ type RemoteNodeRecord = {
deviceFamily?: string;
commands?: string[];
bins: Set<string>;
connected: boolean;
remoteIp?: string;
};
const log = createSubsystemLogger("gateway/skills-remote");
const remoteNodes = new Map<string, RemoteNodeRecord>();
const remoteBinProbeInflight = new Map<string, Promise<void>>();
let remoteRegistry: NodeRegistry | null = null;
function describeNode(nodeId: string): string {
@@ -62,20 +64,36 @@ function extractErrorMessage(err: unknown): string | undefined {
return undefined;
}
function logRemoteBinProbeFailure(nodeId: string, err: unknown) {
function logRemoteBinProbeFailure(
nodeId: string,
err: unknown,
context?: { command?: string; timeoutMs?: number; requiredBinCount?: number },
) {
const message = extractErrorMessage(err);
const label = describeNode(nodeId);
const details = [
context?.command ? `command=${context.command}` : undefined,
typeof context?.timeoutMs === "number" ? `timeoutMs=${context.timeoutMs}` : undefined,
typeof context?.requiredBinCount === "number"
? `requiredBins=${context.requiredBinCount}`
: undefined,
`connected=${remoteNodes.get(nodeId)?.connected === true ? "yes" : "no"}`,
]
.filter(Boolean)
.join(" ");
// Node unavailable errors (not connected or disconnected mid-operation) are expected
// when nodes have transient connections - log at info level instead of warn
if (message?.includes("node not connected") || message?.includes("node disconnected")) {
log.info(`remote bin probe skipped: node unavailable (${label})`);
log.info(`remote bin probe skipped: node unavailable (${label}; ${details})`);
return;
}
if (message?.includes("invoke timed out") || message?.includes("timeout")) {
log.warn(`remote bin probe timed out (${label}); check node connectivity for ${label}`);
log.warn(
`remote bin probe timed out (${label}; ${details}); check node connectivity for ${label}`,
);
return;
}
log.warn(`remote bin probe error (${label}): ${message ?? "unknown"}`);
log.warn(`remote bin probe error (${label}; ${details}): ${message ?? "unknown"}`);
}
function isMacPlatform(platform?: string, deviceFamily?: string): boolean {
@@ -109,6 +127,7 @@ function upsertNode(record: {
commands?: string[];
remoteIp?: string;
bins?: string[];
connected?: boolean;
}) {
const existing = remoteNodes.get(record.nodeId);
const bins = new Set<string>(record.bins ?? existing?.bins ?? []);
@@ -120,9 +139,19 @@ function upsertNode(record: {
commands: record.commands ?? existing?.commands,
remoteIp: record.remoteIp ?? existing?.remoteIp,
bins,
connected: record.connected ?? existing?.connected ?? false,
});
}
function clearRemoteNodeBins(nodeId: string): boolean {
const existing = remoteNodes.get(nodeId);
if (!existing || existing.bins.size === 0) {
return false;
}
existing.bins = new Set();
return true;
}
export function setSkillsRemoteRegistry(registry: NodeRegistry | null) {
remoteRegistry = registry;
}
@@ -140,8 +169,14 @@ export async function primeRemoteSkillsCache() {
commands: node.commands,
remoteIp: node.remoteIp,
bins: node.bins,
connected: false,
});
if (isMacPlatform(node.platform, node.deviceFamily) && supportsSystemRun(node.commands)) {
if (
node.bins &&
node.bins.length > 0 &&
isMacPlatform(node.platform, node.deviceFamily) &&
supportsSystemRun(node.commands)
) {
sawMac = true;
}
}
@@ -161,7 +196,7 @@ export function recordRemoteNodeInfo(node: {
commands?: string[];
remoteIp?: string;
}) {
upsertNode(node);
upsertNode({ ...node, connected: true });
}
export function recordRemoteNodeBins(nodeId: string, bins: string[]) {
@@ -253,6 +288,28 @@ export async function refreshRemoteNodeBins(params: {
commands?: string[];
cfg: OpenClawConfig;
timeoutMs?: number;
}) {
const existing = remoteBinProbeInflight.get(params.nodeId);
if (existing) {
await existing;
return;
}
const run = refreshRemoteNodeBinsUncoalesced(params).finally(() => {
if (remoteBinProbeInflight.get(params.nodeId) === run) {
remoteBinProbeInflight.delete(params.nodeId);
}
});
remoteBinProbeInflight.set(params.nodeId, run);
await run;
}
async function refreshRemoteNodeBinsUncoalesced(params: {
nodeId: string;
platform?: string;
deviceFamily?: string;
commands?: string[];
cfg: OpenClawConfig;
timeoutMs?: number;
}) {
if (!remoteRegistry) {
return;
@@ -278,27 +335,34 @@ export async function refreshRemoteNodeBins(params: {
return;
}
const binsList = [...requiredBins];
const timeoutMs = params.timeoutMs ?? 15_000;
const command = canWhich ? "system.which" : "system.run";
const logContext = { command, timeoutMs, requiredBinCount: binsList.length };
try {
const binsList = [...requiredBins];
const res = await remoteRegistry.invoke(
canWhich
? {
nodeId: params.nodeId,
command: "system.which",
command,
params: { bins: binsList },
timeoutMs: params.timeoutMs ?? 15_000,
timeoutMs,
}
: {
nodeId: params.nodeId,
command: "system.run",
command,
params: {
command: ["/bin/sh", "-lc", buildBinProbeScript(binsList)],
},
timeoutMs: params.timeoutMs ?? 15_000,
timeoutMs,
},
);
if (!res.ok) {
logRemoteBinProbeFailure(params.nodeId, res.error?.message ?? "unknown");
const cleared = clearRemoteNodeBins(params.nodeId);
logRemoteBinProbeFailure(params.nodeId, res.error?.message ?? "unknown", logContext);
if (cleared) {
bumpSkillsSnapshotVersion({ reason: "remote-node" });
}
return;
}
const bins = parseBinProbePayload(res.payloadJSON, res.payload);
@@ -312,7 +376,11 @@ export async function refreshRemoteNodeBins(params: {
await updatePairedNodeMetadata(params.nodeId, { bins });
bumpSkillsSnapshotVersion({ reason: "remote-node" });
} catch (err) {
logRemoteBinProbeFailure(params.nodeId, err);
const cleared = clearRemoteNodeBins(params.nodeId);
logRemoteBinProbeFailure(params.nodeId, err, logContext);
if (cleared) {
bumpSkillsSnapshotVersion({ reason: "remote-node" });
}
}
}
@@ -320,7 +388,10 @@ export function getRemoteSkillEligibility(options?: {
advertiseExecNode?: boolean;
}): SkillEligibilityContext["remote"] | undefined {
const macNodes = [...remoteNodes.values()].filter(
(node) => isMacPlatform(node.platform, node.deviceFamily) && supportsSystemRun(node.commands),
(node) =>
node.connected &&
isMacPlatform(node.platform, node.deviceFamily) &&
supportsSystemRun(node.commands),
);
if (macNodes.length === 0) {
return undefined;

View File

@@ -817,6 +817,60 @@ describe("ensureBundledPluginRuntimeDeps", () => {
).toEqual({ installedSpecs: [], retainSpecs: [] });
});
it("links source-checkout runtime deps from the cache instead of copying them", () => {
const packageRoot = makeTempDir();
fs.writeFileSync(
path.join(packageRoot, "package.json"),
JSON.stringify({ name: "openclaw", version: "2026.4.25" }),
);
fs.writeFileSync(path.join(packageRoot, "pnpm-workspace.yaml"), "packages: []\n");
fs.mkdirSync(path.join(packageRoot, "src"), { recursive: true });
fs.mkdirSync(path.join(packageRoot, "extensions"), { recursive: true });
const pluginRoot = path.join(packageRoot, "dist", "extensions", "voice-call");
fs.mkdirSync(pluginRoot, { recursive: true });
fs.writeFileSync(
path.join(pluginRoot, "package.json"),
JSON.stringify({ dependencies: { "voice-runtime": "1.0.0" } }),
);
spawnSyncMock.mockImplementation((_command, _args, options) => {
const cwd = String(options?.cwd);
expect(cwd).toContain(path.join(".local", "bundled-plugin-runtime-deps"));
const depRoot = path.join(cwd, "node_modules", "voice-runtime");
fs.mkdirSync(depRoot, { recursive: true });
fs.writeFileSync(
path.join(depRoot, "package.json"),
JSON.stringify({ name: "voice-runtime", version: "1.0.0" }),
);
return { status: 0, stdout: "", stderr: "" } as ReturnType<typeof spawnSync>;
});
expect(
ensureBundledPluginRuntimeDeps({
env: {},
pluginId: "voice-call",
pluginRoot,
}),
).toEqual({
installedSpecs: ["voice-runtime@1.0.0"],
retainSpecs: ["voice-runtime@1.0.0"],
});
expect(spawnSyncMock).toHaveBeenCalledTimes(1);
expect(fs.lstatSync(path.join(pluginRoot, "node_modules")).isSymbolicLink()).toBe(true);
fs.rmSync(path.join(pluginRoot, "node_modules"), { recursive: true, force: true });
expect(
ensureBundledPluginRuntimeDeps({
env: {},
installDeps: () => {
throw new Error("cache restore should not reinstall");
},
pluginId: "voice-call",
pluginRoot,
}),
).toEqual({ installedSpecs: [], retainSpecs: [] });
expect(fs.lstatSync(path.join(pluginRoot, "node_modules")).isSymbolicLink()).toBe(true);
});
it("retains existing staged deps without a retained manifest before shared installs", () => {
const packageRoot = makeTempDir();
const stageDir = makeTempDir();
@@ -1194,6 +1248,7 @@ describe("ensureBundledPluginRuntimeDeps", () => {
installExecutionRoot: expect.stringContaining(
path.join(".local", "bundled-plugin-runtime-deps"),
),
linkNodeModulesFromExecutionRoot: true,
missingSpecs: ["tokenjuice@0.6.1"],
installSpecs: ["tokenjuice@0.6.1"],
},
@@ -1276,6 +1331,7 @@ describe("ensureBundledPluginRuntimeDeps", () => {
installExecutionRoot: expect.stringContaining(
path.join(".local", "bundled-plugin-runtime-deps"),
),
linkNodeModulesFromExecutionRoot: true,
missingSpecs: ["acpx@0.5.3"],
installSpecs: ["acpx@0.5.3"],
},

View File

@@ -27,6 +27,7 @@ export type RuntimeDepConflict = {
export type BundledRuntimeDepsInstallParams = {
installRoot: string;
installExecutionRoot?: string;
linkNodeModulesFromExecutionRoot?: boolean;
missingSpecs: string[];
installSpecs?: string[];
};
@@ -701,6 +702,31 @@ function replaceNodeModulesDir(targetDir: string, sourceDir: string): void {
}
}
function linkNodeModulesDir(targetDir: string, sourceDir: string): boolean {
const parentDir = path.dirname(targetDir);
const tempLink = path.join(parentDir, `.openclaw-runtime-deps-link-${process.pid}-${Date.now()}`);
try {
fs.symlinkSync(sourceDir, tempLink, process.platform === "win32" ? "junction" : "dir");
fs.rmSync(targetDir, { recursive: true, force: true });
fs.renameSync(tempLink, targetDir);
return true;
} catch {
try {
fs.rmSync(tempLink, { recursive: true, force: true });
} catch {
// Best-effort cleanup; caller falls back to copying.
}
return false;
}
}
function replaceNodeModulesDirFromCache(targetDir: string, sourceDir: string): void {
if (linkNodeModulesDir(targetDir, sourceDir)) {
return;
}
replaceNodeModulesDir(targetDir, sourceDir);
}
function restoreSourceCheckoutRuntimeDepsFromCache(params: {
cacheDir: string | null;
deps: readonly { name: string }[];
@@ -714,7 +740,10 @@ function restoreSourceCheckoutRuntimeDepsFromCache(params: {
return false;
}
try {
replaceNodeModulesDir(path.join(params.installRoot, "node_modules"), cachedNodeModulesDir);
replaceNodeModulesDirFromCache(
path.join(params.installRoot, "node_modules"),
cachedNodeModulesDir,
);
return true;
} catch {
return false;
@@ -1153,6 +1182,7 @@ function ensureNpmInstallExecutionManifest(installExecutionRoot: string): void {
export function installBundledRuntimeDeps(params: {
installRoot: string;
installExecutionRoot?: string;
linkNodeModulesFromExecutionRoot?: boolean;
missingSpecs: string[];
env: NodeJS.ProcessEnv;
}): void {
@@ -1198,7 +1228,12 @@ export function installBundledRuntimeDeps(params: {
if (!fs.existsSync(stagedNodeModulesDir)) {
throw new Error("npm install did not produce node_modules");
}
replaceNodeModulesDir(path.join(params.installRoot, "node_modules"), stagedNodeModulesDir);
const targetNodeModulesDir = path.join(params.installRoot, "node_modules");
if (params.linkNodeModulesFromExecutionRoot) {
replaceNodeModulesDirFromCache(targetNodeModulesDir, stagedNodeModulesDir);
} else {
replaceNodeModulesDir(targetNodeModulesDir, stagedNodeModulesDir);
}
}
} finally {
if (cleanInstallExecutionRoot) {
@@ -1314,9 +1349,7 @@ export function ensureBundledPluginRuntimeDeps(params: {
});
const isPluginRootInstall = path.resolve(installRoot) === path.resolve(params.pluginRoot);
const sourceCheckoutCacheStage =
cacheDir &&
isPluginRootInstall &&
resolveSourceCheckoutBundledPluginPackageRoot(params.pluginRoot)
cacheDir && isPluginRootInstall && resolveSourceCheckoutPackageRoot(params.pluginRoot)
? cacheDir
: undefined;
const installExecutionRoot =
@@ -1338,14 +1371,26 @@ export function ensureBundledPluginRuntimeDeps(params: {
installBundledRuntimeDeps({
installRoot: installParams.installRoot,
installExecutionRoot: installParams.installExecutionRoot,
linkNodeModulesFromExecutionRoot: installParams.linkNodeModulesFromExecutionRoot,
missingSpecs: installParams.installSpecs ?? installParams.missingSpecs,
env: params.env,
}));
install({ installRoot, installExecutionRoot, missingSpecs, installSpecs });
install({
installRoot,
installExecutionRoot,
...(sourceCheckoutCacheStage ? { linkNodeModulesFromExecutionRoot: true } : {}),
missingSpecs,
installSpecs,
});
const cacheAlreadyPopulated = Boolean(
sourceCheckoutCacheStage && hasAllDependencySentinels(sourceCheckoutCacheStage, deps),
);
if (persistRetainedManifest) {
writeRetainedRuntimeDepsManifest(installRoot, installSpecs);
}
storeSourceCheckoutRuntimeDepsCache({ cacheDir, installRoot });
if (!cacheAlreadyPopulated) {
storeSourceCheckoutRuntimeDepsCache({ cacheDir, installRoot });
}
return { installedSpecs: missingSpecs, retainSpecs: installSpecs };
});
}