mirror of
https://github.com/openclaw/openclaw.git
synced 2026-04-27 17:11:46 +00:00
fix(gateway): harden tailscale lock recovery
This commit is contained in:
@@ -59,74 +59,141 @@ function createOwnerStore() {
|
||||
};
|
||||
}
|
||||
|
||||
describe("startGatewayTailscaleExposure", () => {
|
||||
beforeEach(() => {
|
||||
vi.clearAllMocks();
|
||||
});
|
||||
const modeCases = [
|
||||
{
|
||||
mode: "serve" as const,
|
||||
enableMock: tailscaleState.enableServe,
|
||||
disableMock: tailscaleState.disableServe,
|
||||
},
|
||||
{
|
||||
mode: "funnel" as const,
|
||||
enableMock: tailscaleState.enableFunnel,
|
||||
disableMock: tailscaleState.disableFunnel,
|
||||
},
|
||||
];
|
||||
|
||||
it("skips stale serve cleanup after a newer gateway takes ownership", async () => {
|
||||
const ownerStore = createOwnerStore();
|
||||
const logTailscale = {
|
||||
info: vi.fn(),
|
||||
warn: vi.fn(),
|
||||
};
|
||||
|
||||
const cleanupA = await startGatewayTailscaleExposure({
|
||||
tailscaleMode: "serve",
|
||||
resetOnExit: true,
|
||||
port: 18789,
|
||||
logTailscale,
|
||||
ownerStore,
|
||||
});
|
||||
const cleanupB = await startGatewayTailscaleExposure({
|
||||
tailscaleMode: "serve",
|
||||
resetOnExit: true,
|
||||
port: 18789,
|
||||
logTailscale,
|
||||
ownerStore,
|
||||
describe.each(modeCases)(
|
||||
"startGatewayTailscaleExposure (%s)",
|
||||
({ mode, enableMock, disableMock }) => {
|
||||
beforeEach(() => {
|
||||
vi.restoreAllMocks();
|
||||
vi.clearAllMocks();
|
||||
});
|
||||
|
||||
await cleanupA?.();
|
||||
expect(tailscaleState.disableServe).not.toHaveBeenCalled();
|
||||
expect(logTailscale.info).toHaveBeenCalledWith("serve cleanup skipped: not the current owner");
|
||||
it("skips stale cleanup after a newer gateway takes ownership", async () => {
|
||||
const ownerStore = createOwnerStore();
|
||||
const logTailscale = {
|
||||
info: vi.fn(),
|
||||
warn: vi.fn(),
|
||||
};
|
||||
|
||||
await cleanupB?.();
|
||||
expect(tailscaleState.disableServe).toHaveBeenCalledTimes(1);
|
||||
});
|
||||
const cleanupA = await startGatewayTailscaleExposure({
|
||||
tailscaleMode: mode,
|
||||
resetOnExit: true,
|
||||
port: 18789,
|
||||
logTailscale,
|
||||
ownerStore,
|
||||
});
|
||||
const cleanupB = await startGatewayTailscaleExposure({
|
||||
tailscaleMode: mode,
|
||||
resetOnExit: true,
|
||||
port: 18789,
|
||||
logTailscale,
|
||||
ownerStore,
|
||||
});
|
||||
|
||||
it("restores the previous owner after a takeover startup failure", async () => {
|
||||
const ownerStore = createOwnerStore();
|
||||
const logTailscale = {
|
||||
info: vi.fn(),
|
||||
warn: vi.fn(),
|
||||
};
|
||||
await cleanupA?.();
|
||||
expect(disableMock).not.toHaveBeenCalled();
|
||||
expect(logTailscale.info).toHaveBeenCalledWith(
|
||||
`${mode} cleanup skipped: not the current owner`,
|
||||
);
|
||||
|
||||
const cleanupA = await startGatewayTailscaleExposure({
|
||||
tailscaleMode: "serve",
|
||||
resetOnExit: true,
|
||||
port: 18789,
|
||||
logTailscale,
|
||||
ownerStore,
|
||||
await cleanupB?.();
|
||||
expect(disableMock).toHaveBeenCalledTimes(1);
|
||||
});
|
||||
|
||||
tailscaleState.enableServe.mockRejectedValueOnce(new Error("boom"));
|
||||
it("restores the previous live owner after a takeover startup failure", async () => {
|
||||
const ownerStore = createOwnerStore();
|
||||
const logTailscale = {
|
||||
info: vi.fn(),
|
||||
warn: vi.fn(),
|
||||
};
|
||||
|
||||
const cleanupB = await startGatewayTailscaleExposure({
|
||||
tailscaleMode: "serve",
|
||||
resetOnExit: true,
|
||||
port: 18789,
|
||||
logTailscale,
|
||||
ownerStore,
|
||||
const cleanupA = await startGatewayTailscaleExposure({
|
||||
tailscaleMode: mode,
|
||||
resetOnExit: true,
|
||||
port: 18789,
|
||||
logTailscale,
|
||||
ownerStore,
|
||||
});
|
||||
|
||||
enableMock.mockRejectedValueOnce(new Error("boom"));
|
||||
|
||||
const cleanupB = await startGatewayTailscaleExposure({
|
||||
tailscaleMode: mode,
|
||||
resetOnExit: true,
|
||||
port: 18789,
|
||||
logTailscale,
|
||||
ownerStore,
|
||||
});
|
||||
|
||||
expect(cleanupB).not.toBeNull();
|
||||
expect(logTailscale.warn).toHaveBeenCalledWith(`${mode} failed: boom`);
|
||||
|
||||
await cleanupB?.();
|
||||
expect(disableMock).not.toHaveBeenCalled();
|
||||
expect(logTailscale.info).toHaveBeenCalledWith(
|
||||
`${mode} cleanup skipped: not the current owner`,
|
||||
);
|
||||
|
||||
await cleanupA?.();
|
||||
expect(disableMock).toHaveBeenCalledTimes(1);
|
||||
});
|
||||
|
||||
expect(cleanupB).not.toBeNull();
|
||||
expect(logTailscale.warn).toHaveBeenCalledWith("serve failed: boom");
|
||||
it("keeps the failed owner when the previous owner is already gone", async () => {
|
||||
const ownerStore = createOwnerStore();
|
||||
const logTailscale = {
|
||||
info: vi.fn(),
|
||||
warn: vi.fn(),
|
||||
};
|
||||
|
||||
await cleanupB?.();
|
||||
expect(tailscaleState.disableServe).not.toHaveBeenCalled();
|
||||
expect(logTailscale.info).toHaveBeenCalledWith("serve cleanup skipped: not the current owner");
|
||||
const cleanupA = await startGatewayTailscaleExposure({
|
||||
tailscaleMode: mode,
|
||||
resetOnExit: true,
|
||||
port: 18789,
|
||||
logTailscale,
|
||||
ownerStore,
|
||||
});
|
||||
|
||||
await cleanupA?.();
|
||||
expect(tailscaleState.disableServe).toHaveBeenCalledTimes(1);
|
||||
});
|
||||
});
|
||||
vi.spyOn(process, "kill").mockImplementation(((pid: number) => {
|
||||
if (pid === 1) {
|
||||
const err = new Error("gone") as NodeJS.ErrnoException;
|
||||
err.code = "ESRCH";
|
||||
throw err;
|
||||
}
|
||||
return true;
|
||||
}) as typeof process.kill);
|
||||
enableMock.mockRejectedValueOnce(new Error("boom"));
|
||||
|
||||
const cleanupB = await startGatewayTailscaleExposure({
|
||||
tailscaleMode: mode,
|
||||
resetOnExit: true,
|
||||
port: 18789,
|
||||
logTailscale,
|
||||
ownerStore,
|
||||
});
|
||||
|
||||
expect(cleanupB).not.toBeNull();
|
||||
expect(logTailscale.warn).toHaveBeenCalledWith(`${mode} failed: boom`);
|
||||
|
||||
await cleanupA?.();
|
||||
expect(disableMock).not.toHaveBeenCalled();
|
||||
expect(logTailscale.info).toHaveBeenCalledWith(
|
||||
`${mode} cleanup skipped: not the current owner`,
|
||||
);
|
||||
|
||||
await cleanupB?.();
|
||||
expect(disableMock).toHaveBeenCalledTimes(1);
|
||||
});
|
||||
},
|
||||
);
|
||||
|
||||
@@ -32,6 +32,15 @@ type TailscaleExposureOwnerStore = {
|
||||
runCleanupIfCurrentOwner(token: string, cleanup: () => Promise<void>): Promise<boolean>;
|
||||
};
|
||||
|
||||
function isPidAlive(pid: number): boolean {
|
||||
try {
|
||||
process.kill(pid, 0);
|
||||
return true;
|
||||
} catch (err) {
|
||||
return (err as NodeJS.ErrnoException | undefined)?.code !== "ESRCH";
|
||||
}
|
||||
}
|
||||
|
||||
function createTailscaleExposureOwnerStore(): TailscaleExposureOwnerStore {
|
||||
const ownerFilePath = path.join(resolveGatewayLockDir(), "tailscale-exposure-owner.json");
|
||||
const ownerLockPath = path.join(resolveGatewayLockDir(), "tailscale-exposure-owner.lock");
|
||||
@@ -64,29 +73,15 @@ function createTailscaleExposureOwnerStore(): TailscaleExposureOwnerStore {
|
||||
await new Promise((resolve) => setTimeout(resolve, ms));
|
||||
}
|
||||
|
||||
function isPidAlive(pid: number): boolean {
|
||||
try {
|
||||
process.kill(pid, 0);
|
||||
return true;
|
||||
} catch (err) {
|
||||
return (err as NodeJS.ErrnoException | undefined)?.code !== "ESRCH";
|
||||
}
|
||||
}
|
||||
|
||||
async function breakStaleLock() {
|
||||
try {
|
||||
const [raw, stat] = await Promise.all([
|
||||
fs.readFile(ownerLockPath, "utf8").catch(() => null),
|
||||
fs.stat(ownerLockPath),
|
||||
]);
|
||||
const stat = await fs.stat(ownerLockPath);
|
||||
if (Date.now() - stat.mtimeMs < lockStaleMs) {
|
||||
return;
|
||||
}
|
||||
const parsed = raw ? JSON.parse(raw) : null;
|
||||
const pid = typeof parsed?.pid === "number" ? parsed.pid : null;
|
||||
if (pid !== null && isPidAlive(pid)) {
|
||||
return;
|
||||
}
|
||||
// All lock holders only perform short file I/O plus the Tailscale CLI calls,
|
||||
// and those helpers already time out after 15s. If the lock still exists after
|
||||
// the wider stale window, assume the holder is wedged and break it.
|
||||
await fs.unlink(ownerLockPath).catch(() => {});
|
||||
} catch (err) {
|
||||
if ((err as NodeJS.ErrnoException | undefined)?.code !== "ENOENT") {
|
||||
@@ -202,7 +197,13 @@ export async function startGatewayTailscaleExposure(params: {
|
||||
params.logTailscale.info(`${params.tailscaleMode} enabled`);
|
||||
}
|
||||
} catch (err) {
|
||||
await ownerStore.replaceIfCurrent(owner.token, previousOwner).catch(() => {});
|
||||
const nextOwner =
|
||||
previousOwner && isPidAlive(previousOwner.pid)
|
||||
? previousOwner
|
||||
: params.resetOnExit
|
||||
? owner
|
||||
: null;
|
||||
await ownerStore.replaceIfCurrent(owner.token, nextOwner).catch(() => {});
|
||||
params.logTailscale.warn(
|
||||
`${params.tailscaleMode} failed: ${err instanceof Error ? err.message : String(err)}`,
|
||||
);
|
||||
|
||||
Reference in New Issue
Block a user