fix(gateway): harden tailscale lock recovery

This commit is contained in:
Nimrod Gutman
2026-03-16 11:23:33 +02:00
parent dd570f9b7a
commit f0a773a716
2 changed files with 145 additions and 77 deletions

View File

@@ -59,74 +59,141 @@ function createOwnerStore() {
};
}
describe("startGatewayTailscaleExposure", () => {
beforeEach(() => {
vi.clearAllMocks();
});
const modeCases = [
{
mode: "serve" as const,
enableMock: tailscaleState.enableServe,
disableMock: tailscaleState.disableServe,
},
{
mode: "funnel" as const,
enableMock: tailscaleState.enableFunnel,
disableMock: tailscaleState.disableFunnel,
},
];
it("skips stale serve cleanup after a newer gateway takes ownership", async () => {
const ownerStore = createOwnerStore();
const logTailscale = {
info: vi.fn(),
warn: vi.fn(),
};
const cleanupA = await startGatewayTailscaleExposure({
tailscaleMode: "serve",
resetOnExit: true,
port: 18789,
logTailscale,
ownerStore,
});
const cleanupB = await startGatewayTailscaleExposure({
tailscaleMode: "serve",
resetOnExit: true,
port: 18789,
logTailscale,
ownerStore,
describe.each(modeCases)(
"startGatewayTailscaleExposure (%s)",
({ mode, enableMock, disableMock }) => {
beforeEach(() => {
vi.restoreAllMocks();
vi.clearAllMocks();
});
await cleanupA?.();
expect(tailscaleState.disableServe).not.toHaveBeenCalled();
expect(logTailscale.info).toHaveBeenCalledWith("serve cleanup skipped: not the current owner");
it("skips stale cleanup after a newer gateway takes ownership", async () => {
const ownerStore = createOwnerStore();
const logTailscale = {
info: vi.fn(),
warn: vi.fn(),
};
await cleanupB?.();
expect(tailscaleState.disableServe).toHaveBeenCalledTimes(1);
});
const cleanupA = await startGatewayTailscaleExposure({
tailscaleMode: mode,
resetOnExit: true,
port: 18789,
logTailscale,
ownerStore,
});
const cleanupB = await startGatewayTailscaleExposure({
tailscaleMode: mode,
resetOnExit: true,
port: 18789,
logTailscale,
ownerStore,
});
it("restores the previous owner after a takeover startup failure", async () => {
const ownerStore = createOwnerStore();
const logTailscale = {
info: vi.fn(),
warn: vi.fn(),
};
await cleanupA?.();
expect(disableMock).not.toHaveBeenCalled();
expect(logTailscale.info).toHaveBeenCalledWith(
`${mode} cleanup skipped: not the current owner`,
);
const cleanupA = await startGatewayTailscaleExposure({
tailscaleMode: "serve",
resetOnExit: true,
port: 18789,
logTailscale,
ownerStore,
await cleanupB?.();
expect(disableMock).toHaveBeenCalledTimes(1);
});
tailscaleState.enableServe.mockRejectedValueOnce(new Error("boom"));
it("restores the previous live owner after a takeover startup failure", async () => {
const ownerStore = createOwnerStore();
const logTailscale = {
info: vi.fn(),
warn: vi.fn(),
};
const cleanupB = await startGatewayTailscaleExposure({
tailscaleMode: "serve",
resetOnExit: true,
port: 18789,
logTailscale,
ownerStore,
const cleanupA = await startGatewayTailscaleExposure({
tailscaleMode: mode,
resetOnExit: true,
port: 18789,
logTailscale,
ownerStore,
});
enableMock.mockRejectedValueOnce(new Error("boom"));
const cleanupB = await startGatewayTailscaleExposure({
tailscaleMode: mode,
resetOnExit: true,
port: 18789,
logTailscale,
ownerStore,
});
expect(cleanupB).not.toBeNull();
expect(logTailscale.warn).toHaveBeenCalledWith(`${mode} failed: boom`);
await cleanupB?.();
expect(disableMock).not.toHaveBeenCalled();
expect(logTailscale.info).toHaveBeenCalledWith(
`${mode} cleanup skipped: not the current owner`,
);
await cleanupA?.();
expect(disableMock).toHaveBeenCalledTimes(1);
});
expect(cleanupB).not.toBeNull();
expect(logTailscale.warn).toHaveBeenCalledWith("serve failed: boom");
it("keeps the failed owner when the previous owner is already gone", async () => {
const ownerStore = createOwnerStore();
const logTailscale = {
info: vi.fn(),
warn: vi.fn(),
};
await cleanupB?.();
expect(tailscaleState.disableServe).not.toHaveBeenCalled();
expect(logTailscale.info).toHaveBeenCalledWith("serve cleanup skipped: not the current owner");
const cleanupA = await startGatewayTailscaleExposure({
tailscaleMode: mode,
resetOnExit: true,
port: 18789,
logTailscale,
ownerStore,
});
await cleanupA?.();
expect(tailscaleState.disableServe).toHaveBeenCalledTimes(1);
});
});
vi.spyOn(process, "kill").mockImplementation(((pid: number) => {
if (pid === 1) {
const err = new Error("gone") as NodeJS.ErrnoException;
err.code = "ESRCH";
throw err;
}
return true;
}) as typeof process.kill);
enableMock.mockRejectedValueOnce(new Error("boom"));
const cleanupB = await startGatewayTailscaleExposure({
tailscaleMode: mode,
resetOnExit: true,
port: 18789,
logTailscale,
ownerStore,
});
expect(cleanupB).not.toBeNull();
expect(logTailscale.warn).toHaveBeenCalledWith(`${mode} failed: boom`);
await cleanupA?.();
expect(disableMock).not.toHaveBeenCalled();
expect(logTailscale.info).toHaveBeenCalledWith(
`${mode} cleanup skipped: not the current owner`,
);
await cleanupB?.();
expect(disableMock).toHaveBeenCalledTimes(1);
});
},
);

View File

@@ -32,6 +32,15 @@ type TailscaleExposureOwnerStore = {
runCleanupIfCurrentOwner(token: string, cleanup: () => Promise<void>): Promise<boolean>;
};
function isPidAlive(pid: number): boolean {
try {
process.kill(pid, 0);
return true;
} catch (err) {
return (err as NodeJS.ErrnoException | undefined)?.code !== "ESRCH";
}
}
function createTailscaleExposureOwnerStore(): TailscaleExposureOwnerStore {
const ownerFilePath = path.join(resolveGatewayLockDir(), "tailscale-exposure-owner.json");
const ownerLockPath = path.join(resolveGatewayLockDir(), "tailscale-exposure-owner.lock");
@@ -64,29 +73,15 @@ function createTailscaleExposureOwnerStore(): TailscaleExposureOwnerStore {
await new Promise((resolve) => setTimeout(resolve, ms));
}
function isPidAlive(pid: number): boolean {
try {
process.kill(pid, 0);
return true;
} catch (err) {
return (err as NodeJS.ErrnoException | undefined)?.code !== "ESRCH";
}
}
async function breakStaleLock() {
try {
const [raw, stat] = await Promise.all([
fs.readFile(ownerLockPath, "utf8").catch(() => null),
fs.stat(ownerLockPath),
]);
const stat = await fs.stat(ownerLockPath);
if (Date.now() - stat.mtimeMs < lockStaleMs) {
return;
}
const parsed = raw ? JSON.parse(raw) : null;
const pid = typeof parsed?.pid === "number" ? parsed.pid : null;
if (pid !== null && isPidAlive(pid)) {
return;
}
// All lock holders only perform short file I/O plus the Tailscale CLI calls,
// and those helpers already time out after 15s. If the lock still exists after
// the wider stale window, assume the holder is wedged and break it.
await fs.unlink(ownerLockPath).catch(() => {});
} catch (err) {
if ((err as NodeJS.ErrnoException | undefined)?.code !== "ENOENT") {
@@ -202,7 +197,13 @@ export async function startGatewayTailscaleExposure(params: {
params.logTailscale.info(`${params.tailscaleMode} enabled`);
}
} catch (err) {
await ownerStore.replaceIfCurrent(owner.token, previousOwner).catch(() => {});
const nextOwner =
previousOwner && isPidAlive(previousOwner.pid)
? previousOwner
: params.resetOnExit
? owner
: null;
await ownerStore.replaceIfCurrent(owner.token, nextOwner).catch(() => {});
params.logTailscale.warn(
`${params.tailscaleMode} failed: ${err instanceof Error ? err.message : String(err)}`,
);