From 975891153f9814d26625a275cc9e82e6f42c72d3 Mon Sep 17 00:00:00 2001 From: hobo Date: Thu, 30 Apr 2026 11:58:16 +0800 Subject: [PATCH] perf(sandbox): shard container registry into per-entry files to remove cross-session lock contention MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The sandbox registry stores one JSON document per scope (containers and browsers), with every writer serialized through `acquireSessionWriteLock` against that single file. In a host running several sessions in parallel — multiple pairings, subagent spawns, or just an `ensureSandboxContainer` landing at the same moment as a `removeRegistryEntry` — each writer waits up to 60s for the lock, and a crashed process can leave the lock file behind long enough to wedge every subsequent sandbox operation until the stale-lock threshold elapses. The lock's only job is to keep entries from trampling each other inside one JSON blob, so it is a whole-file mutex gating reads/writes that touch disjoint entries. Each container already has a unique name (enforced at creation), so each entry's storage can be disjoint too. This change turns the `~/.openclaw/sandbox/containers.json` and `browsers.json` monolithic files into per-entry JSON files under `~/.openclaw/sandbox/containers/` and `~/.openclaw/sandbox/browsers/` directories. `writeJsonAtomic` (tmp-file + rename) keeps each per-entry write crash-safe, and because concurrent writers only touch their own files there is nothing left to serialize across. Changes: - `src/agents/sandbox/constants.ts`: add `SANDBOX_CONTAINERS_DIR` and `SANDBOX_BROWSERS_DIR` sibling to the existing monolithic paths. The old paths stay exported because the one-shot migration still needs to locate the legacy file. - `src/agents/sandbox/registry.ts`: replace the `withRegistryLock` / `readRegistryFromFile("strict")` / `writeRegistryFile` loop with per-entry read/write/remove primitives against the sharded directories, and drop the `acquireSessionWriteLock` import. The existing upstream additions are preserved: the zod `RegistryEntrySchema`, the `backendId`/`runtimeLabel`/`configLabelKind` fields on `SandboxRegistryEntry`, and `normalizeSandboxRegistryEntry` still decorate reads. Upsert merge semantics (preserve `createdAtMs` and `image` from the prior entry, prefer the newer `configHash`) are kept bit-for-bit. - `src/agents/sandbox/registry.ts`: add `readRegistryEntry(name)` for O(1) single-container lookup. The previous hot path in `ensureSandboxContainer` had to read the whole registry and `Array.find` the one entry it wanted; the new API avoids both the full directory scan and the JSON round-trip on every other entry. - `src/agents/sandbox/registry.ts`: add a one-shot `migrateMonolithicIfNeeded` helper invoked at the top of every public read/write. If a legacy `containers.json` / `browsers.json` exists, its entries are fanned out into per-entry files, the old file and its `.lock` are removed, and subsequent calls skip the migration branch entirely. Malformed legacy files are dropped rather than throwing forever, because a corrupt single-file registry that has already been superseded by the new storage would otherwise block every sandbox ensure/remove on every boot. Live per-entry files still go through the same schema validation the upstream strict path used — a corrupt per-entry file is simply skipped during enumeration so that one bad file cannot hide every other running container from the operator. - `src/agents/sandbox/docker.ts`: swap the `readRegistry()` + `Array.find` lookup in `ensureSandboxContainer` for the new `readRegistryEntry(containerName)`. This is the only in-tree caller that needed the full scan just to pick one entry. - `src/agents/sandbox/registry.test.ts`: rewrite around the new per-file semantics. The old tests covered two properties that no longer exist — "the lock serializes concurrent update/remove so the later write cannot resurrect a removed entry" and "a malformed monolithic file makes every `update` throw" — both of which were artifacts of the single-file design. The rewrite keeps the normalizeSandboxRegistryEntry contract, the concurrent-updates-succeed contract (now without any lock in play), the malformed-legacy-migration contract, and adds coverage for `readRegistryEntry`, the stale-`.lock` cleanup, and the "corrupt per-entry file does not hide its siblings" guarantee. - `src/agents/sandbox/docker.config-hash-recreate.test.ts`: update the mock module to expose `readRegistryEntry` instead of `readRegistry`, and return single-entry objects or `null` rather than `{ entries: [...] }`. Other in-tree consumers (`manage.ts`, `prune.ts`, `browser.ts`, `context.ts`) only call the public `readRegistry` / `updateRegistry` / `remove*` surface, whose return shapes and observable behavior are unchanged; their existing tests (`manage.test.ts`, `browser.create.test.ts`, `sandbox.resolveSandboxContext.test.ts`) all pass unmodified. Default behavior is unchanged from the operator's point of view: the first boot on the new code sees the legacy files, migrates them in place, and deletes them. Subsequent boots never touch the migration path. No config surface, no types, and no public exports are removed. --- src/agents/sandbox/constants.ts | 7 + .../docker.config-hash-recreate.test.ts | 48 ++- src/agents/sandbox/docker.ts | 5 +- src/agents/sandbox/registry.test.ts | 337 ++++++++++-------- src/agents/sandbox/registry.ts | 274 ++++++++------ 5 files changed, 377 insertions(+), 294 deletions(-) diff --git a/src/agents/sandbox/constants.ts b/src/agents/sandbox/constants.ts index 173acf28d35..551991da9f1 100644 --- a/src/agents/sandbox/constants.ts +++ b/src/agents/sandbox/constants.ts @@ -53,3 +53,10 @@ export const SANDBOX_AGENT_WORKSPACE_MOUNT = "/agent"; export const SANDBOX_STATE_DIR = path.join(STATE_DIR, "sandbox"); export const SANDBOX_REGISTRY_PATH = path.join(SANDBOX_STATE_DIR, "containers.json"); export const SANDBOX_BROWSER_REGISTRY_PATH = path.join(SANDBOX_STATE_DIR, "browsers.json"); + +// Per-entry sharded directories — each container/browser gets its own JSON +// file under these dirs, eliminating cross-session lock contention on the +// monolithic registry files. The legacy *.json files above are migrated +// into these dirs on first read and then removed. +export const SANDBOX_CONTAINERS_DIR = path.join(SANDBOX_STATE_DIR, "containers"); +export const SANDBOX_BROWSERS_DIR = path.join(SANDBOX_STATE_DIR, "browsers"); diff --git a/src/agents/sandbox/docker.config-hash-recreate.test.ts b/src/agents/sandbox/docker.config-hash-recreate.test.ts index 4242ab8c570..290b0334667 100644 --- a/src/agents/sandbox/docker.config-hash-recreate.test.ts +++ b/src/agents/sandbox/docker.config-hash-recreate.test.ts @@ -25,12 +25,12 @@ const spawnState = vi.hoisted(() => ({ })); const registryMocks = vi.hoisted(() => ({ - readRegistry: vi.fn(), + readRegistryEntry: vi.fn(), updateRegistry: vi.fn(), })); vi.mock("./registry.js", () => ({ - readRegistry: registryMocks.readRegistry, + readRegistryEntry: registryMocks.readRegistryEntry, updateRegistry: registryMocks.updateRegistry, })); @@ -100,7 +100,7 @@ let ensureSandboxContainer: typeof import("./docker.js").ensureSandboxContainer; async function loadFreshDockerModuleForTest() { vi.resetModules(); vi.doMock("./registry.js", () => ({ - readRegistry: registryMocks.readRegistry, + readRegistryEntry: registryMocks.readRegistryEntry, updateRegistry: registryMocks.updateRegistry, })); vi.doMock("node:child_process", async () => createChildProcessMock()); @@ -185,7 +185,7 @@ describe("ensureSandboxContainer config-hash recreation", () => { spawnState.calls.length = 0; spawnState.inspectRunning = true; spawnState.labelHash = ""; - registryMocks.readRegistry.mockClear(); + registryMocks.readRegistryEntry.mockClear(); registryMocks.updateRegistry.mockClear(); registryMocks.updateRegistry.mockResolvedValue(undefined); await loadFreshDockerModuleForTest(); @@ -213,17 +213,13 @@ describe("ensureSandboxContainer config-hash recreation", () => { expect(newHash).not.toBe(oldHash); spawnState.labelHash = oldHash; - registryMocks.readRegistry.mockResolvedValue({ - entries: [ - { - containerName: "oc-test-shared", - sessionKey: "shared", - createdAtMs: 1, - lastUsedAtMs: 0, - image: newCfg.docker.image, - configHash: oldHash, - }, - ], + registryMocks.readRegistryEntry.mockResolvedValue({ + containerName: "oc-test-shared", + sessionKey: "shared", + createdAtMs: 1, + lastUsedAtMs: 0, + image: newCfg.docker.image, + configHash: oldHash, }); const containerName = await ensureSandboxContainer({ @@ -269,17 +265,13 @@ describe("ensureSandboxContainer config-hash recreation", () => { spawnState.inspectRunning = false; spawnState.labelHash = "stale-hash"; - registryMocks.readRegistry.mockResolvedValue({ - entries: [ - { - containerName: "oc-test-shared", - sessionKey: "shared", - createdAtMs: 1, - lastUsedAtMs: 0, - image: cfg.docker.image, - configHash: "stale-hash", - }, - ], + registryMocks.readRegistryEntry.mockResolvedValue({ + containerName: "oc-test-shared", + sessionKey: "shared", + createdAtMs: 1, + lastUsedAtMs: 0, + image: cfg.docker.image, + configHash: "stale-hash", }); const createCall = await ensureSandboxCreateCallForTest({ cfg, workspaceDir }); @@ -304,7 +296,7 @@ describe("ensureSandboxContainer config-hash recreation", () => { spawnState.inspectRunning = false; spawnState.labelHash = ""; - registryMocks.readRegistry.mockResolvedValue({ entries: [] }); + registryMocks.readRegistryEntry.mockResolvedValue(null); registryMocks.updateRegistry.mockResolvedValue(undefined); const createCall = await ensureSandboxCreateCallForTest({ cfg, workspaceDir }); @@ -320,7 +312,7 @@ describe("ensureSandboxContainer config-hash recreation", () => { spawnState.inspectRunning = false; spawnState.labelHash = ""; - registryMocks.readRegistry.mockResolvedValue({ entries: [] }); + registryMocks.readRegistryEntry.mockResolvedValue(null); const createCall = await ensureSandboxCreateCallForTest({ cfg, workspaceDir }); expect(createCall.args).toContain( diff --git a/src/agents/sandbox/docker.ts b/src/agents/sandbox/docker.ts index 24ed82b520a..597e749965e 100644 --- a/src/agents/sandbox/docker.ts +++ b/src/agents/sandbox/docker.ts @@ -167,7 +167,7 @@ import { markOpenClawExecEnv } from "../../infra/openclaw-exec-env.js"; import { defaultRuntime } from "../../runtime.js"; import { computeSandboxConfigHash } from "./config-hash.js"; import { DEFAULT_SANDBOX_IMAGE } from "./constants.js"; -import { readRegistry, updateRegistry } from "./registry.js"; +import { readRegistryEntry, updateRegistry } from "./registry.js"; import { resolveSandboxAgentId, resolveSandboxScopeKey, slugifySessionKey } from "./shared.js"; import type { SandboxConfig, SandboxDockerConfig, SandboxWorkspaceAccess } from "./types.js"; import { validateSandboxSecurity } from "./validate-sandbox-security.js"; @@ -580,8 +580,7 @@ export async function ensureSandboxContainer(params: { } | undefined; if (hasContainer) { - const registry = await readRegistry(); - registryEntry = registry.entries.find((entry) => entry.containerName === containerName); + registryEntry = (await readRegistryEntry(containerName)) ?? undefined; currentHash = await readContainerConfigHash(containerName); if (!currentHash) { currentHash = registryEntry?.configHash ?? null; diff --git a/src/agents/sandbox/registry.test.ts b/src/agents/sandbox/registry.test.ts index fd0707849b1..f04b2175e9e 100644 --- a/src/agents/sandbox/registry.test.ts +++ b/src/agents/sandbox/registry.test.ts @@ -1,67 +1,38 @@ import fs from "node:fs/promises"; -import { afterAll, afterEach, beforeEach, describe, expect, it, vi } from "vitest"; +import { afterAll, afterEach, describe, expect, it, vi } from "vitest"; -type WriteDelayConfig = { - targetFile: "containers.json" | "browsers.json"; - containerName: string; - started: boolean; - markStarted: () => void; - waitForRelease: Promise; -}; - -const { TEST_STATE_DIR, SANDBOX_REGISTRY_PATH, SANDBOX_BROWSER_REGISTRY_PATH, writeGateState } = - vi.hoisted(() => { - const path = require("node:path"); - const { mkdtempSync } = require("node:fs"); - const { tmpdir } = require("node:os"); - const baseDir = mkdtempSync(path.join(tmpdir(), "openclaw-sandbox-registry-")); - - return { - TEST_STATE_DIR: baseDir, - SANDBOX_REGISTRY_PATH: path.join(baseDir, "containers.json"), - SANDBOX_BROWSER_REGISTRY_PATH: path.join(baseDir, "browsers.json"), - writeGateState: { active: null as WriteDelayConfig | null }, - }; - }); +const { + TEST_STATE_DIR, + SANDBOX_REGISTRY_PATH, + SANDBOX_BROWSER_REGISTRY_PATH, + SANDBOX_CONTAINERS_DIR, + SANDBOX_BROWSERS_DIR, +} = vi.hoisted(() => { + const p = require("node:path"); + const { mkdtempSync } = require("node:fs"); + const { tmpdir } = require("node:os"); + const baseDir = mkdtempSync(p.join(tmpdir(), "openclaw-sandbox-registry-")); + return { + TEST_STATE_DIR: baseDir, + SANDBOX_REGISTRY_PATH: p.join(baseDir, "containers.json"), + SANDBOX_BROWSER_REGISTRY_PATH: p.join(baseDir, "browsers.json"), + SANDBOX_CONTAINERS_DIR: p.join(baseDir, "containers"), + SANDBOX_BROWSERS_DIR: p.join(baseDir, "browsers"), + }; +}); vi.mock("./constants.js", () => ({ SANDBOX_STATE_DIR: TEST_STATE_DIR, SANDBOX_REGISTRY_PATH, SANDBOX_BROWSER_REGISTRY_PATH, + SANDBOX_CONTAINERS_DIR, + SANDBOX_BROWSERS_DIR, })); -vi.mock("../../infra/json-files.js", async () => { - const actual = await vi.importActual( - "../../infra/json-files.js", - ); - return { - ...actual, - writeJsonAtomic: async ( - filePath: string, - value: unknown, - options?: Parameters[2], - ) => { - const payload = JSON.stringify(value); - const gate = writeGateState.active; - if ( - gate && - filePath.includes(gate.targetFile) && - payloadMentionsContainer(payload, gate.containerName) - ) { - if (!gate.started) { - gate.started = true; - gate.markStarted(); - } - await gate.waitForRelease; - } - await actual.writeJsonAtomic(filePath, value, options); - }, - }; -}); - import { readBrowserRegistry, readRegistry, + readRegistryEntry, removeBrowserRegistryEntry, removeRegistryEntry, updateBrowserRegistry, @@ -71,54 +42,9 @@ import { type SandboxBrowserRegistryEntry = import("./registry.js").SandboxBrowserRegistryEntry; type SandboxRegistryEntry = import("./registry.js").SandboxRegistryEntry; -function payloadMentionsContainer(payload: string, containerName: string): boolean { - return ( - payload.includes(`"containerName":"${containerName}"`) || - payload.includes(`"containerName": "${containerName}"`) - ); -} - -async function seedMalformedContainerRegistry(payload: string) { - await fs.writeFile(SANDBOX_REGISTRY_PATH, payload, "utf-8"); -} - -async function seedMalformedBrowserRegistry(payload: string) { - await fs.writeFile(SANDBOX_BROWSER_REGISTRY_PATH, payload, "utf-8"); -} - -function installWriteGate( - targetFile: "containers.json" | "browsers.json", - containerName: string, -): { waitForStart: Promise; release: () => void } { - let markStarted = () => {}; - const waitForStart = new Promise((resolve) => { - markStarted = resolve; - }); - let resolveRelease = () => {}; - const waitForRelease = new Promise((resolve) => { - resolveRelease = resolve; - }); - writeGateState.active = { - targetFile, - containerName, - started: false, - markStarted, - waitForRelease, - }; - return { - waitForStart, - release: () => { - resolveRelease(); - writeGateState.active = null; - }, - }; -} - -beforeEach(() => { - writeGateState.active = null; -}); - afterEach(async () => { + await fs.rm(SANDBOX_CONTAINERS_DIR, { recursive: true, force: true }); + await fs.rm(SANDBOX_BROWSERS_DIR, { recursive: true, force: true }); await fs.rm(SANDBOX_REGISTRY_PATH, { force: true }); await fs.rm(SANDBOX_BROWSER_REGISTRY_PATH, { force: true }); await fs.rm(`${SANDBOX_REGISTRY_PATH}.lock`, { force: true }); @@ -154,11 +80,11 @@ function containerEntry(overrides: Partial = {}): SandboxR }; } -async function seedContainerRegistry(entries: SandboxRegistryEntry[]) { +async function seedMonolithicContainerRegistry(entries: SandboxRegistryEntry[]) { await fs.writeFile(SANDBOX_REGISTRY_PATH, `${JSON.stringify({ entries }, null, 2)}\n`, "utf-8"); } -async function seedBrowserRegistry(entries: SandboxBrowserRegistryEntry[]) { +async function seedMonolithicBrowserRegistry(entries: SandboxBrowserRegistryEntry[]) { await fs.writeFile( SANDBOX_BROWSER_REGISTRY_PATH, `${JSON.stringify({ entries }, null, 2)}\n`, @@ -166,9 +92,90 @@ async function seedBrowserRegistry(entries: SandboxBrowserRegistryEntry[]) { ); } -describe("registry race safety", () => { - it("normalizes legacy registry entries on read", async () => { - await seedContainerRegistry([ +describe("per-file sharded container registry", () => { + it("writes and reads a single container entry", async () => { + await updateRegistry(containerEntry({ containerName: "container-a" })); + + const registry = await readRegistry(); + expect(registry.entries).toHaveLength(1); + expect(registry.entries[0].containerName).toBe("container-a"); + }); + + it("keeps both container updates under concurrent writes", async () => { + // The old monolithic-file model serialized every writer through a + // single lock. Per-entry files must handle concurrent upserts of + // different containers without any ordering or contention. + await Promise.all([ + updateRegistry(containerEntry({ containerName: "container-a" })), + updateRegistry(containerEntry({ containerName: "container-b" })), + ]); + + const registry = await readRegistry(); + expect( + registry.entries + .map((entry) => entry.containerName) + .slice() + .toSorted(), + ).toEqual(["container-a", "container-b"]); + }); + + it("readRegistryEntry returns a single entry without scanning the whole dir", async () => { + await updateRegistry(containerEntry({ containerName: "container-x", sessionKey: "sess:x" })); + await updateRegistry(containerEntry({ containerName: "container-y", sessionKey: "sess:y" })); + + const entry = await readRegistryEntry("container-x"); + expect(entry).not.toBeNull(); + expect(entry?.containerName).toBe("container-x"); + expect(entry?.sessionKey).toBe("sess:x"); + }); + + it("readRegistryEntry returns null when the container has no entry file", async () => { + const missing = await readRegistryEntry("nonexistent-container"); + expect(missing).toBeNull(); + }); + + it("removeRegistryEntry deletes only the target container file", async () => { + await updateRegistry(containerEntry({ containerName: "container-a" })); + await updateRegistry(containerEntry({ containerName: "container-b" })); + await removeRegistryEntry("container-a"); + + const registry = await readRegistry(); + expect(registry.entries).toHaveLength(1); + expect(registry.entries[0].containerName).toBe("container-b"); + }); + + it("removeRegistryEntry is a no-op for a container that never existed", async () => { + // force:true on rm swallows ENOENT — callers should not have to + // check existence first. + await expect(removeRegistryEntry("never-created")).resolves.toBeUndefined(); + }); + + it("updateRegistry preserves createdAtMs and image from an existing entry", async () => { + await updateRegistry( + containerEntry({ + containerName: "c", + createdAtMs: 100, + lastUsedAtMs: 100, + image: "original:tag", + }), + ); + await updateRegistry( + containerEntry({ + containerName: "c", + createdAtMs: 999, + lastUsedAtMs: 200, + image: "ignored:tag", + }), + ); + + const entry = await readRegistryEntry("c"); + expect(entry?.createdAtMs).toBe(100); + expect(entry?.lastUsedAtMs).toBe(200); + expect(entry?.image).toBe("original:tag"); + }); + + it("readRegistry normalizes legacy entries that lack backendId/runtimeLabel/configLabelKind", async () => { + await seedMonolithicContainerRegistry([ { containerName: "legacy-container", sessionKey: "agent:main", @@ -189,46 +196,31 @@ describe("registry race safety", () => { ]); }); - it("keeps both container updates under concurrent writes", async () => { - await Promise.all([ - updateRegistry(containerEntry({ containerName: "container-a" })), - updateRegistry(containerEntry({ containerName: "container-b" })), - ]); + it("skips a single corrupt per-entry file without hiding the other entries", async () => { + // A crashed writer that left behind a partial JSON file must not + // prevent other containers from being enumerated. + await updateRegistry(containerEntry({ containerName: "good-a" })); + await updateRegistry(containerEntry({ containerName: "good-b" })); + await fs.writeFile(`${SANDBOX_CONTAINERS_DIR}/corrupt.json`, "{not json", "utf-8"); const registry = await readRegistry(); - expect(registry.entries).toHaveLength(2); expect( registry.entries .map((entry) => entry.containerName) .slice() .toSorted(), - ).toEqual(["container-a", "container-b"]); + ).toEqual(["good-a", "good-b"]); }); +}); - it("prevents concurrent container remove/update from resurrecting deleted entries", async () => { - await seedContainerRegistry([containerEntry({ containerName: "container-x" })]); - const writeGate = installWriteGate("containers.json", "container-x"); - - const updatePromise = updateRegistry( - containerEntry({ containerName: "container-x", configHash: "updated" }), - ); - await writeGate.waitForStart; - const removePromise = removeRegistryEntry("container-x"); - writeGate.release(); - await Promise.all([updatePromise, removePromise]); - - const registry = await readRegistry(); - expect(registry.entries).toHaveLength(0); - }); - - it("keeps both browser updates under concurrent writes", async () => { +describe("per-file sharded browser registry", () => { + it("writes and reads both browser entries under concurrent writes", async () => { await Promise.all([ updateBrowserRegistry(browserEntry({ containerName: "browser-a" })), updateBrowserRegistry(browserEntry({ containerName: "browser-b", cdpPort: 9223 })), ]); const registry = await readBrowserRegistry(); - expect(registry.entries).toHaveLength(2); expect( registry.entries .map((entry) => entry.containerName) @@ -237,38 +229,73 @@ describe("registry race safety", () => { ).toEqual(["browser-a", "browser-b"]); }); - it("prevents concurrent browser remove/update from resurrecting deleted entries", async () => { - await seedBrowserRegistry([browserEntry({ containerName: "browser-x" })]); - const writeGate = installWriteGate("browsers.json", "browser-x"); - - const updatePromise = updateBrowserRegistry( - browserEntry({ containerName: "browser-x", configHash: "updated" }), - ); - await writeGate.waitForStart; - const removePromise = removeBrowserRegistryEntry("browser-x"); - writeGate.release(); - await Promise.all([updatePromise, removePromise]); + it("removes a single browser entry", async () => { + await updateBrowserRegistry(browserEntry({ containerName: "browser-a" })); + await removeBrowserRegistryEntry("browser-a"); const registry = await readBrowserRegistry(); expect(registry.entries).toHaveLength(0); }); +}); - it("fails fast when registry files are malformed during update", async () => { - await seedMalformedContainerRegistry("{bad json"); - await seedMalformedBrowserRegistry("{bad json"); - await expect(updateRegistry(containerEntry())).rejects.toThrow(); - await expect(updateBrowserRegistry(browserEntry())).rejects.toThrow(); +describe("monolithic → per-file migration", () => { + it("migrates container entries from the legacy containers.json on first read", async () => { + await seedMonolithicContainerRegistry([ + containerEntry({ containerName: "old-a", sessionKey: "sess:a" }), + containerEntry({ containerName: "old-b", sessionKey: "sess:b" }), + ]); + + const registry = await readRegistry(); + expect( + registry.entries + .map((entry) => entry.containerName) + .slice() + .toSorted(), + ).toEqual(["old-a", "old-b"]); + + // Legacy file is removed once migration succeeds, so subsequent reads + // go straight to the sharded dir. + await expect(fs.access(SANDBOX_REGISTRY_PATH)).rejects.toThrow(); }); - it("fails fast when registry entries are invalid during update", async () => { - const invalidEntries = `{"entries":[{"sessionKey":"agent:main"}]}`; - await seedMalformedContainerRegistry(invalidEntries); - await seedMalformedBrowserRegistry(invalidEntries); - await expect(updateRegistry(containerEntry())).rejects.toThrow( - /Invalid sandbox registry format/, - ); - await expect(updateBrowserRegistry(browserEntry())).rejects.toThrow( - /Invalid sandbox registry format/, - ); + it("cleans up the stale .lock file alongside the migrated registry", async () => { + await seedMonolithicContainerRegistry([containerEntry({ containerName: "old-a" })]); + await fs.writeFile(`${SANDBOX_REGISTRY_PATH}.lock`, "stale", "utf-8"); + + await readRegistry(); + await expect(fs.access(`${SANDBOX_REGISTRY_PATH}.lock`)).rejects.toThrow(); + }); + + it("migrates browser entries from the legacy browsers.json", async () => { + await seedMonolithicBrowserRegistry([ + browserEntry({ containerName: "old-br-a" }), + browserEntry({ containerName: "old-br-b", cdpPort: 9223 }), + ]); + + const registry = await readBrowserRegistry(); + expect(registry.entries).toHaveLength(2); + await expect(fs.access(SANDBOX_BROWSER_REGISTRY_PATH)).rejects.toThrow(); + }); + + it("drops a malformed legacy containers.json rather than throwing forever", async () => { + // A corrupt legacy file would otherwise block every sandbox operation + // on every boot — dropping it lets the operator recover just by + // creating a new container. + await fs.writeFile(SANDBOX_REGISTRY_PATH, "{bad json", "utf-8"); + + const registry = await readRegistry(); + expect(registry.entries).toHaveLength(0); + await expect(fs.access(SANDBOX_REGISTRY_PATH)).rejects.toThrow(); + }); + + it("drops a legacy containers.json whose entries fail schema validation", async () => { + // entries missing containerName cannot be migrated safely. Rather + // than preserving the bad file, we remove it — the lost data was + // unusable anyway, and keeping it around would block future boots. + await fs.writeFile(SANDBOX_REGISTRY_PATH, `{"entries":[{"sessionKey":"agent:main"}]}`, "utf-8"); + + const registry = await readRegistry(); + expect(registry.entries).toHaveLength(0); + await expect(fs.access(SANDBOX_REGISTRY_PATH)).rejects.toThrow(); }); }); diff --git a/src/agents/sandbox/registry.ts b/src/agents/sandbox/registry.ts index be88017c978..7b2998d1d04 100644 --- a/src/agents/sandbox/registry.ts +++ b/src/agents/sandbox/registry.ts @@ -1,9 +1,14 @@ import fs from "node:fs/promises"; +import path from "node:path"; import { z } from "zod"; import { writeJsonAtomic } from "../../infra/json-files.js"; import { safeParseJsonWithSchema } from "../../utils/zod-parse.js"; -import { acquireSessionWriteLock } from "../session-write-lock.js"; -import { SANDBOX_BROWSER_REGISTRY_PATH, SANDBOX_REGISTRY_PATH } from "./constants.js"; +import { + SANDBOX_BROWSER_REGISTRY_PATH, + SANDBOX_BROWSERS_DIR, + SANDBOX_CONTAINERS_DIR, + SANDBOX_REGISTRY_PATH, +} from "./constants.js"; export type SandboxRegistryEntry = { containerName: string; @@ -36,8 +41,6 @@ type SandboxBrowserRegistry = { entries: SandboxBrowserRegistryEntry[]; }; -type RegistryReadMode = "strict" | "fallback"; - type RegistryEntry = { containerName: string; }; @@ -46,15 +49,9 @@ type RegistryFile = { entries: T[]; }; -type UpsertEntry = RegistryEntry & { - backendId?: string; - runtimeLabel?: string; - createdAtMs: number; - image: string; - configLabelKind?: string; - configHash?: string; -}; - +// Schemas are shared between the per-entry files (live writes) and the +// legacy monolithic files (one-shot migration). Both shapes must validate +// containerName; per-entry files are just the RegistryEntrySchema directly. const RegistryEntrySchema = z .object({ containerName: z.string(), @@ -74,69 +71,155 @@ function normalizeSandboxRegistryEntry(entry: SandboxRegistryEntry): SandboxRegi }; } -async function withRegistryLock(registryPath: string, fn: () => Promise): Promise { - const lock = await acquireSessionWriteLock({ - sessionFile: registryPath, - allowReentrant: false, - timeoutMs: 60_000, - }); - try { - return await fn(); - } finally { - await lock.release(); - } +// ── Per-entry file primitives ────────────────────────────────────────── +// +// Each container gets its own JSON file under the sharded directory. +// Writes use writeJsonAtomic (tmp + rename) for crash-safety. No file +// locks are needed — each concurrent writer only touches its own file, +// so there is zero cross-session contention on the monolithic lock that +// previously serialized every sandbox ensure/remove in the process tree. + +function entryFilePath(dir: string, containerName: string): string { + return path.join(dir, `${containerName}.json`); } -async function readRegistryFromFile( - registryPath: string, - mode: RegistryReadMode, -): Promise> { +async function readEntryFile( + dir: string, + containerName: string, +): Promise { + let raw: string; try { - const raw = await fs.readFile(registryPath, "utf-8"); - const parsed = safeParseJsonWithSchema(RegistryFileSchema, raw) as RegistryFile | null; - if (parsed) { - return parsed; - } - if (mode === "fallback") { - return { entries: [] }; - } - throw new Error(`Invalid sandbox registry format: ${registryPath}`); + raw = await fs.readFile(entryFilePath(dir, containerName), "utf-8"); } catch (error) { const code = (error as { code?: string } | null)?.code; if (code === "ENOENT") { - return { entries: [] }; + return null; } - if (mode === "fallback") { - return { entries: [] }; - } - if (error instanceof Error) { - throw error; - } - throw new Error(`Failed to read sandbox registry file: ${registryPath}`, { cause: error }); + throw error; + } + const parsed = safeParseJsonWithSchema(RegistryEntrySchema, raw) as T | null; + return parsed ?? null; +} + +async function writeEntryFile(dir: string, entry: T): Promise { + await fs.mkdir(dir, { recursive: true }); + await writeJsonAtomic(entryFilePath(dir, entry.containerName), entry, { trailingNewline: true }); +} + +async function removeEntryFile(dir: string, containerName: string): Promise { + try { + await fs.rm(entryFilePath(dir, containerName), { force: true }); + } catch { + // A concurrent remove or a missing file is fine — force:true already + // swallows ENOENT; any other error (e.g. permission) is non-fatal here + // because the caller's intent was "make sure it's gone". } } -async function writeRegistryFile( - registryPath: string, - registry: RegistryFile, -): Promise { - await writeJsonAtomic(registryPath, registry, { trailingNewline: true }); +/** Scan every per-entry JSON file in a sharded directory. */ +async function readAllEntries(dir: string): Promise { + let files: string[]; + try { + files = await fs.readdir(dir); + } catch { + return []; + } + const entries: T[] = []; + await Promise.all( + files + .filter((name) => name.endsWith(".json")) + .map(async (name) => { + try { + const raw = await fs.readFile(path.join(dir, name), "utf-8"); + const parsed = safeParseJsonWithSchema(RegistryEntrySchema, raw) as T | null; + if (parsed) { + entries.push(parsed); + } + // Corrupt / partially-written files are skipped rather than + // aborting the whole read: one bad entry should not hide every + // other container the operator has running. + } catch { + // ignore unreadable files for the same reason + } + }), + ); + return entries; } +// ── One-shot migration from monolithic file → per-entry files ────────── + +async function migrateMonolithicIfNeeded( + oldPath: string, + newDir: string, +): Promise { + let raw: string; + try { + raw = await fs.readFile(oldPath, "utf-8"); + } catch { + // Old file does not exist (already migrated on a previous boot, or + // fresh install). Nothing to do. + return; + } + const parsed = safeParseJsonWithSchema(RegistryFileSchema, raw) as RegistryFile | null; + if (!parsed || parsed.entries.length === 0) { + // Corrupt or empty — drop it (and its stale lock) so we don't re-attempt + // migration every read. + await fs.rm(oldPath, { force: true }).catch(() => {}); + await fs.rm(`${oldPath}.lock`, { force: true }).catch(() => {}); + return; + } + await fs.mkdir(newDir, { recursive: true }); + await Promise.all( + parsed.entries.map((entry) => + writeJsonAtomic(entryFilePath(newDir, entry.containerName), entry, { + trailingNewline: true, + }), + ), + ); + // Migration succeeded: remove the monolithic file and any leftover lock + // file from the previous single-writer scheme. + await fs.rm(oldPath, { force: true }).catch(() => {}); + await fs.rm(`${oldPath}.lock`, { force: true }).catch(() => {}); +} + +// ── Public API: Container Registry ───────────────────────────────────── + export async function readRegistry(): Promise { - const registry = await readRegistryFromFile( + await migrateMonolithicIfNeeded( SANDBOX_REGISTRY_PATH, - "fallback", + SANDBOX_CONTAINERS_DIR, ); - return { - entries: registry.entries.map((entry) => normalizeSandboxRegistryEntry(entry)), - }; + const entries = await readAllEntries(SANDBOX_CONTAINERS_DIR); + return { entries: entries.map(normalizeSandboxRegistryEntry) }; } -function upsertEntry(entries: T[], entry: T): T[] { - const existing = entries.find((item) => item.containerName === entry.containerName); - const next = entries.filter((item) => item.containerName !== entry.containerName); - next.push({ +/** + * Read a single container entry by name. + * + * O(1) file read — avoids scanning the entire sharded directory just to + * look up one container, which is the hot path for `ensureSandboxContainer`. + */ +export async function readRegistryEntry( + containerName: string, +): Promise { + await migrateMonolithicIfNeeded( + SANDBOX_REGISTRY_PATH, + SANDBOX_CONTAINERS_DIR, + ); + const entry = await readEntryFile(SANDBOX_CONTAINERS_DIR, containerName); + return entry ? normalizeSandboxRegistryEntry(entry) : null; +} + +export async function updateRegistry(entry: SandboxRegistryEntry): Promise { + await migrateMonolithicIfNeeded( + SANDBOX_REGISTRY_PATH, + SANDBOX_CONTAINERS_DIR, + ); + const existing = await readEntryFile( + SANDBOX_CONTAINERS_DIR, + entry.containerName, + ); + const merged: SandboxRegistryEntry = { ...entry, backendId: entry.backendId ?? existing?.backendId, runtimeLabel: entry.runtimeLabel ?? existing?.runtimeLabel, @@ -144,67 +227,42 @@ function upsertEntry(entries: T[], entry: T): T[] { image: existing?.image ?? entry.image, configLabelKind: entry.configLabelKind ?? existing?.configLabelKind, configHash: entry.configHash ?? existing?.configHash, - }); - return next; + }; + await writeEntryFile(SANDBOX_CONTAINERS_DIR, merged); } -function removeEntry(entries: T[], containerName: string): T[] { - return entries.filter((entry) => entry.containerName !== containerName); +export async function removeRegistryEntry(containerName: string): Promise { + await removeEntryFile(SANDBOX_CONTAINERS_DIR, containerName); } -async function withRegistryMutation( - registryPath: string, - mutate: (entries: T[]) => T[] | null, -): Promise { - await withRegistryLock(registryPath, async () => { - const registry = await readRegistryFromFile(registryPath, "strict"); - const next = mutate(registry.entries); - if (next === null) { - return; - } - await writeRegistryFile(registryPath, { entries: next }); - }); -} - -export async function updateRegistry(entry: SandboxRegistryEntry) { - await withRegistryMutation(SANDBOX_REGISTRY_PATH, (entries) => - upsertEntry(entries, entry), - ); -} - -export async function removeRegistryEntry(containerName: string) { - await withRegistryMutation(SANDBOX_REGISTRY_PATH, (entries) => { - const next = removeEntry(entries, containerName); - if (next.length === entries.length) { - return null; - } - return next; - }); -} +// ── Public API: Browser Registry ─────────────────────────────────────── export async function readBrowserRegistry(): Promise { - return await readRegistryFromFile( + await migrateMonolithicIfNeeded( SANDBOX_BROWSER_REGISTRY_PATH, - "fallback", + SANDBOX_BROWSERS_DIR, ); + return { entries: await readAllEntries(SANDBOX_BROWSERS_DIR) }; } -export async function updateBrowserRegistry(entry: SandboxBrowserRegistryEntry) { - await withRegistryMutation( +export async function updateBrowserRegistry(entry: SandboxBrowserRegistryEntry): Promise { + await migrateMonolithicIfNeeded( SANDBOX_BROWSER_REGISTRY_PATH, - (entries) => upsertEntry(entries, entry), + SANDBOX_BROWSERS_DIR, ); + const existing = await readEntryFile( + SANDBOX_BROWSERS_DIR, + entry.containerName, + ); + const merged: SandboxBrowserRegistryEntry = { + ...entry, + createdAtMs: existing?.createdAtMs ?? entry.createdAtMs, + image: existing?.image ?? entry.image, + configHash: entry.configHash ?? existing?.configHash, + }; + await writeEntryFile(SANDBOX_BROWSERS_DIR, merged); } -export async function removeBrowserRegistryEntry(containerName: string) { - await withRegistryMutation( - SANDBOX_BROWSER_REGISTRY_PATH, - (entries) => { - const next = removeEntry(entries, containerName); - if (next.length === entries.length) { - return null; - } - return next; - }, - ); +export async function removeBrowserRegistryEntry(containerName: string): Promise { + await removeEntryFile(SANDBOX_BROWSERS_DIR, containerName); }