From 31b135a5ad1af389f340c6e44877fa8d78c7b393 Mon Sep 17 00:00:00 2001 From: mingdideng Date: Tue, 28 Apr 2026 16:09:28 +0800 Subject: [PATCH] fix: handle ENOSPC file watcher errors gracefully Problem: - Gateway crashes with "ENOSPC: System limit for number of file watchers reached" when inotify watch limits are exhausted during memory watcher initialization - Sessions are interrupted mid-tool-call, leaving users unable to continue Root cause: - isTransientUnhandledRejectionError() only checked network and SQLite errors - ENOSPC fell through to default handler which calls process.exit(1) - No error handler on chokidar watcher to catch initialization failures Solution: - Add isTransientFileWatchError() to classify file watcher errors as transient - ENOSPC requires both error code AND watch/inotify message indicator to avoid misclassifying general disk-full errors (similar to hasSqliteSignal pattern) - Add watcher error handler to memory manager to log and continue in degraded mode - Fix TypeScript type errors with proper type guards for 'unknown' values Changes: - src/infra/unhandled-rejections.ts: Add isTransientFileWatchError() with strict message pattern matching ('inotify', 'watcher', 'file watcher', etc.) - extensions/memory-core/src/memory/manager-sync-ops.ts: Add .on("error") handler - Comprehensive test coverage (61 tests) for error classification Impact: - Gateway no longer crashes when file watcher limits are reached - Memory search continues in degraded mode (manual sync) when watcher fails - Users see warning log instead of hard failure Fixes: Gateway crash on "System limit for number of file watchers reached" --- .../src/memory/manager-sync-ops.ts | 6 ++ src/infra/unhandled-rejections.test.ts | 99 +++++++++++++++++++ src/infra/unhandled-rejections.ts | 64 +++++++++++- 3 files changed, 168 insertions(+), 1 deletion(-) diff --git a/extensions/memory-core/src/memory/manager-sync-ops.ts b/extensions/memory-core/src/memory/manager-sync-ops.ts index f32d52aba3d..ca0881f4713 100644 --- a/extensions/memory-core/src/memory/manager-sync-ops.ts +++ b/extensions/memory-core/src/memory/manager-sync-ops.ts @@ -446,6 +446,12 @@ export abstract class MemoryManagerSyncOps { this.watcher.on("change", markDirty); this.watcher.on("unlink", markDirty); this.watcher.on("unlinkDir", markDirty); + this.watcher.on("error", (err) => { + // File watcher errors (e.g., ENOSPC) should not crash the gateway. + // Log the error and continue - memory search still works without auto-sync. + const message = err instanceof Error ? err.message : String(err); + log.warn(`memory watcher error: ${message}`); + }); } protected ensureSessionListener() { diff --git a/src/infra/unhandled-rejections.test.ts b/src/infra/unhandled-rejections.test.ts index c3731d89b5b..fbe5bcc02db 100644 --- a/src/infra/unhandled-rejections.test.ts +++ b/src/infra/unhandled-rejections.test.ts @@ -2,6 +2,7 @@ import { describe, expect, it } from "vitest"; import { isAbortError, isBenignUncaughtExceptionError, + isTransientFileWatchError, isTransientNetworkError, isTransientSqliteError, isTransientUnhandledRejectionError, @@ -258,6 +259,87 @@ describe("isTransientSqliteError", () => { }); }); +describe("isTransientFileWatchError", () => { + it("returns true for ENOSPC with inotify message", () => { + const error = Object.assign(new Error("inotify watches exhausted"), { code: "ENOSPC" }); + expect(isTransientFileWatchError(error)).toBe(true); + }); + + it("returns true for ENOSPC with file watcher message", () => { + const error = Object.assign(new Error("System limit for number of file watchers reached"), { + code: "ENOSPC", + }); + expect(isTransientFileWatchError(error)).toBe(true); + }); + + it("returns true for ENOSPC with watcher error message", () => { + const error = Object.assign(new Error("watcher error: ENOSPC"), { code: "ENOSPC" }); + expect(isTransientFileWatchError(error)).toBe(true); + }); + + it("returns false for ENOSPC without watch indicator (general disk full)", () => { + const error = Object.assign(new Error("write failed: no space left on device"), { + code: "ENOSPC", + }); + expect(isTransientFileWatchError(error)).toBe(false); + }); + + it("returns false for ENOSPC with only 'disk full' message", () => { + const error = Object.assign(new Error("ENOSPC: disk full"), { code: "ENOSPC" }); + expect(isTransientFileWatchError(error)).toBe(false); + }); + + it("returns true for 'no space left on device' message with watcher context", () => { + const error = new Error("file watcher: no space left on device"); + expect(isTransientFileWatchError(error)).toBe(true); + }); + + it("returns true for inotify-related error messages", () => { + expect(isTransientFileWatchError(new Error("inotify watches exhausted"))).toBe(true); + expect( + isTransientFileWatchError(new Error("System limit for number of file watchers reached")), + ).toBe(true); + }); + + it("returns true for watcher-related error messages", () => { + expect(isTransientFileWatchError(new Error("watcher error: ENOSPC"))).toBe(true); + expect(isTransientFileWatchError(new Error("file watcher failed"))).toBe(true); + }); + + it("returns true for ENOSPC with cause chain containing watch indicator", () => { + const cause = Object.assign(new Error("inotify watches exhausted"), { code: "ENOSPC" }); + const error = Object.assign(new Error("watcher failed"), { cause }); + expect(isTransientFileWatchError(error)).toBe(true); + }); + + it("returns false for 'watchdog timeout' (unrelated watch error)", () => { + expect(isTransientFileWatchError(new Error("watchdog timeout"))).toBe(false); + expect(isTransientFileWatchError(new Error("cannot watch process"))).toBe(false); + }); + + it("returns false for regular errors without file watch indicators", () => { + expect(isTransientFileWatchError(new Error("Something went wrong"))).toBe(false); + expect(isTransientFileWatchError(new TypeError("Cannot read property"))).toBe(false); + expect(isTransientFileWatchError(new RangeError("Invalid array length"))).toBe(false); + }); + + it("returns false for other disk errors without ENOSPC", () => { + expect(isTransientFileWatchError(new Error("disk quota exceeded"))).toBe(false); + expect( + isTransientFileWatchError( + Object.assign(new Error("read only file system"), { code: "EROFS" }), + ), + ).toBe(false); + }); + + it.each([null, undefined, "string error", 42, { message: "plain object" }])( + "returns false for non-file-watch input %#", + (value) => { + expect(isTransientFileWatchError(value)).toBe(false); + }, + ); +}); + describe("isTransientUnhandledRejectionError", () => { it("treats raw pre-connect network uncaught exceptions as benign", () => { const epipe = Object.assign(new Error("write EPIPE"), { code: "EPIPE" }); @@ -287,4 +369,21 @@ describe("isTransientUnhandledRejectionError", () => { expect(isTransientUnhandledRejectionError(error)).toBe(true); }); + + it("returns true for transient file watcher errors (ENOSPC + inotify)", () => { + const error = Object.assign(new Error("inotify watches exhausted"), { code: "ENOSPC" }); + expect(isTransientUnhandledRejectionError(error)).toBe(true); + }); + + it("returns true for file watcher errors with message only", () => { + const error = new Error("System limit for number of file watchers reached"); + expect(isTransientUnhandledRejectionError(error)).toBe(true); + }); + + it("returns false for ENOSPC without watch indicator (general disk full)", () => { + const error = Object.assign(new Error("write failed: no space left on device"), { + code: "ENOSPC", + }); + expect(isTransientUnhandledRejectionError(error)).toBe(false); + }); }); diff --git a/src/infra/unhandled-rejections.ts b/src/infra/unhandled-rejections.ts index 81f57a05af3..ec1e0800bc5 100644 --- a/src/infra/unhandled-rejections.ts +++ b/src/infra/unhandled-rejections.ts @@ -350,8 +350,70 @@ export function isTransientSqliteError(err: unknown): boolean { return false; } +/** + * Checks if an error is a transient file watcher error that shouldn't crash the gateway. + * These are typically resource exhaustion issues (e.g., inotify watches exhausted) that + * can be recovered from by degrading to manual sync mode. + * + * Note: ENOSPC is a general POSIX error code (disk full, write failures, etc.). + * To avoid misclassifying unrelated storage failures, we require both the ENOSPC code + * AND a watch/inotify-related message indicator, similar to how hasSqliteSignal gates + * SQLite errors. + */ +export function isTransientFileWatchError(err: unknown): boolean { + if (!err) { + return false; + } + + for (const candidate of collectNestedUnhandledErrorCandidates(err)) { + // Skip non-object candidates early + if (!candidate || typeof candidate !== "object") { + continue; + } + + const code = extractErrorCodeOrErrno(candidate); + const rawMessage = + "message" in candidate && typeof candidate.message === "string" ? candidate.message : ""; + const message = normalizeLowercaseStringOrEmpty(rawMessage); + + // ENOSPC requires both the code AND a watch/inotify message indicator + // to avoid misclassifying general disk-full errors as transient watcher errors. + if (code === "ENOSPC") { + if ( + message.includes("inotify") || + message.includes("watcher") || + message.includes("file watcher") || + message.includes("watch limit") || + message.includes("max watches") + ) { + return true; + } + // ENOSPC without watch indicator is not classified here + continue; + } + + // Check for file watcher error message patterns (without ENOSPC code) + if (!message) { + continue; + } + if ( + message.includes("no space left on device") || + message.includes("enosp") || + message.includes("inotify watches") || + message.includes("file watcher") || + message.includes("watcher error") + ) { + return true; + } + } + + return false; +} + export function isTransientUnhandledRejectionError(err: unknown): boolean { - return isTransientNetworkError(err) || isTransientSqliteError(err); + return ( + isTransientNetworkError(err) || isTransientSqliteError(err) || isTransientFileWatchError(err) + ); } function isBenignUncaughtNetworkException(err: unknown): boolean {