mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 11:10:45 +00:00
fix: handle ENOSPC file watcher errors gracefully
Problem:
- Gateway crashes with "ENOSPC: System limit for number of file watchers reached"
when inotify watch limits are exhausted during memory watcher initialization
- Sessions are interrupted mid-tool-call, leaving users unable to continue
Root cause:
- isTransientUnhandledRejectionError() only checked network and SQLite errors
- ENOSPC fell through to default handler which calls process.exit(1)
- No error handler on chokidar watcher to catch initialization failures
Solution:
- Add isTransientFileWatchError() to classify file watcher errors as transient
- ENOSPC requires both error code AND watch/inotify message indicator to avoid
misclassifying general disk-full errors (similar to hasSqliteSignal pattern)
- Add watcher error handler to memory manager to log and continue in degraded mode
- Fix TypeScript type errors with proper type guards for 'unknown' values
Changes:
- src/infra/unhandled-rejections.ts: Add isTransientFileWatchError() with strict
message pattern matching ('inotify', 'watcher', 'file watcher', etc.)
- extensions/memory-core/src/memory/manager-sync-ops.ts: Add .on("error") handler
- Comprehensive test coverage (61 tests) for error classification
Impact:
- Gateway no longer crashes when file watcher limits are reached
- Memory search continues in degraded mode (manual sync) when watcher fails
- Users see warning log instead of hard failure
Fixes: Gateway crash on "System limit for number of file watchers reached"
This commit is contained in:
@@ -446,6 +446,12 @@ export abstract class MemoryManagerSyncOps {
|
||||
this.watcher.on("change", markDirty);
|
||||
this.watcher.on("unlink", markDirty);
|
||||
this.watcher.on("unlinkDir", markDirty);
|
||||
this.watcher.on("error", (err) => {
|
||||
// File watcher errors (e.g., ENOSPC) should not crash the gateway.
|
||||
// Log the error and continue - memory search still works without auto-sync.
|
||||
const message = err instanceof Error ? err.message : String(err);
|
||||
log.warn(`memory watcher error: ${message}`);
|
||||
});
|
||||
}
|
||||
|
||||
protected ensureSessionListener() {
|
||||
|
||||
@@ -2,6 +2,7 @@ import { describe, expect, it } from "vitest";
|
||||
import {
|
||||
isAbortError,
|
||||
isBenignUncaughtExceptionError,
|
||||
isTransientFileWatchError,
|
||||
isTransientNetworkError,
|
||||
isTransientSqliteError,
|
||||
isTransientUnhandledRejectionError,
|
||||
@@ -258,6 +259,87 @@ describe("isTransientSqliteError", () => {
|
||||
});
|
||||
});
|
||||
|
||||
describe("isTransientFileWatchError", () => {
|
||||
it("returns true for ENOSPC with inotify message", () => {
|
||||
const error = Object.assign(new Error("inotify watches exhausted"), { code: "ENOSPC" });
|
||||
expect(isTransientFileWatchError(error)).toBe(true);
|
||||
});
|
||||
|
||||
it("returns true for ENOSPC with file watcher message", () => {
|
||||
const error = Object.assign(new Error("System limit for number of file watchers reached"), {
|
||||
code: "ENOSPC",
|
||||
});
|
||||
expect(isTransientFileWatchError(error)).toBe(true);
|
||||
});
|
||||
|
||||
it("returns true for ENOSPC with watcher error message", () => {
|
||||
const error = Object.assign(new Error("watcher error: ENOSPC"), { code: "ENOSPC" });
|
||||
expect(isTransientFileWatchError(error)).toBe(true);
|
||||
});
|
||||
|
||||
it("returns false for ENOSPC without watch indicator (general disk full)", () => {
|
||||
const error = Object.assign(new Error("write failed: no space left on device"), {
|
||||
code: "ENOSPC",
|
||||
});
|
||||
expect(isTransientFileWatchError(error)).toBe(false);
|
||||
});
|
||||
|
||||
it("returns false for ENOSPC with only 'disk full' message", () => {
|
||||
const error = Object.assign(new Error("ENOSPC: disk full"), { code: "ENOSPC" });
|
||||
expect(isTransientFileWatchError(error)).toBe(false);
|
||||
});
|
||||
|
||||
it("returns true for 'no space left on device' message with watcher context", () => {
|
||||
const error = new Error("file watcher: no space left on device");
|
||||
expect(isTransientFileWatchError(error)).toBe(true);
|
||||
});
|
||||
|
||||
it("returns true for inotify-related error messages", () => {
|
||||
expect(isTransientFileWatchError(new Error("inotify watches exhausted"))).toBe(true);
|
||||
expect(
|
||||
isTransientFileWatchError(new Error("System limit for number of file watchers reached")),
|
||||
).toBe(true);
|
||||
});
|
||||
|
||||
it("returns true for watcher-related error messages", () => {
|
||||
expect(isTransientFileWatchError(new Error("watcher error: ENOSPC"))).toBe(true);
|
||||
expect(isTransientFileWatchError(new Error("file watcher failed"))).toBe(true);
|
||||
});
|
||||
|
||||
it("returns true for ENOSPC with cause chain containing watch indicator", () => {
|
||||
const cause = Object.assign(new Error("inotify watches exhausted"), { code: "ENOSPC" });
|
||||
const error = Object.assign(new Error("watcher failed"), { cause });
|
||||
expect(isTransientFileWatchError(error)).toBe(true);
|
||||
});
|
||||
|
||||
it("returns false for 'watchdog timeout' (unrelated watch error)", () => {
|
||||
expect(isTransientFileWatchError(new Error("watchdog timeout"))).toBe(false);
|
||||
expect(isTransientFileWatchError(new Error("cannot watch process"))).toBe(false);
|
||||
});
|
||||
|
||||
it("returns false for regular errors without file watch indicators", () => {
|
||||
expect(isTransientFileWatchError(new Error("Something went wrong"))).toBe(false);
|
||||
expect(isTransientFileWatchError(new TypeError("Cannot read property"))).toBe(false);
|
||||
expect(isTransientFileWatchError(new RangeError("Invalid array length"))).toBe(false);
|
||||
});
|
||||
|
||||
it("returns false for other disk errors without ENOSPC", () => {
|
||||
expect(isTransientFileWatchError(new Error("disk quota exceeded"))).toBe(false);
|
||||
expect(
|
||||
isTransientFileWatchError(
|
||||
Object.assign(new Error("read only file system"), { code: "EROFS" }),
|
||||
),
|
||||
).toBe(false);
|
||||
});
|
||||
|
||||
it.each([null, undefined, "string error", 42, { message: "plain object" }])(
|
||||
"returns false for non-file-watch input %#",
|
||||
(value) => {
|
||||
expect(isTransientFileWatchError(value)).toBe(false);
|
||||
},
|
||||
);
|
||||
});
|
||||
|
||||
describe("isTransientUnhandledRejectionError", () => {
|
||||
it("treats raw pre-connect network uncaught exceptions as benign", () => {
|
||||
const epipe = Object.assign(new Error("write EPIPE"), { code: "EPIPE" });
|
||||
@@ -287,4 +369,21 @@ describe("isTransientUnhandledRejectionError", () => {
|
||||
|
||||
expect(isTransientUnhandledRejectionError(error)).toBe(true);
|
||||
});
|
||||
|
||||
it("returns true for transient file watcher errors (ENOSPC + inotify)", () => {
|
||||
const error = Object.assign(new Error("inotify watches exhausted"), { code: "ENOSPC" });
|
||||
expect(isTransientUnhandledRejectionError(error)).toBe(true);
|
||||
});
|
||||
|
||||
it("returns true for file watcher errors with message only", () => {
|
||||
const error = new Error("System limit for number of file watchers reached");
|
||||
expect(isTransientUnhandledRejectionError(error)).toBe(true);
|
||||
});
|
||||
|
||||
it("returns false for ENOSPC without watch indicator (general disk full)", () => {
|
||||
const error = Object.assign(new Error("write failed: no space left on device"), {
|
||||
code: "ENOSPC",
|
||||
});
|
||||
expect(isTransientUnhandledRejectionError(error)).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
@@ -350,8 +350,70 @@ export function isTransientSqliteError(err: unknown): boolean {
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if an error is a transient file watcher error that shouldn't crash the gateway.
|
||||
* These are typically resource exhaustion issues (e.g., inotify watches exhausted) that
|
||||
* can be recovered from by degrading to manual sync mode.
|
||||
*
|
||||
* Note: ENOSPC is a general POSIX error code (disk full, write failures, etc.).
|
||||
* To avoid misclassifying unrelated storage failures, we require both the ENOSPC code
|
||||
* AND a watch/inotify-related message indicator, similar to how hasSqliteSignal gates
|
||||
* SQLite errors.
|
||||
*/
|
||||
export function isTransientFileWatchError(err: unknown): boolean {
|
||||
if (!err) {
|
||||
return false;
|
||||
}
|
||||
|
||||
for (const candidate of collectNestedUnhandledErrorCandidates(err)) {
|
||||
// Skip non-object candidates early
|
||||
if (!candidate || typeof candidate !== "object") {
|
||||
continue;
|
||||
}
|
||||
|
||||
const code = extractErrorCodeOrErrno(candidate);
|
||||
const rawMessage =
|
||||
"message" in candidate && typeof candidate.message === "string" ? candidate.message : "";
|
||||
const message = normalizeLowercaseStringOrEmpty(rawMessage);
|
||||
|
||||
// ENOSPC requires both the code AND a watch/inotify message indicator
|
||||
// to avoid misclassifying general disk-full errors as transient watcher errors.
|
||||
if (code === "ENOSPC") {
|
||||
if (
|
||||
message.includes("inotify") ||
|
||||
message.includes("watcher") ||
|
||||
message.includes("file watcher") ||
|
||||
message.includes("watch limit") ||
|
||||
message.includes("max watches")
|
||||
) {
|
||||
return true;
|
||||
}
|
||||
// ENOSPC without watch indicator is not classified here
|
||||
continue;
|
||||
}
|
||||
|
||||
// Check for file watcher error message patterns (without ENOSPC code)
|
||||
if (!message) {
|
||||
continue;
|
||||
}
|
||||
if (
|
||||
message.includes("no space left on device") ||
|
||||
message.includes("enosp") ||
|
||||
message.includes("inotify watches") ||
|
||||
message.includes("file watcher") ||
|
||||
message.includes("watcher error")
|
||||
) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
export function isTransientUnhandledRejectionError(err: unknown): boolean {
|
||||
return isTransientNetworkError(err) || isTransientSqliteError(err);
|
||||
return (
|
||||
isTransientNetworkError(err) || isTransientSqliteError(err) || isTransientFileWatchError(err)
|
||||
);
|
||||
}
|
||||
|
||||
function isBenignUncaughtNetworkException(err: unknown): boolean {
|
||||
|
||||
Reference in New Issue
Block a user