diff --git a/CHANGELOG.md b/CHANGELOG.md index 04625b3fee5..a0490132c22 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -231,6 +231,7 @@ Docs: https://docs.openclaw.ai - Config/agents: accept `agents.list[].contextTokens` in strict config validation so per-agent overrides survive hot reload, letting `/status` reflect the configured model window instead of the 200k fallback. Fixes #70692. (#71247) Thanks @statxc. - Heartbeat: include async exec completion details in heartbeat prompts so command-finished notifications relay the actual output. (#71213) Thanks @GodsBoy. - Memory search: apply session visibility and agent-to-agent policy to session transcript hits, and keep `corpus=sessions` ranking scoped to session collections before result limiting. (#70761) Thanks @nefainl. +- Agents/sessions: stop session write-lock timeouts from entering model failover, so local lock contention surfaces directly instead of cascading across providers. (#68700) Thanks @MonkeyLeeT. ## 2026.4.23 diff --git a/src/agents/failover-error.test.ts b/src/agents/failover-error.test.ts index 86d6f167cb2..62655860660 100644 --- a/src/agents/failover-error.test.ts +++ b/src/agents/failover-error.test.ts @@ -8,6 +8,7 @@ import { resolveFailoverStatus, } from "./failover-error.js"; import { classifyFailoverSignal } from "./pi-embedded-helpers/errors.js"; +import { SessionWriteLockTimeoutError } from "./session-write-lock-error.js"; // OpenAI 429 example shape: https://help.openai.com/en/articles/5955604-how-can-i-solve-429-too-many-requests-errors const OPENAI_RATE_LIMIT_MESSAGE = @@ -359,6 +360,87 @@ describe("failover-error", () => { ).toBe("overloaded"); }); + it("does not classify session lock wait errors as model timeout failover", () => { + const sessionLockError = new SessionWriteLockTimeoutError({ + timeoutMs: 10_000, + owner: "pid=37121", + lockPath: "/tmp/openclaw/session.jsonl.lock", + }); + expect(resolveFailoverReasonFromError(sessionLockError)).toBeNull(); + expect(isTimeoutError(sessionLockError)).toBe(false); + + const wrappedLockError = Object.assign(new Error("operation timed out"), { + name: "AbortError", + cause: sessionLockError, + }); + expect(resolveFailoverReasonFromError(wrappedLockError)).toBeNull(); + expect(isTimeoutError(wrappedLockError)).toBe(false); + + const abortWrappedLockError = Object.assign(new Error("request was aborted"), { + name: "AbortError", + cause: sessionLockError, + }); + expect(resolveFailoverReasonFromError(abortWrappedLockError)).toBeNull(); + expect(isTimeoutError(abortWrappedLockError)).toBe(false); + }); + + it("keeps explicit provider failover metadata authoritative over nested session lock text", () => { + expect( + resolveFailoverReasonFromError({ + status: 429, + code: "RESOURCE_EXHAUSTED", + message: "upstream quota pressure", + cause: new SessionWriteLockTimeoutError({ + timeoutMs: 10_000, + owner: "pid=37121", + lockPath: "/tmp/openclaw/session.jsonl.lock", + }), + }), + ).toBe("rate_limit"); + }); + + it("keeps inferred HTTP failover metadata authoritative over nested session lock text", () => { + expect( + resolveFailoverReasonFromError({ + message: "HTTP 429: upstream quota pressure", + cause: new SessionWriteLockTimeoutError({ + timeoutMs: 10_000, + owner: "pid=37121", + lockPath: "/tmp/openclaw/session.jsonl.lock", + }), + }), + ).toBe("rate_limit"); + }); + + it("does not treat generic abort codes as explicit failover metadata over nested session lock text", () => { + expect( + resolveFailoverReasonFromError({ + name: "AbortError", + code: "ABORT_ERR", + message: "The operation was aborted", + cause: new SessionWriteLockTimeoutError({ + timeoutMs: 10_000, + owner: "pid=37121", + lockPath: "/tmp/openclaw/session.jsonl.lock", + }), + }), + ).toBeNull(); + }); + + it("does not let cause-based failover classification bypass wrapper session lock suppression", () => { + expect( + resolveFailoverReasonFromError({ + message: "wrapper", + reason: new SessionWriteLockTimeoutError({ + timeoutMs: 10_000, + owner: "pid=37121", + lockPath: "/tmp/openclaw/session.jsonl.lock", + }), + cause: new Error("operation timed out"), + }), + ).toBeNull(); + }); + it("classifies provider-scoped generic upstream errors for failover", () => { expect( resolveFailoverReasonFromError({ diff --git a/src/agents/failover-error.ts b/src/agents/failover-error.ts index c66ab11ee3b..9d61dd861cb 100644 --- a/src/agents/failover-error.ts +++ b/src/agents/failover-error.ts @@ -1,12 +1,14 @@ import { readErrorName } from "../infra/errors.js"; import { classifyFailoverSignal, + inferSignalStatus, isUnclassifiedNoBodyHttpSignal, type FailoverClassification, type FailoverSignal, } from "./pi-embedded-helpers/errors.js"; import { isTimeoutErrorMessage } from "./pi-embedded-helpers/errors.js"; import type { FailoverReason } from "./pi-embedded-helpers/types.js"; +import { isSessionWriteLockTimeoutError } from "./session-write-lock-error.js"; const ABORT_TIMEOUT_RE = /request was aborted|request aborted/i; const MAX_FAILOVER_CAUSE_DEPTH = 25; @@ -198,10 +200,32 @@ function normalizeDirectErrorSignal(err: unknown): FailoverSignal { }; } +function hasSessionWriteLockTimeout(err: unknown, seen: Set = new Set()): boolean { + if (isSessionWriteLockTimeoutError(err)) { + return true; + } + if (!err || typeof err !== "object") { + return false; + } + if (seen.has(err)) { + return false; + } + seen.add(err); + const candidate = err as { error?: unknown; cause?: unknown; reason?: unknown }; + return ( + hasSessionWriteLockTimeout(candidate.error, seen) || + hasSessionWriteLockTimeout(candidate.cause, seen) || + hasSessionWriteLockTimeout(candidate.reason, seen) + ); +} + function hasTimeoutHint(err: unknown): boolean { if (!err) { return false; } + if (hasSessionWriteLockTimeout(err)) { + return false; + } if (readErrorName(err) === "TimeoutError") { return true; } @@ -219,6 +243,9 @@ export function isTimeoutError(err: unknown): boolean { if (readErrorName(err) !== "AbortError") { return false; } + if (hasSessionWriteLockTimeout(err)) { + return false; + } const message = getErrorMessage(err); if (message && ABORT_TIMEOUT_RE.test(message)) { return true; @@ -316,8 +343,15 @@ function resolveFailoverClassificationFromErrorInternal( reason: err.reason, }; } - const signal = normalizeErrorSignal(err); + const codeReason = signal.code + ? failoverReasonFromClassification(classifyFailoverSignal({ code: signal.code })) + : null; + const hasExplicitFailoverMetadata = + typeof inferSignalStatus(signal) === "number" || + (codeReason !== null && codeReason !== "timeout"); + const hasSessionLock = hasSessionWriteLockTimeout(err); + const classification = classifyFailoverSignal(signal); const nestedCandidates = getNestedErrorCandidates(err); @@ -329,6 +363,9 @@ function resolveFailoverClassificationFromErrorInternal( depth + 1, ); if (nestedClassification) { + if (hasSessionLock && !hasExplicitFailoverMetadata) { + return null; + } return nestedClassification; } } @@ -352,9 +389,16 @@ function resolveFailoverClassificationFromErrorInternal( } if (classification) { + if (hasSessionLock && !hasExplicitFailoverMetadata) { + return null; + } return classification; } + if (hasSessionLock) { + return null; + } + if (isTimeoutError(err)) { return { kind: "reason", diff --git a/src/agents/pi-embedded-helpers/errors.ts b/src/agents/pi-embedded-helpers/errors.ts index 44086dba835..ea9a9b25eb0 100644 --- a/src/agents/pi-embedded-helpers/errors.ts +++ b/src/agents/pi-embedded-helpers/errors.ts @@ -337,7 +337,7 @@ function stripErrorPrefix(raw: string): string { return raw.replace(/^error:\s*/i, "").trim(); } -function inferSignalStatus(signal: FailoverSignal): number | undefined { +export function inferSignalStatus(signal: FailoverSignal): number | undefined { if (typeof signal.status === "number" && Number.isFinite(signal.status)) { return signal.status; } diff --git a/src/agents/session-write-lock-error.ts b/src/agents/session-write-lock-error.ts new file mode 100644 index 00000000000..8b1826ad457 --- /dev/null +++ b/src/agents/session-write-lock-error.ts @@ -0,0 +1,29 @@ +export const SESSION_WRITE_LOCK_TIMEOUT_CODE = "OPENCLAW_SESSION_WRITE_LOCK_TIMEOUT"; + +export class SessionWriteLockTimeoutError extends Error { + readonly code = SESSION_WRITE_LOCK_TIMEOUT_CODE; + readonly timeoutMs: number; + readonly owner: string; + readonly lockPath: string; + + constructor(params: { timeoutMs: number; owner: string; lockPath: string }) { + super( + `session file locked (timeout ${params.timeoutMs}ms): ${params.owner} ${params.lockPath}`, + ); + this.name = "SessionWriteLockTimeoutError"; + this.timeoutMs = params.timeoutMs; + this.owner = params.owner; + this.lockPath = params.lockPath; + } +} + +export function isSessionWriteLockTimeoutError(err: unknown): boolean { + return ( + err instanceof SessionWriteLockTimeoutError || + Boolean( + err && + typeof err === "object" && + (err as { code?: unknown }).code === SESSION_WRITE_LOCK_TIMEOUT_CODE, + ) + ); +} diff --git a/src/agents/session-write-lock.ts b/src/agents/session-write-lock.ts index e0113c9676e..7be469185fe 100644 --- a/src/agents/session-write-lock.ts +++ b/src/agents/session-write-lock.ts @@ -3,6 +3,7 @@ import fs from "node:fs/promises"; import path from "node:path"; import { getProcessStartTime, isPidAlive } from "../shared/pid-alive.js"; import { resolveProcessScopedMap } from "../shared/process-scoped-map.js"; +import { SessionWriteLockTimeoutError } from "./session-write-lock-error.js"; type LockFilePayload = { pid?: number; @@ -584,7 +585,7 @@ export async function acquireSessionWriteLock(params: { const payload = await readLockPayload(lockPath); const owner = typeof payload?.pid === "number" ? `pid=${payload.pid}` : "unknown"; - throw new Error(`session file locked (timeout ${timeoutMs}ms): ${owner} ${lockPath}`); + throw new SessionWriteLockTimeoutError({ timeoutMs, owner, lockPath }); } export const __testing = {