fix: stop session lock failover (#68700) (thanks @MonkeyLeeT)

* fix(agents): stop treating session lock waits as timeout

* fix(agents): ignore abort-wrapped session lock waits

* fix(agents): keep explicit failover metadata authoritative

* fix(agents): respect inferred failover metadata

* fix(agents): ignore generic abort codes for lock waits

* fix(agents): suppress cause-based lock wait fallback

* fix(agents): type session lock timeout errors

* fix: stop session lock failover (#68700) (thanks @MonkeyLeeT)

---------

Co-authored-by: Ayaan Zaidi <hi@obviy.us>
This commit is contained in:
Ted Li
2026-04-24 21:03:19 -07:00
committed by GitHub
parent c03e5b3c3a
commit 8cc38c1b86
6 changed files with 160 additions and 3 deletions

View File

@@ -231,6 +231,7 @@ Docs: https://docs.openclaw.ai
- Config/agents: accept `agents.list[].contextTokens` in strict config validation so per-agent overrides survive hot reload, letting `/status` reflect the configured model window instead of the 200k fallback. Fixes #70692. (#71247) Thanks @statxc.
- Heartbeat: include async exec completion details in heartbeat prompts so command-finished notifications relay the actual output. (#71213) Thanks @GodsBoy.
- Memory search: apply session visibility and agent-to-agent policy to session transcript hits, and keep `corpus=sessions` ranking scoped to session collections before result limiting. (#70761) Thanks @nefainl.
- Agents/sessions: stop session write-lock timeouts from entering model failover, so local lock contention surfaces directly instead of cascading across providers. (#68700) Thanks @MonkeyLeeT.
## 2026.4.23

View File

@@ -8,6 +8,7 @@ import {
resolveFailoverStatus,
} from "./failover-error.js";
import { classifyFailoverSignal } from "./pi-embedded-helpers/errors.js";
import { SessionWriteLockTimeoutError } from "./session-write-lock-error.js";
// OpenAI 429 example shape: https://help.openai.com/en/articles/5955604-how-can-i-solve-429-too-many-requests-errors
const OPENAI_RATE_LIMIT_MESSAGE =
@@ -359,6 +360,87 @@ describe("failover-error", () => {
).toBe("overloaded");
});
it("does not classify session lock wait errors as model timeout failover", () => {
const sessionLockError = new SessionWriteLockTimeoutError({
timeoutMs: 10_000,
owner: "pid=37121",
lockPath: "/tmp/openclaw/session.jsonl.lock",
});
expect(resolveFailoverReasonFromError(sessionLockError)).toBeNull();
expect(isTimeoutError(sessionLockError)).toBe(false);
const wrappedLockError = Object.assign(new Error("operation timed out"), {
name: "AbortError",
cause: sessionLockError,
});
expect(resolveFailoverReasonFromError(wrappedLockError)).toBeNull();
expect(isTimeoutError(wrappedLockError)).toBe(false);
const abortWrappedLockError = Object.assign(new Error("request was aborted"), {
name: "AbortError",
cause: sessionLockError,
});
expect(resolveFailoverReasonFromError(abortWrappedLockError)).toBeNull();
expect(isTimeoutError(abortWrappedLockError)).toBe(false);
});
it("keeps explicit provider failover metadata authoritative over nested session lock text", () => {
expect(
resolveFailoverReasonFromError({
status: 429,
code: "RESOURCE_EXHAUSTED",
message: "upstream quota pressure",
cause: new SessionWriteLockTimeoutError({
timeoutMs: 10_000,
owner: "pid=37121",
lockPath: "/tmp/openclaw/session.jsonl.lock",
}),
}),
).toBe("rate_limit");
});
it("keeps inferred HTTP failover metadata authoritative over nested session lock text", () => {
expect(
resolveFailoverReasonFromError({
message: "HTTP 429: upstream quota pressure",
cause: new SessionWriteLockTimeoutError({
timeoutMs: 10_000,
owner: "pid=37121",
lockPath: "/tmp/openclaw/session.jsonl.lock",
}),
}),
).toBe("rate_limit");
});
it("does not treat generic abort codes as explicit failover metadata over nested session lock text", () => {
expect(
resolveFailoverReasonFromError({
name: "AbortError",
code: "ABORT_ERR",
message: "The operation was aborted",
cause: new SessionWriteLockTimeoutError({
timeoutMs: 10_000,
owner: "pid=37121",
lockPath: "/tmp/openclaw/session.jsonl.lock",
}),
}),
).toBeNull();
});
it("does not let cause-based failover classification bypass wrapper session lock suppression", () => {
expect(
resolveFailoverReasonFromError({
message: "wrapper",
reason: new SessionWriteLockTimeoutError({
timeoutMs: 10_000,
owner: "pid=37121",
lockPath: "/tmp/openclaw/session.jsonl.lock",
}),
cause: new Error("operation timed out"),
}),
).toBeNull();
});
it("classifies provider-scoped generic upstream errors for failover", () => {
expect(
resolveFailoverReasonFromError({

View File

@@ -1,12 +1,14 @@
import { readErrorName } from "../infra/errors.js";
import {
classifyFailoverSignal,
inferSignalStatus,
isUnclassifiedNoBodyHttpSignal,
type FailoverClassification,
type FailoverSignal,
} from "./pi-embedded-helpers/errors.js";
import { isTimeoutErrorMessage } from "./pi-embedded-helpers/errors.js";
import type { FailoverReason } from "./pi-embedded-helpers/types.js";
import { isSessionWriteLockTimeoutError } from "./session-write-lock-error.js";
const ABORT_TIMEOUT_RE = /request was aborted|request aborted/i;
const MAX_FAILOVER_CAUSE_DEPTH = 25;
@@ -198,10 +200,32 @@ function normalizeDirectErrorSignal(err: unknown): FailoverSignal {
};
}
function hasSessionWriteLockTimeout(err: unknown, seen: Set<object> = new Set()): boolean {
if (isSessionWriteLockTimeoutError(err)) {
return true;
}
if (!err || typeof err !== "object") {
return false;
}
if (seen.has(err)) {
return false;
}
seen.add(err);
const candidate = err as { error?: unknown; cause?: unknown; reason?: unknown };
return (
hasSessionWriteLockTimeout(candidate.error, seen) ||
hasSessionWriteLockTimeout(candidate.cause, seen) ||
hasSessionWriteLockTimeout(candidate.reason, seen)
);
}
function hasTimeoutHint(err: unknown): boolean {
if (!err) {
return false;
}
if (hasSessionWriteLockTimeout(err)) {
return false;
}
if (readErrorName(err) === "TimeoutError") {
return true;
}
@@ -219,6 +243,9 @@ export function isTimeoutError(err: unknown): boolean {
if (readErrorName(err) !== "AbortError") {
return false;
}
if (hasSessionWriteLockTimeout(err)) {
return false;
}
const message = getErrorMessage(err);
if (message && ABORT_TIMEOUT_RE.test(message)) {
return true;
@@ -316,8 +343,15 @@ function resolveFailoverClassificationFromErrorInternal(
reason: err.reason,
};
}
const signal = normalizeErrorSignal(err);
const codeReason = signal.code
? failoverReasonFromClassification(classifyFailoverSignal({ code: signal.code }))
: null;
const hasExplicitFailoverMetadata =
typeof inferSignalStatus(signal) === "number" ||
(codeReason !== null && codeReason !== "timeout");
const hasSessionLock = hasSessionWriteLockTimeout(err);
const classification = classifyFailoverSignal(signal);
const nestedCandidates = getNestedErrorCandidates(err);
@@ -329,6 +363,9 @@ function resolveFailoverClassificationFromErrorInternal(
depth + 1,
);
if (nestedClassification) {
if (hasSessionLock && !hasExplicitFailoverMetadata) {
return null;
}
return nestedClassification;
}
}
@@ -352,9 +389,16 @@ function resolveFailoverClassificationFromErrorInternal(
}
if (classification) {
if (hasSessionLock && !hasExplicitFailoverMetadata) {
return null;
}
return classification;
}
if (hasSessionLock) {
return null;
}
if (isTimeoutError(err)) {
return {
kind: "reason",

View File

@@ -337,7 +337,7 @@ function stripErrorPrefix(raw: string): string {
return raw.replace(/^error:\s*/i, "").trim();
}
function inferSignalStatus(signal: FailoverSignal): number | undefined {
export function inferSignalStatus(signal: FailoverSignal): number | undefined {
if (typeof signal.status === "number" && Number.isFinite(signal.status)) {
return signal.status;
}

View File

@@ -0,0 +1,29 @@
export const SESSION_WRITE_LOCK_TIMEOUT_CODE = "OPENCLAW_SESSION_WRITE_LOCK_TIMEOUT";
export class SessionWriteLockTimeoutError extends Error {
readonly code = SESSION_WRITE_LOCK_TIMEOUT_CODE;
readonly timeoutMs: number;
readonly owner: string;
readonly lockPath: string;
constructor(params: { timeoutMs: number; owner: string; lockPath: string }) {
super(
`session file locked (timeout ${params.timeoutMs}ms): ${params.owner} ${params.lockPath}`,
);
this.name = "SessionWriteLockTimeoutError";
this.timeoutMs = params.timeoutMs;
this.owner = params.owner;
this.lockPath = params.lockPath;
}
}
export function isSessionWriteLockTimeoutError(err: unknown): boolean {
return (
err instanceof SessionWriteLockTimeoutError ||
Boolean(
err &&
typeof err === "object" &&
(err as { code?: unknown }).code === SESSION_WRITE_LOCK_TIMEOUT_CODE,
)
);
}

View File

@@ -3,6 +3,7 @@ import fs from "node:fs/promises";
import path from "node:path";
import { getProcessStartTime, isPidAlive } from "../shared/pid-alive.js";
import { resolveProcessScopedMap } from "../shared/process-scoped-map.js";
import { SessionWriteLockTimeoutError } from "./session-write-lock-error.js";
type LockFilePayload = {
pid?: number;
@@ -584,7 +585,7 @@ export async function acquireSessionWriteLock(params: {
const payload = await readLockPayload(lockPath);
const owner = typeof payload?.pid === "number" ? `pid=${payload.pid}` : "unknown";
throw new Error(`session file locked (timeout ${timeoutMs}ms): ${owner} ${lockPath}`);
throw new SessionWriteLockTimeoutError({ timeoutMs, owner, lockPath });
}
export const __testing = {