mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 16:30:57 +00:00
fix: stop session lock failover (#68700) (thanks @MonkeyLeeT)
* fix(agents): stop treating session lock waits as timeout * fix(agents): ignore abort-wrapped session lock waits * fix(agents): keep explicit failover metadata authoritative * fix(agents): respect inferred failover metadata * fix(agents): ignore generic abort codes for lock waits * fix(agents): suppress cause-based lock wait fallback * fix(agents): type session lock timeout errors * fix: stop session lock failover (#68700) (thanks @MonkeyLeeT) --------- Co-authored-by: Ayaan Zaidi <hi@obviy.us>
This commit is contained in:
@@ -231,6 +231,7 @@ Docs: https://docs.openclaw.ai
|
||||
- Config/agents: accept `agents.list[].contextTokens` in strict config validation so per-agent overrides survive hot reload, letting `/status` reflect the configured model window instead of the 200k fallback. Fixes #70692. (#71247) Thanks @statxc.
|
||||
- Heartbeat: include async exec completion details in heartbeat prompts so command-finished notifications relay the actual output. (#71213) Thanks @GodsBoy.
|
||||
- Memory search: apply session visibility and agent-to-agent policy to session transcript hits, and keep `corpus=sessions` ranking scoped to session collections before result limiting. (#70761) Thanks @nefainl.
|
||||
- Agents/sessions: stop session write-lock timeouts from entering model failover, so local lock contention surfaces directly instead of cascading across providers. (#68700) Thanks @MonkeyLeeT.
|
||||
|
||||
## 2026.4.23
|
||||
|
||||
|
||||
@@ -8,6 +8,7 @@ import {
|
||||
resolveFailoverStatus,
|
||||
} from "./failover-error.js";
|
||||
import { classifyFailoverSignal } from "./pi-embedded-helpers/errors.js";
|
||||
import { SessionWriteLockTimeoutError } from "./session-write-lock-error.js";
|
||||
|
||||
// OpenAI 429 example shape: https://help.openai.com/en/articles/5955604-how-can-i-solve-429-too-many-requests-errors
|
||||
const OPENAI_RATE_LIMIT_MESSAGE =
|
||||
@@ -359,6 +360,87 @@ describe("failover-error", () => {
|
||||
).toBe("overloaded");
|
||||
});
|
||||
|
||||
it("does not classify session lock wait errors as model timeout failover", () => {
|
||||
const sessionLockError = new SessionWriteLockTimeoutError({
|
||||
timeoutMs: 10_000,
|
||||
owner: "pid=37121",
|
||||
lockPath: "/tmp/openclaw/session.jsonl.lock",
|
||||
});
|
||||
expect(resolveFailoverReasonFromError(sessionLockError)).toBeNull();
|
||||
expect(isTimeoutError(sessionLockError)).toBe(false);
|
||||
|
||||
const wrappedLockError = Object.assign(new Error("operation timed out"), {
|
||||
name: "AbortError",
|
||||
cause: sessionLockError,
|
||||
});
|
||||
expect(resolveFailoverReasonFromError(wrappedLockError)).toBeNull();
|
||||
expect(isTimeoutError(wrappedLockError)).toBe(false);
|
||||
|
||||
const abortWrappedLockError = Object.assign(new Error("request was aborted"), {
|
||||
name: "AbortError",
|
||||
cause: sessionLockError,
|
||||
});
|
||||
expect(resolveFailoverReasonFromError(abortWrappedLockError)).toBeNull();
|
||||
expect(isTimeoutError(abortWrappedLockError)).toBe(false);
|
||||
});
|
||||
|
||||
it("keeps explicit provider failover metadata authoritative over nested session lock text", () => {
|
||||
expect(
|
||||
resolveFailoverReasonFromError({
|
||||
status: 429,
|
||||
code: "RESOURCE_EXHAUSTED",
|
||||
message: "upstream quota pressure",
|
||||
cause: new SessionWriteLockTimeoutError({
|
||||
timeoutMs: 10_000,
|
||||
owner: "pid=37121",
|
||||
lockPath: "/tmp/openclaw/session.jsonl.lock",
|
||||
}),
|
||||
}),
|
||||
).toBe("rate_limit");
|
||||
});
|
||||
|
||||
it("keeps inferred HTTP failover metadata authoritative over nested session lock text", () => {
|
||||
expect(
|
||||
resolveFailoverReasonFromError({
|
||||
message: "HTTP 429: upstream quota pressure",
|
||||
cause: new SessionWriteLockTimeoutError({
|
||||
timeoutMs: 10_000,
|
||||
owner: "pid=37121",
|
||||
lockPath: "/tmp/openclaw/session.jsonl.lock",
|
||||
}),
|
||||
}),
|
||||
).toBe("rate_limit");
|
||||
});
|
||||
|
||||
it("does not treat generic abort codes as explicit failover metadata over nested session lock text", () => {
|
||||
expect(
|
||||
resolveFailoverReasonFromError({
|
||||
name: "AbortError",
|
||||
code: "ABORT_ERR",
|
||||
message: "The operation was aborted",
|
||||
cause: new SessionWriteLockTimeoutError({
|
||||
timeoutMs: 10_000,
|
||||
owner: "pid=37121",
|
||||
lockPath: "/tmp/openclaw/session.jsonl.lock",
|
||||
}),
|
||||
}),
|
||||
).toBeNull();
|
||||
});
|
||||
|
||||
it("does not let cause-based failover classification bypass wrapper session lock suppression", () => {
|
||||
expect(
|
||||
resolveFailoverReasonFromError({
|
||||
message: "wrapper",
|
||||
reason: new SessionWriteLockTimeoutError({
|
||||
timeoutMs: 10_000,
|
||||
owner: "pid=37121",
|
||||
lockPath: "/tmp/openclaw/session.jsonl.lock",
|
||||
}),
|
||||
cause: new Error("operation timed out"),
|
||||
}),
|
||||
).toBeNull();
|
||||
});
|
||||
|
||||
it("classifies provider-scoped generic upstream errors for failover", () => {
|
||||
expect(
|
||||
resolveFailoverReasonFromError({
|
||||
|
||||
@@ -1,12 +1,14 @@
|
||||
import { readErrorName } from "../infra/errors.js";
|
||||
import {
|
||||
classifyFailoverSignal,
|
||||
inferSignalStatus,
|
||||
isUnclassifiedNoBodyHttpSignal,
|
||||
type FailoverClassification,
|
||||
type FailoverSignal,
|
||||
} from "./pi-embedded-helpers/errors.js";
|
||||
import { isTimeoutErrorMessage } from "./pi-embedded-helpers/errors.js";
|
||||
import type { FailoverReason } from "./pi-embedded-helpers/types.js";
|
||||
import { isSessionWriteLockTimeoutError } from "./session-write-lock-error.js";
|
||||
|
||||
const ABORT_TIMEOUT_RE = /request was aborted|request aborted/i;
|
||||
const MAX_FAILOVER_CAUSE_DEPTH = 25;
|
||||
@@ -198,10 +200,32 @@ function normalizeDirectErrorSignal(err: unknown): FailoverSignal {
|
||||
};
|
||||
}
|
||||
|
||||
function hasSessionWriteLockTimeout(err: unknown, seen: Set<object> = new Set()): boolean {
|
||||
if (isSessionWriteLockTimeoutError(err)) {
|
||||
return true;
|
||||
}
|
||||
if (!err || typeof err !== "object") {
|
||||
return false;
|
||||
}
|
||||
if (seen.has(err)) {
|
||||
return false;
|
||||
}
|
||||
seen.add(err);
|
||||
const candidate = err as { error?: unknown; cause?: unknown; reason?: unknown };
|
||||
return (
|
||||
hasSessionWriteLockTimeout(candidate.error, seen) ||
|
||||
hasSessionWriteLockTimeout(candidate.cause, seen) ||
|
||||
hasSessionWriteLockTimeout(candidate.reason, seen)
|
||||
);
|
||||
}
|
||||
|
||||
function hasTimeoutHint(err: unknown): boolean {
|
||||
if (!err) {
|
||||
return false;
|
||||
}
|
||||
if (hasSessionWriteLockTimeout(err)) {
|
||||
return false;
|
||||
}
|
||||
if (readErrorName(err) === "TimeoutError") {
|
||||
return true;
|
||||
}
|
||||
@@ -219,6 +243,9 @@ export function isTimeoutError(err: unknown): boolean {
|
||||
if (readErrorName(err) !== "AbortError") {
|
||||
return false;
|
||||
}
|
||||
if (hasSessionWriteLockTimeout(err)) {
|
||||
return false;
|
||||
}
|
||||
const message = getErrorMessage(err);
|
||||
if (message && ABORT_TIMEOUT_RE.test(message)) {
|
||||
return true;
|
||||
@@ -316,8 +343,15 @@ function resolveFailoverClassificationFromErrorInternal(
|
||||
reason: err.reason,
|
||||
};
|
||||
}
|
||||
|
||||
const signal = normalizeErrorSignal(err);
|
||||
const codeReason = signal.code
|
||||
? failoverReasonFromClassification(classifyFailoverSignal({ code: signal.code }))
|
||||
: null;
|
||||
const hasExplicitFailoverMetadata =
|
||||
typeof inferSignalStatus(signal) === "number" ||
|
||||
(codeReason !== null && codeReason !== "timeout");
|
||||
const hasSessionLock = hasSessionWriteLockTimeout(err);
|
||||
|
||||
const classification = classifyFailoverSignal(signal);
|
||||
const nestedCandidates = getNestedErrorCandidates(err);
|
||||
|
||||
@@ -329,6 +363,9 @@ function resolveFailoverClassificationFromErrorInternal(
|
||||
depth + 1,
|
||||
);
|
||||
if (nestedClassification) {
|
||||
if (hasSessionLock && !hasExplicitFailoverMetadata) {
|
||||
return null;
|
||||
}
|
||||
return nestedClassification;
|
||||
}
|
||||
}
|
||||
@@ -352,9 +389,16 @@ function resolveFailoverClassificationFromErrorInternal(
|
||||
}
|
||||
|
||||
if (classification) {
|
||||
if (hasSessionLock && !hasExplicitFailoverMetadata) {
|
||||
return null;
|
||||
}
|
||||
return classification;
|
||||
}
|
||||
|
||||
if (hasSessionLock) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (isTimeoutError(err)) {
|
||||
return {
|
||||
kind: "reason",
|
||||
|
||||
@@ -337,7 +337,7 @@ function stripErrorPrefix(raw: string): string {
|
||||
return raw.replace(/^error:\s*/i, "").trim();
|
||||
}
|
||||
|
||||
function inferSignalStatus(signal: FailoverSignal): number | undefined {
|
||||
export function inferSignalStatus(signal: FailoverSignal): number | undefined {
|
||||
if (typeof signal.status === "number" && Number.isFinite(signal.status)) {
|
||||
return signal.status;
|
||||
}
|
||||
|
||||
29
src/agents/session-write-lock-error.ts
Normal file
29
src/agents/session-write-lock-error.ts
Normal file
@@ -0,0 +1,29 @@
|
||||
export const SESSION_WRITE_LOCK_TIMEOUT_CODE = "OPENCLAW_SESSION_WRITE_LOCK_TIMEOUT";
|
||||
|
||||
export class SessionWriteLockTimeoutError extends Error {
|
||||
readonly code = SESSION_WRITE_LOCK_TIMEOUT_CODE;
|
||||
readonly timeoutMs: number;
|
||||
readonly owner: string;
|
||||
readonly lockPath: string;
|
||||
|
||||
constructor(params: { timeoutMs: number; owner: string; lockPath: string }) {
|
||||
super(
|
||||
`session file locked (timeout ${params.timeoutMs}ms): ${params.owner} ${params.lockPath}`,
|
||||
);
|
||||
this.name = "SessionWriteLockTimeoutError";
|
||||
this.timeoutMs = params.timeoutMs;
|
||||
this.owner = params.owner;
|
||||
this.lockPath = params.lockPath;
|
||||
}
|
||||
}
|
||||
|
||||
export function isSessionWriteLockTimeoutError(err: unknown): boolean {
|
||||
return (
|
||||
err instanceof SessionWriteLockTimeoutError ||
|
||||
Boolean(
|
||||
err &&
|
||||
typeof err === "object" &&
|
||||
(err as { code?: unknown }).code === SESSION_WRITE_LOCK_TIMEOUT_CODE,
|
||||
)
|
||||
);
|
||||
}
|
||||
@@ -3,6 +3,7 @@ import fs from "node:fs/promises";
|
||||
import path from "node:path";
|
||||
import { getProcessStartTime, isPidAlive } from "../shared/pid-alive.js";
|
||||
import { resolveProcessScopedMap } from "../shared/process-scoped-map.js";
|
||||
import { SessionWriteLockTimeoutError } from "./session-write-lock-error.js";
|
||||
|
||||
type LockFilePayload = {
|
||||
pid?: number;
|
||||
@@ -584,7 +585,7 @@ export async function acquireSessionWriteLock(params: {
|
||||
|
||||
const payload = await readLockPayload(lockPath);
|
||||
const owner = typeof payload?.pid === "number" ? `pid=${payload.pid}` : "unknown";
|
||||
throw new Error(`session file locked (timeout ${timeoutMs}ms): ${owner} ${lockPath}`);
|
||||
throw new SessionWriteLockTimeoutError({ timeoutMs, owner, lockPath });
|
||||
}
|
||||
|
||||
export const __testing = {
|
||||
|
||||
Reference in New Issue
Block a user