mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 05:20:43 +00:00
fix(agents): reclaim untracked self-owned session locks (#75822)
Summary: - The PR refactors session-lock inspection to reclaim untracked current-process locks with matching starttime during acquisition and startup cleanup, adds regression tests, and adds a changelog entry. - Reproducibility: yes. A high-confidence code-level reproduction is to create a fresh `.jsonl.lock` with `pid ... eLock or cleanStaleLockFiles on current main and observe that acquisition waits or cleanup leaves the lock. ClawSweeper fixups: - Included follow-up commit: docs: add session lock changelog entry - Included follow-up commit: refactor(agents): distill session lock reclaim policy Validation: - ClawSweeper review passed for head2eae2c93b1. - Required merge gates passed before the squash merge. Prepared head SHA:2eae2c93b1Review: https://github.com/openclaw/openclaw/pull/75822#issuecomment-4361741599 Co-authored-by: Cedric <86914379+cdznho@users.noreply.github.com> Co-authored-by: Ayaan Zaidi <hi@obviy.us>
This commit is contained in:
@@ -54,6 +54,7 @@ Docs: https://docs.openclaw.ai
|
||||
- Slack/setup: print the generated app manifest as plain JSON instead of embedding it inside the framed setup note, so it can be copied into Slack without deleting border characters. Fixes #65751. Thanks @theDanielJLewis.
|
||||
- Channels/WhatsApp: route CLI logout through the live Gateway and stop runtime-backed listeners before channel removal, so removing a WhatsApp account does not leave the old socket replying until restart. Fixes #67746. Thanks @123Mismail.
|
||||
- Voice Call/Twilio: honor TTS directive text and provider voice/model overrides during telephony synthesis, so `[[tts:...]]` tags are not spoken literally and voiceId overrides reach OpenAI/ElevenLabs calls. Fixes #58114. Thanks @legonhilltech-jpg.
|
||||
- Agents/session-locks: reclaim untracked current-process session locks with matching starttime during acquisition and startup cleanup, so Gateway restarts recover from self-owned orphan `.jsonl.lock` files. Fixes #75805; refs #49603. Thanks @cdznho.
|
||||
- Agents/subagents: initialize built-in context engines before native `sessions_spawn` resolves spawn preparation, so cliBackend-only cold starts no longer fail with an unregistered `legacy` context engine. Fixes #73095. (#73904) Thanks @brokemac79.
|
||||
- Agents/Codex: stop prompting message-tool-only source turns to finish with `NO_REPLY`, so quiet turns are represented by not calling the visible message tool instead of conflicting final-text instructions. Thanks @pashpashpash.
|
||||
- Gateway/config: report failed backup restores as failed in logs and config observe audit records instead of marking them valid. (#70515) Thanks @davidangularme.
|
||||
|
||||
@@ -401,6 +401,43 @@ describe("acquireSessionWriteLock", () => {
|
||||
}
|
||||
});
|
||||
|
||||
it("cleans untracked current-process .jsonl lock files with matching starttime", async () => {
|
||||
const root = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-lock-"));
|
||||
const sessionsDir = path.join(root, "sessions");
|
||||
await fs.mkdir(sessionsDir, { recursive: true });
|
||||
|
||||
const nowMs = Date.now();
|
||||
const orphanSelfLock = path.join(sessionsDir, "orphan-self.jsonl.lock");
|
||||
|
||||
try {
|
||||
await fs.writeFile(
|
||||
orphanSelfLock,
|
||||
JSON.stringify({
|
||||
pid: process.pid,
|
||||
createdAt: new Date(nowMs).toISOString(),
|
||||
starttime: FAKE_STARTTIME,
|
||||
}),
|
||||
"utf8",
|
||||
);
|
||||
|
||||
const result = await cleanStaleLockFiles({
|
||||
sessionsDir,
|
||||
staleMs: 30_000,
|
||||
nowMs,
|
||||
removeStale: true,
|
||||
});
|
||||
|
||||
expect(result.locks).toHaveLength(1);
|
||||
expect(result.cleaned.map((entry) => path.basename(entry.lockPath))).toEqual([
|
||||
"orphan-self.jsonl.lock",
|
||||
]);
|
||||
expect(result.cleaned[0]?.staleReasons).toContain("orphan-self-pid");
|
||||
await expect(fs.access(orphanSelfLock)).rejects.toThrow();
|
||||
} finally {
|
||||
await fs.rm(root, { recursive: true, force: true });
|
||||
}
|
||||
});
|
||||
|
||||
it("removes held locks on termination signals", async () => {
|
||||
const signals = ["SIGINT", "SIGTERM", "SIGQUIT", "SIGABRT"] as const;
|
||||
const originalKill = process.kill.bind(process);
|
||||
@@ -456,6 +493,14 @@ describe("acquireSessionWriteLock", () => {
|
||||
});
|
||||
});
|
||||
|
||||
it("reclaims untracked current-process lock files with matching starttime", async () => {
|
||||
await withTempSessionLockFile(async ({ sessionFile, lockPath }) => {
|
||||
await writeCurrentProcessLock(lockPath, { starttime: FAKE_STARTTIME });
|
||||
|
||||
await expectCurrentPidOwnsLock({ sessionFile, timeoutMs: 500 });
|
||||
});
|
||||
});
|
||||
|
||||
it("does not reclaim active in-process lock files without starttime", async () => {
|
||||
await expectActiveInProcessLockIsNotReclaimed();
|
||||
});
|
||||
@@ -464,6 +509,10 @@ describe("acquireSessionWriteLock", () => {
|
||||
await expectActiveInProcessLockIsNotReclaimed({ legacyStarttime: 123.5 });
|
||||
});
|
||||
|
||||
it("does not reclaim active in-process lock files with matching starttime", async () => {
|
||||
await expectActiveInProcessLockIsNotReclaimed({ legacyStarttime: FAKE_STARTTIME });
|
||||
});
|
||||
|
||||
it("registers cleanup for SIGQUIT and SIGABRT", () => {
|
||||
expect(__testing.cleanupSignals).toContain("SIGQUIT");
|
||||
expect(__testing.cleanupSignals).toContain("SIGABRT");
|
||||
|
||||
@@ -348,6 +348,17 @@ async function readLockPayload(lockPath: string): Promise<LockFilePayload | null
|
||||
}
|
||||
}
|
||||
|
||||
async function resolveNormalizedSessionFile(sessionFile: string): Promise<string> {
|
||||
const resolvedSessionFile = path.resolve(sessionFile);
|
||||
const sessionDir = path.dirname(resolvedSessionFile);
|
||||
try {
|
||||
const normalizedDir = await fs.realpath(sessionDir);
|
||||
return path.join(normalizedDir, path.basename(resolvedSessionFile));
|
||||
} catch {
|
||||
return resolvedSessionFile;
|
||||
}
|
||||
}
|
||||
|
||||
function inspectLockPayload(
|
||||
payload: LockFilePayload | null,
|
||||
staleMs: number,
|
||||
@@ -429,16 +440,51 @@ async function shouldReclaimContendedLockFile(
|
||||
function shouldTreatAsOrphanSelfLock(params: {
|
||||
payload: LockFilePayload | null;
|
||||
normalizedSessionFile: string;
|
||||
reclaimLockWithoutStarttime: boolean;
|
||||
}): boolean {
|
||||
const pid = isValidLockNumber(params.payload?.pid) ? params.payload.pid : null;
|
||||
if (pid !== process.pid) {
|
||||
return false;
|
||||
}
|
||||
const hasValidStarttime = isValidLockNumber(params.payload?.starttime);
|
||||
if (hasValidStarttime) {
|
||||
if (HELD_LOCKS.has(params.normalizedSessionFile)) {
|
||||
return false;
|
||||
}
|
||||
return !HELD_LOCKS.has(params.normalizedSessionFile);
|
||||
|
||||
const storedStarttime = isValidLockNumber(params.payload?.starttime)
|
||||
? params.payload.starttime
|
||||
: null;
|
||||
if (storedStarttime === null) {
|
||||
return params.reclaimLockWithoutStarttime;
|
||||
}
|
||||
|
||||
const currentStarttime = getProcessStartTime(process.pid);
|
||||
return currentStarttime !== null && currentStarttime === storedStarttime;
|
||||
}
|
||||
|
||||
function inspectLockPayloadForSession(params: {
|
||||
payload: LockFilePayload | null;
|
||||
staleMs: number;
|
||||
nowMs: number;
|
||||
normalizedSessionFile: string;
|
||||
reclaimLockWithoutStarttime: boolean;
|
||||
}): LockInspectionDetails {
|
||||
const inspected = inspectLockPayload(params.payload, params.staleMs, params.nowMs);
|
||||
if (
|
||||
!shouldTreatAsOrphanSelfLock({
|
||||
payload: params.payload,
|
||||
normalizedSessionFile: params.normalizedSessionFile,
|
||||
reclaimLockWithoutStarttime: params.reclaimLockWithoutStarttime,
|
||||
})
|
||||
) {
|
||||
return inspected;
|
||||
}
|
||||
return {
|
||||
...inspected,
|
||||
stale: true,
|
||||
staleReasons: inspected.staleReasons.includes("orphan-self-pid")
|
||||
? inspected.staleReasons
|
||||
: [...inspected.staleReasons, "orphan-self-pid"],
|
||||
};
|
||||
}
|
||||
|
||||
export async function cleanStaleLockFiles(params: {
|
||||
@@ -476,7 +522,15 @@ export async function cleanStaleLockFiles(params: {
|
||||
for (const entry of lockEntries) {
|
||||
const lockPath = path.join(sessionsDir, entry.name);
|
||||
const payload = await readLockPayload(lockPath);
|
||||
const inspected = inspectLockPayload(payload, staleMs, nowMs);
|
||||
const sessionFile = lockPath.slice(0, -".lock".length);
|
||||
const normalizedSessionFile = await resolveNormalizedSessionFile(sessionFile);
|
||||
const inspected = inspectLockPayloadForSession({
|
||||
payload,
|
||||
staleMs,
|
||||
nowMs,
|
||||
normalizedSessionFile,
|
||||
reclaimLockWithoutStarttime: false,
|
||||
});
|
||||
const lockInfo: SessionLockInspection = {
|
||||
lockPath,
|
||||
...inspected,
|
||||
@@ -515,13 +569,7 @@ export async function acquireSessionWriteLock(params: {
|
||||
const sessionFile = path.resolve(params.sessionFile);
|
||||
const sessionDir = path.dirname(sessionFile);
|
||||
await fs.mkdir(sessionDir, { recursive: true });
|
||||
let normalizedDir = sessionDir;
|
||||
try {
|
||||
normalizedDir = await fs.realpath(sessionDir);
|
||||
} catch {
|
||||
// Fall back to the resolved path if realpath fails (permissions, transient FS).
|
||||
}
|
||||
const normalizedSessionFile = path.join(normalizedDir, path.basename(sessionFile));
|
||||
const normalizedSessionFile = await resolveNormalizedSessionFile(sessionFile);
|
||||
const lockPath = `${normalizedSessionFile}.lock`;
|
||||
|
||||
const held = HELD_LOCKS.get(normalizedSessionFile);
|
||||
@@ -587,21 +635,14 @@ export async function acquireSessionWriteLock(params: {
|
||||
}
|
||||
const payload = await readLockPayload(lockPath);
|
||||
const nowMs = Date.now();
|
||||
const inspected = inspectLockPayload(payload, staleMs, nowMs);
|
||||
const orphanSelfLock = shouldTreatAsOrphanSelfLock({
|
||||
const inspected = inspectLockPayloadForSession({
|
||||
payload,
|
||||
staleMs,
|
||||
nowMs,
|
||||
normalizedSessionFile,
|
||||
reclaimLockWithoutStarttime: true,
|
||||
});
|
||||
const reclaimDetails = orphanSelfLock
|
||||
? {
|
||||
...inspected,
|
||||
stale: true,
|
||||
staleReasons: inspected.staleReasons.includes("orphan-self-pid")
|
||||
? inspected.staleReasons
|
||||
: [...inspected.staleReasons, "orphan-self-pid"],
|
||||
}
|
||||
: inspected;
|
||||
if (await shouldReclaimContendedLockFile(lockPath, reclaimDetails, staleMs, nowMs)) {
|
||||
if (await shouldReclaimContendedLockFile(lockPath, inspected, staleMs, nowMs)) {
|
||||
await fs.rm(lockPath, { force: true });
|
||||
continue;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user