From 4b4cde71874f1140346456640b8913042e560661 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Tue, 28 Apr 2026 04:16:03 +0100 Subject: [PATCH] fix(memory): back off qmd open failures --- CHANGELOG.md | 1 + docs/concepts/memory-qmd.md | 3 + .../src/memory/search-manager.test.ts | 50 ++++++++ .../memory-core/src/memory/search-manager.ts | 109 +++++++++++++++--- 4 files changed, 147 insertions(+), 16 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4a567a49cc9..76fb0a32ce6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -41,6 +41,7 @@ Docs: https://docs.openclaw.ai - Backup: skip installed plugin `extensions/*/node_modules` dependency trees while keeping plugin manifests and source files in archives, so local backups avoid rebuildable npm payload bloat. Fixes #64144. Thanks @BrilliantWang. - Cron/models: fail isolated cron runs closed when an explicit `payload.model` is not allowed or cannot be resolved, so scheduled jobs do not silently fall back to an unrelated agent default or paid route before configured provider proxies such as LiteLLM can run. Fixes #73146. Thanks @oneandrewwang. +- Memory/QMD: back off repeated chat-turn QMD open failures while still letting memory status and CLI probes recheck immediately, so a broken sidecar dependency cannot trigger active-memory or cron retry storms. Fixes #73188 and #73176. Thanks @leonlushgit and @w3i-William. - Memory/Ollama: resolve `memorySearch.provider` custom provider ids through their configured `models.providers..api` owner, so multi-GPU Ollama setups can dedicate embeddings to providers such as `ollama-5080` without losing the Ollama adapter or local auth semantics. Fixes #73150. Thanks @oneandrewwang. - CLI/memory: skip eager context-window warmup for `openclaw memory` commands so memory search does not race unrelated model metadata discovery. Fixes #73123. Thanks @oalansilva and @neeravmakwana. - CLI/Telegram: route Telegram `message send` and poll actions through the running Gateway when available, so packaged installs use the staged `grammy` runtime deps and CLI sends return instead of hanging after the Telegram channel is active. Fixes #73140. Thanks @oalansilva. diff --git a/docs/concepts/memory-qmd.md b/docs/concepts/memory-qmd.md index daa425c92d4..502a672e157 100644 --- a/docs/concepts/memory-qmd.md +++ b/docs/concepts/memory-qmd.md @@ -64,6 +64,9 @@ present. same-source collections into one QMD search invocation. Older QMD releases keep the compatible per-collection fallback. - If QMD fails entirely, OpenClaw falls back to the builtin SQLite engine. + Repeated chat-turn attempts back off briefly after an open failure so a + missing binary or broken sidecar dependency does not create a retry storm; + `openclaw memory status` and one-shot CLI probes still recheck QMD directly. The first search may be slow -- QMD auto-downloads GGUF models (~2 GB) for diff --git a/extensions/memory-core/src/memory/search-manager.test.ts b/extensions/memory-core/src/memory/search-manager.test.ts index c3d315c7f58..2052c53a3a3 100644 --- a/extensions/memory-core/src/memory/search-manager.test.ts +++ b/extensions/memory-core/src/memory/search-manager.test.ts @@ -280,6 +280,56 @@ describe("getMemorySearchManager caching", () => { expect(searchResults).toHaveLength(1); }); + it("backs off repeated full qmd open failures until the cooldown expires", async () => { + const agentId = "qmd-open-cooldown"; + const cfg = createQmdCfg(agentId); + const nowSpy = vi.spyOn(Date, "now").mockReturnValue(1_000); + createQmdManagerMock.mockRejectedValueOnce(new Error("Cannot find package 'chokidar'")); + + try { + const first = await getMemorySearchManager({ cfg, agentId }); + const second = await getMemorySearchManager({ cfg, agentId }); + + expect(first.manager).toBe(fallbackManager); + expect(second.manager).toBe(fallbackManager); + expect(createQmdManagerMock).toHaveBeenCalledTimes(1); + expect(checkQmdBinaryAvailability).toHaveBeenCalledTimes(1); + + nowSpy.mockReturnValue(62_001); + const third = await getMemorySearchManager({ cfg, agentId }); + const thirdManager = requireManager(third); + + expect(thirdManager.status()).toMatchObject({ backend: "qmd" }); + expect(createQmdManagerMock).toHaveBeenCalledTimes(2); + expect(checkQmdBinaryAvailability).toHaveBeenCalledTimes(2); + } finally { + nowSpy.mockRestore(); + } + }); + + it("lets status probes bypass and clear a full qmd open-failure cooldown", async () => { + const agentId = "qmd-open-status-bypass"; + const cfg = createQmdCfg(agentId); + const nowSpy = vi.spyOn(Date, "now").mockReturnValue(1_000); + createQmdManagerMock.mockRejectedValueOnce(new Error("Cannot find package 'chokidar'")); + + try { + const first = await getMemorySearchManager({ cfg, agentId }); + expect(first.manager).toBe(fallbackManager); + expect(createQmdManagerMock).toHaveBeenCalledTimes(1); + + const status = await getMemorySearchManager({ cfg, agentId, purpose: "status" }); + expect(requireManager(status).status()).toMatchObject({ backend: "qmd" }); + expect(createQmdManagerMock).toHaveBeenCalledTimes(2); + + const full = await getMemorySearchManager({ cfg, agentId }); + expect(requireManager(full).status()).toMatchObject({ backend: "qmd" }); + expect(createQmdManagerMock).toHaveBeenCalledTimes(3); + } finally { + nowSpy.mockRestore(); + } + }); + it("probes qmd availability from the agent workspace", async () => { const agentId = "workspace-probe"; const cfg = createQmdCfg(agentId); diff --git a/extensions/memory-core/src/memory/search-manager.ts b/extensions/memory-core/src/memory/search-manager.ts index 8aff35f4575..5146ebefb3a 100644 --- a/extensions/memory-core/src/memory/search-manager.ts +++ b/extensions/memory-core/src/memory/search-manager.ts @@ -38,15 +38,25 @@ type PendingQmdManagerCreate = { promise: Promise>; }; +type QmdManagerOpenFailure = { + identityKey: string; + reason: string; + retryAfterMs: number; +}; + type MemorySearchManagerCacheStore = { qmdManagerCache: Map; pendingQmdManagerCreates: Map; + qmdManagerOpenFailures: Map; }; +const QMD_MANAGER_OPEN_FAILURE_COOLDOWN_MS = 60_000; + function createMemorySearchManagerCacheStore(): MemorySearchManagerCacheStore { return { qmdManagerCache: new Map(), pendingQmdManagerCreates: new Map(), + qmdManagerOpenFailures: new Map(), }; } @@ -62,7 +72,11 @@ function getMemorySearchManagerCacheStore(): MemorySearchManagerCacheStore { (resolved as Partial).qmdManagerCache instanceof Map && (resolved as Partial).pendingQmdManagerCreates instanceof Map ) { - return resolved as MemorySearchManagerCacheStore; + const cacheStore = resolved as Partial; + if (!(cacheStore.qmdManagerOpenFailures instanceof Map)) { + cacheStore.qmdManagerOpenFailures = new Map(); + } + return cacheStore as MemorySearchManagerCacheStore; } const repaired = createMemorySearchManagerCacheStore(); (globalThis as Record)[MEMORY_SEARCH_MANAGER_CACHE_KEY] = repaired; @@ -73,6 +87,7 @@ const log = createSubsystemLogger("memory"); const { qmdManagerCache: QMD_MANAGER_CACHE, pendingQmdManagerCreates: PENDING_QMD_MANAGER_CREATES, + qmdManagerOpenFailures: QMD_MANAGER_OPEN_FAILURES, } = getMemorySearchManagerCacheStore(); let managerRuntimePromise: Promise | null = null; let qmdManagerModulePromise: Promise | null = null; @@ -94,6 +109,42 @@ export type MemorySearchManagerResult = { export type MemorySearchManagerPurpose = "default" | "status" | "cli"; +function getActiveQmdManagerOpenFailure( + scopeKey: string, + identityKey: string, + nowMs = Date.now(), +): QmdManagerOpenFailure | null { + const failure = QMD_MANAGER_OPEN_FAILURES.get(scopeKey); + if (!failure) { + return null; + } + if (failure.identityKey !== identityKey || failure.retryAfterMs <= nowMs) { + QMD_MANAGER_OPEN_FAILURES.delete(scopeKey); + return null; + } + return failure; +} + +function recordQmdManagerOpenFailure( + scopeKey: string, + identityKey: string, + reason: string, + nowMs = Date.now(), +): void { + QMD_MANAGER_OPEN_FAILURES.set(scopeKey, { + identityKey, + reason, + retryAfterMs: nowMs + QMD_MANAGER_OPEN_FAILURE_COOLDOWN_MS, + }); +} + +function clearQmdManagerOpenFailure(scopeKey: string, identityKey: string): void { + const failure = QMD_MANAGER_OPEN_FAILURES.get(scopeKey); + if (failure?.identityKey === identityKey) { + QMD_MANAGER_OPEN_FAILURES.delete(scopeKey); + } +} + export async function getMemorySearchManager(params: { cfg: OpenClawConfig; agentId: string; @@ -111,14 +162,18 @@ export async function getMemorySearchManager(params: { const createPrimaryQmdManager = async ( mode: "full" | "status" | "cli", - ): Promise> => { + ): Promise<{ manager: Maybe; failureReason?: string }> => { try { await fs.mkdir(workspaceDir, { recursive: true }); } catch (err) { + const message = formatErrorMessage(err); log.warn( - `qmd workspace unavailable (${workspaceDir}); falling back to builtin: ${formatErrorMessage(err)}`, + `qmd workspace unavailable (${workspaceDir}); falling back to builtin: ${message}`, ); - return null; + return { + manager: null, + failureReason: `qmd workspace unavailable (${workspaceDir}): ${message}`, + }; } const qmdBinary = await checkQmdBinaryAvailability({ @@ -127,10 +182,14 @@ export async function getMemorySearchManager(params: { cwd: workspaceDir, }); if (!qmdBinary.available) { + const message = qmdBinary.error ?? "unknown error"; log.warn( - `qmd binary unavailable (${qmdResolved.command}); falling back to builtin: ${qmdBinary.error ?? "unknown error"}`, + `qmd binary unavailable (${qmdResolved.command}); falling back to builtin: ${message}`, ); - return null; + return { + manager: null, + failureReason: `qmd binary unavailable (${qmdResolved.command}): ${message}`, + }; } try { const { QmdMemoryManager } = await loadQmdManagerModule(); @@ -142,21 +201,23 @@ export async function getMemorySearchManager(params: { runtimeConfig, }); if (primary) { - return primary; + clearQmdManagerOpenFailure(scopeKey, identityKey); + return { manager: primary }; } } catch (err) { const message = formatErrorMessage(err); log.warn(`qmd memory unavailable; falling back to builtin: ${message}`); + return { manager: null, failureReason: `qmd memory unavailable: ${message}` }; } - return null; + return { manager: null, failureReason: "qmd memory unavailable: no manager returned" }; }; const createFullQmdManager = async ( expectedIdentityKey: string, - ): Promise> => { - const primary = await createPrimaryQmdManager("full"); + ): Promise<{ entry: Maybe; failureReason?: string }> => { + const { manager: primary, failureReason } = await createPrimaryQmdManager("full"); if (!primary) { - return null; + return { entry: null, failureReason }; } let cacheEntry!: CachedQmdManagerEntry; const wrapper = new FallbackMemoryManager( @@ -178,7 +239,7 @@ export async function getMemorySearchManager(params: { identityKey: expectedIdentityKey, manager: wrapper, }; - return cacheEntry; + return { entry: cacheEntry }; }; while (true) { @@ -197,10 +258,20 @@ export async function getMemorySearchManager(params: { } if (transient) { - const manager = await createPrimaryQmdManager(params.purpose === "cli" ? "cli" : "status"); + const { manager } = await createPrimaryQmdManager( + params.purpose === "cli" ? "cli" : "status", + ); return manager ? { manager } : await getBuiltinMemorySearchManager(params); } + const recentFailure = getActiveQmdManagerOpenFailure(scopeKey, identityKey); + if (recentFailure) { + log.debug?.( + `qmd memory unavailable; using builtin during cooldown: ${recentFailure.reason}`, + ); + return await getBuiltinMemorySearchManager(params); + } + const pending = PENDING_QMD_MANAGER_CREATES.get(scopeKey); if (pending) { await pending.promise; @@ -211,16 +282,21 @@ export async function getMemorySearchManager(params: { identityKey, promise: (async () => { const created = await createFullQmdManager(identityKey); - if (!created) { + if (!created.entry) { + recordQmdManagerOpenFailure( + scopeKey, + identityKey, + created.failureReason ?? "qmd memory unavailable", + ); return null; } - QMD_MANAGER_CACHE.set(scopeKey, created); + QMD_MANAGER_CACHE.set(scopeKey, created.entry); if (cached) { await closeQmdManagerForReplacement(cached.manager).catch((err) => { log.warn(`failed to retire replaced qmd memory manager: ${formatErrorMessage(err)}`); }); } - return created.manager; + return created.entry.manager; })().finally(() => { const currentPending = PENDING_QMD_MANAGER_CREATES.get(scopeKey); if (currentPending === pendingCreate) { @@ -307,6 +383,7 @@ export async function closeAllMemorySearchManagers(): Promise { const managers = Array.from(QMD_MANAGER_CACHE.values(), (entry) => entry.manager); PENDING_QMD_MANAGER_CREATES.clear(); QMD_MANAGER_CACHE.clear(); + QMD_MANAGER_OPEN_FAILURES.clear(); for (const manager of managers) { try { await manager.close?.();