From b555214c9626ae881a2a35a4901a7af41a9cfd2d Mon Sep 17 00:00:00 2001 From: Xan Torres Date: Wed, 15 Apr 2026 23:18:09 +0200 Subject: [PATCH] Extensions/lmstudio: back off inference preload after consecutive failures --- CHANGELOG.md | 1 + extensions/lmstudio/src/stream.test.ts | 119 ++++++++++++++++++++- extensions/lmstudio/src/stream.ts | 138 +++++++++++++++++++++---- 3 files changed, 237 insertions(+), 21 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1e669362076..b7ee0eb4c44 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -39,6 +39,7 @@ Docs: https://docs.openclaw.ai - Gateway/skills: bump the cached skills-snapshot version whenever a config write touches `skills.*` (for example `skills.allowBundled`, `skills.entries..enabled`, or `skills.profile`). Existing agent sessions persist a `skillsSnapshot` in `sessions.json` that reuses the skill list frozen at session creation; without this invalidation, removing a bundled skill from the allowlist left the old snapshot live and the model kept calling the disabled tool, producing `Tool not found` loops that ran until the embedded-run timeout. (#67401) Thanks @xantorres. - Agents/tool-loop: enable the unknown-tool stream guard by default. Previously `resolveUnknownToolGuardThreshold` returned `undefined` unless `tools.loopDetection.enabled` was explicitly set to `true`, which left the protection off in the default configuration. A hallucinated or removed tool (for example `himalaya` after it was dropped from `skills.allowBundled`) would then loop "Tool X not found" attempts until the full embedded-run timeout. The guard has no false-positive surface because it only triggers on tools that are objectively not registered in the run, so it now stays on regardless of `tools.loopDetection.enabled` and still accepts `tools.loopDetection.unknownToolThreshold` as a per-run override (default 10). (#67401) Thanks @xantorres. - TUI/streaming: add a client-side streaming watchdog to `tui-event-handlers` so the `streaming · Xm Ys` activity indicator resets to `idle` after 30s of delta silence on the active run. Guards against lost or late `state: "final"` chat events (WS reconnects, gateway restarts, etc.) leaving the TUI stuck on `streaming` indefinitely; a new system log line surfaces the reset so users know to send a new message to resync. The window is configurable via the new `streamingWatchdogMs` context option (set to `0` to disable), and the handler now exposes a `dispose()` that clears the pending timer on shutdown. (#67401) Thanks @xantorres. +- Extensions/lmstudio: add exponential backoff to the inference-preload wrapper so an LM Studio model-load failure (for example the built-in memory guardrail rejecting a load because the swap is saturated) no longer produces a WARN line every ~2s for every chat request. The wrapper now records consecutive preload failures per `(baseUrl, modelKey, contextLength)` tuple with a 5s → 10s → 20s → … → 5min cooldown and skips the preload step entirely while a cooldown is active, letting chat requests proceed directly to the stream (the model is often already loaded via the LM Studio UI). The combined `preload failed` log line now reports consecutive-failure count and remaining cooldown so operators can act on the real issue instead of drowning in repeated warnings. (#67401) Thanks @xantorres. ## 2026.4.15-beta.1 diff --git a/extensions/lmstudio/src/stream.test.ts b/extensions/lmstudio/src/stream.test.ts index 7b0a02312c6..73098d90618 100644 --- a/extensions/lmstudio/src/stream.test.ts +++ b/extensions/lmstudio/src/stream.test.ts @@ -1,7 +1,10 @@ import type { StreamFn } from "@mariozechner/pi-agent-core"; import { createAssistantMessageEventStream } from "@mariozechner/pi-ai"; -import { afterEach, describe, expect, it, vi } from "vitest"; -import { wrapLmstudioInferencePreload } from "./stream.js"; +import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; +import { + __resetLmstudioPreloadCooldownForTest, + wrapLmstudioInferencePreload, +} from "./stream.js"; const ensureLmstudioModelLoadedMock = vi.hoisted(() => vi.fn()); const resolveLmstudioProviderHeadersMock = vi.hoisted(() => @@ -51,12 +54,17 @@ function buildDoneStreamFn(): StreamFn { } describe("lmstudio stream wrapper", () => { + beforeEach(() => { + __resetLmstudioPreloadCooldownForTest(); + }); + afterEach(() => { ensureLmstudioModelLoadedMock.mockReset(); resolveLmstudioProviderHeadersMock.mockReset(); resolveLmstudioRuntimeApiKeyMock.mockReset(); resolveLmstudioProviderHeadersMock.mockResolvedValue(undefined); resolveLmstudioRuntimeApiKeyMock.mockResolvedValue(undefined); + __resetLmstudioPreloadCooldownForTest(); }); it("preloads LM Studio model before inference using model context window", async () => { @@ -243,6 +251,113 @@ describe("lmstudio stream wrapper", () => { expect(ensureLmstudioModelLoadedMock).toHaveBeenCalledTimes(1); }); + it("skips preload on the second attempt while the failure backoff is active", async () => { + ensureLmstudioModelLoadedMock.mockRejectedValue(new Error("out of memory")); + const baseStream = buildDoneStreamFn(); + const wrapped = wrapLmstudioInferencePreload({ + provider: "lmstudio", + modelId: "qwen3-8b-instruct", + config: { + models: { + providers: { + lmstudio: { + baseUrl: "http://localhost:1234", + models: [], + }, + }, + }, + }, + streamFn: baseStream, + } as never); + + const firstEvents = await collectEvents( + wrapped( + { + provider: "lmstudio", + api: "openai-completions", + id: "qwen3-8b-instruct", + } as never, + { messages: [] } as never, + undefined as never, + ), + ); + expect(firstEvents).toEqual([expect.objectContaining({ type: "done" })]); + expect(ensureLmstudioModelLoadedMock).toHaveBeenCalledTimes(1); + + const secondEvents = await collectEvents( + wrapped( + { + provider: "lmstudio", + api: "openai-completions", + id: "qwen3-8b-instruct", + } as never, + { messages: [] } as never, + undefined as never, + ), + ); + expect(secondEvents).toEqual([expect.objectContaining({ type: "done" })]); + // The second call must NOT retry preload because cooldown is active, but + // the underlying stream must still run so the user gets a response. + expect(ensureLmstudioModelLoadedMock).toHaveBeenCalledTimes(1); + expect(baseStream).toHaveBeenCalledTimes(2); + }); + + it("retries preload once the cooldown expires", async () => { + ensureLmstudioModelLoadedMock.mockRejectedValueOnce(new Error("out of memory")); + ensureLmstudioModelLoadedMock.mockResolvedValueOnce(undefined); + const baseStream = buildDoneStreamFn(); + const wrapped = wrapLmstudioInferencePreload({ + provider: "lmstudio", + modelId: "qwen3-8b-instruct", + config: { + models: { + providers: { + lmstudio: { + baseUrl: "http://localhost:1234", + models: [], + }, + }, + }, + }, + streamFn: baseStream, + } as never); + + // Freeze Date.now at a known base so we can jump past the first backoff + // window (5s by default) between the two preload attempts. + const baseTime = 1_000_000; + const nowSpy = vi.spyOn(Date, "now"); + nowSpy.mockReturnValue(baseTime); + await collectEvents( + wrapped( + { + provider: "lmstudio", + api: "openai-completions", + id: "qwen3-8b-instruct", + } as never, + { messages: [] } as never, + undefined as never, + ), + ); + expect(ensureLmstudioModelLoadedMock).toHaveBeenCalledTimes(1); + + // Move the clock past the initial 5s cooldown window so the next call is + // allowed to retry preload. + nowSpy.mockReturnValue(baseTime + 6_000); + await collectEvents( + wrapped( + { + provider: "lmstudio", + api: "openai-completions", + id: "qwen3-8b-instruct", + } as never, + { messages: [] } as never, + undefined as never, + ), + ); + expect(ensureLmstudioModelLoadedMock).toHaveBeenCalledTimes(2); + nowSpy.mockRestore(); + }); + it("forces supportsUsageInStreaming compat before calling the underlying stream", async () => { const baseStream = buildDoneStreamFn(); const wrapped = wrapLmstudioInferencePreload({ diff --git a/extensions/lmstudio/src/stream.ts b/extensions/lmstudio/src/stream.ts index b0a63c25494..cbfd2674301 100644 --- a/extensions/lmstudio/src/stream.ts +++ b/extensions/lmstudio/src/stream.ts @@ -15,6 +15,68 @@ type StreamModel = Parameters[0]; const preloadInFlight = new Map>(); +/** + * Cooldown state for the LM Studio preload endpoint. + * + * Without this, every chat request would retry preload ~every 2s even when + * LM Studio has rejected the load (for example the memory guardrail will keep + * rejecting until the user adjusts the setting or frees RAM). That produced + * hundreds of `LM Studio inference preload failed` WARN lines per hour without + * actually helping the user. The cooldown applies an exponential backoff per + * preloadKey and, while the cooldown is active, the wrapper skips the preload + * step entirely and proceeds directly to streaming — the model is often + * already loaded from the user's LM Studio UI, so inference can succeed even + * when preload keeps being rejected. + */ +type PreloadCooldownEntry = { + untilMs: number; + consecutiveFailures: number; +}; + +const preloadCooldown = new Map(); + +const PRELOAD_BACKOFF_BASE_MS = 5_000; +const PRELOAD_BACKOFF_MAX_MS = 300_000; + +function computePreloadBackoffMs(consecutiveFailures: number): number { + const exponent = Math.max(0, consecutiveFailures - 1); + const raw = PRELOAD_BACKOFF_BASE_MS * 2 ** exponent; + return Math.min(PRELOAD_BACKOFF_MAX_MS, raw); +} + +function recordPreloadSuccess(preloadKey: string): void { + preloadCooldown.delete(preloadKey); +} + +function recordPreloadFailure(preloadKey: string, now: number): PreloadCooldownEntry { + const existing = preloadCooldown.get(preloadKey); + const consecutiveFailures = (existing?.consecutiveFailures ?? 0) + 1; + const entry: PreloadCooldownEntry = { + consecutiveFailures, + untilMs: now + computePreloadBackoffMs(consecutiveFailures), + }; + preloadCooldown.set(preloadKey, entry); + return entry; +} + +function isPreloadCoolingDown(preloadKey: string, now: number): PreloadCooldownEntry | undefined { + const entry = preloadCooldown.get(preloadKey); + if (!entry) { + return undefined; + } + if (entry.untilMs <= now) { + preloadCooldown.delete(preloadKey); + return undefined; + } + return entry; +} + +/** Test-only hook for clearing preload cooldown state between cases. */ +export function __resetLmstudioPreloadCooldownForTest(): void { + preloadCooldown.clear(); + preloadInFlight.clear(); +} + function normalizeLmstudioModelKey(modelId: string): string { const trimmed = modelId.trim(); if (trimmed.toLowerCase().startsWith("lmstudio/")) { @@ -131,29 +193,67 @@ export function wrapLmstudioInferencePreload(ctx: ProviderWrapStreamFnContext): modelKey, requestedContextLength, }); + + const cooldownEntry = isPreloadCoolingDown(preloadKey, Date.now()); const existing = preloadInFlight.get(preloadKey); - const preloadPromise = + const preloadPromise: Promise | undefined = existing ?? - ensureLmstudioModelLoadedBestEffort({ - baseUrl: resolvedBaseUrl, - modelKey, - requestedContextLength, - options, - ctx, - modelHeaders: resolveModelHeaders(model), - }).finally(() => { - preloadInFlight.delete(preloadKey); - }); - if (!existing) { - preloadInFlight.set(preloadKey, preloadPromise); - } + (cooldownEntry + ? undefined + : (() => { + const created = ensureLmstudioModelLoadedBestEffort({ + baseUrl: resolvedBaseUrl, + modelKey, + requestedContextLength, + options, + ctx, + modelHeaders: resolveModelHeaders(model), + }) + .then( + () => { + recordPreloadSuccess(preloadKey); + }, + (error) => { + const entry = recordPreloadFailure(preloadKey, Date.now()); + throw Object.assign(new Error("preload-failed"), { + cause: error, + consecutiveFailures: entry.consecutiveFailures, + cooldownMs: entry.untilMs - Date.now(), + }); + }, + ) + .finally(() => { + preloadInFlight.delete(preloadKey); + }); + preloadInFlight.set(preloadKey, created); + return created; + })()); return (async () => { - try { - await preloadPromise; - } catch (error) { - log.warn( - `LM Studio inference preload failed for "${modelKey}"; continuing without preload: ${String(error)}`, + if (preloadPromise) { + try { + await preloadPromise; + } catch (error) { + const annotated = error as { + cause?: unknown; + consecutiveFailures?: number; + cooldownMs?: number; + }; + const cause = annotated.cause ?? error; + const failures = annotated.consecutiveFailures ?? 1; + const cooldownSec = Math.max( + 0, + Math.round((annotated.cooldownMs ?? 0) / 1000), + ); + log.warn( + `LM Studio inference preload failed for "${modelKey}" (${failures} consecutive failure${ + failures === 1 ? "" : "s" + }, next preload attempt skipped for ~${cooldownSec}s); continuing without preload: ${String(cause)}`, + ); + } + } else if (cooldownEntry) { + log.debug( + `LM Studio inference preload for "${modelKey}" skipped while backoff active (${cooldownEntry.consecutiveFailures} prior failures)`, ); } // LM Studio uses OpenAI-compatible streaming usage payloads when requested via