Extensions/lmstudio: back off inference preload after consecutive failures

This commit is contained in:
Xan Torres
2026-04-15 23:18:09 +02:00
committed by Ayaan Zaidi
parent f44ab20d4d
commit b555214c96
3 changed files with 237 additions and 21 deletions

View File

@@ -39,6 +39,7 @@ Docs: https://docs.openclaw.ai
- Gateway/skills: bump the cached skills-snapshot version whenever a config write touches `skills.*` (for example `skills.allowBundled`, `skills.entries.<id>.enabled`, or `skills.profile`). Existing agent sessions persist a `skillsSnapshot` in `sessions.json` that reuses the skill list frozen at session creation; without this invalidation, removing a bundled skill from the allowlist left the old snapshot live and the model kept calling the disabled tool, producing `Tool <name> not found` loops that ran until the embedded-run timeout. (#67401) Thanks @xantorres.
- Agents/tool-loop: enable the unknown-tool stream guard by default. Previously `resolveUnknownToolGuardThreshold` returned `undefined` unless `tools.loopDetection.enabled` was explicitly set to `true`, which left the protection off in the default configuration. A hallucinated or removed tool (for example `himalaya` after it was dropped from `skills.allowBundled`) would then loop "Tool X not found" attempts until the full embedded-run timeout. The guard has no false-positive surface because it only triggers on tools that are objectively not registered in the run, so it now stays on regardless of `tools.loopDetection.enabled` and still accepts `tools.loopDetection.unknownToolThreshold` as a per-run override (default 10). (#67401) Thanks @xantorres.
- TUI/streaming: add a client-side streaming watchdog to `tui-event-handlers` so the `streaming · Xm Ys` activity indicator resets to `idle` after 30s of delta silence on the active run. Guards against lost or late `state: "final"` chat events (WS reconnects, gateway restarts, etc.) leaving the TUI stuck on `streaming` indefinitely; a new system log line surfaces the reset so users know to send a new message to resync. The window is configurable via the new `streamingWatchdogMs` context option (set to `0` to disable), and the handler now exposes a `dispose()` that clears the pending timer on shutdown. (#67401) Thanks @xantorres.
- Extensions/lmstudio: add exponential backoff to the inference-preload wrapper so an LM Studio model-load failure (for example the built-in memory guardrail rejecting a load because the swap is saturated) no longer produces a WARN line every ~2s for every chat request. The wrapper now records consecutive preload failures per `(baseUrl, modelKey, contextLength)` tuple with a 5s → 10s → 20s → … → 5min cooldown and skips the preload step entirely while a cooldown is active, letting chat requests proceed directly to the stream (the model is often already loaded via the LM Studio UI). The combined `preload failed` log line now reports consecutive-failure count and remaining cooldown so operators can act on the real issue instead of drowning in repeated warnings. (#67401) Thanks @xantorres.
## 2026.4.15-beta.1

View File

@@ -1,7 +1,10 @@
import type { StreamFn } from "@mariozechner/pi-agent-core";
import { createAssistantMessageEventStream } from "@mariozechner/pi-ai";
import { afterEach, describe, expect, it, vi } from "vitest";
import { wrapLmstudioInferencePreload } from "./stream.js";
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
import {
__resetLmstudioPreloadCooldownForTest,
wrapLmstudioInferencePreload,
} from "./stream.js";
const ensureLmstudioModelLoadedMock = vi.hoisted(() => vi.fn());
const resolveLmstudioProviderHeadersMock = vi.hoisted(() =>
@@ -51,12 +54,17 @@ function buildDoneStreamFn(): StreamFn {
}
describe("lmstudio stream wrapper", () => {
beforeEach(() => {
__resetLmstudioPreloadCooldownForTest();
});
afterEach(() => {
ensureLmstudioModelLoadedMock.mockReset();
resolveLmstudioProviderHeadersMock.mockReset();
resolveLmstudioRuntimeApiKeyMock.mockReset();
resolveLmstudioProviderHeadersMock.mockResolvedValue(undefined);
resolveLmstudioRuntimeApiKeyMock.mockResolvedValue(undefined);
__resetLmstudioPreloadCooldownForTest();
});
it("preloads LM Studio model before inference using model context window", async () => {
@@ -243,6 +251,113 @@ describe("lmstudio stream wrapper", () => {
expect(ensureLmstudioModelLoadedMock).toHaveBeenCalledTimes(1);
});
it("skips preload on the second attempt while the failure backoff is active", async () => {
ensureLmstudioModelLoadedMock.mockRejectedValue(new Error("out of memory"));
const baseStream = buildDoneStreamFn();
const wrapped = wrapLmstudioInferencePreload({
provider: "lmstudio",
modelId: "qwen3-8b-instruct",
config: {
models: {
providers: {
lmstudio: {
baseUrl: "http://localhost:1234",
models: [],
},
},
},
},
streamFn: baseStream,
} as never);
const firstEvents = await collectEvents(
wrapped(
{
provider: "lmstudio",
api: "openai-completions",
id: "qwen3-8b-instruct",
} as never,
{ messages: [] } as never,
undefined as never,
),
);
expect(firstEvents).toEqual([expect.objectContaining({ type: "done" })]);
expect(ensureLmstudioModelLoadedMock).toHaveBeenCalledTimes(1);
const secondEvents = await collectEvents(
wrapped(
{
provider: "lmstudio",
api: "openai-completions",
id: "qwen3-8b-instruct",
} as never,
{ messages: [] } as never,
undefined as never,
),
);
expect(secondEvents).toEqual([expect.objectContaining({ type: "done" })]);
// The second call must NOT retry preload because cooldown is active, but
// the underlying stream must still run so the user gets a response.
expect(ensureLmstudioModelLoadedMock).toHaveBeenCalledTimes(1);
expect(baseStream).toHaveBeenCalledTimes(2);
});
it("retries preload once the cooldown expires", async () => {
ensureLmstudioModelLoadedMock.mockRejectedValueOnce(new Error("out of memory"));
ensureLmstudioModelLoadedMock.mockResolvedValueOnce(undefined);
const baseStream = buildDoneStreamFn();
const wrapped = wrapLmstudioInferencePreload({
provider: "lmstudio",
modelId: "qwen3-8b-instruct",
config: {
models: {
providers: {
lmstudio: {
baseUrl: "http://localhost:1234",
models: [],
},
},
},
},
streamFn: baseStream,
} as never);
// Freeze Date.now at a known base so we can jump past the first backoff
// window (5s by default) between the two preload attempts.
const baseTime = 1_000_000;
const nowSpy = vi.spyOn(Date, "now");
nowSpy.mockReturnValue(baseTime);
await collectEvents(
wrapped(
{
provider: "lmstudio",
api: "openai-completions",
id: "qwen3-8b-instruct",
} as never,
{ messages: [] } as never,
undefined as never,
),
);
expect(ensureLmstudioModelLoadedMock).toHaveBeenCalledTimes(1);
// Move the clock past the initial 5s cooldown window so the next call is
// allowed to retry preload.
nowSpy.mockReturnValue(baseTime + 6_000);
await collectEvents(
wrapped(
{
provider: "lmstudio",
api: "openai-completions",
id: "qwen3-8b-instruct",
} as never,
{ messages: [] } as never,
undefined as never,
),
);
expect(ensureLmstudioModelLoadedMock).toHaveBeenCalledTimes(2);
nowSpy.mockRestore();
});
it("forces supportsUsageInStreaming compat before calling the underlying stream", async () => {
const baseStream = buildDoneStreamFn();
const wrapped = wrapLmstudioInferencePreload({

View File

@@ -15,6 +15,68 @@ type StreamModel = Parameters<StreamFn>[0];
const preloadInFlight = new Map<string, Promise<void>>();
/**
* Cooldown state for the LM Studio preload endpoint.
*
* Without this, every chat request would retry preload ~every 2s even when
* LM Studio has rejected the load (for example the memory guardrail will keep
* rejecting until the user adjusts the setting or frees RAM). That produced
* hundreds of `LM Studio inference preload failed` WARN lines per hour without
* actually helping the user. The cooldown applies an exponential backoff per
* preloadKey and, while the cooldown is active, the wrapper skips the preload
* step entirely and proceeds directly to streaming — the model is often
* already loaded from the user's LM Studio UI, so inference can succeed even
* when preload keeps being rejected.
*/
type PreloadCooldownEntry = {
untilMs: number;
consecutiveFailures: number;
};
const preloadCooldown = new Map<string, PreloadCooldownEntry>();
const PRELOAD_BACKOFF_BASE_MS = 5_000;
const PRELOAD_BACKOFF_MAX_MS = 300_000;
function computePreloadBackoffMs(consecutiveFailures: number): number {
const exponent = Math.max(0, consecutiveFailures - 1);
const raw = PRELOAD_BACKOFF_BASE_MS * 2 ** exponent;
return Math.min(PRELOAD_BACKOFF_MAX_MS, raw);
}
function recordPreloadSuccess(preloadKey: string): void {
preloadCooldown.delete(preloadKey);
}
function recordPreloadFailure(preloadKey: string, now: number): PreloadCooldownEntry {
const existing = preloadCooldown.get(preloadKey);
const consecutiveFailures = (existing?.consecutiveFailures ?? 0) + 1;
const entry: PreloadCooldownEntry = {
consecutiveFailures,
untilMs: now + computePreloadBackoffMs(consecutiveFailures),
};
preloadCooldown.set(preloadKey, entry);
return entry;
}
function isPreloadCoolingDown(preloadKey: string, now: number): PreloadCooldownEntry | undefined {
const entry = preloadCooldown.get(preloadKey);
if (!entry) {
return undefined;
}
if (entry.untilMs <= now) {
preloadCooldown.delete(preloadKey);
return undefined;
}
return entry;
}
/** Test-only hook for clearing preload cooldown state between cases. */
export function __resetLmstudioPreloadCooldownForTest(): void {
preloadCooldown.clear();
preloadInFlight.clear();
}
function normalizeLmstudioModelKey(modelId: string): string {
const trimmed = modelId.trim();
if (trimmed.toLowerCase().startsWith("lmstudio/")) {
@@ -131,29 +193,67 @@ export function wrapLmstudioInferencePreload(ctx: ProviderWrapStreamFnContext):
modelKey,
requestedContextLength,
});
const cooldownEntry = isPreloadCoolingDown(preloadKey, Date.now());
const existing = preloadInFlight.get(preloadKey);
const preloadPromise =
const preloadPromise: Promise<void> | undefined =
existing ??
ensureLmstudioModelLoadedBestEffort({
baseUrl: resolvedBaseUrl,
modelKey,
requestedContextLength,
options,
ctx,
modelHeaders: resolveModelHeaders(model),
}).finally(() => {
preloadInFlight.delete(preloadKey);
});
if (!existing) {
preloadInFlight.set(preloadKey, preloadPromise);
}
(cooldownEntry
? undefined
: (() => {
const created = ensureLmstudioModelLoadedBestEffort({
baseUrl: resolvedBaseUrl,
modelKey,
requestedContextLength,
options,
ctx,
modelHeaders: resolveModelHeaders(model),
})
.then(
() => {
recordPreloadSuccess(preloadKey);
},
(error) => {
const entry = recordPreloadFailure(preloadKey, Date.now());
throw Object.assign(new Error("preload-failed"), {
cause: error,
consecutiveFailures: entry.consecutiveFailures,
cooldownMs: entry.untilMs - Date.now(),
});
},
)
.finally(() => {
preloadInFlight.delete(preloadKey);
});
preloadInFlight.set(preloadKey, created);
return created;
})());
return (async () => {
try {
await preloadPromise;
} catch (error) {
log.warn(
`LM Studio inference preload failed for "${modelKey}"; continuing without preload: ${String(error)}`,
if (preloadPromise) {
try {
await preloadPromise;
} catch (error) {
const annotated = error as {
cause?: unknown;
consecutiveFailures?: number;
cooldownMs?: number;
};
const cause = annotated.cause ?? error;
const failures = annotated.consecutiveFailures ?? 1;
const cooldownSec = Math.max(
0,
Math.round((annotated.cooldownMs ?? 0) / 1000),
);
log.warn(
`LM Studio inference preload failed for "${modelKey}" (${failures} consecutive failure${
failures === 1 ? "" : "s"
}, next preload attempt skipped for ~${cooldownSec}s); continuing without preload: ${String(cause)}`,
);
}
} else if (cooldownEntry) {
log.debug(
`LM Studio inference preload for "${modelKey}" skipped while backoff active (${cooldownEntry.consecutiveFailures} prior failures)`,
);
}
// LM Studio uses OpenAI-compatible streaming usage payloads when requested via