mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 15:40:44 +00:00
Extensions/lmstudio: back off inference preload after consecutive failures
This commit is contained in:
@@ -39,6 +39,7 @@ Docs: https://docs.openclaw.ai
|
||||
- Gateway/skills: bump the cached skills-snapshot version whenever a config write touches `skills.*` (for example `skills.allowBundled`, `skills.entries.<id>.enabled`, or `skills.profile`). Existing agent sessions persist a `skillsSnapshot` in `sessions.json` that reuses the skill list frozen at session creation; without this invalidation, removing a bundled skill from the allowlist left the old snapshot live and the model kept calling the disabled tool, producing `Tool <name> not found` loops that ran until the embedded-run timeout. (#67401) Thanks @xantorres.
|
||||
- Agents/tool-loop: enable the unknown-tool stream guard by default. Previously `resolveUnknownToolGuardThreshold` returned `undefined` unless `tools.loopDetection.enabled` was explicitly set to `true`, which left the protection off in the default configuration. A hallucinated or removed tool (for example `himalaya` after it was dropped from `skills.allowBundled`) would then loop "Tool X not found" attempts until the full embedded-run timeout. The guard has no false-positive surface because it only triggers on tools that are objectively not registered in the run, so it now stays on regardless of `tools.loopDetection.enabled` and still accepts `tools.loopDetection.unknownToolThreshold` as a per-run override (default 10). (#67401) Thanks @xantorres.
|
||||
- TUI/streaming: add a client-side streaming watchdog to `tui-event-handlers` so the `streaming · Xm Ys` activity indicator resets to `idle` after 30s of delta silence on the active run. Guards against lost or late `state: "final"` chat events (WS reconnects, gateway restarts, etc.) leaving the TUI stuck on `streaming` indefinitely; a new system log line surfaces the reset so users know to send a new message to resync. The window is configurable via the new `streamingWatchdogMs` context option (set to `0` to disable), and the handler now exposes a `dispose()` that clears the pending timer on shutdown. (#67401) Thanks @xantorres.
|
||||
- Extensions/lmstudio: add exponential backoff to the inference-preload wrapper so an LM Studio model-load failure (for example the built-in memory guardrail rejecting a load because the swap is saturated) no longer produces a WARN line every ~2s for every chat request. The wrapper now records consecutive preload failures per `(baseUrl, modelKey, contextLength)` tuple with a 5s → 10s → 20s → … → 5min cooldown and skips the preload step entirely while a cooldown is active, letting chat requests proceed directly to the stream (the model is often already loaded via the LM Studio UI). The combined `preload failed` log line now reports consecutive-failure count and remaining cooldown so operators can act on the real issue instead of drowning in repeated warnings. (#67401) Thanks @xantorres.
|
||||
|
||||
## 2026.4.15-beta.1
|
||||
|
||||
|
||||
@@ -1,7 +1,10 @@
|
||||
import type { StreamFn } from "@mariozechner/pi-agent-core";
|
||||
import { createAssistantMessageEventStream } from "@mariozechner/pi-ai";
|
||||
import { afterEach, describe, expect, it, vi } from "vitest";
|
||||
import { wrapLmstudioInferencePreload } from "./stream.js";
|
||||
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
|
||||
import {
|
||||
__resetLmstudioPreloadCooldownForTest,
|
||||
wrapLmstudioInferencePreload,
|
||||
} from "./stream.js";
|
||||
|
||||
const ensureLmstudioModelLoadedMock = vi.hoisted(() => vi.fn());
|
||||
const resolveLmstudioProviderHeadersMock = vi.hoisted(() =>
|
||||
@@ -51,12 +54,17 @@ function buildDoneStreamFn(): StreamFn {
|
||||
}
|
||||
|
||||
describe("lmstudio stream wrapper", () => {
|
||||
beforeEach(() => {
|
||||
__resetLmstudioPreloadCooldownForTest();
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
ensureLmstudioModelLoadedMock.mockReset();
|
||||
resolveLmstudioProviderHeadersMock.mockReset();
|
||||
resolveLmstudioRuntimeApiKeyMock.mockReset();
|
||||
resolveLmstudioProviderHeadersMock.mockResolvedValue(undefined);
|
||||
resolveLmstudioRuntimeApiKeyMock.mockResolvedValue(undefined);
|
||||
__resetLmstudioPreloadCooldownForTest();
|
||||
});
|
||||
|
||||
it("preloads LM Studio model before inference using model context window", async () => {
|
||||
@@ -243,6 +251,113 @@ describe("lmstudio stream wrapper", () => {
|
||||
expect(ensureLmstudioModelLoadedMock).toHaveBeenCalledTimes(1);
|
||||
});
|
||||
|
||||
it("skips preload on the second attempt while the failure backoff is active", async () => {
|
||||
ensureLmstudioModelLoadedMock.mockRejectedValue(new Error("out of memory"));
|
||||
const baseStream = buildDoneStreamFn();
|
||||
const wrapped = wrapLmstudioInferencePreload({
|
||||
provider: "lmstudio",
|
||||
modelId: "qwen3-8b-instruct",
|
||||
config: {
|
||||
models: {
|
||||
providers: {
|
||||
lmstudio: {
|
||||
baseUrl: "http://localhost:1234",
|
||||
models: [],
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
streamFn: baseStream,
|
||||
} as never);
|
||||
|
||||
const firstEvents = await collectEvents(
|
||||
wrapped(
|
||||
{
|
||||
provider: "lmstudio",
|
||||
api: "openai-completions",
|
||||
id: "qwen3-8b-instruct",
|
||||
} as never,
|
||||
{ messages: [] } as never,
|
||||
undefined as never,
|
||||
),
|
||||
);
|
||||
expect(firstEvents).toEqual([expect.objectContaining({ type: "done" })]);
|
||||
expect(ensureLmstudioModelLoadedMock).toHaveBeenCalledTimes(1);
|
||||
|
||||
const secondEvents = await collectEvents(
|
||||
wrapped(
|
||||
{
|
||||
provider: "lmstudio",
|
||||
api: "openai-completions",
|
||||
id: "qwen3-8b-instruct",
|
||||
} as never,
|
||||
{ messages: [] } as never,
|
||||
undefined as never,
|
||||
),
|
||||
);
|
||||
expect(secondEvents).toEqual([expect.objectContaining({ type: "done" })]);
|
||||
// The second call must NOT retry preload because cooldown is active, but
|
||||
// the underlying stream must still run so the user gets a response.
|
||||
expect(ensureLmstudioModelLoadedMock).toHaveBeenCalledTimes(1);
|
||||
expect(baseStream).toHaveBeenCalledTimes(2);
|
||||
});
|
||||
|
||||
it("retries preload once the cooldown expires", async () => {
|
||||
ensureLmstudioModelLoadedMock.mockRejectedValueOnce(new Error("out of memory"));
|
||||
ensureLmstudioModelLoadedMock.mockResolvedValueOnce(undefined);
|
||||
const baseStream = buildDoneStreamFn();
|
||||
const wrapped = wrapLmstudioInferencePreload({
|
||||
provider: "lmstudio",
|
||||
modelId: "qwen3-8b-instruct",
|
||||
config: {
|
||||
models: {
|
||||
providers: {
|
||||
lmstudio: {
|
||||
baseUrl: "http://localhost:1234",
|
||||
models: [],
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
streamFn: baseStream,
|
||||
} as never);
|
||||
|
||||
// Freeze Date.now at a known base so we can jump past the first backoff
|
||||
// window (5s by default) between the two preload attempts.
|
||||
const baseTime = 1_000_000;
|
||||
const nowSpy = vi.spyOn(Date, "now");
|
||||
nowSpy.mockReturnValue(baseTime);
|
||||
await collectEvents(
|
||||
wrapped(
|
||||
{
|
||||
provider: "lmstudio",
|
||||
api: "openai-completions",
|
||||
id: "qwen3-8b-instruct",
|
||||
} as never,
|
||||
{ messages: [] } as never,
|
||||
undefined as never,
|
||||
),
|
||||
);
|
||||
expect(ensureLmstudioModelLoadedMock).toHaveBeenCalledTimes(1);
|
||||
|
||||
// Move the clock past the initial 5s cooldown window so the next call is
|
||||
// allowed to retry preload.
|
||||
nowSpy.mockReturnValue(baseTime + 6_000);
|
||||
await collectEvents(
|
||||
wrapped(
|
||||
{
|
||||
provider: "lmstudio",
|
||||
api: "openai-completions",
|
||||
id: "qwen3-8b-instruct",
|
||||
} as never,
|
||||
{ messages: [] } as never,
|
||||
undefined as never,
|
||||
),
|
||||
);
|
||||
expect(ensureLmstudioModelLoadedMock).toHaveBeenCalledTimes(2);
|
||||
nowSpy.mockRestore();
|
||||
});
|
||||
|
||||
it("forces supportsUsageInStreaming compat before calling the underlying stream", async () => {
|
||||
const baseStream = buildDoneStreamFn();
|
||||
const wrapped = wrapLmstudioInferencePreload({
|
||||
|
||||
@@ -15,6 +15,68 @@ type StreamModel = Parameters<StreamFn>[0];
|
||||
|
||||
const preloadInFlight = new Map<string, Promise<void>>();
|
||||
|
||||
/**
|
||||
* Cooldown state for the LM Studio preload endpoint.
|
||||
*
|
||||
* Without this, every chat request would retry preload ~every 2s even when
|
||||
* LM Studio has rejected the load (for example the memory guardrail will keep
|
||||
* rejecting until the user adjusts the setting or frees RAM). That produced
|
||||
* hundreds of `LM Studio inference preload failed` WARN lines per hour without
|
||||
* actually helping the user. The cooldown applies an exponential backoff per
|
||||
* preloadKey and, while the cooldown is active, the wrapper skips the preload
|
||||
* step entirely and proceeds directly to streaming — the model is often
|
||||
* already loaded from the user's LM Studio UI, so inference can succeed even
|
||||
* when preload keeps being rejected.
|
||||
*/
|
||||
type PreloadCooldownEntry = {
|
||||
untilMs: number;
|
||||
consecutiveFailures: number;
|
||||
};
|
||||
|
||||
const preloadCooldown = new Map<string, PreloadCooldownEntry>();
|
||||
|
||||
const PRELOAD_BACKOFF_BASE_MS = 5_000;
|
||||
const PRELOAD_BACKOFF_MAX_MS = 300_000;
|
||||
|
||||
function computePreloadBackoffMs(consecutiveFailures: number): number {
|
||||
const exponent = Math.max(0, consecutiveFailures - 1);
|
||||
const raw = PRELOAD_BACKOFF_BASE_MS * 2 ** exponent;
|
||||
return Math.min(PRELOAD_BACKOFF_MAX_MS, raw);
|
||||
}
|
||||
|
||||
function recordPreloadSuccess(preloadKey: string): void {
|
||||
preloadCooldown.delete(preloadKey);
|
||||
}
|
||||
|
||||
function recordPreloadFailure(preloadKey: string, now: number): PreloadCooldownEntry {
|
||||
const existing = preloadCooldown.get(preloadKey);
|
||||
const consecutiveFailures = (existing?.consecutiveFailures ?? 0) + 1;
|
||||
const entry: PreloadCooldownEntry = {
|
||||
consecutiveFailures,
|
||||
untilMs: now + computePreloadBackoffMs(consecutiveFailures),
|
||||
};
|
||||
preloadCooldown.set(preloadKey, entry);
|
||||
return entry;
|
||||
}
|
||||
|
||||
function isPreloadCoolingDown(preloadKey: string, now: number): PreloadCooldownEntry | undefined {
|
||||
const entry = preloadCooldown.get(preloadKey);
|
||||
if (!entry) {
|
||||
return undefined;
|
||||
}
|
||||
if (entry.untilMs <= now) {
|
||||
preloadCooldown.delete(preloadKey);
|
||||
return undefined;
|
||||
}
|
||||
return entry;
|
||||
}
|
||||
|
||||
/** Test-only hook for clearing preload cooldown state between cases. */
|
||||
export function __resetLmstudioPreloadCooldownForTest(): void {
|
||||
preloadCooldown.clear();
|
||||
preloadInFlight.clear();
|
||||
}
|
||||
|
||||
function normalizeLmstudioModelKey(modelId: string): string {
|
||||
const trimmed = modelId.trim();
|
||||
if (trimmed.toLowerCase().startsWith("lmstudio/")) {
|
||||
@@ -131,29 +193,67 @@ export function wrapLmstudioInferencePreload(ctx: ProviderWrapStreamFnContext):
|
||||
modelKey,
|
||||
requestedContextLength,
|
||||
});
|
||||
|
||||
const cooldownEntry = isPreloadCoolingDown(preloadKey, Date.now());
|
||||
const existing = preloadInFlight.get(preloadKey);
|
||||
const preloadPromise =
|
||||
const preloadPromise: Promise<void> | undefined =
|
||||
existing ??
|
||||
ensureLmstudioModelLoadedBestEffort({
|
||||
baseUrl: resolvedBaseUrl,
|
||||
modelKey,
|
||||
requestedContextLength,
|
||||
options,
|
||||
ctx,
|
||||
modelHeaders: resolveModelHeaders(model),
|
||||
}).finally(() => {
|
||||
preloadInFlight.delete(preloadKey);
|
||||
});
|
||||
if (!existing) {
|
||||
preloadInFlight.set(preloadKey, preloadPromise);
|
||||
}
|
||||
(cooldownEntry
|
||||
? undefined
|
||||
: (() => {
|
||||
const created = ensureLmstudioModelLoadedBestEffort({
|
||||
baseUrl: resolvedBaseUrl,
|
||||
modelKey,
|
||||
requestedContextLength,
|
||||
options,
|
||||
ctx,
|
||||
modelHeaders: resolveModelHeaders(model),
|
||||
})
|
||||
.then(
|
||||
() => {
|
||||
recordPreloadSuccess(preloadKey);
|
||||
},
|
||||
(error) => {
|
||||
const entry = recordPreloadFailure(preloadKey, Date.now());
|
||||
throw Object.assign(new Error("preload-failed"), {
|
||||
cause: error,
|
||||
consecutiveFailures: entry.consecutiveFailures,
|
||||
cooldownMs: entry.untilMs - Date.now(),
|
||||
});
|
||||
},
|
||||
)
|
||||
.finally(() => {
|
||||
preloadInFlight.delete(preloadKey);
|
||||
});
|
||||
preloadInFlight.set(preloadKey, created);
|
||||
return created;
|
||||
})());
|
||||
|
||||
return (async () => {
|
||||
try {
|
||||
await preloadPromise;
|
||||
} catch (error) {
|
||||
log.warn(
|
||||
`LM Studio inference preload failed for "${modelKey}"; continuing without preload: ${String(error)}`,
|
||||
if (preloadPromise) {
|
||||
try {
|
||||
await preloadPromise;
|
||||
} catch (error) {
|
||||
const annotated = error as {
|
||||
cause?: unknown;
|
||||
consecutiveFailures?: number;
|
||||
cooldownMs?: number;
|
||||
};
|
||||
const cause = annotated.cause ?? error;
|
||||
const failures = annotated.consecutiveFailures ?? 1;
|
||||
const cooldownSec = Math.max(
|
||||
0,
|
||||
Math.round((annotated.cooldownMs ?? 0) / 1000),
|
||||
);
|
||||
log.warn(
|
||||
`LM Studio inference preload failed for "${modelKey}" (${failures} consecutive failure${
|
||||
failures === 1 ? "" : "s"
|
||||
}, next preload attempt skipped for ~${cooldownSec}s); continuing without preload: ${String(cause)}`,
|
||||
);
|
||||
}
|
||||
} else if (cooldownEntry) {
|
||||
log.debug(
|
||||
`LM Studio inference preload for "${modelKey}" skipped while backoff active (${cooldownEntry.consecutiveFailures} prior failures)`,
|
||||
);
|
||||
}
|
||||
// LM Studio uses OpenAI-compatible streaming usage payloads when requested via
|
||||
|
||||
Reference in New Issue
Block a user