fix(providers): read nested llama cpp props context

This commit is contained in:
brokemac79
2026-05-05 23:23:23 +01:00
committed by Peter Steinberger
parent 7c7d19ec84
commit f4be39c4f4
3 changed files with 173 additions and 26 deletions

View File

@@ -146,7 +146,129 @@ describe("discoverOpenAICompatibleLocalModels", () => {
expect(propsRelease).toHaveBeenCalledOnce();
});
it("uses llama.cpp /props n_ctx as the runtime context cap", async () => {
it("uses llama.cpp nested /props n_ctx as the runtime context cap", async () => {
const modelsRelease = vi.fn(async () => undefined);
const propsRelease = vi.fn(async () => undefined);
fetchWithSsrFGuardMock.mockResolvedValueOnce({
response: new Response(
JSON.stringify({
data: [
{
id: "qwen3.6-mxfp4-moe",
meta: { n_ctx_train: 262_144 },
},
],
}),
{ status: 200 },
),
finalUrl: "http://127.0.0.1:8080/v1/models",
release: modelsRelease,
});
fetchWithSsrFGuardMock.mockResolvedValueOnce({
response: new Response(JSON.stringify({ default_generation_settings: { n_ctx: 65_536 } }), {
status: 200,
}),
finalUrl: "http://127.0.0.1:8080/props",
release: propsRelease,
});
const models = await discoverOpenAICompatibleLocalModels({
baseUrl: "http://127.0.0.1:8080/v1",
label: "llama.cpp",
env: {},
});
expect(models).toEqual([
expect.objectContaining({
id: "qwen3.6-mxfp4-moe",
contextWindow: 262_144,
contextTokens: 65_536,
}),
]);
expect(fetchWithSsrFGuardMock).toHaveBeenNthCalledWith(
2,
expect.objectContaining({
url: "http://127.0.0.1:8080/props",
}),
);
expect(modelsRelease).toHaveBeenCalledOnce();
expect(propsRelease).toHaveBeenCalledOnce();
});
it("scopes llama.cpp /props runtime caps to each discovered model", async () => {
const modelsRelease = vi.fn(async () => undefined);
const firstPropsRelease = vi.fn(async () => undefined);
const secondPropsRelease = vi.fn(async () => undefined);
fetchWithSsrFGuardMock.mockResolvedValueOnce({
response: new Response(
JSON.stringify({
data: [
{
id: "qwen/router-a",
meta: { n_ctx_train: 262_144 },
},
{
id: "qwen/router-b",
meta: { n_ctx_train: 131_072 },
},
],
}),
{ status: 200 },
),
finalUrl: "http://127.0.0.1:8080/v1/models",
release: modelsRelease,
});
fetchWithSsrFGuardMock.mockResolvedValueOnce({
response: new Response(JSON.stringify({ default_generation_settings: { n_ctx: 65_536 } }), {
status: 200,
}),
finalUrl: "http://127.0.0.1:8080/props?model=qwen%2Frouter-a",
release: firstPropsRelease,
});
fetchWithSsrFGuardMock.mockResolvedValueOnce({
response: new Response(JSON.stringify({ default_generation_settings: { n_ctx: 32_768 } }), {
status: 200,
}),
finalUrl: "http://127.0.0.1:8080/props?model=qwen%2Frouter-b",
release: secondPropsRelease,
});
const models = await discoverOpenAICompatibleLocalModels({
baseUrl: "http://127.0.0.1:8080/v1",
label: "llama.cpp",
env: {},
});
expect(models).toEqual([
expect.objectContaining({
id: "qwen/router-a",
contextWindow: 262_144,
contextTokens: 65_536,
}),
expect.objectContaining({
id: "qwen/router-b",
contextWindow: 131_072,
contextTokens: 32_768,
}),
]);
expect(fetchWithSsrFGuardMock).toHaveBeenNthCalledWith(
2,
expect.objectContaining({
url: "http://127.0.0.1:8080/props?model=qwen%2Frouter-a",
}),
);
expect(fetchWithSsrFGuardMock).toHaveBeenNthCalledWith(
3,
expect.objectContaining({
url: "http://127.0.0.1:8080/props?model=qwen%2Frouter-b",
}),
);
expect(modelsRelease).toHaveBeenCalledOnce();
expect(firstPropsRelease).toHaveBeenCalledOnce();
expect(secondPropsRelease).toHaveBeenCalledOnce();
});
it("keeps top-level llama.cpp /props n_ctx as a compatibility fallback", async () => {
const modelsRelease = vi.fn(async () => undefined);
const propsRelease = vi.fn(async () => undefined);
fetchWithSsrFGuardMock.mockResolvedValueOnce({
@@ -183,12 +305,6 @@ describe("discoverOpenAICompatibleLocalModels", () => {
contextTokens: 65_536,
}),
]);
expect(fetchWithSsrFGuardMock).toHaveBeenNthCalledWith(
2,
expect.objectContaining({
url: "http://127.0.0.1:8080/props",
}),
);
expect(modelsRelease).toHaveBeenCalledOnce();
expect(propsRelease).toHaveBeenCalledOnce();
});

View File

@@ -42,6 +42,9 @@ type OpenAICompatModelsResponse = {
};
type LlamaCppPropsResponse = {
default_generation_settings?: {
n_ctx?: unknown;
};
n_ctx?: unknown;
};
@@ -76,23 +79,28 @@ function readPositiveInteger(value: unknown): number | undefined {
return Math.trunc(value);
}
function resolveLlamaCppPropsUrl(baseUrl: string): string {
function resolveLlamaCppPropsUrl(baseUrl: string, modelId?: string): string {
const parsed = new URL(baseUrl);
const pathname = parsed.pathname.replace(/\/+$/, "");
parsed.pathname = pathname.endsWith("/v1") ? pathname.slice(0, -3) || "/" : pathname;
const rootPathname = pathname.endsWith("/v1") ? pathname.slice(0, -3) || "/" : pathname;
parsed.pathname = `${rootPathname.replace(/\/+$/, "")}/props`;
parsed.search = "";
parsed.hash = "";
const root = parsed.toString().replace(/\/+$/, "");
return `${root}/props`;
const normalizedModelId = normalizeOptionalString(modelId);
if (normalizedModelId) {
parsed.searchParams.set("model", normalizedModelId);
}
return parsed.toString();
}
async function discoverLlamaCppRuntimeContextTokens(params: {
baseUrl: string;
apiKey?: string;
modelId?: string;
}): Promise<number | undefined> {
let url: string;
try {
url = resolveLlamaCppPropsUrl(params.baseUrl);
url = resolveLlamaCppPropsUrl(params.baseUrl, params.modelId);
} catch {
return undefined;
}
@@ -111,7 +119,10 @@ async function discoverLlamaCppRuntimeContextTokens(params: {
return undefined;
}
const data = (await response.json()) as LlamaCppPropsResponse;
return readPositiveInteger(data.n_ctx);
return (
readPositiveInteger(data.default_generation_settings?.n_ctx) ??
readPositiveInteger(data.n_ctx)
);
} finally {
await release();
}
@@ -158,23 +169,41 @@ export async function discoverOpenAICompatibleLocalModels(params: {
return [];
}
const runtimeContextTokens =
params.contextWindow === undefined
? await discoverLlamaCppRuntimeContextTokens({
baseUrl: trimmedBaseUrl,
apiKey: params.apiKey,
})
: undefined;
return models.flatMap((model) => {
const discoveredModels = models.flatMap((model) => {
const modelId = normalizeOptionalString(model.id);
if (!modelId) {
return [];
}
return [{ id: modelId, meta: model.meta }];
});
const runtimeContextTokensByModelId = new Map<string, number>();
if (params.contextWindow === undefined) {
const uniqueModelIds = [...new Set(discoveredModels.map((model) => model.id))];
const runtimeContextTokenResults = await Promise.all(
uniqueModelIds.map(
async (modelId) =>
[
modelId,
await discoverLlamaCppRuntimeContextTokens({
baseUrl: trimmedBaseUrl,
apiKey: params.apiKey,
modelId: uniqueModelIds.length > 1 ? modelId : undefined,
}),
] as const,
),
);
for (const [modelId, runtimeContextTokens] of runtimeContextTokenResults) {
if (runtimeContextTokens) {
runtimeContextTokensByModelId.set(modelId, runtimeContextTokens);
}
}
}
return discoveredModels.map((model) => {
const modelConfig: ModelDefinitionConfig = {
id: modelId,
name: modelId,
reasoning: isReasoningModelHeuristic(modelId),
id: model.id,
name: model.id,
reasoning: isReasoningModelHeuristic(model.id),
input: ["text"],
cost: SELF_HOSTED_DEFAULT_COST,
contextWindow:
@@ -183,10 +212,11 @@ export async function discoverOpenAICompatibleLocalModels(params: {
SELF_HOSTED_DEFAULT_CONTEXT_WINDOW,
maxTokens: params.maxTokens ?? SELF_HOSTED_DEFAULT_MAX_TOKENS,
};
const runtimeContextTokens = runtimeContextTokensByModelId.get(model.id);
if (runtimeContextTokens) {
modelConfig.contextTokens = runtimeContextTokens;
}
return [modelConfig];
return modelConfig;
});
} finally {
await release();