fix(providers): read nested llama cpp props context

2026-07-11 05:46:10 +00:00 · 2026-05-05 23:23:23 +01:00
parent 7c7d19ec84
commit f4be39c4f4
3 changed files with 173 additions and 26 deletions
--- a/src/plugins/provider-self-hosted-setup.test.ts
+++ b/src/plugins/provider-self-hosted-setup.test.ts
@@ -146,7 +146,129 @@ describe("discoverOpenAICompatibleLocalModels", () => {
    expect(propsRelease).toHaveBeenCalledOnce();
  });

-  it("uses llama.cpp /props n_ctx as the runtime context cap", async () => {
+  it("uses llama.cpp nested /props n_ctx as the runtime context cap", async () => {
+    const modelsRelease = vi.fn(async () => undefined);
+    const propsRelease = vi.fn(async () => undefined);
+    fetchWithSsrFGuardMock.mockResolvedValueOnce({
+      response: new Response(
+        JSON.stringify({
+          data: [
+            {
+              id: "qwen3.6-mxfp4-moe",
+              meta: { n_ctx_train: 262_144 },
+            },
+          ],
+        }),
+        { status: 200 },
+      ),
+      finalUrl: "http://127.0.0.1:8080/v1/models",
+      release: modelsRelease,
+    });
+    fetchWithSsrFGuardMock.mockResolvedValueOnce({
+      response: new Response(JSON.stringify({ default_generation_settings: { n_ctx: 65_536 } }), {
+        status: 200,
+      }),
+      finalUrl: "http://127.0.0.1:8080/props",
+      release: propsRelease,
+    });
+
+    const models = await discoverOpenAICompatibleLocalModels({
+      baseUrl: "http://127.0.0.1:8080/v1",
+      label: "llama.cpp",
+      env: {},
+    });
+
+    expect(models).toEqual([
+      expect.objectContaining({
+        id: "qwen3.6-mxfp4-moe",
+        contextWindow: 262_144,
+        contextTokens: 65_536,
+      }),
+    ]);
+    expect(fetchWithSsrFGuardMock).toHaveBeenNthCalledWith(
+      2,
+      expect.objectContaining({
+        url: "http://127.0.0.1:8080/props",
+      }),
+    );
+    expect(modelsRelease).toHaveBeenCalledOnce();
+    expect(propsRelease).toHaveBeenCalledOnce();
+  });
+
+  it("scopes llama.cpp /props runtime caps to each discovered model", async () => {
+    const modelsRelease = vi.fn(async () => undefined);
+    const firstPropsRelease = vi.fn(async () => undefined);
+    const secondPropsRelease = vi.fn(async () => undefined);
+    fetchWithSsrFGuardMock.mockResolvedValueOnce({
+      response: new Response(
+        JSON.stringify({
+          data: [
+            {
+              id: "qwen/router-a",
+              meta: { n_ctx_train: 262_144 },
+            },
+            {
+              id: "qwen/router-b",
+              meta: { n_ctx_train: 131_072 },
+            },
+          ],
+        }),
+        { status: 200 },
+      ),
+      finalUrl: "http://127.0.0.1:8080/v1/models",
+      release: modelsRelease,
+    });
+    fetchWithSsrFGuardMock.mockResolvedValueOnce({
+      response: new Response(JSON.stringify({ default_generation_settings: { n_ctx: 65_536 } }), {
+        status: 200,
+      }),
+      finalUrl: "http://127.0.0.1:8080/props?model=qwen%2Frouter-a",
+      release: firstPropsRelease,
+    });
+    fetchWithSsrFGuardMock.mockResolvedValueOnce({
+      response: new Response(JSON.stringify({ default_generation_settings: { n_ctx: 32_768 } }), {
+        status: 200,
+      }),
+      finalUrl: "http://127.0.0.1:8080/props?model=qwen%2Frouter-b",
+      release: secondPropsRelease,
+    });
+
+    const models = await discoverOpenAICompatibleLocalModels({
+      baseUrl: "http://127.0.0.1:8080/v1",
+      label: "llama.cpp",
+      env: {},
+    });
+
+    expect(models).toEqual([
+      expect.objectContaining({
+        id: "qwen/router-a",
+        contextWindow: 262_144,
+        contextTokens: 65_536,
+      }),
+      expect.objectContaining({
+        id: "qwen/router-b",
+        contextWindow: 131_072,
+        contextTokens: 32_768,
+      }),
+    ]);
+    expect(fetchWithSsrFGuardMock).toHaveBeenNthCalledWith(
+      2,
+      expect.objectContaining({
+        url: "http://127.0.0.1:8080/props?model=qwen%2Frouter-a",
+      }),
+    );
+    expect(fetchWithSsrFGuardMock).toHaveBeenNthCalledWith(
+      3,
+      expect.objectContaining({
+        url: "http://127.0.0.1:8080/props?model=qwen%2Frouter-b",
+      }),
+    );
+    expect(modelsRelease).toHaveBeenCalledOnce();
+    expect(firstPropsRelease).toHaveBeenCalledOnce();
+    expect(secondPropsRelease).toHaveBeenCalledOnce();
+  });
+
+  it("keeps top-level llama.cpp /props n_ctx as a compatibility fallback", async () => {
    const modelsRelease = vi.fn(async () => undefined);
    const propsRelease = vi.fn(async () => undefined);
    fetchWithSsrFGuardMock.mockResolvedValueOnce({
@@ -183,12 +305,6 @@ describe("discoverOpenAICompatibleLocalModels", () => {
        contextTokens: 65_536,
      }),
    ]);
-    expect(fetchWithSsrFGuardMock).toHaveBeenNthCalledWith(
-      2,
-      expect.objectContaining({
-        url: "http://127.0.0.1:8080/props",
-      }),
-    );
    expect(modelsRelease).toHaveBeenCalledOnce();
    expect(propsRelease).toHaveBeenCalledOnce();
  });
--- a/src/plugins/provider-self-hosted-setup.ts
+++ b/src/plugins/provider-self-hosted-setup.ts
@@ -42,6 +42,9 @@ type OpenAICompatModelsResponse = {
 };

 type LlamaCppPropsResponse = {
+  default_generation_settings?: {
+    n_ctx?: unknown;
+  };
  n_ctx?: unknown;
 };

@@ -76,23 +79,28 @@ function readPositiveInteger(value: unknown): number | undefined {
  return Math.trunc(value);
 }

-function resolveLlamaCppPropsUrl(baseUrl: string): string {
+function resolveLlamaCppPropsUrl(baseUrl: string, modelId?: string): string {
  const parsed = new URL(baseUrl);
  const pathname = parsed.pathname.replace(/\/+$/, "");
-  parsed.pathname = pathname.endsWith("/v1") ? pathname.slice(0, -3) || "/" : pathname;
+  const rootPathname = pathname.endsWith("/v1") ? pathname.slice(0, -3) || "/" : pathname;
+  parsed.pathname = `${rootPathname.replace(/\/+$/, "")}/props`;
  parsed.search = "";
  parsed.hash = "";
-  const root = parsed.toString().replace(/\/+$/, "");
-  return `${root}/props`;
+  const normalizedModelId = normalizeOptionalString(modelId);
+  if (normalizedModelId) {
+    parsed.searchParams.set("model", normalizedModelId);
+  }
+  return parsed.toString();
 }

 async function discoverLlamaCppRuntimeContextTokens(params: {
  baseUrl: string;
  apiKey?: string;
+  modelId?: string;
 }): Promise<number | undefined> {
  let url: string;
  try {
-    url = resolveLlamaCppPropsUrl(params.baseUrl);
+    url = resolveLlamaCppPropsUrl(params.baseUrl, params.modelId);
  } catch {
    return undefined;
  }
@@ -111,7 +119,10 @@ async function discoverLlamaCppRuntimeContextTokens(params: {
        return undefined;
      }
      const data = (await response.json()) as LlamaCppPropsResponse;
-      return readPositiveInteger(data.n_ctx);
+      return (
+        readPositiveInteger(data.default_generation_settings?.n_ctx) ??
+        readPositiveInteger(data.n_ctx)
+      );
    } finally {
      await release();
    }
@@ -158,23 +169,41 @@ export async function discoverOpenAICompatibleLocalModels(params: {
        return [];
      }

-      const runtimeContextTokens =
-        params.contextWindow === undefined
-          ? await discoverLlamaCppRuntimeContextTokens({
-              baseUrl: trimmedBaseUrl,
-              apiKey: params.apiKey,
-            })
-          : undefined;
-
-      return models.flatMap((model) => {
+      const discoveredModels = models.flatMap((model) => {
        const modelId = normalizeOptionalString(model.id);
        if (!modelId) {
          return [];
        }
+        return [{ id: modelId, meta: model.meta }];
+      });
+      const runtimeContextTokensByModelId = new Map<string, number>();
+      if (params.contextWindow === undefined) {
+        const uniqueModelIds = [...new Set(discoveredModels.map((model) => model.id))];
+        const runtimeContextTokenResults = await Promise.all(
+          uniqueModelIds.map(
+            async (modelId) =>
+              [
+                modelId,
+                await discoverLlamaCppRuntimeContextTokens({
+                  baseUrl: trimmedBaseUrl,
+                  apiKey: params.apiKey,
+                  modelId: uniqueModelIds.length > 1 ? modelId : undefined,
+                }),
+              ] as const,
+          ),
+        );
+        for (const [modelId, runtimeContextTokens] of runtimeContextTokenResults) {
+          if (runtimeContextTokens) {
+            runtimeContextTokensByModelId.set(modelId, runtimeContextTokens);
+          }
+        }
+      }
+
+      return discoveredModels.map((model) => {
        const modelConfig: ModelDefinitionConfig = {
-          id: modelId,
-          name: modelId,
-          reasoning: isReasoningModelHeuristic(modelId),
+          id: model.id,
+          name: model.id,
+          reasoning: isReasoningModelHeuristic(model.id),
          input: ["text"],
          cost: SELF_HOSTED_DEFAULT_COST,
          contextWindow:
@@ -183,10 +212,11 @@ export async function discoverOpenAICompatibleLocalModels(params: {
            SELF_HOSTED_DEFAULT_CONTEXT_WINDOW,
          maxTokens: params.maxTokens ?? SELF_HOSTED_DEFAULT_MAX_TOKENS,
        };
+        const runtimeContextTokens = runtimeContextTokensByModelId.get(model.id);
        if (runtimeContextTokens) {
          modelConfig.contextTokens = runtimeContextTokens;
        }
-        return [modelConfig];
+        return modelConfig;
      });
    } finally {
      await release();