diff --git a/src/agents/pi-embedded-helpers/provider-error-patterns.test.ts b/src/agents/pi-embedded-helpers/provider-error-patterns.test.ts index 86dc42759c5..0e7bfbed0e4 100644 --- a/src/agents/pi-embedded-helpers/provider-error-patterns.test.ts +++ b/src/agents/pi-embedded-helpers/provider-error-patterns.test.ts @@ -50,6 +50,11 @@ describe("matchesProviderContextOverflow", () => { // Cohere "total tokens exceeds the model's maximum limit of 4096", + // llama.cpp HTTP server (slot ctx-size overflow) + "400 request (66202 tokens) exceeds the available context size (65536 tokens), try increasing it", + "request (130000 tokens) exceeds available context size (131072 tokens)", + "prompt (8500 tokens) exceeds the available context size (8192 tokens), try increasing it", + // Generic "input is too long for model gpt-5.4", ])("matches provider-specific overflow: %s", (msg) => { @@ -113,6 +118,15 @@ describe("isContextOverflowError with provider patterns", () => { expect(isContextOverflowError("ollama error: context length exceeded")).toBe(true); }); + it("detects llama.cpp slot ctx-size overflow", () => { + // Native llama.cpp HTTP server overflow surfaced through openai-completions providers. + expect( + isContextOverflowError( + "400 request (66202 tokens) exceeds the available context size (65536 tokens), try increasing it", + ), + ).toBe(true); + }); + it("still detects standard context overflow patterns", () => { expect(isContextOverflowError("context length exceeded")).toBe(true); expect(isContextOverflowError("prompt is too long: 150000 tokens > 128000 maximum")).toBe(true); diff --git a/src/agents/pi-embedded-helpers/provider-error-patterns.ts b/src/agents/pi-embedded-helpers/provider-error-patterns.ts index 63c464a0fa3..96e16d99bd5 100644 --- a/src/agents/pi-embedded-helpers/provider-error-patterns.ts +++ b/src/agents/pi-embedded-helpers/provider-error-patterns.ts @@ -35,6 +35,13 @@ export const PROVIDER_CONTEXT_OVERFLOW_PATTERNS: readonly RegExp[] = [ // Cohere does not currently ship a bundled provider hook. /\btotal tokens?.*exceeds? (?:the )?(?:model(?:'s)? )?(?:max|maximum|limit)/i, + // llama.cpp HTTP server (often used directly or behind an OpenAI-compatible + // shim) returns "request (N tokens) exceeds the available context size + // (M tokens), try increasing it" when the prompt overshoots a slot's + // ctx-size. Wording is from the upstream slot manager and is stable. + // Example: "400 request (66202 tokens) exceeds the available context size (65536 tokens), try increasing it" + /\b(?:request|prompt) \(\d[\d,]*\s*tokens?\) exceeds (?:the )?available context size\b/i, + // Generic "input too long" pattern that isn't covered by existing checks /\binput (?:is )?too long for (?:the )?model\b/i, ];