From 57e6aeca840b2556c8cc74684e7cf6665d25f5eb Mon Sep 17 00:00:00 2001
From: Alexander Bunn <alexander@applyinnovations.com.au>
Date: Fri, 10 Apr 2026 18:36:05 +1000
Subject: [PATCH] fix(agents): detect llama.cpp slot overflow as context
 overflow

Auto-compaction never triggered for self-hosted llama.cpp HTTP servers
(used directly or behind an OpenAI-compatible shim configured with
`api: "openai-completions"`) because llama.cpp's native overflow wording
isn't covered by any existing pattern in `isContextOverflowError()` or
`matchesProviderContextOverflow()`.

When the prompt overshoots a slot's `--ctx-size`, llama.cpp returns:

  400 request (66202 tokens) exceeds the available context size (65536 tokens), try increasing it

That message uses "context size" rather than "context length", says
"request (N tokens)" instead of "input/prompt is too long", and the
status code is 400 (not 413), so it slips past every existing string
check and every regex in `PROVIDER_CONTEXT_OVERFLOW_PATTERNS`. The
generic candidate pre-check passes, but the concrete provider regexes
all miss, so the agent runner reports `surface_error reason=...` and
the user gets the raw upstream error instead of compaction + retry.

This commit adds a llama.cpp-shaped pattern next to the existing Bedrock
/ Vertex / Ollama / Cohere ones in
`PROVIDER_CONTEXT_OVERFLOW_PATTERNS`, plus four test cases (three
parameterised messages exercising the new regex directly, and one
end-to-end assertion that `isContextOverflowError()` now returns true
for the verbatim message produced by llama.cpp's slot manager).

The pattern is anchored on llama.cpp's stable slot-manager wording
(`(?:request|prompt) (N tokens) exceeds (the )?available context size`)
so it won't accidentally swallow unrelated provider errors.

Closes #64180

AI-assisted: drafted with Claude Code (Opus 4.6, 1M context).
Testing: targeted tests pass via `pnpm vitest run
src/agents/pi-embedded-helpers/provider-error-patterns.test.ts`
(26/26). Broader vitest run shows 2 unrelated failures in
`group-policy.fallback.contract.test.ts` that are not touched by this
change.
---
 .../provider-error-patterns.test.ts                | 14 ++++++++++++++
 .../pi-embedded-helpers/provider-error-patterns.ts |  7 +++++++
 2 files changed, 21 insertions(+)

diff --git a/src/agents/pi-embedded-helpers/provider-error-patterns.test.ts b/src/agents/pi-embedded-helpers/provider-error-patterns.test.ts
index 86dc42759c5..0e7bfbed0e4 100644
--- a/src/agents/pi-embedded-helpers/provider-error-patterns.test.ts
+++ b/src/agents/pi-embedded-helpers/provider-error-patterns.test.ts
@@ -50,6 +50,11 @@ describe("matchesProviderContextOverflow", () => {
     // Cohere
     "total tokens exceeds the model's maximum limit of 4096",
 
+    // llama.cpp HTTP server (slot ctx-size overflow)
+    "400 request (66202 tokens) exceeds the available context size (65536 tokens), try increasing it",
+    "request (130000 tokens) exceeds available context size (131072 tokens)",
+    "prompt (8500 tokens) exceeds the available context size (8192 tokens), try increasing it",
+
     // Generic
     "input is too long for model gpt-5.4",
   ])("matches provider-specific overflow: %s", (msg) => {
@@ -113,6 +118,15 @@ describe("isContextOverflowError with provider patterns", () => {
     expect(isContextOverflowError("ollama error: context length exceeded")).toBe(true);
   });
 
+  it("detects llama.cpp slot ctx-size overflow", () => {
+    // Native llama.cpp HTTP server overflow surfaced through openai-completions providers.
+    expect(
+      isContextOverflowError(
+        "400 request (66202 tokens) exceeds the available context size (65536 tokens), try increasing it",
+      ),
+    ).toBe(true);
+  });
+
   it("still detects standard context overflow patterns", () => {
     expect(isContextOverflowError("context length exceeded")).toBe(true);
     expect(isContextOverflowError("prompt is too long: 150000 tokens > 128000 maximum")).toBe(true);
diff --git a/src/agents/pi-embedded-helpers/provider-error-patterns.ts b/src/agents/pi-embedded-helpers/provider-error-patterns.ts
index 63c464a0fa3..96e16d99bd5 100644
--- a/src/agents/pi-embedded-helpers/provider-error-patterns.ts
+++ b/src/agents/pi-embedded-helpers/provider-error-patterns.ts
@@ -35,6 +35,13 @@ export const PROVIDER_CONTEXT_OVERFLOW_PATTERNS: readonly RegExp[] = [
   // Cohere does not currently ship a bundled provider hook.
   /\btotal tokens?.*exceeds? (?:the )?(?:model(?:'s)? )?(?:max|maximum|limit)/i,
 
+  // llama.cpp HTTP server (often used directly or behind an OpenAI-compatible
+  // shim) returns "request (N tokens) exceeds the available context size
+  // (M tokens), try increasing it" when the prompt overshoots a slot's
+  // ctx-size. Wording is from the upstream slot manager and is stable.
+  // Example: "400 request (66202 tokens) exceeds the available context size (65536 tokens), try increasing it"
+  /\b(?:request|prompt) \(\d[\d,]*\s*tokens?\) exceeds (?:the )?available context size\b/i,
+
   // Generic "input too long" pattern that isn't covered by existing checks
   /\binput (?:is )?too long for (?:the )?model\b/i,
 ];