fix: support inferrs string-only completions

This commit is contained in:
Peter Steinberger
2026-04-07 15:52:41 +01:00
parent ea9efc0e81
commit 9d4b0d551d
18 changed files with 435 additions and 5 deletions

View File

@@ -26,6 +26,7 @@ Docs: https://docs.openclaw.ai
- Memory/wiki: use compiled digest artifacts as the first-pass wiki index for search/get flows, and resolve claim ids back to owning pages so agents can retrieve knowledge by belief identity instead of only by file path. Thanks @vincentkoc.
- Memory/wiki: add an opt-in `context.includeCompiledDigestPrompt` flag so memory prompt supplements can append a compact compiled wiki snapshot for legacy prompt assembly and context engines that explicitly consume memory prompt sections. Thanks @vincentkoc.
- Plugin SDK/context engines: pass `availableTools` and `citationsMode` into `assemble()`, and expose `buildMemorySystemPromptAddition(...)` so non-legacy context engines can adopt the active memory prompt path without reimplementing it. Thanks @vincentkoc.
- Providers/inferrs: add string-content compatibility for stricter OpenAI-compatible chat backends, document `inferrs` setup with a full config example, and add troubleshooting guidance for local backends that pass direct probes but fail on full agent-runtime prompts.
### Fixes

View File

@@ -1,4 +1,4 @@
838e3c2f798321d47ccafd132b07a94a676ecf01ec128550c85cea9c2cacf0f5 config-baseline.json
531ad785e7877e8d426985df5074b958a09ea61da5557061f8762272ef9e1d46 config-baseline.core.json
af24bd5a2a86e8bb481302211b35c440e82636585c46f57050648c0290b1d4ee config-baseline.json
73bda77ebf7d70609c57f394655332536eb5ff55516a6b7db06243bd4e8e44a5 config-baseline.core.json
d22f4414b79ee03d896e58d875c80523bcc12303cbacb1700261e6ec73945187 config-baseline.channel.json
d32b286c554e8fe7a53b01dde23987fa6eb2140f021297bf029aed5542d721af config-baseline.plugin.json
d42cee3dea4668bdb7daf6ff5e6f87f326fdef56a8c3716d73079b92cab6e7b2 config-baseline.plugin.json

View File

@@ -2349,6 +2349,7 @@ OpenClaw uses the built-in model catalog. Add custom providers via `models.provi
- `models.providers.*.models.*.contextWindow`: native model context window metadata.
- `models.providers.*.models.*.contextTokens`: optional runtime context cap. Use this when you want a smaller effective context budget than the model's native `contextWindow`.
- `models.providers.*.models.*.compat.supportsDeveloperRole`: optional compatibility hint. For `api: "openai-completions"` with a non-empty non-native `baseUrl` (host not `api.openai.com`), OpenClaw forces this to `false` at runtime. Empty/omitted `baseUrl` keeps default OpenAI behavior.
- `models.providers.*.models.*.compat.requiresStringContent`: optional compatibility hint for string-only OpenAI-compatible chat endpoints. When `true`, OpenClaw flattens pure text `messages[].content` arrays into plain strings before sending the request.
- `plugins.entries.amazon-bedrock.config.discovery`: Bedrock auto-discovery settings root.
- `plugins.entries.amazon-bedrock.config.discovery.enabled`: turn implicit discovery on/off.
- `plugins.entries.amazon-bedrock.config.discovery.region`: AWS region for discovery.

View File

@@ -155,9 +155,30 @@ Behavior note for local/proxied `/v1` backends:
- hidden OpenClaw attribution headers (`originator`, `version`, `User-Agent`)
are not injected on these custom proxy URLs
Compatibility notes for stricter OpenAI-compatible backends:
- Some servers accept only string `messages[].content` on Chat Completions, not
structured content-part arrays. Set
`models.providers.<provider>.models[].compat.requiresStringContent: true` for
those endpoints.
- Some smaller or stricter local backends are unstable with OpenClaw's full
agent-runtime prompt shape, especially when tool schemas are included. If the
backend works for tiny direct `/v1/chat/completions` calls but fails on normal
OpenClaw agent turns, try
`models.providers.<provider>.models[].compat.supportsTools: false` first.
- If the backend still fails only on larger OpenClaw runs, the remaining issue
is usually upstream model/server capacity or a backend bug, not OpenClaw's
transport layer.
## Troubleshooting
- Gateway can reach the proxy? `curl http://127.0.0.1:1234/v1/models`.
- LM Studio model unloaded? Reload; cold start is a common “hanging” cause.
- Context errors? Lower `contextWindow` or raise your server limit.
- OpenAI-compatible server returns `messages[].content ... expected a string`?
Add `compat.requiresStringContent: true` on that model entry.
- Direct tiny `/v1/chat/completions` calls work, but `openclaw infer model run`
fails on Gemma or another local model? Disable tool schemas first with
`compat.supportsTools: false`, then retest. If the server still crashes only
on larger OpenClaw prompts, treat it as an upstream server/model limitation.
- Safety: local models skip provider-side filters; keep agents narrow and compaction on to limit prompt injection blast radius.

View File

@@ -59,6 +59,61 @@ Related:
- [/reference/token-use](/reference/token-use)
- [/help/faq#why-am-i-seeing-http-429-ratelimiterror-from-anthropic](/help/faq#why-am-i-seeing-http-429-ratelimiterror-from-anthropic)
## Local OpenAI-compatible backend passes direct probes but agent runs fail
Use this when:
- `curl ... /v1/models` works
- tiny direct `/v1/chat/completions` calls work
- OpenClaw model runs fail only on normal agent turns
```bash
curl http://127.0.0.1:1234/v1/models
curl http://127.0.0.1:1234/v1/chat/completions \
-H 'content-type: application/json' \
-d '{"model":"<id>","messages":[{"role":"user","content":"hi"}],"stream":false}'
openclaw infer model run --model <provider/model> --prompt "hi" --json
openclaw logs --follow
```
Look for:
- direct tiny calls succeed, but OpenClaw runs fail only on larger prompts
- backend errors about `messages[].content` expecting a string
- backend crashes that appear only with larger prompt-token counts or full agent
runtime prompts
Common signatures:
- `messages[...].content: invalid type: sequence, expected a string` → backend
rejects structured Chat Completions content parts. Fix: set
`models.providers.<provider>.models[].compat.requiresStringContent: true`.
- direct tiny requests succeed, but OpenClaw agent runs fail with backend/model
crashes (for example Gemma on some `inferrs` builds) → OpenClaw transport is
likely already correct; the backend is failing on the larger agent-runtime
prompt shape.
- failures shrink after disabling tools but do not disappear → tool schemas were
part of the pressure, but the remaining issue is still upstream model/server
capacity or a backend bug.
Fix options:
1. Set `compat.requiresStringContent: true` for string-only Chat Completions backends.
2. Set `compat.supportsTools: false` for models/backends that cannot handle
OpenClaw's tool schema surface reliably.
3. Lower prompt pressure where possible: smaller workspace bootstrap, shorter
session history, lighter local model, or a backend with stronger long-context
support.
4. If tiny direct requests keep passing while OpenClaw agent turns still crash
inside the backend, treat it as an upstream server/model limitation and file
a repro there with the accepted payload shape.
Related:
- [/gateway/local-models](/gateway/local-models)
- [/gateway/configuration#models](/gateway/configuration#models)
- [/gateway/configuration-reference#openai-compatible-endpoints](/gateway/configuration-reference#openai-compatible-endpoints)
## No replies
If channels are up but nothing answers, check routing and policy before reconnecting anything.

View File

@@ -42,6 +42,21 @@ If you see:
`HTTP 429: rate_limit_error: Extra usage is required for long context requests`,
go to [/gateway/troubleshooting#anthropic-429-extra-usage-required-for-long-context](/gateway/troubleshooting#anthropic-429-extra-usage-required-for-long-context).
## Local OpenAI-compatible backend works directly but fails in OpenClaw
If your local or self-hosted `/v1` backend answers small direct
`/v1/chat/completions` probes but fails on `openclaw infer model run` or normal
agent turns:
1. If the error mentions `messages[].content` expecting a string, set
`models.providers.<provider>.models[].compat.requiresStringContent: true`.
2. If the backend still fails only on OpenClaw agent turns, set
`models.providers.<provider>.models[].compat.supportsTools: false` and retry.
3. If tiny direct calls still work but larger OpenClaw prompts crash the
backend, treat the remaining issue as an upstream model/server limitation and
continue in the deep runbook:
[/gateway/troubleshooting#local-openai-compatible-backend-passes-direct-probes-but-agent-runs-fail](/gateway/troubleshooting#local-openai-compatible-backend-passes-direct-probes-but-agent-runs-fail)
## Plugin install fails with missing openclaw extensions
If install fails with `package.json missing openclaw.extensions`, the plugin package

View File

@@ -42,6 +42,7 @@ Looking for chat channel docs (WhatsApp/Telegram/Discord/Slack/Mattermost (plugi
- [Google (Gemini)](/providers/google)
- [Groq (LPU inference)](/providers/groq)
- [Hugging Face (Inference)](/providers/huggingface)
- [inferrs (local models)](/providers/inferrs)
- [Kilocode](/providers/kilocode)
- [LiteLLM (unified gateway)](/providers/litellm)
- [MiniMax](/providers/minimax)

173
docs/providers/inferrs.md Normal file
View File

@@ -0,0 +1,173 @@
---
summary: "Run OpenClaw through inferrs (OpenAI-compatible local server)"
read_when:
- You want to run OpenClaw against a local inferrs server
- You are serving Gemma or another model through inferrs
- You need the exact OpenClaw compat flags for inferrs
title: "inferrs"
---
# inferrs
[inferrs](https://github.com/ericcurtin/inferrs) can serve local models behind an
OpenAI-compatible `/v1` API. OpenClaw works with `inferrs` through the generic
`openai-completions` path.
`inferrs` is currently best treated as a custom self-hosted OpenAI-compatible
backend, not a dedicated OpenClaw provider plugin.
## Quick start
1. Start `inferrs` with a model.
Example:
```bash
inferrs serve gg-hf-gg/gemma-4-E2B-it \
--host 127.0.0.1 \
--port 8080 \
--device metal
```
2. Verify the server is reachable.
```bash
curl http://127.0.0.1:8080/health
curl http://127.0.0.1:8080/v1/models
```
3. Add an explicit OpenClaw provider entry and point your default model at it.
## Full config example
This example uses Gemma 4 on a local `inferrs` server.
```json5
{
agents: {
defaults: {
model: { primary: "inferrs/gg-hf-gg/gemma-4-E2B-it" },
models: {
"inferrs/gg-hf-gg/gemma-4-E2B-it": {
alias: "Gemma 4 (inferrs)",
},
},
},
},
models: {
mode: "merge",
providers: {
inferrs: {
baseUrl: "http://127.0.0.1:8080/v1",
apiKey: "inferrs-local",
api: "openai-completions",
models: [
{
id: "gg-hf-gg/gemma-4-E2B-it",
name: "Gemma 4 E2B (inferrs)",
reasoning: false,
input: ["text"],
cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
contextWindow: 131072,
maxTokens: 4096,
compat: {
requiresStringContent: true,
},
},
],
},
},
},
}
```
## Why `requiresStringContent` matters
Some `inferrs` Chat Completions routes accept only string
`messages[].content`, not structured content-part arrays.
If OpenClaw runs fail with an error like:
```text
messages[1].content: invalid type: sequence, expected a string
```
set:
```json5
compat: {
requiresStringContent: true
}
```
OpenClaw will flatten pure text content parts into plain strings before sending
the request.
## Gemma and tool-schema caveat
Some current `inferrs` + Gemma combinations accept small direct
`/v1/chat/completions` requests but still fail on full OpenClaw agent-runtime
turns.
If that happens, try this first:
```json5
compat: {
requiresStringContent: true,
supportsTools: false
}
```
That disables OpenClaw's tool schema surface for the model and can reduce prompt
pressure on stricter local backends.
If tiny direct requests still work but normal OpenClaw agent turns continue to
crash inside `inferrs`, the remaining issue is usually upstream model/server
behavior rather than OpenClaw's transport layer.
## Manual smoke test
Once configured, test both layers:
```bash
curl http://127.0.0.1:8080/v1/chat/completions \
-H 'content-type: application/json' \
-d '{"model":"gg-hf-gg/gemma-4-E2B-it","messages":[{"role":"user","content":"What is 2 + 2?"}],"stream":false}'
openclaw infer model run \
--model inferrs/gg-hf-gg/gemma-4-E2B-it \
--prompt "What is 2 + 2? Reply with one short sentence." \
--json
```
If the first command works but the second fails, use the troubleshooting notes
below.
## Troubleshooting
- `curl /v1/models` fails: `inferrs` is not running, not reachable, or not
bound to the expected host/port.
- `messages[].content ... expected a string`: set
`compat.requiresStringContent: true`.
- Direct tiny `/v1/chat/completions` calls pass, but `openclaw infer model run`
fails: try `compat.supportsTools: false`.
- OpenClaw no longer gets schema errors, but `inferrs` still crashes on larger
agent turns: treat it as an upstream `inferrs` or model limitation and reduce
prompt pressure or switch local backend/model.
## Proxy-style behavior
`inferrs` is treated as a proxy-style OpenAI-compatible `/v1` backend, not a
native OpenAI endpoint.
- native OpenAI-only request shaping does not apply here
- no `service_tier`, no Responses `store`, no prompt-cache hints, and no
OpenAI reasoning-compat payload shaping
- hidden OpenClaw attribution headers (`originator`, `version`, `User-Agent`)
are not injected on custom `inferrs` base URLs
## See also
- [Local models](/gateway/local-models)
- [Gateway troubleshooting](/gateway/troubleshooting#local-openai-compatible-backend-passes-direct-probes-but-agent-runs-fail)
- [Model providers](/concepts/model-providers)

View File

@@ -0,0 +1,35 @@
export function flattenStringOnlyCompletionContent(content: unknown): unknown {
if (!Array.isArray(content)) {
return content;
}
const textParts: string[] = [];
for (const item of content) {
if (
!item ||
typeof item !== "object" ||
(item as { type?: unknown }).type !== "text" ||
typeof (item as { text?: unknown }).text !== "string"
) {
return content;
}
textParts.push((item as { text: string }).text);
}
return textParts.join("\n");
}
export function flattenCompletionMessagesToStringContent(messages: unknown[]): unknown[] {
return messages.map((message) => {
if (!message || typeof message !== "object") {
return message;
}
const content = (message as { content?: unknown }).content;
const flattenedContent = flattenStringOnlyCompletionContent(content);
if (flattenedContent === content) {
return message;
}
return {
...message,
content: flattenedContent,
};
});
}

View File

@@ -1079,6 +1079,41 @@ describe("openai transport stream", () => {
expect(params.tools?.[0]?.function).not.toHaveProperty("strict");
});
it("flattens pure text content arrays for string-only completions backends when opted in", () => {
const params = buildOpenAICompletionsParams(
{
id: "gg-hf-gg/gemma-4-E2B-it",
name: "Gemma 4 E2B",
api: "openai-completions",
provider: "inferrs",
baseUrl: "http://127.0.0.1:8080/v1",
reasoning: false,
input: ["text"],
cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
contextWindow: 131072,
maxTokens: 4096,
compat: {
requiresStringContent: true,
} as Record<string, unknown>,
} satisfies Model<"openai-completions">,
{
systemPrompt: "system",
messages: [
{
role: "user",
content: [{ type: "text", text: "What is 2 + 2?" }],
timestamp: Date.now(),
},
],
tools: [],
} as never,
undefined,
) as { messages?: Array<{ role?: string; content?: unknown }> };
expect(params.messages?.[0]).toMatchObject({ role: "system", content: "system" });
expect(params.messages?.[1]).toMatchObject({ role: "user", content: "What is 2 + 2?" });
});
it("uses max_tokens for Chutes default-route completions providers without relying on baseUrl host sniffing", () => {
const params = buildOpenAICompletionsParams(
{

View File

@@ -23,6 +23,7 @@ import { resolveProviderTransportTurnStateWithPlugin } from "../plugins/provider
import type { ProviderRuntimeModel } from "../plugins/types.js";
import { buildCopilotDynamicHeaders, hasCopilotVisionInput } from "./copilot-dynamic-headers.js";
import { detectOpenAICompletionsCompat } from "./openai-completions-compat.js";
import { flattenCompletionMessagesToStringContent } from "./openai-completions-string-content.js";
import {
applyOpenAIResponsesPayloadPolicy,
resolveOpenAIResponsesPayloadPolicy,
@@ -1164,6 +1165,7 @@ function getCompat(model: OpenAIModeModel): {
openRouterRouting: Record<string, unknown>;
vercelGatewayRouting: Record<string, unknown>;
supportsStrictMode: boolean;
requiresStringContent: boolean;
} {
const detected = detectCompat(model);
const compat = model.compat ?? {};
@@ -1198,6 +1200,7 @@ function getCompat(model: OpenAIModeModel): {
detected.vercelGatewayRouting,
supportsStrictMode:
(compat.supportsStrictMode as boolean | undefined) ?? detected.supportsStrictMode,
requiresStringContent: (compat.requiresStringContent as boolean | undefined) ?? false,
};
}
@@ -1261,9 +1264,12 @@ export function buildOpenAICompletionsParams(
systemPrompt: stripSystemPromptCacheBoundary(context.systemPrompt),
}
: context;
const messages = convertMessages(model as never, completionsContext, compat as never);
const params: Record<string, unknown> = {
model: model.id,
messages: convertMessages(model as never, completionsContext, compat as never),
messages: compat.requiresStringContent
? flattenCompletionMessagesToStringContent(messages)
: messages,
stream: true,
};
if (compat.supportsUsageInStreaming) {

View File

@@ -132,6 +132,7 @@ import {
createOpenAIReasoningCompatibilityWrapper,
createOpenAIResponsesContextManagementWrapper,
createOpenAIServiceTierWrapper,
createOpenAIStringContentWrapper,
createOpenAITextVerbosityWrapper,
resolveOpenAIFastMode,
resolveOpenAIServiceTier,
@@ -170,6 +171,7 @@ function createTestOpenAIProviderWrapper(
config: params.context.config,
agentDir: params.context.agentDir,
});
streamFn = createOpenAIStringContentWrapper(streamFn);
return createOpenAIResponsesContextManagementWrapper(
createOpenAIReasoningCompatibilityWrapper(streamFn),
params.context.extraParams,
@@ -562,6 +564,54 @@ describe("applyExtraParamsToAgent", () => {
expect(payload.parallel_tool_calls).toBe(false);
});
it("flattens pure text OpenAI completions message arrays for string-only compat models", () => {
const payload = runResponsesPayloadMutationCase({
applyProvider: "inferrs",
applyModelId: "gg-hf-gg/gemma-4-E2B-it",
model: {
api: "openai-completions",
provider: "inferrs",
id: "gg-hf-gg/gemma-4-E2B-it",
name: "Gemma 4 E2B (inferrs)",
baseUrl: "http://127.0.0.1:8080/v1",
reasoning: false,
input: ["text"],
cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
contextWindow: 131072,
maxTokens: 4096,
compat: {
requiresStringContent: true,
} as Record<string, unknown>,
} as unknown as Model<"openai-completions">,
payload: {
messages: [
{
role: "system",
content: [{ type: "text", text: "System text" }],
},
{
role: "user",
content: [
{ type: "text", text: "Line one" },
{ type: "text", text: "Line two" },
],
},
],
},
});
expect(payload.messages).toEqual([
{
role: "system",
content: "System text",
},
{
role: "user",
content: "Line one\nLine two",
},
]);
});
it("injects parallel_tool_calls for openai-responses payloads when configured", () => {
const payload = runParallelToolCallsPayloadMutationCase({
applyProvider: "openai",

View File

@@ -16,7 +16,10 @@ import {
createSiliconFlowThinkingWrapper,
shouldApplySiliconFlowThinkingOffCompat,
} from "./moonshot-stream-wrappers.js";
import { createOpenAIResponsesContextManagementWrapper } from "./openai-stream-wrappers.js";
import {
createOpenAIResponsesContextManagementWrapper,
createOpenAIStringContentWrapper,
} from "./openai-stream-wrappers.js";
import { resolveCacheRetention } from "./prompt-cache-retention.js";
import { createOpenRouterSystemCacheWrapper } from "./proxy-stream-wrappers.js";
import { streamWithPayloadPatch } from "./stream-payload-utils.js";
@@ -389,6 +392,7 @@ function applyPostPluginStreamWrappers(
ctx: ApplyExtraParamsContext & { providerWrapperHandled: boolean },
): void {
ctx.agent.streamFn = createOpenRouterSystemCacheWrapper(ctx.agent.streamFn);
ctx.agent.streamFn = createOpenAIStringContentWrapper(ctx.agent.streamFn);
if (!ctx.providerWrapperHandled) {
// Guard Google-family payloads against invalid negative thinking budgets

View File

@@ -7,6 +7,7 @@ import {
patchCodexNativeWebSearchPayload,
resolveCodexNativeSearchActivation,
} from "../codex-native-web-search.js";
import { flattenCompletionMessagesToStringContent } from "../openai-completions-string-content.js";
import {
applyOpenAIResponsesPayloadPolicy,
resolveOpenAIResponsesPayloadPolicy,
@@ -66,6 +67,17 @@ function shouldApplyOpenAIReasoningCompatibility(model: {
return resolveOpenAIRequestCapabilities(model).supportsOpenAIReasoningCompatPayload;
}
function shouldFlattenOpenAICompletionMessages(model: {
api?: unknown;
compat?: unknown;
}): boolean {
const compat =
model.compat && typeof model.compat === "object"
? (model.compat as { requiresStringContent?: unknown })
: undefined;
return model.api === "openai-completions" && compat?.requiresStringContent === true;
}
function normalizeOpenAIServiceTier(value: unknown): OpenAIServiceTier | undefined {
if (typeof value !== "string") {
return undefined;
@@ -219,6 +231,21 @@ export function createOpenAIReasoningCompatibilityWrapper(
};
}
export function createOpenAIStringContentWrapper(baseStreamFn: StreamFn | undefined): StreamFn {
const underlying = baseStreamFn ?? streamSimple;
return (model, context, options) => {
if (!shouldFlattenOpenAICompletionMessages(model)) {
return underlying(model, context, options);
}
return streamWithPayloadPatch(underlying, model, context, options, (payloadObj) => {
if (!Array.isArray(payloadObj.messages)) {
return;
}
payloadObj.messages = flattenCompletionMessagesToStringContent(payloadObj.messages);
});
};
}
export function createOpenAIFastModeWrapper(baseStreamFn: StreamFn | undefined): StreamFn {
const underlying = baseStreamFn ?? streamSimple;
return (model, context, options) => {

View File

@@ -391,6 +391,7 @@ describe("model compat config schema", () => {
compat: {
supportsUsageInStreaming: true,
supportsStrictMode: false,
requiresStringContent: true,
thinkingFormat: "qwen",
requiresToolResultName: true,
requiresAssistantAfterToolResult: false,

View File

@@ -2807,6 +2807,9 @@ export const GENERATED_BASE_CONFIG_SCHEMA: BaseConfigSchemaResponse = {
supportsStrictMode: {
type: "boolean",
},
requiresStringContent: {
type: "boolean",
},
maxTokensField: {
anyOf: [
{

View File

@@ -37,6 +37,7 @@ type SupportedThinkingFormat =
export type ModelCompatConfig = SupportedOpenAICompatFields & {
thinkingFormat?: SupportedThinkingFormat;
supportsTools?: boolean;
requiresStringContent?: boolean;
toolSchemaProfile?: string;
unsupportedToolSchemaKeywords?: string[];
nativeWebSearchTool?: boolean;

View File

@@ -189,6 +189,7 @@ export const ModelCompatSchema = z
supportsUsageInStreaming: z.boolean().optional(),
supportsTools: z.boolean().optional(),
supportsStrictMode: z.boolean().optional(),
requiresStringContent: z.boolean().optional(),
maxTokensField: z
.union([z.literal("max_completion_tokens"), z.literal("max_tokens")])
.optional(),