feat(memory): configurable local embedding contextSize (default 4096) (#70544)

node-llama-cpp defaults contextSize to "auto", which on large embedding
models like Qwen3-Embedding-8B (trained context 40,960) inflates gateway
VRAM from ~8.8 GB to ~32 GB and causes OOM on single-GPU hosts that share
the gateway with an LLM runtime.

Expose memorySearch.local.contextSize in openclaw.json (number | "auto"),
default to 4096 which comfortably covers typical memory-search chunks
(128–512 tokens) while keeping non-weight VRAM bounded.

Closes #69667.
This commit is contained in:
aalekh-sarvam
2026-04-24 02:51:53 +05:30
committed by GitHub
parent 88b3fa14f0
commit d40dd9088e
11 changed files with 97 additions and 6 deletions

View File

@@ -198,10 +198,11 @@ arn:aws:bedrock:*::foundation-model/amazon.titan-embed-text-v2:0
## Local embedding config
| Key | Type | Default | Description |
| --------------------- | -------- | ---------------------- | ------------------------------- |
| `local.modelPath` | `string` | auto-downloaded | Path to GGUF model file |
| `local.modelCacheDir` | `string` | node-llama-cpp default | Cache dir for downloaded models |
| Key | Type | Default | Description |
| --------------------- | ------------------ | ---------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `local.modelPath` | `string` | auto-downloaded | Path to GGUF model file |
| `local.modelCacheDir` | `string` | node-llama-cpp default | Cache dir for downloaded models |
| `local.contextSize` | `number \| "auto"` | `4096` | Context window size for the embedding context. 4096 covers typical chunks (128512 tokens) while bounding non-weight VRAM. Lower to 10242048 on constrained hosts. `"auto"` uses the model's trained maximum — not recommended for 8B+ models (Qwen3-Embedding-8B: 40 960 tokens → ~32 GB VRAM vs ~8.8 GB at 4096). |
Default model: `embeddinggemma-300m-qat-Q8_0.gguf` (~0.6 GB, auto-downloaded).
Requires native build: `pnpm approve-builds` then `pnpm rebuild node-llama-cpp`.

View File

@@ -39,6 +39,7 @@ export type ResolvedMemorySearchConfig = {
local: {
modelPath?: string;
modelCacheDir?: string;
contextSize?: number | "auto";
};
store: {
driver: "sqlite";
@@ -195,6 +196,7 @@ function mergeConfig(
const local = {
modelPath: overrides?.local?.modelPath ?? defaults?.local?.modelPath,
modelCacheDir: overrides?.local?.modelCacheDir ?? defaults?.local?.modelCacheDir,
contextSize: overrides?.local?.contextSize ?? defaults?.local?.contextSize,
};
const sources = normalizeSources(overrides?.sources ?? defaults?.sources, sessionMemory);
const rawPaths = [...(defaults?.extraPaths ?? []), ...(overrides?.extraPaths ?? [])]

View File

@@ -4170,6 +4170,15 @@ export const GENERATED_BASE_CONFIG_SCHEMA: BaseConfigSchemaResponse = {
modelCacheDir: {
type: "string",
},
contextSize: {
anyOf: [
{ type: "integer", exclusiveMinimum: 0, maximum: 9007199254740991 },
{ type: "string", const: "auto" },
],
title: "Local Embedding Context Size",
description:
'Context window size passed to node-llama-cpp when creating the embedding context (default: 4096). 4096 safely covers typical memory-search chunks (128\u2013512 tokens) while keeping non-weight VRAM bounded. Lower to 1024\u20132048 on resource-constrained hosts. Set to "auto" to let node-llama-cpp use the model\'s trained maximum \u2014 not recommended for large models (e.g. Qwen3-Embedding-8B trained on 40\u202f960 tokens can push VRAM from ~8.8\u202fGB to ~32\u202fGB).',
},
},
additionalProperties: false,
},
@@ -6056,6 +6065,12 @@ export const GENERATED_BASE_CONFIG_SCHEMA: BaseConfigSchemaResponse = {
modelCacheDir: {
type: "string",
},
contextSize: {
anyOf: [
{ type: "integer", exclusiveMinimum: 0, maximum: 9007199254740991 },
{ type: "string", const: "auto" },
],
},
},
additionalProperties: false,
},
@@ -25150,6 +25165,11 @@ export const GENERATED_BASE_CONFIG_SCHEMA: BaseConfigSchemaResponse = {
help: "Specifies the local embedding model source for local memory search, such as a GGUF file path or `hf:` URI. Use this only when provider is `local`, and verify model compatibility before large index rebuilds.",
tags: ["storage"],
},
"agents.defaults.memorySearch.local.contextSize": {
label: "Local Embedding Context Size",
help: 'Context window size passed to node-llama-cpp when creating the embedding context (default: 4096). 4096 safely covers typical memory-search chunks (128\u2013512 tokens) while keeping non-weight VRAM bounded. Lower to 1024\u20132048 on resource-constrained hosts. Set to "auto" to let node-llama-cpp use the model\'s trained maximum \u2014 not recommended for large models (e.g. Qwen3-Embedding-8B trained on 40\u202f960 tokens can push VRAM from ~8.8\u202fGB to ~32\u202fGB).',
tags: ["advanced"],
},
"agents.defaults.memorySearch.store.path": {
label: "Memory Search Index Path",
help: "Sets where the SQLite memory index is stored on disk for each agent. Keep the default `~/.openclaw/memory/{agentId}.sqlite` unless you need custom storage placement or backup policy alignment.",

View File

@@ -958,6 +958,8 @@ export const FIELD_HELP: Record<string, string> = {
"Sets the maximum wait time for a full embedding batch operation in minutes (default: 60). Increase for very large corpora or slower providers, and lower it to fail fast in automation-heavy flows.",
"agents.defaults.memorySearch.local.modelPath":
"Specifies the local embedding model source for local memory search, such as a GGUF file path or `hf:` URI. Use this only when provider is `local`, and verify model compatibility before large index rebuilds.",
"agents.defaults.memorySearch.local.contextSize":
'Context window size passed to node-llama-cpp when creating the embedding context (default: 4096). 4096 safely covers typical memory-search chunks (128\u2013512 tokens) while keeping non-weight VRAM bounded. Lower to 1024\u20132048 on resource-constrained hosts. Set to "auto" to let node-llama-cpp use the model\'s trained maximum \u2014 not recommended for large models (e.g. Qwen3-Embedding-8B trained on 40\u202f960 tokens can push VRAM from ~8.8\u202fGB to ~32\u202fGB).',
"agents.defaults.memorySearch.fallback":
'Backup provider used when primary embeddings fail: "openai", "gemini", "voyage", "mistral", "bedrock", "lmstudio", "ollama", "local", or "none". Set a real fallback for production reliability; use "none" only if you prefer explicit failures.',
"agents.defaults.memorySearch.store.path":

View File

@@ -399,6 +399,7 @@ export const FIELD_LABELS: Record<string, string> = {
"agents.defaults.memorySearch.outputDimensionality": "Memory Search Output Dimensionality",
"agents.defaults.memorySearch.fallback": "Memory Search Fallback",
"agents.defaults.memorySearch.local.modelPath": "Local Embedding Model Path",
"agents.defaults.memorySearch.local.contextSize": "Local Embedding Context Size",
"agents.defaults.memorySearch.store.path": "Memory Search Index Path",
"agents.defaults.memorySearch.store.vector.enabled": "Memory Search Vector Index",
"agents.defaults.memorySearch.store.vector.extensionPath": "Memory Search Vector Extension Path",

View File

@@ -393,6 +393,12 @@ export type MemorySearchConfig = {
modelPath?: string;
/** Optional cache directory for local models. */
modelCacheDir?: string;
/**
* Context window size for the local embedding context (default: 4096).
* Use `"auto"` to defer to node-llama-cpp, which picks up to the model's
* trained maximum — not recommended for 8B+ models.
*/
contextSize?: number | "auto";
};
/** Index storage configuration. */
store?: {

View File

@@ -684,6 +684,7 @@ export const MemorySearchSchema = z
.object({
modelPath: z.string().optional(),
modelCacheDir: z.string().optional(),
contextSize: z.union([z.number().int().positive(), z.literal("auto")]).optional(),
})
.strict()
.optional(),

View File

@@ -44,6 +44,53 @@ describe("local embedding provider", () => {
expect(runtime.getEmbeddingFor).toHaveBeenCalledWith("test query");
});
it("passes default contextSize (4096) to createEmbeddingContext when not configured", async () => {
const runtime = mockLocalEmbeddingRuntime();
const provider = await createLocalEmbeddingProvider({
config: {} as never,
provider: "local",
model: "",
fallback: "none",
});
await provider.embedQuery("context size default test");
expect(runtime.createEmbeddingContext).toHaveBeenCalledWith({ contextSize: 4096 });
});
it("passes configured contextSize to createEmbeddingContext", async () => {
const runtime = mockLocalEmbeddingRuntime();
const provider = await createLocalEmbeddingProvider({
config: {} as never,
provider: "local",
model: "",
fallback: "none",
local: { contextSize: 2048 },
});
await provider.embedQuery("context size custom test");
expect(runtime.createEmbeddingContext).toHaveBeenCalledWith({ contextSize: 2048 });
});
it('passes "auto" contextSize to createEmbeddingContext when explicitly set', async () => {
const runtime = mockLocalEmbeddingRuntime();
const provider = await createLocalEmbeddingProvider({
config: {} as never,
provider: "local",
model: "",
fallback: "none",
local: { contextSize: "auto" },
});
await provider.embedQuery("context size auto test");
expect(runtime.createEmbeddingContext).toHaveBeenCalledWith({ contextSize: "auto" });
});
it("trims explicit local model paths and cache directories", async () => {
const runtime = mockLocalEmbeddingRuntime(new Float32Array([1, 0]));

View File

@@ -25,6 +25,7 @@ export async function createLocalEmbeddingProvider(
): Promise<EmbeddingProvider> {
const modelPath = normalizeOptionalString(options.local?.modelPath) || DEFAULT_LOCAL_MODEL;
const modelCacheDir = normalizeOptionalString(options.local?.modelCacheDir);
const contextSize: number | "auto" = options.local?.contextSize ?? 4096;
// Lazy-load node-llama-cpp to keep startup light unless local is enabled.
const { getLlama, resolveModelFile, LlamaLogLevel } = await importNodeLlamaCpp();
@@ -51,7 +52,7 @@ export async function createLocalEmbeddingProvider(
embeddingModel = await llama.loadModel({ modelPath: resolved });
}
if (!embeddingContext) {
embeddingContext = await embeddingModel.createEmbeddingContext();
embeddingContext = await embeddingModel.createEmbeddingContext({ contextSize });
}
return embeddingContext;
} catch (err) {

View File

@@ -38,6 +38,14 @@ export type EmbeddingProviderOptions = {
local?: {
modelPath?: string;
modelCacheDir?: string;
/**
* Context size passed to node-llama-cpp `createEmbeddingContext`.
* Default: 4096, chosen to cover typical memory-search chunks (128512 tokens)
* while keeping non-weight VRAM bounded.
* Set `"auto"` to let node-llama-cpp use the model's trained maximum — not
* recommended for 8B+ models (e.g. Qwen3-Embedding-8B: up to 40 960 tokens → ~32 GB VRAM).
*/
contextSize?: number | "auto";
};
/** Provider-specific output vector dimensions for supported embedding families. */
outputDimensionality?: number;

View File

@@ -7,7 +7,9 @@ export type LlamaEmbeddingContext = {
};
export type LlamaModel = {
createEmbeddingContext: () => Promise<LlamaEmbeddingContext>;
createEmbeddingContext: (options?: {
contextSize?: number | "auto";
}) => Promise<LlamaEmbeddingContext>;
};
export type Llama = {