From d40dd9088ee2708c15968dcaf5d5ce1e7b9fe6d3 Mon Sep 17 00:00:00 2001 From: aalekh-sarvam Date: Fri, 24 Apr 2026 02:51:53 +0530 Subject: [PATCH] feat(memory): configurable local embedding contextSize (default 4096) (#70544) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit node-llama-cpp defaults contextSize to "auto", which on large embedding models like Qwen3-Embedding-8B (trained context 40,960) inflates gateway VRAM from ~8.8 GB to ~32 GB and causes OOM on single-GPU hosts that share the gateway with an LLM runtime. Expose memorySearch.local.contextSize in openclaw.json (number | "auto"), default to 4096 which comfortably covers typical memory-search chunks (128–512 tokens) while keeping non-weight VRAM bounded. Closes #69667. --- docs/reference/memory-config.md | 9 ++-- src/agents/memory-search.ts | 2 + src/config/schema.base.generated.ts | 20 +++++++++ src/config/schema.help.ts | 2 + src/config/schema.labels.ts | 1 + src/config/types.tools.ts | 6 +++ src/config/zod-schema.agent-runtime.ts | 1 + src/memory-host-sdk/host/embeddings.test.ts | 47 ++++++++++++++++++++ src/memory-host-sdk/host/embeddings.ts | 3 +- src/memory-host-sdk/host/embeddings.types.ts | 8 ++++ src/memory-host-sdk/host/node-llama.ts | 4 +- 11 files changed, 97 insertions(+), 6 deletions(-) diff --git a/docs/reference/memory-config.md b/docs/reference/memory-config.md index 8f094a5276d..9dd7fbfa56a 100644 --- a/docs/reference/memory-config.md +++ b/docs/reference/memory-config.md @@ -198,10 +198,11 @@ arn:aws:bedrock:*::foundation-model/amazon.titan-embed-text-v2:0 ## Local embedding config -| Key | Type | Default | Description | -| --------------------- | -------- | ---------------------- | ------------------------------- | -| `local.modelPath` | `string` | auto-downloaded | Path to GGUF model file | -| `local.modelCacheDir` | `string` | node-llama-cpp default | Cache dir for downloaded models | +| Key | Type | Default | Description | +| --------------------- | ------------------ | ---------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `local.modelPath` | `string` | auto-downloaded | Path to GGUF model file | +| `local.modelCacheDir` | `string` | node-llama-cpp default | Cache dir for downloaded models | +| `local.contextSize` | `number \| "auto"` | `4096` | Context window size for the embedding context. 4096 covers typical chunks (128–512 tokens) while bounding non-weight VRAM. Lower to 1024–2048 on constrained hosts. `"auto"` uses the model's trained maximum — not recommended for 8B+ models (Qwen3-Embedding-8B: 40 960 tokens → ~32 GB VRAM vs ~8.8 GB at 4096). | Default model: `embeddinggemma-300m-qat-Q8_0.gguf` (~0.6 GB, auto-downloaded). Requires native build: `pnpm approve-builds` then `pnpm rebuild node-llama-cpp`. diff --git a/src/agents/memory-search.ts b/src/agents/memory-search.ts index 021e1674e9e..a509d49b63c 100644 --- a/src/agents/memory-search.ts +++ b/src/agents/memory-search.ts @@ -39,6 +39,7 @@ export type ResolvedMemorySearchConfig = { local: { modelPath?: string; modelCacheDir?: string; + contextSize?: number | "auto"; }; store: { driver: "sqlite"; @@ -195,6 +196,7 @@ function mergeConfig( const local = { modelPath: overrides?.local?.modelPath ?? defaults?.local?.modelPath, modelCacheDir: overrides?.local?.modelCacheDir ?? defaults?.local?.modelCacheDir, + contextSize: overrides?.local?.contextSize ?? defaults?.local?.contextSize, }; const sources = normalizeSources(overrides?.sources ?? defaults?.sources, sessionMemory); const rawPaths = [...(defaults?.extraPaths ?? []), ...(overrides?.extraPaths ?? [])] diff --git a/src/config/schema.base.generated.ts b/src/config/schema.base.generated.ts index 8e32bf31bc2..2236e57f718 100644 --- a/src/config/schema.base.generated.ts +++ b/src/config/schema.base.generated.ts @@ -4170,6 +4170,15 @@ export const GENERATED_BASE_CONFIG_SCHEMA: BaseConfigSchemaResponse = { modelCacheDir: { type: "string", }, + contextSize: { + anyOf: [ + { type: "integer", exclusiveMinimum: 0, maximum: 9007199254740991 }, + { type: "string", const: "auto" }, + ], + title: "Local Embedding Context Size", + description: + 'Context window size passed to node-llama-cpp when creating the embedding context (default: 4096). 4096 safely covers typical memory-search chunks (128\u2013512 tokens) while keeping non-weight VRAM bounded. Lower to 1024\u20132048 on resource-constrained hosts. Set to "auto" to let node-llama-cpp use the model\'s trained maximum \u2014 not recommended for large models (e.g. Qwen3-Embedding-8B trained on 40\u202f960 tokens can push VRAM from ~8.8\u202fGB to ~32\u202fGB).', + }, }, additionalProperties: false, }, @@ -6056,6 +6065,12 @@ export const GENERATED_BASE_CONFIG_SCHEMA: BaseConfigSchemaResponse = { modelCacheDir: { type: "string", }, + contextSize: { + anyOf: [ + { type: "integer", exclusiveMinimum: 0, maximum: 9007199254740991 }, + { type: "string", const: "auto" }, + ], + }, }, additionalProperties: false, }, @@ -25150,6 +25165,11 @@ export const GENERATED_BASE_CONFIG_SCHEMA: BaseConfigSchemaResponse = { help: "Specifies the local embedding model source for local memory search, such as a GGUF file path or `hf:` URI. Use this only when provider is `local`, and verify model compatibility before large index rebuilds.", tags: ["storage"], }, + "agents.defaults.memorySearch.local.contextSize": { + label: "Local Embedding Context Size", + help: 'Context window size passed to node-llama-cpp when creating the embedding context (default: 4096). 4096 safely covers typical memory-search chunks (128\u2013512 tokens) while keeping non-weight VRAM bounded. Lower to 1024\u20132048 on resource-constrained hosts. Set to "auto" to let node-llama-cpp use the model\'s trained maximum \u2014 not recommended for large models (e.g. Qwen3-Embedding-8B trained on 40\u202f960 tokens can push VRAM from ~8.8\u202fGB to ~32\u202fGB).', + tags: ["advanced"], + }, "agents.defaults.memorySearch.store.path": { label: "Memory Search Index Path", help: "Sets where the SQLite memory index is stored on disk for each agent. Keep the default `~/.openclaw/memory/{agentId}.sqlite` unless you need custom storage placement or backup policy alignment.", diff --git a/src/config/schema.help.ts b/src/config/schema.help.ts index 98358dceb5f..465265307bd 100644 --- a/src/config/schema.help.ts +++ b/src/config/schema.help.ts @@ -958,6 +958,8 @@ export const FIELD_HELP: Record = { "Sets the maximum wait time for a full embedding batch operation in minutes (default: 60). Increase for very large corpora or slower providers, and lower it to fail fast in automation-heavy flows.", "agents.defaults.memorySearch.local.modelPath": "Specifies the local embedding model source for local memory search, such as a GGUF file path or `hf:` URI. Use this only when provider is `local`, and verify model compatibility before large index rebuilds.", + "agents.defaults.memorySearch.local.contextSize": + 'Context window size passed to node-llama-cpp when creating the embedding context (default: 4096). 4096 safely covers typical memory-search chunks (128\u2013512 tokens) while keeping non-weight VRAM bounded. Lower to 1024\u20132048 on resource-constrained hosts. Set to "auto" to let node-llama-cpp use the model\'s trained maximum \u2014 not recommended for large models (e.g. Qwen3-Embedding-8B trained on 40\u202f960 tokens can push VRAM from ~8.8\u202fGB to ~32\u202fGB).', "agents.defaults.memorySearch.fallback": 'Backup provider used when primary embeddings fail: "openai", "gemini", "voyage", "mistral", "bedrock", "lmstudio", "ollama", "local", or "none". Set a real fallback for production reliability; use "none" only if you prefer explicit failures.', "agents.defaults.memorySearch.store.path": diff --git a/src/config/schema.labels.ts b/src/config/schema.labels.ts index 868b615cbf5..3abf869e3d6 100644 --- a/src/config/schema.labels.ts +++ b/src/config/schema.labels.ts @@ -399,6 +399,7 @@ export const FIELD_LABELS: Record = { "agents.defaults.memorySearch.outputDimensionality": "Memory Search Output Dimensionality", "agents.defaults.memorySearch.fallback": "Memory Search Fallback", "agents.defaults.memorySearch.local.modelPath": "Local Embedding Model Path", + "agents.defaults.memorySearch.local.contextSize": "Local Embedding Context Size", "agents.defaults.memorySearch.store.path": "Memory Search Index Path", "agents.defaults.memorySearch.store.vector.enabled": "Memory Search Vector Index", "agents.defaults.memorySearch.store.vector.extensionPath": "Memory Search Vector Extension Path", diff --git a/src/config/types.tools.ts b/src/config/types.tools.ts index 66bb686d6b5..2ad59d2557b 100644 --- a/src/config/types.tools.ts +++ b/src/config/types.tools.ts @@ -393,6 +393,12 @@ export type MemorySearchConfig = { modelPath?: string; /** Optional cache directory for local models. */ modelCacheDir?: string; + /** + * Context window size for the local embedding context (default: 4096). + * Use `"auto"` to defer to node-llama-cpp, which picks up to the model's + * trained maximum — not recommended for 8B+ models. + */ + contextSize?: number | "auto"; }; /** Index storage configuration. */ store?: { diff --git a/src/config/zod-schema.agent-runtime.ts b/src/config/zod-schema.agent-runtime.ts index 5401abeb342..b2037b3bcb5 100644 --- a/src/config/zod-schema.agent-runtime.ts +++ b/src/config/zod-schema.agent-runtime.ts @@ -684,6 +684,7 @@ export const MemorySearchSchema = z .object({ modelPath: z.string().optional(), modelCacheDir: z.string().optional(), + contextSize: z.union([z.number().int().positive(), z.literal("auto")]).optional(), }) .strict() .optional(), diff --git a/src/memory-host-sdk/host/embeddings.test.ts b/src/memory-host-sdk/host/embeddings.test.ts index a6073c413e7..cabb5db93fc 100644 --- a/src/memory-host-sdk/host/embeddings.test.ts +++ b/src/memory-host-sdk/host/embeddings.test.ts @@ -44,6 +44,53 @@ describe("local embedding provider", () => { expect(runtime.getEmbeddingFor).toHaveBeenCalledWith("test query"); }); + it("passes default contextSize (4096) to createEmbeddingContext when not configured", async () => { + const runtime = mockLocalEmbeddingRuntime(); + + const provider = await createLocalEmbeddingProvider({ + config: {} as never, + provider: "local", + model: "", + fallback: "none", + }); + + await provider.embedQuery("context size default test"); + + expect(runtime.createEmbeddingContext).toHaveBeenCalledWith({ contextSize: 4096 }); + }); + + it("passes configured contextSize to createEmbeddingContext", async () => { + const runtime = mockLocalEmbeddingRuntime(); + + const provider = await createLocalEmbeddingProvider({ + config: {} as never, + provider: "local", + model: "", + fallback: "none", + local: { contextSize: 2048 }, + }); + + await provider.embedQuery("context size custom test"); + + expect(runtime.createEmbeddingContext).toHaveBeenCalledWith({ contextSize: 2048 }); + }); + + it('passes "auto" contextSize to createEmbeddingContext when explicitly set', async () => { + const runtime = mockLocalEmbeddingRuntime(); + + const provider = await createLocalEmbeddingProvider({ + config: {} as never, + provider: "local", + model: "", + fallback: "none", + local: { contextSize: "auto" }, + }); + + await provider.embedQuery("context size auto test"); + + expect(runtime.createEmbeddingContext).toHaveBeenCalledWith({ contextSize: "auto" }); + }); + it("trims explicit local model paths and cache directories", async () => { const runtime = mockLocalEmbeddingRuntime(new Float32Array([1, 0])); diff --git a/src/memory-host-sdk/host/embeddings.ts b/src/memory-host-sdk/host/embeddings.ts index dbf98f261af..08d5bd65df0 100644 --- a/src/memory-host-sdk/host/embeddings.ts +++ b/src/memory-host-sdk/host/embeddings.ts @@ -25,6 +25,7 @@ export async function createLocalEmbeddingProvider( ): Promise { const modelPath = normalizeOptionalString(options.local?.modelPath) || DEFAULT_LOCAL_MODEL; const modelCacheDir = normalizeOptionalString(options.local?.modelCacheDir); + const contextSize: number | "auto" = options.local?.contextSize ?? 4096; // Lazy-load node-llama-cpp to keep startup light unless local is enabled. const { getLlama, resolveModelFile, LlamaLogLevel } = await importNodeLlamaCpp(); @@ -51,7 +52,7 @@ export async function createLocalEmbeddingProvider( embeddingModel = await llama.loadModel({ modelPath: resolved }); } if (!embeddingContext) { - embeddingContext = await embeddingModel.createEmbeddingContext(); + embeddingContext = await embeddingModel.createEmbeddingContext({ contextSize }); } return embeddingContext; } catch (err) { diff --git a/src/memory-host-sdk/host/embeddings.types.ts b/src/memory-host-sdk/host/embeddings.types.ts index d83c5305b0a..218e3c91508 100644 --- a/src/memory-host-sdk/host/embeddings.types.ts +++ b/src/memory-host-sdk/host/embeddings.types.ts @@ -38,6 +38,14 @@ export type EmbeddingProviderOptions = { local?: { modelPath?: string; modelCacheDir?: string; + /** + * Context size passed to node-llama-cpp `createEmbeddingContext`. + * Default: 4096, chosen to cover typical memory-search chunks (128–512 tokens) + * while keeping non-weight VRAM bounded. + * Set `"auto"` to let node-llama-cpp use the model's trained maximum — not + * recommended for 8B+ models (e.g. Qwen3-Embedding-8B: up to 40 960 tokens → ~32 GB VRAM). + */ + contextSize?: number | "auto"; }; /** Provider-specific output vector dimensions for supported embedding families. */ outputDimensionality?: number; diff --git a/src/memory-host-sdk/host/node-llama.ts b/src/memory-host-sdk/host/node-llama.ts index 7b54e3fed2f..8871b65da2e 100644 --- a/src/memory-host-sdk/host/node-llama.ts +++ b/src/memory-host-sdk/host/node-llama.ts @@ -7,7 +7,9 @@ export type LlamaEmbeddingContext = { }; export type LlamaModel = { - createEmbeddingContext: () => Promise; + createEmbeddingContext: (options?: { + contextSize?: number | "auto"; + }) => Promise; }; export type Llama = {