From d40dd9088ee2708c15968dcaf5d5ce1e7b9fe6d3 Mon Sep 17 00:00:00 2001
From: aalekh-sarvam <aalekh@sarvam.ai>
Date: Fri, 24 Apr 2026 02:51:53 +0530
Subject: [PATCH] feat(memory): configurable local embedding contextSize
 (default 4096) (#70544)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

node-llama-cpp defaults contextSize to "auto", which on large embedding
models like Qwen3-Embedding-8B (trained context 40,960) inflates gateway
VRAM from ~8.8 GB to ~32 GB and causes OOM on single-GPU hosts that share
the gateway with an LLM runtime.

Expose memorySearch.local.contextSize in openclaw.json (number | "auto"),
default to 4096 which comfortably covers typical memory-search chunks
(128–512 tokens) while keeping non-weight VRAM bounded.

Closes #69667.
---
 docs/reference/memory-config.md              |  9 ++--
 src/agents/memory-search.ts                  |  2 +
 src/config/schema.base.generated.ts          | 20 +++++++++
 src/config/schema.help.ts                    |  2 +
 src/config/schema.labels.ts                  |  1 +
 src/config/types.tools.ts                    |  6 +++
 src/config/zod-schema.agent-runtime.ts       |  1 +
 src/memory-host-sdk/host/embeddings.test.ts  | 47 ++++++++++++++++++++
 src/memory-host-sdk/host/embeddings.ts       |  3 +-
 src/memory-host-sdk/host/embeddings.types.ts |  8 ++++
 src/memory-host-sdk/host/node-llama.ts       |  4 +-
 11 files changed, 97 insertions(+), 6 deletions(-)

diff --git a/docs/reference/memory-config.md b/docs/reference/memory-config.md
index 8f094a5276d..9dd7fbfa56a 100644
--- a/docs/reference/memory-config.md
+++ b/docs/reference/memory-config.md
@@ -198,10 +198,11 @@ arn:aws:bedrock:*::foundation-model/amazon.titan-embed-text-v2:0
 
 ## Local embedding config
 
-| Key                   | Type     | Default                | Description                     |
-| --------------------- | -------- | ---------------------- | ------------------------------- |
-| `local.modelPath`     | `string` | auto-downloaded        | Path to GGUF model file         |
-| `local.modelCacheDir` | `string` | node-llama-cpp default | Cache dir for downloaded models |
+| Key                   | Type               | Default                | Description                                                                                                                                                                                                                                                                                                          |
+| --------------------- | ------------------ | ---------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `local.modelPath`     | `string`           | auto-downloaded        | Path to GGUF model file                                                                                                                                                                                                                                                                                              |
+| `local.modelCacheDir` | `string`           | node-llama-cpp default | Cache dir for downloaded models                                                                                                                                                                                                                                                                                      |
+| `local.contextSize`   | `number \| "auto"` | `4096`                 | Context window size for the embedding context. 4096 covers typical chunks (128–512 tokens) while bounding non-weight VRAM. Lower to 1024–2048 on constrained hosts. `"auto"` uses the model's trained maximum — not recommended for 8B+ models (Qwen3-Embedding-8B: 40 960 tokens → ~32 GB VRAM vs ~8.8 GB at 4096). |
 
 Default model: `embeddinggemma-300m-qat-Q8_0.gguf` (~0.6 GB, auto-downloaded).
 Requires native build: `pnpm approve-builds` then `pnpm rebuild node-llama-cpp`.
diff --git a/src/agents/memory-search.ts b/src/agents/memory-search.ts
index 021e1674e9e..a509d49b63c 100644
--- a/src/agents/memory-search.ts
+++ b/src/agents/memory-search.ts
@@ -39,6 +39,7 @@ export type ResolvedMemorySearchConfig = {
   local: {
     modelPath?: string;
     modelCacheDir?: string;
+    contextSize?: number | "auto";
   };
   store: {
     driver: "sqlite";
@@ -195,6 +196,7 @@ function mergeConfig(
   const local = {
     modelPath: overrides?.local?.modelPath ?? defaults?.local?.modelPath,
     modelCacheDir: overrides?.local?.modelCacheDir ?? defaults?.local?.modelCacheDir,
+    contextSize: overrides?.local?.contextSize ?? defaults?.local?.contextSize,
   };
   const sources = normalizeSources(overrides?.sources ?? defaults?.sources, sessionMemory);
   const rawPaths = [...(defaults?.extraPaths ?? []), ...(overrides?.extraPaths ?? [])]
diff --git a/src/config/schema.base.generated.ts b/src/config/schema.base.generated.ts
index 8e32bf31bc2..2236e57f718 100644
--- a/src/config/schema.base.generated.ts
+++ b/src/config/schema.base.generated.ts
@@ -4170,6 +4170,15 @@ export const GENERATED_BASE_CONFIG_SCHEMA: BaseConfigSchemaResponse = {
                       modelCacheDir: {
                         type: "string",
                       },
+                      contextSize: {
+                        anyOf: [
+                          { type: "integer", exclusiveMinimum: 0, maximum: 9007199254740991 },
+                          { type: "string", const: "auto" },
+                        ],
+                        title: "Local Embedding Context Size",
+                        description:
+                          'Context window size passed to node-llama-cpp when creating the embedding context (default: 4096). 4096 safely covers typical memory-search chunks (128\u2013512 tokens) while keeping non-weight VRAM bounded. Lower to 1024\u20132048 on resource-constrained hosts. Set to "auto" to let node-llama-cpp use the model\'s trained maximum \u2014 not recommended for large models (e.g. Qwen3-Embedding-8B trained on 40\u202f960 tokens can push VRAM from ~8.8\u202fGB to ~32\u202fGB).',
+                      },
                     },
                     additionalProperties: false,
                   },
@@ -6056,6 +6065,12 @@ export const GENERATED_BASE_CONFIG_SCHEMA: BaseConfigSchemaResponse = {
                         modelCacheDir: {
                           type: "string",
                         },
+                        contextSize: {
+                          anyOf: [
+                            { type: "integer", exclusiveMinimum: 0, maximum: 9007199254740991 },
+                            { type: "string", const: "auto" },
+                          ],
+                        },
                       },
                       additionalProperties: false,
                     },
@@ -25150,6 +25165,11 @@ export const GENERATED_BASE_CONFIG_SCHEMA: BaseConfigSchemaResponse = {
       help: "Specifies the local embedding model source for local memory search, such as a GGUF file path or `hf:` URI. Use this only when provider is `local`, and verify model compatibility before large index rebuilds.",
       tags: ["storage"],
     },
+    "agents.defaults.memorySearch.local.contextSize": {
+      label: "Local Embedding Context Size",
+      help: 'Context window size passed to node-llama-cpp when creating the embedding context (default: 4096). 4096 safely covers typical memory-search chunks (128\u2013512 tokens) while keeping non-weight VRAM bounded. Lower to 1024\u20132048 on resource-constrained hosts. Set to "auto" to let node-llama-cpp use the model\'s trained maximum \u2014 not recommended for large models (e.g. Qwen3-Embedding-8B trained on 40\u202f960 tokens can push VRAM from ~8.8\u202fGB to ~32\u202fGB).',
+      tags: ["advanced"],
+    },
     "agents.defaults.memorySearch.store.path": {
       label: "Memory Search Index Path",
       help: "Sets where the SQLite memory index is stored on disk for each agent. Keep the default `~/.openclaw/memory/{agentId}.sqlite` unless you need custom storage placement or backup policy alignment.",
diff --git a/src/config/schema.help.ts b/src/config/schema.help.ts
index 98358dceb5f..465265307bd 100644
--- a/src/config/schema.help.ts
+++ b/src/config/schema.help.ts
@@ -958,6 +958,8 @@ export const FIELD_HELP: Record<string, string> = {
     "Sets the maximum wait time for a full embedding batch operation in minutes (default: 60). Increase for very large corpora or slower providers, and lower it to fail fast in automation-heavy flows.",
   "agents.defaults.memorySearch.local.modelPath":
     "Specifies the local embedding model source for local memory search, such as a GGUF file path or `hf:` URI. Use this only when provider is `local`, and verify model compatibility before large index rebuilds.",
+  "agents.defaults.memorySearch.local.contextSize":
+    'Context window size passed to node-llama-cpp when creating the embedding context (default: 4096). 4096 safely covers typical memory-search chunks (128\u2013512 tokens) while keeping non-weight VRAM bounded. Lower to 1024\u20132048 on resource-constrained hosts. Set to "auto" to let node-llama-cpp use the model\'s trained maximum \u2014 not recommended for large models (e.g. Qwen3-Embedding-8B trained on 40\u202f960 tokens can push VRAM from ~8.8\u202fGB to ~32\u202fGB).',
   "agents.defaults.memorySearch.fallback":
     'Backup provider used when primary embeddings fail: "openai", "gemini", "voyage", "mistral", "bedrock", "lmstudio", "ollama", "local", or "none". Set a real fallback for production reliability; use "none" only if you prefer explicit failures.',
   "agents.defaults.memorySearch.store.path":
diff --git a/src/config/schema.labels.ts b/src/config/schema.labels.ts
index 868b615cbf5..3abf869e3d6 100644
--- a/src/config/schema.labels.ts
+++ b/src/config/schema.labels.ts
@@ -399,6 +399,7 @@ export const FIELD_LABELS: Record<string, string> = {
   "agents.defaults.memorySearch.outputDimensionality": "Memory Search Output Dimensionality",
   "agents.defaults.memorySearch.fallback": "Memory Search Fallback",
   "agents.defaults.memorySearch.local.modelPath": "Local Embedding Model Path",
+  "agents.defaults.memorySearch.local.contextSize": "Local Embedding Context Size",
   "agents.defaults.memorySearch.store.path": "Memory Search Index Path",
   "agents.defaults.memorySearch.store.vector.enabled": "Memory Search Vector Index",
   "agents.defaults.memorySearch.store.vector.extensionPath": "Memory Search Vector Extension Path",
diff --git a/src/config/types.tools.ts b/src/config/types.tools.ts
index 66bb686d6b5..2ad59d2557b 100644
--- a/src/config/types.tools.ts
+++ b/src/config/types.tools.ts
@@ -393,6 +393,12 @@ export type MemorySearchConfig = {
     modelPath?: string;
     /** Optional cache directory for local models. */
     modelCacheDir?: string;
+    /**
+     * Context window size for the local embedding context (default: 4096).
+     * Use `"auto"` to defer to node-llama-cpp, which picks up to the model's
+     * trained maximum — not recommended for 8B+ models.
+     */
+    contextSize?: number | "auto";
   };
   /** Index storage configuration. */
   store?: {
diff --git a/src/config/zod-schema.agent-runtime.ts b/src/config/zod-schema.agent-runtime.ts
index 5401abeb342..b2037b3bcb5 100644
--- a/src/config/zod-schema.agent-runtime.ts
+++ b/src/config/zod-schema.agent-runtime.ts
@@ -684,6 +684,7 @@ export const MemorySearchSchema = z
       .object({
         modelPath: z.string().optional(),
         modelCacheDir: z.string().optional(),
+        contextSize: z.union([z.number().int().positive(), z.literal("auto")]).optional(),
       })
       .strict()
       .optional(),
diff --git a/src/memory-host-sdk/host/embeddings.test.ts b/src/memory-host-sdk/host/embeddings.test.ts
index a6073c413e7..cabb5db93fc 100644
--- a/src/memory-host-sdk/host/embeddings.test.ts
+++ b/src/memory-host-sdk/host/embeddings.test.ts
@@ -44,6 +44,53 @@ describe("local embedding provider", () => {
     expect(runtime.getEmbeddingFor).toHaveBeenCalledWith("test query");
   });
 
+  it("passes default contextSize (4096) to createEmbeddingContext when not configured", async () => {
+    const runtime = mockLocalEmbeddingRuntime();
+
+    const provider = await createLocalEmbeddingProvider({
+      config: {} as never,
+      provider: "local",
+      model: "",
+      fallback: "none",
+    });
+
+    await provider.embedQuery("context size default test");
+
+    expect(runtime.createEmbeddingContext).toHaveBeenCalledWith({ contextSize: 4096 });
+  });
+
+  it("passes configured contextSize to createEmbeddingContext", async () => {
+    const runtime = mockLocalEmbeddingRuntime();
+
+    const provider = await createLocalEmbeddingProvider({
+      config: {} as never,
+      provider: "local",
+      model: "",
+      fallback: "none",
+      local: { contextSize: 2048 },
+    });
+
+    await provider.embedQuery("context size custom test");
+
+    expect(runtime.createEmbeddingContext).toHaveBeenCalledWith({ contextSize: 2048 });
+  });
+
+  it('passes "auto" contextSize to createEmbeddingContext when explicitly set', async () => {
+    const runtime = mockLocalEmbeddingRuntime();
+
+    const provider = await createLocalEmbeddingProvider({
+      config: {} as never,
+      provider: "local",
+      model: "",
+      fallback: "none",
+      local: { contextSize: "auto" },
+    });
+
+    await provider.embedQuery("context size auto test");
+
+    expect(runtime.createEmbeddingContext).toHaveBeenCalledWith({ contextSize: "auto" });
+  });
+
   it("trims explicit local model paths and cache directories", async () => {
     const runtime = mockLocalEmbeddingRuntime(new Float32Array([1, 0]));
 
diff --git a/src/memory-host-sdk/host/embeddings.ts b/src/memory-host-sdk/host/embeddings.ts
index dbf98f261af..08d5bd65df0 100644
--- a/src/memory-host-sdk/host/embeddings.ts
+++ b/src/memory-host-sdk/host/embeddings.ts
@@ -25,6 +25,7 @@ export async function createLocalEmbeddingProvider(
 ): Promise<EmbeddingProvider> {
   const modelPath = normalizeOptionalString(options.local?.modelPath) || DEFAULT_LOCAL_MODEL;
   const modelCacheDir = normalizeOptionalString(options.local?.modelCacheDir);
+  const contextSize: number | "auto" = options.local?.contextSize ?? 4096;
 
   // Lazy-load node-llama-cpp to keep startup light unless local is enabled.
   const { getLlama, resolveModelFile, LlamaLogLevel } = await importNodeLlamaCpp();
@@ -51,7 +52,7 @@ export async function createLocalEmbeddingProvider(
           embeddingModel = await llama.loadModel({ modelPath: resolved });
         }
         if (!embeddingContext) {
-          embeddingContext = await embeddingModel.createEmbeddingContext();
+          embeddingContext = await embeddingModel.createEmbeddingContext({ contextSize });
         }
         return embeddingContext;
       } catch (err) {
diff --git a/src/memory-host-sdk/host/embeddings.types.ts b/src/memory-host-sdk/host/embeddings.types.ts
index d83c5305b0a..218e3c91508 100644
--- a/src/memory-host-sdk/host/embeddings.types.ts
+++ b/src/memory-host-sdk/host/embeddings.types.ts
@@ -38,6 +38,14 @@ export type EmbeddingProviderOptions = {
   local?: {
     modelPath?: string;
     modelCacheDir?: string;
+    /**
+     * Context size passed to node-llama-cpp `createEmbeddingContext`.
+     * Default: 4096, chosen to cover typical memory-search chunks (128–512 tokens)
+     * while keeping non-weight VRAM bounded.
+     * Set `"auto"` to let node-llama-cpp use the model's trained maximum — not
+     * recommended for 8B+ models (e.g. Qwen3-Embedding-8B: up to 40 960 tokens → ~32 GB VRAM).
+     */
+    contextSize?: number | "auto";
   };
   /** Provider-specific output vector dimensions for supported embedding families. */
   outputDimensionality?: number;
diff --git a/src/memory-host-sdk/host/node-llama.ts b/src/memory-host-sdk/host/node-llama.ts
index 7b54e3fed2f..8871b65da2e 100644
--- a/src/memory-host-sdk/host/node-llama.ts
+++ b/src/memory-host-sdk/host/node-llama.ts
@@ -7,7 +7,9 @@ export type LlamaEmbeddingContext = {
 };
 
 export type LlamaModel = {
-  createEmbeddingContext: () => Promise<LlamaEmbeddingContext>;
+  createEmbeddingContext: (options?: {
+    contextSize?: number | "auto";
+  }) => Promise<LlamaEmbeddingContext>;
 };
 
 export type Llama = {