feat(memory): configurable local embedding contextSize (default 4096) (#70544)

node-llama-cpp defaults contextSize to "auto", which on large embedding models like Qwen3-Embedding-8B (trained context 40,960) inflates gateway VRAM from ~8.8 GB to ~32 GB and causes OOM on single-GPU hosts that share the gateway with an LLM runtime. Expose memorySearch.local.contextSize in openclaw.json (number | "auto"), default to 4096 which comfortably covers typical memory-search chunks (128–512 tokens) while keeping non-weight VRAM bounded. Closes #69667.
2026-05-06 07:40:44 +00:00 · 2026-04-24 02:51:53 +05:30
parent 88b3fa14f0
commit d40dd9088e
11 changed files with 97 additions and 6 deletions
--- a/docs/reference/memory-config.md
+++ b/docs/reference/memory-config.md
@@ -198,10 +198,11 @@ arn:aws:bedrock:*::foundation-model/amazon.titan-embed-text-v2:0

 ## Local embedding config

-| Key                   | Type     | Default                | Description                     |
-| --------------------- | -------- | ---------------------- | ------------------------------- |
-| `local.modelPath`     | `string` | auto-downloaded        | Path to GGUF model file         |
-| `local.modelCacheDir` | `string` | node-llama-cpp default | Cache dir for downloaded models |
+| Key                   | Type               | Default                | Description                                                                                                                                                                                                                                                                                                          |
+| --------------------- | ------------------ | ---------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `local.modelPath`     | `string`           | auto-downloaded        | Path to GGUF model file                                                                                                                                                                                                                                                                                              |
+| `local.modelCacheDir` | `string`           | node-llama-cpp default | Cache dir for downloaded models                                                                                                                                                                                                                                                                                      |
+| `local.contextSize`   | `number \| "auto"` | `4096`                 | Context window size for the embedding context. 4096 covers typical chunks (128–512 tokens) while bounding non-weight VRAM. Lower to 1024–2048 on constrained hosts. `"auto"` uses the model's trained maximum — not recommended for 8B+ models (Qwen3-Embedding-8B: 40 960 tokens → ~32 GB VRAM vs ~8.8 GB at 4096). |

 Default model: `embeddinggemma-300m-qat-Q8_0.gguf` (~0.6 GB, auto-downloaded).
 Requires native build: `pnpm approve-builds` then `pnpm rebuild node-llama-cpp`.
--- a/src/agents/memory-search.ts
+++ b/src/agents/memory-search.ts
@@ -39,6 +39,7 @@ export type ResolvedMemorySearchConfig = {
  local: {
    modelPath?: string;
    modelCacheDir?: string;
+    contextSize?: number | "auto";
  };
  store: {
    driver: "sqlite";
@@ -195,6 +196,7 @@ function mergeConfig(
  const local = {
    modelPath: overrides?.local?.modelPath ?? defaults?.local?.modelPath,
    modelCacheDir: overrides?.local?.modelCacheDir ?? defaults?.local?.modelCacheDir,
+    contextSize: overrides?.local?.contextSize ?? defaults?.local?.contextSize,
  };
  const sources = normalizeSources(overrides?.sources ?? defaults?.sources, sessionMemory);
  const rawPaths = [...(defaults?.extraPaths ?? []), ...(overrides?.extraPaths ?? [])]
--- a/src/config/schema.base.generated.ts
+++ b/src/config/schema.base.generated.ts
@@ -4170,6 +4170,15 @@ export const GENERATED_BASE_CONFIG_SCHEMA: BaseConfigSchemaResponse = {
                      modelCacheDir: {
                        type: "string",
                      },
+                      contextSize: {
+                        anyOf: [
+                          { type: "integer", exclusiveMinimum: 0, maximum: 9007199254740991 },
+                          { type: "string", const: "auto" },
+                        ],
+                        title: "Local Embedding Context Size",
+                        description:
+                          'Context window size passed to node-llama-cpp when creating the embedding context (default: 4096). 4096 safely covers typical memory-search chunks (128\u2013512 tokens) while keeping non-weight VRAM bounded. Lower to 1024\u20132048 on resource-constrained hosts. Set to "auto" to let node-llama-cpp use the model\'s trained maximum \u2014 not recommended for large models (e.g. Qwen3-Embedding-8B trained on 40\u202f960 tokens can push VRAM from ~8.8\u202fGB to ~32\u202fGB).',
+                      },
                    },
                    additionalProperties: false,
                  },
@@ -6056,6 +6065,12 @@ export const GENERATED_BASE_CONFIG_SCHEMA: BaseConfigSchemaResponse = {
                        modelCacheDir: {
                          type: "string",
                        },
+                        contextSize: {
+                          anyOf: [
+                            { type: "integer", exclusiveMinimum: 0, maximum: 9007199254740991 },
+                            { type: "string", const: "auto" },
+                          ],
+                        },
                      },
                      additionalProperties: false,
                    },
@@ -25150,6 +25165,11 @@ export const GENERATED_BASE_CONFIG_SCHEMA: BaseConfigSchemaResponse = {
      help: "Specifies the local embedding model source for local memory search, such as a GGUF file path or `hf:` URI. Use this only when provider is `local`, and verify model compatibility before large index rebuilds.",
      tags: ["storage"],
    },
+    "agents.defaults.memorySearch.local.contextSize": {
+      label: "Local Embedding Context Size",
+      help: 'Context window size passed to node-llama-cpp when creating the embedding context (default: 4096). 4096 safely covers typical memory-search chunks (128\u2013512 tokens) while keeping non-weight VRAM bounded. Lower to 1024\u20132048 on resource-constrained hosts. Set to "auto" to let node-llama-cpp use the model\'s trained maximum \u2014 not recommended for large models (e.g. Qwen3-Embedding-8B trained on 40\u202f960 tokens can push VRAM from ~8.8\u202fGB to ~32\u202fGB).',
+      tags: ["advanced"],
+    },
    "agents.defaults.memorySearch.store.path": {
      label: "Memory Search Index Path",
      help: "Sets where the SQLite memory index is stored on disk for each agent. Keep the default `~/.openclaw/memory/{agentId}.sqlite` unless you need custom storage placement or backup policy alignment.",
--- a/src/config/schema.help.ts
+++ b/src/config/schema.help.ts
@@ -958,6 +958,8 @@ export const FIELD_HELP: Record<string, string> = {
    "Sets the maximum wait time for a full embedding batch operation in minutes (default: 60). Increase for very large corpora or slower providers, and lower it to fail fast in automation-heavy flows.",
  "agents.defaults.memorySearch.local.modelPath":
    "Specifies the local embedding model source for local memory search, such as a GGUF file path or `hf:` URI. Use this only when provider is `local`, and verify model compatibility before large index rebuilds.",
+  "agents.defaults.memorySearch.local.contextSize":
+    'Context window size passed to node-llama-cpp when creating the embedding context (default: 4096). 4096 safely covers typical memory-search chunks (128\u2013512 tokens) while keeping non-weight VRAM bounded. Lower to 1024\u20132048 on resource-constrained hosts. Set to "auto" to let node-llama-cpp use the model\'s trained maximum \u2014 not recommended for large models (e.g. Qwen3-Embedding-8B trained on 40\u202f960 tokens can push VRAM from ~8.8\u202fGB to ~32\u202fGB).',
  "agents.defaults.memorySearch.fallback":
    'Backup provider used when primary embeddings fail: "openai", "gemini", "voyage", "mistral", "bedrock", "lmstudio", "ollama", "local", or "none". Set a real fallback for production reliability; use "none" only if you prefer explicit failures.',
  "agents.defaults.memorySearch.store.path":
--- a/src/config/schema.labels.ts
+++ b/src/config/schema.labels.ts
@@ -399,6 +399,7 @@ export const FIELD_LABELS: Record<string, string> = {
  "agents.defaults.memorySearch.outputDimensionality": "Memory Search Output Dimensionality",
  "agents.defaults.memorySearch.fallback": "Memory Search Fallback",
  "agents.defaults.memorySearch.local.modelPath": "Local Embedding Model Path",
+  "agents.defaults.memorySearch.local.contextSize": "Local Embedding Context Size",
  "agents.defaults.memorySearch.store.path": "Memory Search Index Path",
  "agents.defaults.memorySearch.store.vector.enabled": "Memory Search Vector Index",
  "agents.defaults.memorySearch.store.vector.extensionPath": "Memory Search Vector Extension Path",
--- a/src/config/types.tools.ts
+++ b/src/config/types.tools.ts
@@ -393,6 +393,12 @@ export type MemorySearchConfig = {
    modelPath?: string;
    /** Optional cache directory for local models. */
    modelCacheDir?: string;
+    /**
+     * Context window size for the local embedding context (default: 4096).
+     * Use `"auto"` to defer to node-llama-cpp, which picks up to the model's
+     * trained maximum — not recommended for 8B+ models.
+     */
+    contextSize?: number | "auto";
  };
  /** Index storage configuration. */
  store?: {
--- a/src/config/zod-schema.agent-runtime.ts
+++ b/src/config/zod-schema.agent-runtime.ts
@@ -684,6 +684,7 @@ export const MemorySearchSchema = z
      .object({
        modelPath: z.string().optional(),
        modelCacheDir: z.string().optional(),
+        contextSize: z.union([z.number().int().positive(), z.literal("auto")]).optional(),
      })
      .strict()
      .optional(),
--- a/src/memory-host-sdk/host/embeddings.test.ts
+++ b/src/memory-host-sdk/host/embeddings.test.ts
@@ -44,6 +44,53 @@ describe("local embedding provider", () => {
    expect(runtime.getEmbeddingFor).toHaveBeenCalledWith("test query");
  });

+  it("passes default contextSize (4096) to createEmbeddingContext when not configured", async () => {
+    const runtime = mockLocalEmbeddingRuntime();
+
+    const provider = await createLocalEmbeddingProvider({
+      config: {} as never,
+      provider: "local",
+      model: "",
+      fallback: "none",
+    });
+
+    await provider.embedQuery("context size default test");
+
+    expect(runtime.createEmbeddingContext).toHaveBeenCalledWith({ contextSize: 4096 });
+  });
+
+  it("passes configured contextSize to createEmbeddingContext", async () => {
+    const runtime = mockLocalEmbeddingRuntime();
+
+    const provider = await createLocalEmbeddingProvider({
+      config: {} as never,
+      provider: "local",
+      model: "",
+      fallback: "none",
+      local: { contextSize: 2048 },
+    });
+
+    await provider.embedQuery("context size custom test");
+
+    expect(runtime.createEmbeddingContext).toHaveBeenCalledWith({ contextSize: 2048 });
+  });
+
+  it('passes "auto" contextSize to createEmbeddingContext when explicitly set', async () => {
+    const runtime = mockLocalEmbeddingRuntime();
+
+    const provider = await createLocalEmbeddingProvider({
+      config: {} as never,
+      provider: "local",
+      model: "",
+      fallback: "none",
+      local: { contextSize: "auto" },
+    });
+
+    await provider.embedQuery("context size auto test");
+
+    expect(runtime.createEmbeddingContext).toHaveBeenCalledWith({ contextSize: "auto" });
+  });
+
  it("trims explicit local model paths and cache directories", async () => {
    const runtime = mockLocalEmbeddingRuntime(new Float32Array([1, 0]));

--- a/src/memory-host-sdk/host/embeddings.ts
+++ b/src/memory-host-sdk/host/embeddings.ts
@@ -25,6 +25,7 @@ export async function createLocalEmbeddingProvider(
 ): Promise<EmbeddingProvider> {
  const modelPath = normalizeOptionalString(options.local?.modelPath) || DEFAULT_LOCAL_MODEL;
  const modelCacheDir = normalizeOptionalString(options.local?.modelCacheDir);
+  const contextSize: number | "auto" = options.local?.contextSize ?? 4096;

  // Lazy-load node-llama-cpp to keep startup light unless local is enabled.
  const { getLlama, resolveModelFile, LlamaLogLevel } = await importNodeLlamaCpp();
@@ -51,7 +52,7 @@ export async function createLocalEmbeddingProvider(
          embeddingModel = await llama.loadModel({ modelPath: resolved });
        }
        if (!embeddingContext) {
-          embeddingContext = await embeddingModel.createEmbeddingContext();
+          embeddingContext = await embeddingModel.createEmbeddingContext({ contextSize });
        }
        return embeddingContext;
      } catch (err) {
--- a/src/memory-host-sdk/host/embeddings.types.ts
+++ b/src/memory-host-sdk/host/embeddings.types.ts
@@ -38,6 +38,14 @@ export type EmbeddingProviderOptions = {
  local?: {
    modelPath?: string;
    modelCacheDir?: string;
+    /**
+     * Context size passed to node-llama-cpp `createEmbeddingContext`.
+     * Default: 4096, chosen to cover typical memory-search chunks (128–512 tokens)
+     * while keeping non-weight VRAM bounded.
+     * Set `"auto"` to let node-llama-cpp use the model's trained maximum — not
+     * recommended for 8B+ models (e.g. Qwen3-Embedding-8B: up to 40 960 tokens → ~32 GB VRAM).
+     */
+    contextSize?: number | "auto";
  };
  /** Provider-specific output vector dimensions for supported embedding families. */
  outputDimensionality?: number;
--- a/src/memory-host-sdk/host/node-llama.ts
+++ b/src/memory-host-sdk/host/node-llama.ts
@@ -7,7 +7,9 @@ export type LlamaEmbeddingContext = {
 };

 export type LlamaModel = {
-  createEmbeddingContext: () => Promise<LlamaEmbeddingContext>;
+  createEmbeddingContext: (options?: {
+    contextSize?: number | "auto";
+  }) => Promise<LlamaEmbeddingContext>;
 };

 export type Llama = {