mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 07:40:44 +00:00
feat(memory): configurable local embedding contextSize (default 4096) (#70544)
node-llama-cpp defaults contextSize to "auto", which on large embedding models like Qwen3-Embedding-8B (trained context 40,960) inflates gateway VRAM from ~8.8 GB to ~32 GB and causes OOM on single-GPU hosts that share the gateway with an LLM runtime. Expose memorySearch.local.contextSize in openclaw.json (number | "auto"), default to 4096 which comfortably covers typical memory-search chunks (128–512 tokens) while keeping non-weight VRAM bounded. Closes #69667.
This commit is contained in:
@@ -198,10 +198,11 @@ arn:aws:bedrock:*::foundation-model/amazon.titan-embed-text-v2:0
|
||||
|
||||
## Local embedding config
|
||||
|
||||
| Key | Type | Default | Description |
|
||||
| --------------------- | -------- | ---------------------- | ------------------------------- |
|
||||
| `local.modelPath` | `string` | auto-downloaded | Path to GGUF model file |
|
||||
| `local.modelCacheDir` | `string` | node-llama-cpp default | Cache dir for downloaded models |
|
||||
| Key | Type | Default | Description |
|
||||
| --------------------- | ------------------ | ---------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `local.modelPath` | `string` | auto-downloaded | Path to GGUF model file |
|
||||
| `local.modelCacheDir` | `string` | node-llama-cpp default | Cache dir for downloaded models |
|
||||
| `local.contextSize` | `number \| "auto"` | `4096` | Context window size for the embedding context. 4096 covers typical chunks (128–512 tokens) while bounding non-weight VRAM. Lower to 1024–2048 on constrained hosts. `"auto"` uses the model's trained maximum — not recommended for 8B+ models (Qwen3-Embedding-8B: 40 960 tokens → ~32 GB VRAM vs ~8.8 GB at 4096). |
|
||||
|
||||
Default model: `embeddinggemma-300m-qat-Q8_0.gguf` (~0.6 GB, auto-downloaded).
|
||||
Requires native build: `pnpm approve-builds` then `pnpm rebuild node-llama-cpp`.
|
||||
|
||||
@@ -39,6 +39,7 @@ export type ResolvedMemorySearchConfig = {
|
||||
local: {
|
||||
modelPath?: string;
|
||||
modelCacheDir?: string;
|
||||
contextSize?: number | "auto";
|
||||
};
|
||||
store: {
|
||||
driver: "sqlite";
|
||||
@@ -195,6 +196,7 @@ function mergeConfig(
|
||||
const local = {
|
||||
modelPath: overrides?.local?.modelPath ?? defaults?.local?.modelPath,
|
||||
modelCacheDir: overrides?.local?.modelCacheDir ?? defaults?.local?.modelCacheDir,
|
||||
contextSize: overrides?.local?.contextSize ?? defaults?.local?.contextSize,
|
||||
};
|
||||
const sources = normalizeSources(overrides?.sources ?? defaults?.sources, sessionMemory);
|
||||
const rawPaths = [...(defaults?.extraPaths ?? []), ...(overrides?.extraPaths ?? [])]
|
||||
|
||||
@@ -4170,6 +4170,15 @@ export const GENERATED_BASE_CONFIG_SCHEMA: BaseConfigSchemaResponse = {
|
||||
modelCacheDir: {
|
||||
type: "string",
|
||||
},
|
||||
contextSize: {
|
||||
anyOf: [
|
||||
{ type: "integer", exclusiveMinimum: 0, maximum: 9007199254740991 },
|
||||
{ type: "string", const: "auto" },
|
||||
],
|
||||
title: "Local Embedding Context Size",
|
||||
description:
|
||||
'Context window size passed to node-llama-cpp when creating the embedding context (default: 4096). 4096 safely covers typical memory-search chunks (128\u2013512 tokens) while keeping non-weight VRAM bounded. Lower to 1024\u20132048 on resource-constrained hosts. Set to "auto" to let node-llama-cpp use the model\'s trained maximum \u2014 not recommended for large models (e.g. Qwen3-Embedding-8B trained on 40\u202f960 tokens can push VRAM from ~8.8\u202fGB to ~32\u202fGB).',
|
||||
},
|
||||
},
|
||||
additionalProperties: false,
|
||||
},
|
||||
@@ -6056,6 +6065,12 @@ export const GENERATED_BASE_CONFIG_SCHEMA: BaseConfigSchemaResponse = {
|
||||
modelCacheDir: {
|
||||
type: "string",
|
||||
},
|
||||
contextSize: {
|
||||
anyOf: [
|
||||
{ type: "integer", exclusiveMinimum: 0, maximum: 9007199254740991 },
|
||||
{ type: "string", const: "auto" },
|
||||
],
|
||||
},
|
||||
},
|
||||
additionalProperties: false,
|
||||
},
|
||||
@@ -25150,6 +25165,11 @@ export const GENERATED_BASE_CONFIG_SCHEMA: BaseConfigSchemaResponse = {
|
||||
help: "Specifies the local embedding model source for local memory search, such as a GGUF file path or `hf:` URI. Use this only when provider is `local`, and verify model compatibility before large index rebuilds.",
|
||||
tags: ["storage"],
|
||||
},
|
||||
"agents.defaults.memorySearch.local.contextSize": {
|
||||
label: "Local Embedding Context Size",
|
||||
help: 'Context window size passed to node-llama-cpp when creating the embedding context (default: 4096). 4096 safely covers typical memory-search chunks (128\u2013512 tokens) while keeping non-weight VRAM bounded. Lower to 1024\u20132048 on resource-constrained hosts. Set to "auto" to let node-llama-cpp use the model\'s trained maximum \u2014 not recommended for large models (e.g. Qwen3-Embedding-8B trained on 40\u202f960 tokens can push VRAM from ~8.8\u202fGB to ~32\u202fGB).',
|
||||
tags: ["advanced"],
|
||||
},
|
||||
"agents.defaults.memorySearch.store.path": {
|
||||
label: "Memory Search Index Path",
|
||||
help: "Sets where the SQLite memory index is stored on disk for each agent. Keep the default `~/.openclaw/memory/{agentId}.sqlite` unless you need custom storage placement or backup policy alignment.",
|
||||
|
||||
@@ -958,6 +958,8 @@ export const FIELD_HELP: Record<string, string> = {
|
||||
"Sets the maximum wait time for a full embedding batch operation in minutes (default: 60). Increase for very large corpora or slower providers, and lower it to fail fast in automation-heavy flows.",
|
||||
"agents.defaults.memorySearch.local.modelPath":
|
||||
"Specifies the local embedding model source for local memory search, such as a GGUF file path or `hf:` URI. Use this only when provider is `local`, and verify model compatibility before large index rebuilds.",
|
||||
"agents.defaults.memorySearch.local.contextSize":
|
||||
'Context window size passed to node-llama-cpp when creating the embedding context (default: 4096). 4096 safely covers typical memory-search chunks (128\u2013512 tokens) while keeping non-weight VRAM bounded. Lower to 1024\u20132048 on resource-constrained hosts. Set to "auto" to let node-llama-cpp use the model\'s trained maximum \u2014 not recommended for large models (e.g. Qwen3-Embedding-8B trained on 40\u202f960 tokens can push VRAM from ~8.8\u202fGB to ~32\u202fGB).',
|
||||
"agents.defaults.memorySearch.fallback":
|
||||
'Backup provider used when primary embeddings fail: "openai", "gemini", "voyage", "mistral", "bedrock", "lmstudio", "ollama", "local", or "none". Set a real fallback for production reliability; use "none" only if you prefer explicit failures.',
|
||||
"agents.defaults.memorySearch.store.path":
|
||||
|
||||
@@ -399,6 +399,7 @@ export const FIELD_LABELS: Record<string, string> = {
|
||||
"agents.defaults.memorySearch.outputDimensionality": "Memory Search Output Dimensionality",
|
||||
"agents.defaults.memorySearch.fallback": "Memory Search Fallback",
|
||||
"agents.defaults.memorySearch.local.modelPath": "Local Embedding Model Path",
|
||||
"agents.defaults.memorySearch.local.contextSize": "Local Embedding Context Size",
|
||||
"agents.defaults.memorySearch.store.path": "Memory Search Index Path",
|
||||
"agents.defaults.memorySearch.store.vector.enabled": "Memory Search Vector Index",
|
||||
"agents.defaults.memorySearch.store.vector.extensionPath": "Memory Search Vector Extension Path",
|
||||
|
||||
@@ -393,6 +393,12 @@ export type MemorySearchConfig = {
|
||||
modelPath?: string;
|
||||
/** Optional cache directory for local models. */
|
||||
modelCacheDir?: string;
|
||||
/**
|
||||
* Context window size for the local embedding context (default: 4096).
|
||||
* Use `"auto"` to defer to node-llama-cpp, which picks up to the model's
|
||||
* trained maximum — not recommended for 8B+ models.
|
||||
*/
|
||||
contextSize?: number | "auto";
|
||||
};
|
||||
/** Index storage configuration. */
|
||||
store?: {
|
||||
|
||||
@@ -684,6 +684,7 @@ export const MemorySearchSchema = z
|
||||
.object({
|
||||
modelPath: z.string().optional(),
|
||||
modelCacheDir: z.string().optional(),
|
||||
contextSize: z.union([z.number().int().positive(), z.literal("auto")]).optional(),
|
||||
})
|
||||
.strict()
|
||||
.optional(),
|
||||
|
||||
@@ -44,6 +44,53 @@ describe("local embedding provider", () => {
|
||||
expect(runtime.getEmbeddingFor).toHaveBeenCalledWith("test query");
|
||||
});
|
||||
|
||||
it("passes default contextSize (4096) to createEmbeddingContext when not configured", async () => {
|
||||
const runtime = mockLocalEmbeddingRuntime();
|
||||
|
||||
const provider = await createLocalEmbeddingProvider({
|
||||
config: {} as never,
|
||||
provider: "local",
|
||||
model: "",
|
||||
fallback: "none",
|
||||
});
|
||||
|
||||
await provider.embedQuery("context size default test");
|
||||
|
||||
expect(runtime.createEmbeddingContext).toHaveBeenCalledWith({ contextSize: 4096 });
|
||||
});
|
||||
|
||||
it("passes configured contextSize to createEmbeddingContext", async () => {
|
||||
const runtime = mockLocalEmbeddingRuntime();
|
||||
|
||||
const provider = await createLocalEmbeddingProvider({
|
||||
config: {} as never,
|
||||
provider: "local",
|
||||
model: "",
|
||||
fallback: "none",
|
||||
local: { contextSize: 2048 },
|
||||
});
|
||||
|
||||
await provider.embedQuery("context size custom test");
|
||||
|
||||
expect(runtime.createEmbeddingContext).toHaveBeenCalledWith({ contextSize: 2048 });
|
||||
});
|
||||
|
||||
it('passes "auto" contextSize to createEmbeddingContext when explicitly set', async () => {
|
||||
const runtime = mockLocalEmbeddingRuntime();
|
||||
|
||||
const provider = await createLocalEmbeddingProvider({
|
||||
config: {} as never,
|
||||
provider: "local",
|
||||
model: "",
|
||||
fallback: "none",
|
||||
local: { contextSize: "auto" },
|
||||
});
|
||||
|
||||
await provider.embedQuery("context size auto test");
|
||||
|
||||
expect(runtime.createEmbeddingContext).toHaveBeenCalledWith({ contextSize: "auto" });
|
||||
});
|
||||
|
||||
it("trims explicit local model paths and cache directories", async () => {
|
||||
const runtime = mockLocalEmbeddingRuntime(new Float32Array([1, 0]));
|
||||
|
||||
|
||||
@@ -25,6 +25,7 @@ export async function createLocalEmbeddingProvider(
|
||||
): Promise<EmbeddingProvider> {
|
||||
const modelPath = normalizeOptionalString(options.local?.modelPath) || DEFAULT_LOCAL_MODEL;
|
||||
const modelCacheDir = normalizeOptionalString(options.local?.modelCacheDir);
|
||||
const contextSize: number | "auto" = options.local?.contextSize ?? 4096;
|
||||
|
||||
// Lazy-load node-llama-cpp to keep startup light unless local is enabled.
|
||||
const { getLlama, resolveModelFile, LlamaLogLevel } = await importNodeLlamaCpp();
|
||||
@@ -51,7 +52,7 @@ export async function createLocalEmbeddingProvider(
|
||||
embeddingModel = await llama.loadModel({ modelPath: resolved });
|
||||
}
|
||||
if (!embeddingContext) {
|
||||
embeddingContext = await embeddingModel.createEmbeddingContext();
|
||||
embeddingContext = await embeddingModel.createEmbeddingContext({ contextSize });
|
||||
}
|
||||
return embeddingContext;
|
||||
} catch (err) {
|
||||
|
||||
@@ -38,6 +38,14 @@ export type EmbeddingProviderOptions = {
|
||||
local?: {
|
||||
modelPath?: string;
|
||||
modelCacheDir?: string;
|
||||
/**
|
||||
* Context size passed to node-llama-cpp `createEmbeddingContext`.
|
||||
* Default: 4096, chosen to cover typical memory-search chunks (128–512 tokens)
|
||||
* while keeping non-weight VRAM bounded.
|
||||
* Set `"auto"` to let node-llama-cpp use the model's trained maximum — not
|
||||
* recommended for 8B+ models (e.g. Qwen3-Embedding-8B: up to 40 960 tokens → ~32 GB VRAM).
|
||||
*/
|
||||
contextSize?: number | "auto";
|
||||
};
|
||||
/** Provider-specific output vector dimensions for supported embedding families. */
|
||||
outputDimensionality?: number;
|
||||
|
||||
@@ -7,7 +7,9 @@ export type LlamaEmbeddingContext = {
|
||||
};
|
||||
|
||||
export type LlamaModel = {
|
||||
createEmbeddingContext: () => Promise<LlamaEmbeddingContext>;
|
||||
createEmbeddingContext: (options?: {
|
||||
contextSize?: number | "auto";
|
||||
}) => Promise<LlamaEmbeddingContext>;
|
||||
};
|
||||
|
||||
export type Llama = {
|
||||
|
||||
Reference in New Issue
Block a user