mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-03 20:10:20 +00:00
fix(memory): account for CJK characters in QMD memory chunking
The QMD memory system uses a fixed 4:1 chars-to-tokens ratio for chunk sizing, which severely underestimates CJK (Chinese/Japanese/Korean) text where each character is roughly 1 token. This causes oversized chunks for CJK users, degrading vector search quality and wasting context window space. Changes: - Add shared src/utils/cjk-chars.ts module with CJK-aware character counting (estimateStringChars) and token estimation helpers - Update chunkMarkdown() in src/memory/internal.ts to use weighted character lengths for chunk boundary decisions and overlap calculation - Replace hardcoded estimateTokensFromChars in the context report command with the shared utility - Add 13 unit tests for the CJK estimation module and 5 new tests for CJK-aware memory chunking behavior Backward compatible: pure ASCII/Latin text behavior is unchanged. Closes #39965 Related: #40216
This commit is contained in:
committed by
Peter Steinberger
parent
7f46b03de0
commit
971ecabe80
@@ -5,14 +5,11 @@ import {
|
||||
} from "../../agents/pi-embedded-helpers.js";
|
||||
import { buildSystemPromptReport } from "../../agents/system-prompt-report.js";
|
||||
import type { SessionSystemPromptReport } from "../../config/sessions/types.js";
|
||||
import { estimateTokensFromChars } from "../../utils/cjk-chars.js";
|
||||
import type { ReplyPayload } from "../types.js";
|
||||
import { resolveCommandsSystemPromptBundle } from "./commands-system-prompt.js";
|
||||
import type { HandleCommandsParams } from "./commands-types.js";
|
||||
|
||||
function estimateTokensFromChars(chars: number): number {
|
||||
return Math.ceil(Math.max(0, chars) / 4);
|
||||
}
|
||||
|
||||
function formatInt(n: number): string {
|
||||
return new Intl.NumberFormat("en-US").format(n);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user