mirror of
https://github.com/openclaw/openclaw.git
synced 2026-04-25 08:02:04 +00:00
The QMD memory system uses a fixed 4:1 chars-to-tokens ratio for chunk sizing, which severely underestimates CJK (Chinese/Japanese/Korean) text where each character is roughly 1 token. This causes oversized chunks for CJK users, degrading vector search quality and wasting context window space. Changes: - Add shared src/utils/cjk-chars.ts module with CJK-aware character counting (estimateStringChars) and token estimation helpers - Update chunkMarkdown() in src/memory/internal.ts to use weighted character lengths for chunk boundary decisions and overlap calculation - Replace hardcoded estimateTokensFromChars in the context report command with the shared utility - Add 13 unit tests for the CJK estimation module and 5 new tests for CJK-aware memory chunking behavior Backward compatible: pure ASCII/Latin text behavior is unchanged. Closes #39965 Related: #40216
82 lines
3.3 KiB
TypeScript
82 lines
3.3 KiB
TypeScript
/**
|
||
* CJK-aware character counting for accurate token estimation.
|
||
*
|
||
* Most LLM tokenizers encode CJK (Chinese, Japanese, Korean) characters as
|
||
* roughly 1 token per character, whereas Latin/ASCII text averages ~1 token
|
||
* per 4 characters. When the codebase estimates tokens as `chars / 4`, CJK
|
||
* content is underestimated by 2–4×.
|
||
*
|
||
* This module provides a shared helper that inflates the character count of
|
||
* CJK text so that the standard `chars / 4` formula yields an accurate
|
||
* token estimate for any script.
|
||
*/
|
||
|
||
/**
|
||
* Default characters-per-token ratio used throughout the codebase.
|
||
* Latin text ≈ 4 chars/token; CJK ≈ 1 char/token.
|
||
*/
|
||
export const CHARS_PER_TOKEN_ESTIMATE = 4;
|
||
|
||
/**
|
||
* Matches CJK Unified Ideographs, CJK Extension A/B, CJK Compatibility
|
||
* Ideographs, Hangul Syllables, Hiragana, Katakana, and other non-Latin
|
||
* scripts that typically use ~1 token per character.
|
||
*/
|
||
const NON_LATIN_RE = /[\u2E80-\u9FFF\uA000-\uA4FF\uAC00-\uD7AF\uF900-\uFAFF\u{20000}-\u{2FA1F}]/gu;
|
||
|
||
/**
|
||
* Return an adjusted character length that accounts for non-Latin (CJK, etc.)
|
||
* characters. Each non-Latin character is counted as
|
||
* {@link CHARS_PER_TOKEN_ESTIMATE} chars so that the downstream
|
||
* `chars / CHARS_PER_TOKEN_ESTIMATE` token estimate remains accurate.
|
||
*
|
||
* For pure ASCII/Latin text the return value equals `text.length` (no change).
|
||
*/
|
||
export function estimateStringChars(text: string): number {
|
||
if (text.length === 0) {
|
||
return 0;
|
||
}
|
||
const nonLatinCount = (text.match(NON_LATIN_RE) ?? []).length;
|
||
// Use code-point length instead of UTF-16 length so that surrogate pairs
|
||
// (CJK Extension B+, U+20000–U+2FA1F) are counted as 1 character, not 2.
|
||
const codePointLength = countCodePoints(text, nonLatinCount);
|
||
// Non-Latin chars already contribute 1 to codePointLength, so add the extra weight.
|
||
return codePointLength + nonLatinCount * (CHARS_PER_TOKEN_ESTIMATE - 1);
|
||
}
|
||
|
||
/**
|
||
* Matches surrogate pairs whose code point falls in the CJK Extension B+
|
||
* range (U+20000–U+2FA1F). Only these surrogates need adjustment because
|
||
* they are matched by {@link NON_LATIN_RE} and already counted in
|
||
* `nonLatinCount`. Other surrogates (emoji, symbols) are not matched by
|
||
* that regex, so collapsing them would create an inconsistency.
|
||
*
|
||
* High-surrogate range for U+20000–U+2FA1F is D840–D87E.
|
||
*/
|
||
const CJK_SURROGATE_HIGH_RE = /[\uD840-\uD87E][\uDC00-\uDFFF]/g;
|
||
|
||
/**
|
||
* Return the code-point-aware length of the string, adjusting only for
|
||
* CJK Extension B+ surrogate pairs. For text without such characters
|
||
* (the vast majority of inputs) this returns `text.length` unchanged.
|
||
*/
|
||
function countCodePoints(text: string, nonLatinCount: number): number {
|
||
if (nonLatinCount === 0) {
|
||
return text.length;
|
||
}
|
||
// Count only CJK-range surrogate pairs — each occupies 2 UTF-16 units
|
||
// but represents 1 code point (and 1 regex match in NON_LATIN_RE).
|
||
const cjkSurrogates = (text.match(CJK_SURROGATE_HIGH_RE) ?? []).length;
|
||
return text.length - cjkSurrogates;
|
||
}
|
||
|
||
/**
|
||
* Estimate the number of tokens from a raw character count.
|
||
*
|
||
* For a more accurate estimate when the source text is available, prefer
|
||
* `estimateStringChars(text) / CHARS_PER_TOKEN_ESTIMATE` instead.
|
||
*/
|
||
export function estimateTokensFromChars(chars: number): number {
|
||
return Math.ceil(Math.max(0, chars) / CHARS_PER_TOKEN_ESTIMATE);
|
||
}
|