From 971ecabe80b632180edddb81f5652436de13ffed Mon Sep 17 00:00:00 2001 From: AaronLuo00 Date: Sun, 8 Mar 2026 17:16:55 -0400 Subject: [PATCH] fix(memory): account for CJK characters in QMD memory chunking The QMD memory system uses a fixed 4:1 chars-to-tokens ratio for chunk sizing, which severely underestimates CJK (Chinese/Japanese/Korean) text where each character is roughly 1 token. This causes oversized chunks for CJK users, degrading vector search quality and wasting context window space. Changes: - Add shared src/utils/cjk-chars.ts module with CJK-aware character counting (estimateStringChars) and token estimation helpers - Update chunkMarkdown() in src/memory/internal.ts to use weighted character lengths for chunk boundary decisions and overlap calculation - Replace hardcoded estimateTokensFromChars in the context report command with the shared utility - Add 13 unit tests for the CJK estimation module and 5 new tests for CJK-aware memory chunking behavior Backward compatible: pure ASCII/Latin text behavior is unchanged. Closes #39965 Related: #40216 --- .../memory-host-sdk/src/host/internal.test.ts | 106 ++++++++++++++++++ packages/memory-host-sdk/src/host/internal.ts | 11 +- .../reply/commands-context-report.ts | 5 +- src/utils/cjk-chars.test.ts | 105 +++++++++++++++++ src/utils/cjk-chars.ts | 81 +++++++++++++ 5 files changed, 299 insertions(+), 9 deletions(-) create mode 100644 src/utils/cjk-chars.test.ts create mode 100644 src/utils/cjk-chars.ts diff --git a/packages/memory-host-sdk/src/host/internal.test.ts b/packages/memory-host-sdk/src/host/internal.test.ts index d18120b413a..764fbc24bf8 100644 --- a/packages/memory-host-sdk/src/host/internal.test.ts +++ b/packages/memory-host-sdk/src/host/internal.test.ts @@ -249,6 +249,112 @@ describe("chunkMarkdown", () => { expect(chunk.text.length).toBeLessThanOrEqual(maxChars); } }); + + it("produces more chunks for CJK text than for equal-length ASCII text", () => { + // CJK chars ≈ 1 token each; ASCII chars ≈ 0.25 tokens each. + // For the same raw character count, CJK content should produce more chunks + // because each character "weighs" ~4× more in token estimation. + const chunkTokens = 50; + + // 400 ASCII chars → ~100 tokens → fits in ~2 chunks + const asciiLines = Array.from({ length: 20 }, () => "a".repeat(20)).join("\n"); + const asciiChunks = chunkMarkdown(asciiLines, { tokens: chunkTokens, overlap: 0 }); + + // 400 CJK chars → ~400 tokens → needs ~8 chunks + const cjkLines = Array.from({ length: 20 }, () => "你".repeat(20)).join("\n"); + const cjkChunks = chunkMarkdown(cjkLines, { tokens: chunkTokens, overlap: 0 }); + + expect(cjkChunks.length).toBeGreaterThan(asciiChunks.length); + }); + + it("respects token budget for Chinese text", () => { + // With tokens=100, each CJK char ≈ 1 token, so chunks should hold ~100 CJK chars. + const chunkTokens = 100; + const lines: string[] = []; + for (let i = 0; i < 50; i++) { + lines.push("这是一个测试句子用来验证分块逻辑是否正确处理中文文本内容"); + } + const content = lines.join("\n"); + const chunks = chunkMarkdown(content, { tokens: chunkTokens, overlap: 0 }); + + expect(chunks.length).toBeGreaterThan(1); + // Each chunk's CJK content should not vastly exceed the token budget. + // With CJK-aware estimation, each char ≈ 1 token, so chunk text length + // (in CJK chars) should be roughly <= tokens budget (with some tolerance + // for line boundaries). + for (const chunk of chunks) { + // Count actual CJK characters in the chunk + const cjkCount = (chunk.text.match(/[\u4e00-\u9fff]/g) ?? []).length; + // Allow 2× tolerance for line-boundary rounding + expect(cjkCount).toBeLessThanOrEqual(chunkTokens * 2); + } + }); + + it("keeps English chunking behavior unchanged", () => { + const chunkTokens = 100; + const maxChars = chunkTokens * 4; // 400 chars + const content = "hello world this is a test. ".repeat(50); + const chunks = chunkMarkdown(content, { tokens: chunkTokens, overlap: 0 }); + expect(chunks.length).toBeGreaterThan(1); + for (const chunk of chunks) { + expect(chunk.text.length).toBeLessThanOrEqual(maxChars); + } + }); + + it("handles mixed CJK and ASCII content correctly", () => { + const chunkTokens = 50; + const lines: string[] = []; + for (let i = 0; i < 30; i++) { + lines.push(`Line ${i}: 这是中英文混合的测试内容 with some English text`); + } + const content = lines.join("\n"); + const chunks = chunkMarkdown(content, { tokens: chunkTokens, overlap: 0 }); + // Should produce multiple chunks and not crash + expect(chunks.length).toBeGreaterThan(1); + // Verify all content is preserved + const reconstructed = chunks.map((c) => c.text).join("\n"); + // Due to overlap=0, the concatenated chunks should cover all lines + expect(reconstructed).toContain("Line 0"); + expect(reconstructed).toContain("Line 29"); + }); + + it("splits very long CJK lines into budget-sized segments", () => { + // A single line of 2000 CJK characters (no newlines). + // With tokens=200, each CJK char ≈ 1 token. + const longCjkLine = "中".repeat(2000); + const chunks = chunkMarkdown(longCjkLine, { tokens: 200, overlap: 0 }); + expect(chunks.length).toBeGreaterThanOrEqual(8); + for (const chunk of chunks) { + const cjkCount = (chunk.text.match(/[\u4E00-\u9FFF]/g) ?? []).length; + expect(cjkCount).toBeLessThanOrEqual(200 * 2); + } + }); + + it("does not break surrogate pairs when splitting long CJK lines", () => { + // "𠀀" (U+20000) is a surrogate pair: 2 UTF-16 code units per character. + // With tokens=99 (odd), the fine-split must not cut inside a pair. + const surrogateChar = "\u{20000}"; + const longLine = surrogateChar.repeat(500); + const chunks = chunkMarkdown(longLine, { tokens: 99, overlap: 0 }); + for (const chunk of chunks) { + expect(chunk.text).not.toContain("\uFFFD"); + for (let i = 0; i < chunk.text.length; i += 1) { + const code = chunk.text.charCodeAt(i); + if (code >= 0xd800 && code <= 0xdbff) { + const next = chunk.text.charCodeAt(i + 1); + expect(next).toBeGreaterThanOrEqual(0xdc00); + expect(next).toBeLessThanOrEqual(0xdfff); + } + } + } + }); + + it("does not over-split long Latin lines (backward compat)", () => { + // 2000 ASCII chars / 800 maxChars -> about 3 segments, not 10 tiny ones. + const longLatinLine = "a".repeat(2000); + const chunks = chunkMarkdown(longLatinLine, { tokens: 200, overlap: 0 }); + expect(chunks.length).toBeLessThanOrEqual(5); + }); }); describe("remapChunkLines", () => { diff --git a/packages/memory-host-sdk/src/host/internal.ts b/packages/memory-host-sdk/src/host/internal.ts index 730eaf052f0..89fd0030a94 100644 --- a/packages/memory-host-sdk/src/host/internal.ts +++ b/packages/memory-host-sdk/src/host/internal.ts @@ -3,6 +3,7 @@ import fsSync from "node:fs"; import fs from "node:fs/promises"; import path from "node:path"; import { detectMime } from "../../../../src/media/mime.js"; +import { CHARS_PER_TOKEN_ESTIMATE, estimateStringChars } from "../../../../src/utils/cjk-chars.js"; import { runTasksWithConcurrency } from "../../../../src/utils/run-with-concurrency.js"; import { estimateStructuredEmbeddingInputBytes } from "./embedding-input-limits.js"; import { buildTextEmbeddingInput, type EmbeddingInput } from "./embedding-inputs.js"; @@ -339,8 +340,8 @@ export function chunkMarkdown( if (lines.length === 0) { return []; } - const maxChars = Math.max(32, chunking.tokens * 4); - const overlapChars = Math.max(0, chunking.overlap * 4); + const maxChars = Math.max(32, chunking.tokens * CHARS_PER_TOKEN_ESTIMATE); + const overlapChars = Math.max(0, chunking.overlap * CHARS_PER_TOKEN_ESTIMATE); const chunks: MemoryChunk[] = []; let current: Array<{ line: string; lineNo: number }> = []; @@ -380,14 +381,14 @@ export function chunkMarkdown( if (!entry) { continue; } - acc += entry.line.length + 1; + acc += estimateStringChars(entry.line) + 1; kept.unshift(entry); if (acc >= overlapChars) { break; } } current = kept; - currentChars = kept.reduce((sum, entry) => sum + entry.line.length + 1, 0); + currentChars = kept.reduce((sum, entry) => sum + estimateStringChars(entry.line) + 1, 0); }; for (let i = 0; i < lines.length; i += 1) { @@ -402,7 +403,7 @@ export function chunkMarkdown( } } for (const segment of segments) { - const lineSize = segment.length + 1; + const lineSize = estimateStringChars(segment) + 1; if (currentChars + lineSize > maxChars && current.length > 0) { flush(); carryOverlap(); diff --git a/src/auto-reply/reply/commands-context-report.ts b/src/auto-reply/reply/commands-context-report.ts index cbf190c4c88..085fe6f7baf 100644 --- a/src/auto-reply/reply/commands-context-report.ts +++ b/src/auto-reply/reply/commands-context-report.ts @@ -5,14 +5,11 @@ import { } from "../../agents/pi-embedded-helpers.js"; import { buildSystemPromptReport } from "../../agents/system-prompt-report.js"; import type { SessionSystemPromptReport } from "../../config/sessions/types.js"; +import { estimateTokensFromChars } from "../../utils/cjk-chars.js"; import type { ReplyPayload } from "../types.js"; import { resolveCommandsSystemPromptBundle } from "./commands-system-prompt.js"; import type { HandleCommandsParams } from "./commands-types.js"; -function estimateTokensFromChars(chars: number): number { - return Math.ceil(Math.max(0, chars) / 4); -} - function formatInt(n: number): string { return new Intl.NumberFormat("en-US").format(n); } diff --git a/src/utils/cjk-chars.test.ts b/src/utils/cjk-chars.test.ts new file mode 100644 index 00000000000..0e8327d0a7d --- /dev/null +++ b/src/utils/cjk-chars.test.ts @@ -0,0 +1,105 @@ +import { describe, expect, it } from "vitest"; +import { + CHARS_PER_TOKEN_ESTIMATE, + estimateStringChars, + estimateTokensFromChars, +} from "./cjk-chars.js"; + +describe("estimateStringChars", () => { + it("returns plain string length for ASCII text", () => { + expect(estimateStringChars("hello world")).toBe(11); + }); + + it("returns 0 for empty string", () => { + expect(estimateStringChars("")).toBe(0); + }); + + it("counts Chinese characters with extra weight", () => { + // "你好世" = 3 CJK chars + // Each CJK char counted as CHARS_PER_TOKEN_ESTIMATE (4) chars + // .length = 3, adjusted = 3 + 3 * (4 - 1) = 12 + expect(estimateStringChars("你好世")).toBe(12); + }); + + it("handles mixed ASCII and CJK text", () => { + // "hi你好" = 2 ASCII + 2 CJK + // .length = 4, adjusted = 4 + 2 * 3 = 10 + expect(estimateStringChars("hi你好")).toBe(10); + }); + + it("handles Japanese hiragana", () => { + // "こんにちは" = 5 hiragana chars + // .length = 5, adjusted = 5 + 5 * 3 = 20 + expect(estimateStringChars("こんにちは")).toBe(20); + }); + + it("handles Japanese katakana", () => { + // "カタカナ" = 4 katakana chars + // .length = 4, adjusted = 4 + 4 * 3 = 16 + expect(estimateStringChars("カタカナ")).toBe(16); + }); + + it("handles Korean hangul", () => { + // "안녕하세요" = 5 hangul chars + // .length = 5, adjusted = 5 + 5 * 3 = 20 + expect(estimateStringChars("안녕하세요")).toBe(20); + }); + + it("handles CJK punctuation and symbols in the extended range", () => { + // "⺀" (U+2E80) is in CJK Radicals Supplement range + expect(estimateStringChars("⺀")).toBe(CHARS_PER_TOKEN_ESTIMATE); + }); + + it("does not inflate standard Latin characters", () => { + const latin = "The quick brown fox jumps over the lazy dog"; + expect(estimateStringChars(latin)).toBe(latin.length); + }); + + it("does not inflate numbers and basic punctuation", () => { + const text = "123.45, hello! @#$%"; + expect(estimateStringChars(text)).toBe(text.length); + }); + + it("counts CJK Extension B characters as one code point", () => { + // "𠀀" (U+20000) is represented as a surrogate pair in UTF-16. + // Result = 1 + 1 * 3 = 4 (exactly CHARS_PER_TOKEN_ESTIMATE) + expect(estimateStringChars("𠀀")).toBe(CHARS_PER_TOKEN_ESTIMATE); + }); + + it("handles mixed BMP and Extension B CJK consistently", () => { + // 3 CJK code points total: 你 + 𠀀 + 好 => 3 * 4 = 12 + expect(estimateStringChars("你𠀀好")).toBe(12); + }); + + it("does not collapse non-CJK surrogate pairs like emoji", () => { + // Emoji is a surrogate pair in UTF-16, but not matched by NON_LATIN_RE. + // Its weighted length should remain the UTF-16 length (2). + expect(estimateStringChars("😀")).toBe(2); + }); + + it("keeps mixed CJK and emoji weighting consistent", () => { + // "你" counts as 4, emoji remains 2 => total 6 + expect(estimateStringChars("你😀")).toBe(6); + }); + + it("yields ~1 token per CJK char when divided by CHARS_PER_TOKEN_ESTIMATE", () => { + // 10 CJK chars should estimate as ~10 tokens + const cjk = "这是一个测试用的句子呢"; + const estimated = estimateStringChars(cjk); + const tokens = Math.ceil(estimated / CHARS_PER_TOKEN_ESTIMATE); + // Each CJK char ≈ 1 token, so tokens should be close to string length + expect(tokens).toBe(cjk.length); + }); +}); + +describe("estimateTokensFromChars", () => { + it("divides by CHARS_PER_TOKEN_ESTIMATE and rounds up", () => { + expect(estimateTokensFromChars(8)).toBe(2); + expect(estimateTokensFromChars(9)).toBe(3); + expect(estimateTokensFromChars(0)).toBe(0); + }); + + it("clamps negative values to 0", () => { + expect(estimateTokensFromChars(-10)).toBe(0); + }); +}); diff --git a/src/utils/cjk-chars.ts b/src/utils/cjk-chars.ts new file mode 100644 index 00000000000..483076749f7 --- /dev/null +++ b/src/utils/cjk-chars.ts @@ -0,0 +1,81 @@ +/** + * CJK-aware character counting for accurate token estimation. + * + * Most LLM tokenizers encode CJK (Chinese, Japanese, Korean) characters as + * roughly 1 token per character, whereas Latin/ASCII text averages ~1 token + * per 4 characters. When the codebase estimates tokens as `chars / 4`, CJK + * content is underestimated by 2–4×. + * + * This module provides a shared helper that inflates the character count of + * CJK text so that the standard `chars / 4` formula yields an accurate + * token estimate for any script. + */ + +/** + * Default characters-per-token ratio used throughout the codebase. + * Latin text ≈ 4 chars/token; CJK ≈ 1 char/token. + */ +export const CHARS_PER_TOKEN_ESTIMATE = 4; + +/** + * Matches CJK Unified Ideographs, CJK Extension A/B, CJK Compatibility + * Ideographs, Hangul Syllables, Hiragana, Katakana, and other non-Latin + * scripts that typically use ~1 token per character. + */ +const NON_LATIN_RE = /[\u2E80-\u9FFF\uA000-\uA4FF\uAC00-\uD7AF\uF900-\uFAFF\u{20000}-\u{2FA1F}]/gu; + +/** + * Return an adjusted character length that accounts for non-Latin (CJK, etc.) + * characters. Each non-Latin character is counted as + * {@link CHARS_PER_TOKEN_ESTIMATE} chars so that the downstream + * `chars / CHARS_PER_TOKEN_ESTIMATE` token estimate remains accurate. + * + * For pure ASCII/Latin text the return value equals `text.length` (no change). + */ +export function estimateStringChars(text: string): number { + if (text.length === 0) { + return 0; + } + const nonLatinCount = (text.match(NON_LATIN_RE) ?? []).length; + // Use code-point length instead of UTF-16 length so that surrogate pairs + // (CJK Extension B+, U+20000–U+2FA1F) are counted as 1 character, not 2. + const codePointLength = countCodePoints(text, nonLatinCount); + // Non-Latin chars already contribute 1 to codePointLength, so add the extra weight. + return codePointLength + nonLatinCount * (CHARS_PER_TOKEN_ESTIMATE - 1); +} + +/** + * Matches surrogate pairs whose code point falls in the CJK Extension B+ + * range (U+20000–U+2FA1F). Only these surrogates need adjustment because + * they are matched by {@link NON_LATIN_RE} and already counted in + * `nonLatinCount`. Other surrogates (emoji, symbols) are not matched by + * that regex, so collapsing them would create an inconsistency. + * + * High-surrogate range for U+20000–U+2FA1F is D840–D87E. + */ +const CJK_SURROGATE_HIGH_RE = /[\uD840-\uD87E][\uDC00-\uDFFF]/g; + +/** + * Return the code-point-aware length of the string, adjusting only for + * CJK Extension B+ surrogate pairs. For text without such characters + * (the vast majority of inputs) this returns `text.length` unchanged. + */ +function countCodePoints(text: string, nonLatinCount: number): number { + if (nonLatinCount === 0) { + return text.length; + } + // Count only CJK-range surrogate pairs — each occupies 2 UTF-16 units + // but represents 1 code point (and 1 regex match in NON_LATIN_RE). + const cjkSurrogates = (text.match(CJK_SURROGATE_HIGH_RE) ?? []).length; + return text.length - cjkSurrogates; +} + +/** + * Estimate the number of tokens from a raw character count. + * + * For a more accurate estimate when the source text is available, prefer + * `estimateStringChars(text) / CHARS_PER_TOKEN_ESTIMATE` instead. + */ +export function estimateTokensFromChars(chars: number): number { + return Math.ceil(Math.max(0, chars) / CHARS_PER_TOKEN_ESTIMATE); +}