fix(memory): account for CJK characters in QMD memory chunking

The QMD memory system uses a fixed 4:1 chars-to-tokens ratio for chunk sizing, which severely underestimates CJK (Chinese/Japanese/Korean) text where each character is roughly 1 token. This causes oversized chunks for CJK users, degrading vector search quality and wasting context window space. Changes: - Add shared src/utils/cjk-chars.ts module with CJK-aware character counting (estimateStringChars) and token estimation helpers - Update chunkMarkdown() in src/memory/internal.ts to use weighted character lengths for chunk boundary decisions and overlap calculation - Replace hardcoded estimateTokensFromChars in the context report command with the shared utility - Add 13 unit tests for the CJK estimation module and 5 new tests for CJK-aware memory chunking behavior Backward compatible: pure ASCII/Latin text behavior is unchanged. Closes #39965 Related: #40216
2026-05-18 14:34:47 +00:00 · 2026-03-08 17:16:55 -04:00
parent 7f46b03de0
commit 971ecabe80
5 changed files with 299 additions and 9 deletions
--- a/packages/memory-host-sdk/src/host/internal.test.ts
+++ b/packages/memory-host-sdk/src/host/internal.test.ts
@@ -249,6 +249,112 @@ describe("chunkMarkdown", () => {
      expect(chunk.text.length).toBeLessThanOrEqual(maxChars);
    }
  });
+
+  it("produces more chunks for CJK text than for equal-length ASCII text", () => {
+    // CJK chars ≈ 1 token each; ASCII chars ≈ 0.25 tokens each.
+    // For the same raw character count, CJK content should produce more chunks
+    // because each character "weighs" ~4× more in token estimation.
+    const chunkTokens = 50;
+
+    // 400 ASCII chars → ~100 tokens → fits in ~2 chunks
+    const asciiLines = Array.from({ length: 20 }, () => "a".repeat(20)).join("\n");
+    const asciiChunks = chunkMarkdown(asciiLines, { tokens: chunkTokens, overlap: 0 });
+
+    // 400 CJK chars → ~400 tokens → needs ~8 chunks
+    const cjkLines = Array.from({ length: 20 }, () => "你".repeat(20)).join("\n");
+    const cjkChunks = chunkMarkdown(cjkLines, { tokens: chunkTokens, overlap: 0 });
+
+    expect(cjkChunks.length).toBeGreaterThan(asciiChunks.length);
+  });
+
+  it("respects token budget for Chinese text", () => {
+    // With tokens=100, each CJK char ≈ 1 token, so chunks should hold ~100 CJK chars.
+    const chunkTokens = 100;
+    const lines: string[] = [];
+    for (let i = 0; i < 50; i++) {
+      lines.push("这是一个测试句子用来验证分块逻辑是否正确处理中文文本内容");
+    }
+    const content = lines.join("\n");
+    const chunks = chunkMarkdown(content, { tokens: chunkTokens, overlap: 0 });
+
+    expect(chunks.length).toBeGreaterThan(1);
+    // Each chunk's CJK content should not vastly exceed the token budget.
+    // With CJK-aware estimation, each char ≈ 1 token, so chunk text length
+    // (in CJK chars) should be roughly <= tokens budget (with some tolerance
+    // for line boundaries).
+    for (const chunk of chunks) {
+      // Count actual CJK characters in the chunk
+      const cjkCount = (chunk.text.match(/[\u4e00-\u9fff]/g) ?? []).length;
+      // Allow 2× tolerance for line-boundary rounding
+      expect(cjkCount).toBeLessThanOrEqual(chunkTokens * 2);
+    }
+  });
+
+  it("keeps English chunking behavior unchanged", () => {
+    const chunkTokens = 100;
+    const maxChars = chunkTokens * 4; // 400 chars
+    const content = "hello world this is a test. ".repeat(50);
+    const chunks = chunkMarkdown(content, { tokens: chunkTokens, overlap: 0 });
+    expect(chunks.length).toBeGreaterThan(1);
+    for (const chunk of chunks) {
+      expect(chunk.text.length).toBeLessThanOrEqual(maxChars);
+    }
+  });
+
+  it("handles mixed CJK and ASCII content correctly", () => {
+    const chunkTokens = 50;
+    const lines: string[] = [];
+    for (let i = 0; i < 30; i++) {
+      lines.push(`Line ${i}: 这是中英文混合的测试内容 with some English text`);
+    }
+    const content = lines.join("\n");
+    const chunks = chunkMarkdown(content, { tokens: chunkTokens, overlap: 0 });
+    // Should produce multiple chunks and not crash
+    expect(chunks.length).toBeGreaterThan(1);
+    // Verify all content is preserved
+    const reconstructed = chunks.map((c) => c.text).join("\n");
+    // Due to overlap=0, the concatenated chunks should cover all lines
+    expect(reconstructed).toContain("Line 0");
+    expect(reconstructed).toContain("Line 29");
+  });
+
+  it("splits very long CJK lines into budget-sized segments", () => {
+    // A single line of 2000 CJK characters (no newlines).
+    // With tokens=200, each CJK char ≈ 1 token.
+    const longCjkLine = "中".repeat(2000);
+    const chunks = chunkMarkdown(longCjkLine, { tokens: 200, overlap: 0 });
+    expect(chunks.length).toBeGreaterThanOrEqual(8);
+    for (const chunk of chunks) {
+      const cjkCount = (chunk.text.match(/[\u4E00-\u9FFF]/g) ?? []).length;
+      expect(cjkCount).toBeLessThanOrEqual(200 * 2);
+    }
+  });
+
+  it("does not break surrogate pairs when splitting long CJK lines", () => {
+    // "𠀀" (U+20000) is a surrogate pair: 2 UTF-16 code units per character.
+    // With tokens=99 (odd), the fine-split must not cut inside a pair.
+    const surrogateChar = "\u{20000}";
+    const longLine = surrogateChar.repeat(500);
+    const chunks = chunkMarkdown(longLine, { tokens: 99, overlap: 0 });
+    for (const chunk of chunks) {
+      expect(chunk.text).not.toContain("\uFFFD");
+      for (let i = 0; i < chunk.text.length; i += 1) {
+        const code = chunk.text.charCodeAt(i);
+        if (code >= 0xd800 && code <= 0xdbff) {
+          const next = chunk.text.charCodeAt(i + 1);
+          expect(next).toBeGreaterThanOrEqual(0xdc00);
+          expect(next).toBeLessThanOrEqual(0xdfff);
+        }
+      }
+    }
+  });
+
+  it("does not over-split long Latin lines (backward compat)", () => {
+    // 2000 ASCII chars / 800 maxChars -> about 3 segments, not 10 tiny ones.
+    const longLatinLine = "a".repeat(2000);
+    const chunks = chunkMarkdown(longLatinLine, { tokens: 200, overlap: 0 });
+    expect(chunks.length).toBeLessThanOrEqual(5);
+  });
 });

 describe("remapChunkLines", () => {
--- a/packages/memory-host-sdk/src/host/internal.ts
+++ b/packages/memory-host-sdk/src/host/internal.ts
@@ -3,6 +3,7 @@ import fsSync from "node:fs";
 import fs from "node:fs/promises";
 import path from "node:path";
 import { detectMime } from "../../../../src/media/mime.js";
+import { CHARS_PER_TOKEN_ESTIMATE, estimateStringChars } from "../../../../src/utils/cjk-chars.js";
 import { runTasksWithConcurrency } from "../../../../src/utils/run-with-concurrency.js";
 import { estimateStructuredEmbeddingInputBytes } from "./embedding-input-limits.js";
 import { buildTextEmbeddingInput, type EmbeddingInput } from "./embedding-inputs.js";
@@ -339,8 +340,8 @@ export function chunkMarkdown(
  if (lines.length === 0) {
    return [];
  }
-  const maxChars = Math.max(32, chunking.tokens * 4);
-  const overlapChars = Math.max(0, chunking.overlap * 4);
+  const maxChars = Math.max(32, chunking.tokens * CHARS_PER_TOKEN_ESTIMATE);
+  const overlapChars = Math.max(0, chunking.overlap * CHARS_PER_TOKEN_ESTIMATE);
  const chunks: MemoryChunk[] = [];

  let current: Array<{ line: string; lineNo: number }> = [];
@@ -380,14 +381,14 @@ export function chunkMarkdown(
      if (!entry) {
        continue;
      }
-      acc += entry.line.length + 1;
+      acc += estimateStringChars(entry.line) + 1;
      kept.unshift(entry);
      if (acc >= overlapChars) {
        break;
      }
    }
    current = kept;
-    currentChars = kept.reduce((sum, entry) => sum + entry.line.length + 1, 0);
+    currentChars = kept.reduce((sum, entry) => sum + estimateStringChars(entry.line) + 1, 0);
  };

  for (let i = 0; i < lines.length; i += 1) {
@@ -402,7 +403,7 @@ export function chunkMarkdown(
      }
    }
    for (const segment of segments) {
-      const lineSize = segment.length + 1;
+      const lineSize = estimateStringChars(segment) + 1;
      if (currentChars + lineSize > maxChars && current.length > 0) {
        flush();
        carryOverlap();
--- a/src/auto-reply/reply/commands-context-report.ts
+++ b/src/auto-reply/reply/commands-context-report.ts
@@ -5,14 +5,11 @@ import {
 } from "../../agents/pi-embedded-helpers.js";
 import { buildSystemPromptReport } from "../../agents/system-prompt-report.js";
 import type { SessionSystemPromptReport } from "../../config/sessions/types.js";
+import { estimateTokensFromChars } from "../../utils/cjk-chars.js";
 import type { ReplyPayload } from "../types.js";
 import { resolveCommandsSystemPromptBundle } from "./commands-system-prompt.js";
 import type { HandleCommandsParams } from "./commands-types.js";

-function estimateTokensFromChars(chars: number): number {
-  return Math.ceil(Math.max(0, chars) / 4);
-}
-
 function formatInt(n: number): string {
  return new Intl.NumberFormat("en-US").format(n);
 }
--- a/src/utils/cjk-chars.test.ts
+++ b/src/utils/cjk-chars.test.ts
@@ -0,0 +1,105 @@
+import { describe, expect, it } from "vitest";
+import {
+  CHARS_PER_TOKEN_ESTIMATE,
+  estimateStringChars,
+  estimateTokensFromChars,
+} from "./cjk-chars.js";
+
+describe("estimateStringChars", () => {
+  it("returns plain string length for ASCII text", () => {
+    expect(estimateStringChars("hello world")).toBe(11);
+  });
+
+  it("returns 0 for empty string", () => {
+    expect(estimateStringChars("")).toBe(0);
+  });
+
+  it("counts Chinese characters with extra weight", () => {
+    // "你好世" = 3 CJK chars
+    // Each CJK char counted as CHARS_PER_TOKEN_ESTIMATE (4) chars
+    // .length = 3, adjusted = 3 + 3 * (4 - 1) = 12
+    expect(estimateStringChars("你好世")).toBe(12);
+  });
+
+  it("handles mixed ASCII and CJK text", () => {
+    // "hi你好" = 2 ASCII + 2 CJK
+    // .length = 4, adjusted = 4 + 2 * 3 = 10
+    expect(estimateStringChars("hi你好")).toBe(10);
+  });
+
+  it("handles Japanese hiragana", () => {
+    // "こんにちは" = 5 hiragana chars
+    // .length = 5, adjusted = 5 + 5 * 3 = 20
+    expect(estimateStringChars("こんにちは")).toBe(20);
+  });
+
+  it("handles Japanese katakana", () => {
+    // "カタカナ" = 4 katakana chars
+    // .length = 4, adjusted = 4 + 4 * 3 = 16
+    expect(estimateStringChars("カタカナ")).toBe(16);
+  });
+
+  it("handles Korean hangul", () => {
+    // "안녕하세요" = 5 hangul chars
+    // .length = 5, adjusted = 5 + 5 * 3 = 20
+    expect(estimateStringChars("안녕하세요")).toBe(20);
+  });
+
+  it("handles CJK punctuation and symbols in the extended range", () => {
+    // "⺀" (U+2E80) is in CJK Radicals Supplement range
+    expect(estimateStringChars("⺀")).toBe(CHARS_PER_TOKEN_ESTIMATE);
+  });
+
+  it("does not inflate standard Latin characters", () => {
+    const latin = "The quick brown fox jumps over the lazy dog";
+    expect(estimateStringChars(latin)).toBe(latin.length);
+  });
+
+  it("does not inflate numbers and basic punctuation", () => {
+    const text = "123.45, hello! @#$%";
+    expect(estimateStringChars(text)).toBe(text.length);
+  });
+
+  it("counts CJK Extension B characters as one code point", () => {
+    // "𠀀" (U+20000) is represented as a surrogate pair in UTF-16.
+    // Result = 1 + 1 * 3 = 4 (exactly CHARS_PER_TOKEN_ESTIMATE)
+    expect(estimateStringChars("𠀀")).toBe(CHARS_PER_TOKEN_ESTIMATE);
+  });
+
+  it("handles mixed BMP and Extension B CJK consistently", () => {
+    // 3 CJK code points total: 你 + 𠀀 + 好 => 3 * 4 = 12
+    expect(estimateStringChars("你𠀀好")).toBe(12);
+  });
+
+  it("does not collapse non-CJK surrogate pairs like emoji", () => {
+    // Emoji is a surrogate pair in UTF-16, but not matched by NON_LATIN_RE.
+    // Its weighted length should remain the UTF-16 length (2).
+    expect(estimateStringChars("😀")).toBe(2);
+  });
+
+  it("keeps mixed CJK and emoji weighting consistent", () => {
+    // "你" counts as 4, emoji remains 2 => total 6
+    expect(estimateStringChars("你😀")).toBe(6);
+  });
+
+  it("yields ~1 token per CJK char when divided by CHARS_PER_TOKEN_ESTIMATE", () => {
+    // 10 CJK chars should estimate as ~10 tokens
+    const cjk = "这是一个测试用的句子呢";
+    const estimated = estimateStringChars(cjk);
+    const tokens = Math.ceil(estimated / CHARS_PER_TOKEN_ESTIMATE);
+    // Each CJK char ≈ 1 token, so tokens should be close to string length
+    expect(tokens).toBe(cjk.length);
+  });
+});
+
+describe("estimateTokensFromChars", () => {
+  it("divides by CHARS_PER_TOKEN_ESTIMATE and rounds up", () => {
+    expect(estimateTokensFromChars(8)).toBe(2);
+    expect(estimateTokensFromChars(9)).toBe(3);
+    expect(estimateTokensFromChars(0)).toBe(0);
+  });
+
+  it("clamps negative values to 0", () => {
+    expect(estimateTokensFromChars(-10)).toBe(0);
+  });
+});
--- a/src/utils/cjk-chars.ts
+++ b/src/utils/cjk-chars.ts
@@ -0,0 +1,81 @@
+/**
+ * CJK-aware character counting for accurate token estimation.
+ *
+ * Most LLM tokenizers encode CJK (Chinese, Japanese, Korean) characters as
+ * roughly 1 token per character, whereas Latin/ASCII text averages ~1 token
+ * per 4 characters.  When the codebase estimates tokens as `chars / 4`, CJK
+ * content is underestimated by 2–4×.
+ *
+ * This module provides a shared helper that inflates the character count of
+ * CJK text so that the standard `chars / 4` formula yields an accurate
+ * token estimate for any script.
+ */
+
+/**
+ * Default characters-per-token ratio used throughout the codebase.
+ * Latin text ≈ 4 chars/token; CJK ≈ 1 char/token.
+ */
+export const CHARS_PER_TOKEN_ESTIMATE = 4;
+
+/**
+ * Matches CJK Unified Ideographs, CJK Extension A/B, CJK Compatibility
+ * Ideographs, Hangul Syllables, Hiragana, Katakana, and other non-Latin
+ * scripts that typically use ~1 token per character.
+ */
+const NON_LATIN_RE = /[\u2E80-\u9FFF\uA000-\uA4FF\uAC00-\uD7AF\uF900-\uFAFF\u{20000}-\u{2FA1F}]/gu;
+
+/**
+ * Return an adjusted character length that accounts for non-Latin (CJK, etc.)
+ * characters.  Each non-Latin character is counted as
+ * {@link CHARS_PER_TOKEN_ESTIMATE} chars so that the downstream
+ * `chars / CHARS_PER_TOKEN_ESTIMATE` token estimate remains accurate.
+ *
+ * For pure ASCII/Latin text the return value equals `text.length` (no change).
+ */
+export function estimateStringChars(text: string): number {
+  if (text.length === 0) {
+    return 0;
+  }
+  const nonLatinCount = (text.match(NON_LATIN_RE) ?? []).length;
+  // Use code-point length instead of UTF-16 length so that surrogate pairs
+  // (CJK Extension B+, U+20000–U+2FA1F) are counted as 1 character, not 2.
+  const codePointLength = countCodePoints(text, nonLatinCount);
+  // Non-Latin chars already contribute 1 to codePointLength, so add the extra weight.
+  return codePointLength + nonLatinCount * (CHARS_PER_TOKEN_ESTIMATE - 1);
+}
+
+/**
+ * Matches surrogate pairs whose code point falls in the CJK Extension B+
+ * range (U+20000–U+2FA1F). Only these surrogates need adjustment because
+ * they are matched by {@link NON_LATIN_RE} and already counted in
+ * `nonLatinCount`. Other surrogates (emoji, symbols) are not matched by
+ * that regex, so collapsing them would create an inconsistency.
+ *
+ * High-surrogate range for U+20000–U+2FA1F is D840–D87E.
+ */
+const CJK_SURROGATE_HIGH_RE = /[\uD840-\uD87E][\uDC00-\uDFFF]/g;
+
+/**
+ * Return the code-point-aware length of the string, adjusting only for
+ * CJK Extension B+ surrogate pairs. For text without such characters
+ * (the vast majority of inputs) this returns `text.length` unchanged.
+ */
+function countCodePoints(text: string, nonLatinCount: number): number {
+  if (nonLatinCount === 0) {
+    return text.length;
+  }
+  // Count only CJK-range surrogate pairs — each occupies 2 UTF-16 units
+  // but represents 1 code point (and 1 regex match in NON_LATIN_RE).
+  const cjkSurrogates = (text.match(CJK_SURROGATE_HIGH_RE) ?? []).length;
+  return text.length - cjkSurrogates;
+}
+
+/**
+ * Estimate the number of tokens from a raw character count.
+ *
+ * For a more accurate estimate when the source text is available, prefer
+ * `estimateStringChars(text) / CHARS_PER_TOKEN_ESTIMATE` instead.
+ */
+export function estimateTokensFromChars(chars: number): number {
+  return Math.ceil(Math.max(0, chars) / CHARS_PER_TOKEN_ESTIMATE);
+}