Files
openclaw/src/utils/cjk-chars.ts
AaronLuo00 3b95aa8804 fix: address second-round review — Latin backward compat and emoji consistency
- Two-pass line splitting: first slice at maxChars (unchanged for Latin),
  then re-split only CJK-heavy segments at chunking.tokens. This preserves
  the original ~800-char segments for ASCII lines while keeping CJK chunks
  within the token budget.

- Narrow surrogate-pair adjustment to CJK Extension B+ range (D840–D87E)
  only, so emoji surrogate pairs are not affected. Mixed CJK+emoji text
  is now handled consistently regardless of composition.

- Add tests: emoji handling (2), Latin backward-compat long-line (1).

Addresses Codex P1 (oversized CJK segments) and P2s (Latin over-splitting,
emoji surrogate inconsistency).
2026-03-29 10:22:43 +09:00

82 lines
3.3 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/**
* CJK-aware character counting for accurate token estimation.
*
* Most LLM tokenizers encode CJK (Chinese, Japanese, Korean) characters as
* roughly 1 token per character, whereas Latin/ASCII text averages ~1 token
* per 4 characters. When the codebase estimates tokens as `chars / 4`, CJK
* content is underestimated by 24×.
*
* This module provides a shared helper that inflates the character count of
* CJK text so that the standard `chars / 4` formula yields an accurate
* token estimate for any script.
*/
/**
* Default characters-per-token ratio used throughout the codebase.
* Latin text ≈ 4 chars/token; CJK ≈ 1 char/token.
*/
export const CHARS_PER_TOKEN_ESTIMATE = 4;
/**
* Matches CJK Unified Ideographs, CJK Extension A/B, CJK Compatibility
* Ideographs, Hangul Syllables, Hiragana, Katakana, and other non-Latin
* scripts that typically use ~1 token per character.
*/
const NON_LATIN_RE = /[\u2E80-\u9FFF\uA000-\uA4FF\uAC00-\uD7AF\uF900-\uFAFF\u{20000}-\u{2FA1F}]/gu;
/**
* Return an adjusted character length that accounts for non-Latin (CJK, etc.)
* characters. Each non-Latin character is counted as
* {@link CHARS_PER_TOKEN_ESTIMATE} chars so that the downstream
* `chars / CHARS_PER_TOKEN_ESTIMATE` token estimate remains accurate.
*
* For pure ASCII/Latin text the return value equals `text.length` (no change).
*/
export function estimateStringChars(text: string): number {
if (text.length === 0) {
return 0;
}
const nonLatinCount = (text.match(NON_LATIN_RE) ?? []).length;
// Use code-point length instead of UTF-16 length so that surrogate pairs
// (CJK Extension B+, U+20000U+2FA1F) are counted as 1 character, not 2.
const codePointLength = countCodePoints(text, nonLatinCount);
// Non-Latin chars already contribute 1 to codePointLength, so add the extra weight.
return codePointLength + nonLatinCount * (CHARS_PER_TOKEN_ESTIMATE - 1);
}
/**
* Matches surrogate pairs whose code point falls in the CJK Extension B+
* range (U+20000U+2FA1F). Only these surrogates need adjustment because
* they are matched by {@link NON_LATIN_RE} and already counted in
* `nonLatinCount`. Other surrogates (emoji, symbols) are not matched by
* that regex, so collapsing them would create an inconsistency.
*
* High-surrogate range for U+20000U+2FA1F is D840D87E.
*/
const CJK_SURROGATE_HIGH_RE = /[\uD840-\uD87E][\uDC00-\uDFFF]/g;
/**
* Return the code-point-aware length of the string, adjusting only for
* CJK Extension B+ surrogate pairs. For text without such characters
* (the vast majority of inputs) this returns `text.length` unchanged.
*/
function countCodePoints(text: string, nonLatinCount: number): number {
if (nonLatinCount === 0) {
return text.length;
}
// Count only CJK-range surrogate pairs — each occupies 2 UTF-16 units
// but represents 1 code point (and 1 regex match in NON_LATIN_RE).
const cjkSurrogates = (text.match(CJK_SURROGATE_HIGH_RE) ?? []).length;
return text.length - cjkSurrogates;
}
/**
* Estimate the number of tokens from a raw character count.
*
* For a more accurate estimate when the source text is available, prefer
* `estimateStringChars(text) / CHARS_PER_TOKEN_ESTIMATE` instead.
*/
export function estimateTokensFromChars(chars: number): number {
return Math.ceil(Math.max(0, chars) / CHARS_PER_TOKEN_ESTIMATE);
}