mirror of
https://github.com/openclaw/openclaw.git
synced 2026-03-30 19:32:27 +00:00
fix: address bot review — surrogate-pair counting and CJK line splitting
- Use code-point length instead of UTF-16 length in estimateStringChars() so that CJK Extension B+ surrogate pairs (U+20000+) are counted as 1 character, not 2 (fixes ~25% overestimate for rare characters). - Change long-line split step from maxChars to chunking.tokens so that CJK lines are sliced into token-budget-sized segments instead of char-budget-sized segments that produce ~4x oversized chunks. - Add tests for both fixes: surrogate-pair handling and long CJK line splitting. Addresses review feedback from Greptile and Codex bots.
This commit is contained in:
committed by
Peter Steinberger
parent
971ecabe80
commit
a5147d4d88
@@ -398,8 +398,13 @@ export function chunkMarkdown(
|
||||
if (line.length === 0) {
|
||||
segments.push("");
|
||||
} else {
|
||||
for (let start = 0; start < line.length; start += maxChars) {
|
||||
segments.push(line.slice(start, start + maxChars));
|
||||
// Use token count (not maxChars) as the split step so that CJK lines
|
||||
// – where 1 char ≈ 1 token – are sliced into budget-sized segments.
|
||||
// For Latin text the token count is ≥ maxChars/4, which still produces
|
||||
// segments well within the char budget after weighting.
|
||||
const splitStep = Math.max(1, chunking.tokens);
|
||||
for (let start = 0; start < line.length; start += splitStep) {
|
||||
segments.push(line.slice(start, start + splitStep));
|
||||
}
|
||||
}
|
||||
for (const segment of segments) {
|
||||
|
||||
@@ -81,7 +81,6 @@ describe("estimateStringChars", () => {
|
||||
// "你" counts as 4, emoji remains 2 => total 6
|
||||
expect(estimateStringChars("你😀")).toBe(6);
|
||||
});
|
||||
|
||||
it("yields ~1 token per CJK char when divided by CHARS_PER_TOKEN_ESTIMATE", () => {
|
||||
// 10 CJK chars should estimate as ~10 tokens
|
||||
const cjk = "这是一个测试用的句子呢";
|
||||
|
||||
Reference in New Issue
Block a user