mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-05 03:50:20 +00:00
fix: address second-round review — Latin backward compat and emoji consistency
- Two-pass line splitting: first slice at maxChars (unchanged for Latin), then re-split only CJK-heavy segments at chunking.tokens. This preserves the original ~800-char segments for ASCII lines while keeping CJK chunks within the token budget. - Narrow surrogate-pair adjustment to CJK Extension B+ range (D840–D87E) only, so emoji surrogate pairs are not affected. Mixed CJK+emoji text is now handled consistently regardless of composition. - Add tests: emoji handling (2), Latin backward-compat long-line (1). Addresses Codex P1 (oversized CJK segments) and P2s (Latin over-splitting, emoji surrogate inconsistency).
This commit is contained in:
committed by
Peter Steinberger
parent
a5147d4d88
commit
3b95aa8804
@@ -398,13 +398,20 @@ export function chunkMarkdown(
|
||||
if (line.length === 0) {
|
||||
segments.push("");
|
||||
} else {
|
||||
// Use token count (not maxChars) as the split step so that CJK lines
|
||||
// – where 1 char ≈ 1 token – are sliced into budget-sized segments.
|
||||
// For Latin text the token count is ≥ maxChars/4, which still produces
|
||||
// segments well within the char budget after weighting.
|
||||
const splitStep = Math.max(1, chunking.tokens);
|
||||
for (let start = 0; start < line.length; start += splitStep) {
|
||||
segments.push(line.slice(start, start + splitStep));
|
||||
// First pass: slice at maxChars (preserves original behaviour for Latin).
|
||||
// Second pass: if a segment's *weighted* size still exceeds the budget
|
||||
// (happens for CJK-heavy text where 1 char ≈ 1 token), re-split it at
|
||||
// chunking.tokens so the chunk stays within the token budget.
|
||||
for (let start = 0; start < line.length; start += maxChars) {
|
||||
const coarse = line.slice(start, start + maxChars);
|
||||
if (estimateStringChars(coarse) > maxChars) {
|
||||
const fineStep = Math.max(1, chunking.tokens);
|
||||
for (let j = 0; j < coarse.length; j += fineStep) {
|
||||
segments.push(coarse.slice(j, j + fineStep));
|
||||
}
|
||||
} else {
|
||||
segments.push(coarse);
|
||||
}
|
||||
}
|
||||
}
|
||||
for (const segment of segments) {
|
||||
|
||||
Reference in New Issue
Block a user