mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 05:20:43 +00:00
test: trim surrogate chunk fixtures
This commit is contained in:
@@ -368,11 +368,10 @@ describe("chunkMarkdown", () => {
|
||||
});
|
||||
it("does not break surrogate pairs when splitting long CJK lines", () => {
|
||||
// "𠀀" (U+20000) is a surrogate pair: 2 UTF-16 code units per character.
|
||||
// A line of 500 such characters = 1000 UTF-16 code units.
|
||||
// With tokens=99 (odd), the fine-split must not cut inside a pair.
|
||||
// With an odd token budget, the fine-split must not cut inside a pair.
|
||||
const surrogateChar = "\u{20000}"; // 𠀀
|
||||
const longLine = surrogateChar.repeat(500);
|
||||
const chunks = chunkMarkdown(longLine, { tokens: 99, overlap: 0 });
|
||||
const longLine = surrogateChar.repeat(120);
|
||||
const chunks = chunkMarkdown(longLine, { tokens: 31, overlap: 0 });
|
||||
for (const chunk of chunks) {
|
||||
// No chunk should contain the Unicode replacement character U+FFFD,
|
||||
// which would indicate a broken surrogate pair.
|
||||
|
||||
@@ -360,11 +360,10 @@ describe("chunkMarkdown", () => {
|
||||
});
|
||||
it("does not break surrogate pairs when splitting long CJK lines", () => {
|
||||
// "𠀀" (U+20000) is a surrogate pair: 2 UTF-16 code units per character.
|
||||
// A line of 500 such characters = 1000 UTF-16 code units.
|
||||
// With tokens=99 (odd), the fine-split must not cut inside a pair.
|
||||
// With an odd token budget, the fine-split must not cut inside a pair.
|
||||
const surrogateChar = "\u{20000}"; // 𠀀
|
||||
const longLine = surrogateChar.repeat(500);
|
||||
const chunks = chunkMarkdown(longLine, { tokens: 99, overlap: 0 });
|
||||
const longLine = surrogateChar.repeat(120);
|
||||
const chunks = chunkMarkdown(longLine, { tokens: 31, overlap: 0 });
|
||||
for (const chunk of chunks) {
|
||||
// No chunk should contain the Unicode replacement character U+FFFD,
|
||||
// which would indicate a broken surrogate pair.
|
||||
|
||||
Reference in New Issue
Block a user