From dadcfb574f5415e22c927a0608404c6b2750831c Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Fri, 17 Apr 2026 19:38:53 +0100 Subject: [PATCH] test: trim surrogate chunk fixtures --- packages/memory-host-sdk/src/host/internal.test.ts | 7 +++---- src/memory-host-sdk/host/internal.test.ts | 7 +++---- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/packages/memory-host-sdk/src/host/internal.test.ts b/packages/memory-host-sdk/src/host/internal.test.ts index 45adf405867..6aa17953741 100644 --- a/packages/memory-host-sdk/src/host/internal.test.ts +++ b/packages/memory-host-sdk/src/host/internal.test.ts @@ -368,11 +368,10 @@ describe("chunkMarkdown", () => { }); it("does not break surrogate pairs when splitting long CJK lines", () => { // "𠀀" (U+20000) is a surrogate pair: 2 UTF-16 code units per character. - // A line of 500 such characters = 1000 UTF-16 code units. - // With tokens=99 (odd), the fine-split must not cut inside a pair. + // With an odd token budget, the fine-split must not cut inside a pair. const surrogateChar = "\u{20000}"; // 𠀀 - const longLine = surrogateChar.repeat(500); - const chunks = chunkMarkdown(longLine, { tokens: 99, overlap: 0 }); + const longLine = surrogateChar.repeat(120); + const chunks = chunkMarkdown(longLine, { tokens: 31, overlap: 0 }); for (const chunk of chunks) { // No chunk should contain the Unicode replacement character U+FFFD, // which would indicate a broken surrogate pair. diff --git a/src/memory-host-sdk/host/internal.test.ts b/src/memory-host-sdk/host/internal.test.ts index a68d1a98137..298a3e78dd4 100644 --- a/src/memory-host-sdk/host/internal.test.ts +++ b/src/memory-host-sdk/host/internal.test.ts @@ -360,11 +360,10 @@ describe("chunkMarkdown", () => { }); it("does not break surrogate pairs when splitting long CJK lines", () => { // "𠀀" (U+20000) is a surrogate pair: 2 UTF-16 code units per character. - // A line of 500 such characters = 1000 UTF-16 code units. - // With tokens=99 (odd), the fine-split must not cut inside a pair. + // With an odd token budget, the fine-split must not cut inside a pair. const surrogateChar = "\u{20000}"; // 𠀀 - const longLine = surrogateChar.repeat(500); - const chunks = chunkMarkdown(longLine, { tokens: 99, overlap: 0 }); + const longLine = surrogateChar.repeat(120); + const chunks = chunkMarkdown(longLine, { tokens: 31, overlap: 0 }); for (const chunk of chunks) { // No chunk should contain the Unicode replacement character U+FFFD, // which would indicate a broken surrogate pair.