From df87b40bec5164de02dbbb0792fbd84eb5da54e8 Mon Sep 17 00:00:00 2001 From: nas <156536069+Nas01010101@users.noreply.github.com> Date: Wed, 17 Jun 2026 06:56:25 -0400 Subject: [PATCH] fix(telegram): guard UTF-16 surrogate pairs in outbound chunkers (#93938) Merged via squash. Prepared head SHA: 583b22354d49b0ffd29128deb359f16b663c8343 Co-authored-by: Nas01010101 <156536069+Nas01010101@users.noreply.github.com> Co-authored-by: vincentkoc <25068+vincentkoc@users.noreply.github.com> Reviewed-by: @vincentkoc --- extensions/telegram/src/format.test.ts | 43 ++++++++++++++ extensions/telegram/src/format.ts | 19 +++++++ extensions/telegram/src/send.chunks.test.ts | 57 +++++++++++++++++++ extensions/telegram/src/send.ts | 41 +++++++++++-- .../telegram/src/telegram-outbound.test.ts | 11 ++++ packages/markdown-core/src/chunk-text.ts | 23 +++++++- .../src/render-aware-chunking.test.ts | 22 +++++++ .../src/render-aware-chunking.ts | 8 ++- src/auto-reply/chunk.test.ts | 4 ++ src/shared/text-chunking.ts | 2 +- 10 files changed, 221 insertions(+), 9 deletions(-) create mode 100644 extensions/telegram/src/send.chunks.test.ts diff --git a/extensions/telegram/src/format.test.ts b/extensions/telegram/src/format.test.ts index 9bc78890edb..8d4e5dc189e 100644 --- a/extensions/telegram/src/format.test.ts +++ b/extensions/telegram/src/format.test.ts @@ -424,4 +424,47 @@ describe("markdownToTelegramHtml", () => { it("fails loudly when tag overhead leaves no room for text", () => { expect(() => splitTelegramHtmlChunks("x", 10)).toThrow(/tag overhead/i); }); + + it("does not split an astral char across the chunk boundary", () => { + // Emoji surrogate pair straddles index 10 (limit): high at 9, low at 10. + const input = `${"A".repeat(9)}😀${"B".repeat(20)}`; + const chunks = splitTelegramHtmlChunks(input, 10); + expect(chunks.length).toBeGreaterThan(1); + expect(chunks.join("")).toBe(input); + for (const chunk of chunks) { + expect(containsLoneSurrogate(chunk)).toBe(false); + } + }); + + it("keeps an astral char whole when a positive limit starts on its pair", () => { + expect(splitTelegramHtmlChunks("A😀B", 1)).toEqual(["A", "😀", "B"]); + }); + + it("keeps astral chars whole in rendered Markdown chunks", () => { + const chunks = markdownToTelegramChunks("A😀B", 1); + + expect(chunks.map((chunk) => chunk.text)).toEqual(["A", "😀", "B"]); + for (const chunk of chunks) { + expect(containsLoneSurrogate(chunk.html)).toBe(false); + expect(containsLoneSurrogate(chunk.text)).toBe(false); + } + }); }); + +function containsLoneSurrogate(text: string): boolean { + for (let index = 0; index < text.length; index += 1) { + const code = text.charCodeAt(index); + const isHigh = code >= 0xd800 && code <= 0xdbff; + const isLow = code >= 0xdc00 && code <= 0xdfff; + if (isHigh) { + const next = text.charCodeAt(index + 1); + if (!(next >= 0xdc00 && next <= 0xdfff)) { + return true; + } + index += 1; + } else if (isLow) { + return true; + } + } + return false; +} diff --git a/extensions/telegram/src/format.ts b/extensions/telegram/src/format.ts index 715181efbf0..8de2530ca9d 100644 --- a/extensions/telegram/src/format.ts +++ b/extensions/telegram/src/format.ts @@ -1070,11 +1070,30 @@ function findTelegramHtmlEntityEnd(text: string, start: number): number { return text[index] === ";" ? index : -1; } +// Never return a split index that lands between a UTF-16 surrogate pair, or +// both chunks would carry a lone surrogate that re-encodes to U+FFFD. If the +// pair starts the segment, keep it whole so chunking still advances. +function clampToSurrogateBoundary(text: string, index: number): number { + const high = text.charCodeAt(index - 1); + const low = text.charCodeAt(index); + const splitsPair = + index > 0 && high >= 0xd800 && high <= 0xdbff && low >= 0xdc00 && low <= 0xdfff; + if (!splitsPair) { + return index; + } + return index > 1 ? index - 1 : index + 1; +} + function findTelegramHtmlSafeSplitIndex(text: string, maxLength: number): number { if (text.length <= maxLength) { return text.length; } const normalizedMaxLength = Math.max(1, Math.floor(maxLength)); + const splitIndex = findTelegramHtmlEntitySafeSplitIndex(text, normalizedMaxLength); + return clampToSurrogateBoundary(text, splitIndex); +} + +function findTelegramHtmlEntitySafeSplitIndex(text: string, normalizedMaxLength: number): number { const lastAmpersand = text.lastIndexOf("&", normalizedMaxLength - 1); if (lastAmpersand === -1) { return normalizedMaxLength; diff --git a/extensions/telegram/src/send.chunks.test.ts b/extensions/telegram/src/send.chunks.test.ts new file mode 100644 index 00000000000..c7c8673ba8e --- /dev/null +++ b/extensions/telegram/src/send.chunks.test.ts @@ -0,0 +1,57 @@ +// Telegram tests cover plain-text chunk-splitting behavior. +import { describe, expect, it } from "vitest"; +import { splitTelegramPlainTextChunksForTests } from "./send.js"; + +function containsLoneSurrogate(text: string): boolean { + for (let index = 0; index < text.length; index += 1) { + const code = text.charCodeAt(index); + const isHigh = code >= 0xd800 && code <= 0xdbff; + const isLow = code >= 0xdc00 && code <= 0xdfff; + if (isHigh) { + const next = text.charCodeAt(index + 1); + if (!(next >= 0xdc00 && next <= 0xdfff)) { + return true; + } + index += 1; + } else if (isLow) { + return true; + } + } + return false; +} + +describe("splitTelegramPlainTextChunks", () => { + it("does not split an astral char across the chunk boundary", () => { + // Emoji surrogate pair straddles index 10 (limit): high at 9, low at 10. + const input = `${"A".repeat(9)}😀${"B".repeat(20)}`; + const chunks = splitTelegramPlainTextChunksForTests(input, 10); + expect(chunks.length).toBeGreaterThan(1); + expect(chunks.join("")).toBe(input); + for (const chunk of chunks) { + expect(containsLoneSurrogate(chunk)).toBe(false); + } + }); + + it("does not hang when limit=1 and text starts with an astral char", () => { + // Regression: with limit=1 the clamp would return start (no advance), + // causing the while-loop to spin forever. The surrogate pair must be + // emitted as a unit (2 code units) so the loop always advances. + const input = "😀X"; + const chunks = splitTelegramPlainTextChunksForTests(input, 1); + expect(chunks.join("")).toBe(input); + for (const chunk of chunks) { + expect(containsLoneSurrogate(chunk)).toBe(false); + } + }); + + it("does not hang when limit=1 and an astral char appears mid-string at a chunk boundary", () => { + // 'A' + emoji: with limit=1, second iteration starts at index 1 (high + // surrogate) — same stall condition as above, now mid-string. + const input = "A😀B"; + const chunks = splitTelegramPlainTextChunksForTests(input, 1); + expect(chunks.join("")).toBe(input); + for (const chunk of chunks) { + expect(containsLoneSurrogate(chunk)).toBe(false); + } + }); +}); diff --git a/extensions/telegram/src/send.ts b/extensions/telegram/src/send.ts index 363c1340dd6..33adeeffdb8 100644 --- a/extensions/telegram/src/send.ts +++ b/extensions/telegram/src/send.ts @@ -179,14 +179,40 @@ function resolveTelegramMessageIdOrThrow( throw new Error(`Telegram ${context} returned no message_id`); } +// Pull a chunk end back off a UTF-16 surrogate pair so neither chunk carries a +// lone surrogate that re-encodes to U+FFFD. Mirrors the guard in +// bot/native-quote.ts `truncateUtf16Safe`; shared by both plain-text splitters. +// +// `start` is the beginning of the current chunk — the return value is +// guaranteed to be > start, so callers that loop on `start = end` always +// advance. When clamping would land on `start` (i.e. the surrogate pair begins +// exactly at `start`), we emit both surrogates together (end = start + 2) +// rather than emitting a lone surrogate or stalling. +function surrogateSafeChunkEnd(text: string, end: number, start: number): number { + const high = text.charCodeAt(end - 1); + const low = text.charCodeAt(end); + const splitsPair = end > 0 && high >= 0xd800 && high <= 0xdbff && low >= 0xdc00 && low <= 0xdfff; + if (!splitsPair) { + return end; + } + const clamped = end - 1; + // Guard: never return an index that would stall the loop. If clamped equals + // start the surrogate pair's high unit is the very first char of this chunk; + // emit both surrogates together instead of splitting or stalling. + return clamped > start ? clamped : start + 2; +} + function splitTelegramPlainTextChunks(text: string, limit: number): string[] { if (!text) { return []; } const normalizedLimit = Math.max(1, Math.floor(limit)); const chunks: string[] = []; - for (let start = 0; start < text.length; start += normalizedLimit) { - chunks.push(text.slice(start, start + normalizedLimit)); + let start = 0; + while (start < text.length) { + const end = surrogateSafeChunkEnd(text, start + normalizedLimit, start); + chunks.push(text.slice(start, end)); + start = end; } return chunks; } @@ -209,12 +235,19 @@ function splitTelegramPlainTextFallback(text: string, chunkCount: number, limit: remainingChunks === 1 ? remainingChars : Math.min(normalizedLimit, Math.ceil(remainingChars / remainingChunks)); - chunks.push(text.slice(offset, offset + nextChunkLength)); - offset += nextChunkLength; + const end = surrogateSafeChunkEnd(text, offset + nextChunkLength, offset); + chunks.push(text.slice(offset, end)); + offset = end; } return chunks; } +// Test-only handle: the plain-text splitter is internal, but its surrogate-safe +// chunk boundary needs direct behavior coverage. +export function splitTelegramPlainTextChunksForTests(text: string, limit: number): string[] { + return splitTelegramPlainTextChunks(text, limit); +} + function logTelegramOutboundSendOk(params: TelegramOutboundSuccessLogParams): void { const parts = [ "telegram outbound send ok", diff --git a/extensions/telegram/src/telegram-outbound.test.ts b/extensions/telegram/src/telegram-outbound.test.ts index df0c7360378..e002c0bbd6e 100644 --- a/extensions/telegram/src/telegram-outbound.test.ts +++ b/extensions/telegram/src/telegram-outbound.test.ts @@ -43,6 +43,17 @@ describe("telegramPlugin outbound", () => { expect(telegramOutbound.chunker?.(text, 4000)).toEqual([text]); }); + it("keeps astral characters whole at positive configured chunk limits", () => { + clearTelegramRuntime(); + + expect(telegramOutbound.chunker?.("A😀B", 1)).toEqual(["A", "😀", "B"]); + expect(telegramOutbound.chunker?.("A😀B", 1, { formatting: { parseMode: "HTML" } })).toEqual([ + "A", + "😀", + "B", + ]); + }); + it("preserves markdown tables for the configured delivery renderer", () => { clearTelegramRuntime(); const text = ["| Name | Value |", "|------|-------|", "| A | 1 |"].join("\n"); diff --git a/packages/markdown-core/src/chunk-text.ts b/packages/markdown-core/src/chunk-text.ts index 2c9331a9a04..c0051b0ee87 100644 --- a/packages/markdown-core/src/chunk-text.ts +++ b/packages/markdown-core/src/chunk-text.ts @@ -42,6 +42,23 @@ function scanParenAwareBreakpoints(text: string): { lastNewline: number; lastWhi return { lastNewline, lastWhitespace }; } +/** + * Keeps UTF-16 chunk boundaries from separating a supplementary-plane character. + * A one-unit positive limit still needs to emit an entire surrogate pair. + */ +export function avoidTrailingHighSurrogateBreak(text: string, start: number, end: number): number { + if ( + end >= text.length || + text.charCodeAt(end - 1) < 0xd800 || + text.charCodeAt(end - 1) > 0xdbff || + text.charCodeAt(end) < 0xdc00 || + text.charCodeAt(end) > 0xdfff + ) { + return end; + } + return end - 1 > start ? end - 1 : end + 1; +} + /** * Splits plain text into size-bounded chunks at readable boundaries. * @@ -66,7 +83,11 @@ export function chunkText(text: string, limit: number): string[] { // Prefer block boundaries, then spaces, then a hard size cut when no // readable breakpoint exists inside this window. const breakOffset = lastNewline > 0 ? lastNewline : lastWhitespace; - const end = breakOffset > 0 ? cursor + breakOffset : windowEnd; + const end = avoidTrailingHighSurrogateBreak( + text, + cursor, + breakOffset > 0 ? cursor + breakOffset : windowEnd, + ); chunks.push(text.slice(cursor, end)); cursor = end; while (cursor < text.length && /\s/.test(text[cursor] ?? "")) { diff --git a/packages/markdown-core/src/render-aware-chunking.test.ts b/packages/markdown-core/src/render-aware-chunking.test.ts index 7bd4213e277..e59be20bfe1 100644 --- a/packages/markdown-core/src/render-aware-chunking.test.ts +++ b/packages/markdown-core/src/render-aware-chunking.test.ts @@ -85,6 +85,28 @@ describe("renderMarkdownIRChunksWithinLimit", () => { expect(chunks.every((chunk) => chunk.rendered.length <= 1)).toBe(true); }); + it("keeps astral characters whole when a positive limit reaches their pair", () => { + const chunks = renderMarkdownIRChunksWithinLimit({ + ir: markdownToIR("A😀B"), + limit: 1, + renderChunk: (chunk) => chunk.text, + measureRendered: (rendered) => rendered.length, + }); + + expect(chunks.map((chunk) => chunk.source.text)).toEqual(["A", "😀", "B"]); + }); + + it("keeps astral characters whole when rendered size requires a retry split", () => { + const chunks = renderMarkdownIRChunksWithinLimit({ + ir: markdownToIR("A😀"), + limit: 3, + renderChunk: (chunk) => (chunk.text === "A😀" ? "too long" : chunk.text), + measureRendered: (rendered) => rendered.length, + }); + + expect(chunks.map((chunk) => chunk.source.text)).toEqual(["A", "😀"]); + }); + it("treats Infinity as no size cap and returns a single chunk", () => { const text = "one two three four five six seven eight nine ten"; const ir = markdownToIR(text); diff --git a/packages/markdown-core/src/render-aware-chunking.ts b/packages/markdown-core/src/render-aware-chunking.ts index de045152e3c..37a20da48b2 100644 --- a/packages/markdown-core/src/render-aware-chunking.ts +++ b/packages/markdown-core/src/render-aware-chunking.ts @@ -1,3 +1,4 @@ +import { avoidTrailingHighSurrogateBreak } from "./chunk-text.js"; // Markdown Core module implements render aware chunking behavior. import { chunkMarkdownIR, @@ -127,10 +128,11 @@ function findLargestChunkTextLengthWithinRenderedLimit( // Rendered length is not guaranteed to be monotonic after escaping/link or // file-reference rewriting, so test exact candidates from longest to shortest. for (let candidateLength = currentTextLength - 1; candidateLength >= 1; candidateLength -= 1) { - const candidate = sliceMarkdownIR(chunk, 0, candidateLength); + const safeCandidateLength = avoidTrailingHighSurrogateBreak(chunk.text, 0, candidateLength); + const candidate = sliceMarkdownIR(chunk, 0, safeCandidateLength); const rendered = options.renderChunk(candidate); if (options.measureRendered(rendered) <= renderedLimit) { - return candidateLength; + return safeCandidateLength; } } return 0; @@ -215,7 +217,7 @@ function findMarkdownIRPreservedSplitIndex(text: string, start: number, limit: n if (lastAnyWhitespaceBreak > start) { return resolveWhitespaceBreak(lastAnyWhitespaceBreak, lastAnyWhitespaceRunStart); } - return maxEnd; + return avoidTrailingHighSurrogateBreak(text, start, maxEnd); } function splitMarkdownIRPreserveWhitespace(ir: MarkdownIR, limit: number): MarkdownIR[] { diff --git a/src/auto-reply/chunk.test.ts b/src/auto-reply/chunk.test.ts index 4e102596d05..0f3ddf0ae86 100644 --- a/src/auto-reply/chunk.test.ts +++ b/src/auto-reply/chunk.test.ts @@ -604,6 +604,10 @@ describe("chunkMarkdownTextWithMode", () => { expect(chunks.every((chunk) => !/[\uD800-\uDBFF]$/u.test(chunk))).toBe(true); expect(chunks.every((chunk) => !/^[\uDC00-\uDFFF]/u.test(chunk))).toBe(true); }); + + it("keeps an astral character whole when a positive hard limit starts on its pair", () => { + expect(chunkMarkdownTextWithMode("A😀B", 1, "length")).toEqual(["A", "😀", "B"]); + }); }); describe("resolveChunkMode", () => { diff --git a/src/shared/text-chunking.ts b/src/shared/text-chunking.ts index 95e2d4b8bba..0f7800e520a 100644 --- a/src/shared/text-chunking.ts +++ b/src/shared/text-chunking.ts @@ -16,7 +16,7 @@ export function avoidTrailingHighSurrogateBreak(text: string, start: number, end return end; } const adjusted = end - 1; - return adjusted > start ? adjusted : end; + return adjusted > start ? adjusted : end + 1; } export function chunkTextByBreakResolver(