fix(telegram): guard UTF-16 surrogate pairs in outbound chunkers (#93938)

Merged via squash.

Prepared head SHA: 583b22354d
Co-authored-by: Nas01010101 <156536069+Nas01010101@users.noreply.github.com>
Co-authored-by: vincentkoc <25068+vincentkoc@users.noreply.github.com>
Reviewed-by: @vincentkoc
This commit is contained in:
nas
2026-06-17 06:56:25 -04:00
committed by GitHub
parent 5d9c010628
commit df87b40bec
10 changed files with 221 additions and 9 deletions

View File

@@ -424,4 +424,47 @@ describe("markdownToTelegramHtml", () => {
it("fails loudly when tag overhead leaves no room for text", () => {
expect(() => splitTelegramHtmlChunks("<b><i><u>x</u></i></b>", 10)).toThrow(/tag overhead/i);
});
it("does not split an astral char across the chunk boundary", () => {
// Emoji surrogate pair straddles index 10 (limit): high at 9, low at 10.
const input = `${"A".repeat(9)}😀${"B".repeat(20)}`;
const chunks = splitTelegramHtmlChunks(input, 10);
expect(chunks.length).toBeGreaterThan(1);
expect(chunks.join("")).toBe(input);
for (const chunk of chunks) {
expect(containsLoneSurrogate(chunk)).toBe(false);
}
});
it("keeps an astral char whole when a positive limit starts on its pair", () => {
expect(splitTelegramHtmlChunks("A😀B", 1)).toEqual(["A", "😀", "B"]);
});
it("keeps astral chars whole in rendered Markdown chunks", () => {
const chunks = markdownToTelegramChunks("A😀B", 1);
expect(chunks.map((chunk) => chunk.text)).toEqual(["A", "😀", "B"]);
for (const chunk of chunks) {
expect(containsLoneSurrogate(chunk.html)).toBe(false);
expect(containsLoneSurrogate(chunk.text)).toBe(false);
}
});
});
function containsLoneSurrogate(text: string): boolean {
for (let index = 0; index < text.length; index += 1) {
const code = text.charCodeAt(index);
const isHigh = code >= 0xd800 && code <= 0xdbff;
const isLow = code >= 0xdc00 && code <= 0xdfff;
if (isHigh) {
const next = text.charCodeAt(index + 1);
if (!(next >= 0xdc00 && next <= 0xdfff)) {
return true;
}
index += 1;
} else if (isLow) {
return true;
}
}
return false;
}

View File

@@ -1070,11 +1070,30 @@ function findTelegramHtmlEntityEnd(text: string, start: number): number {
return text[index] === ";" ? index : -1;
}
// Never return a split index that lands between a UTF-16 surrogate pair, or
// both chunks would carry a lone surrogate that re-encodes to U+FFFD. If the
// pair starts the segment, keep it whole so chunking still advances.
function clampToSurrogateBoundary(text: string, index: number): number {
const high = text.charCodeAt(index - 1);
const low = text.charCodeAt(index);
const splitsPair =
index > 0 && high >= 0xd800 && high <= 0xdbff && low >= 0xdc00 && low <= 0xdfff;
if (!splitsPair) {
return index;
}
return index > 1 ? index - 1 : index + 1;
}
function findTelegramHtmlSafeSplitIndex(text: string, maxLength: number): number {
if (text.length <= maxLength) {
return text.length;
}
const normalizedMaxLength = Math.max(1, Math.floor(maxLength));
const splitIndex = findTelegramHtmlEntitySafeSplitIndex(text, normalizedMaxLength);
return clampToSurrogateBoundary(text, splitIndex);
}
function findTelegramHtmlEntitySafeSplitIndex(text: string, normalizedMaxLength: number): number {
const lastAmpersand = text.lastIndexOf("&", normalizedMaxLength - 1);
if (lastAmpersand === -1) {
return normalizedMaxLength;

View File

@@ -0,0 +1,57 @@
// Telegram tests cover plain-text chunk-splitting behavior.
import { describe, expect, it } from "vitest";
import { splitTelegramPlainTextChunksForTests } from "./send.js";
function containsLoneSurrogate(text: string): boolean {
for (let index = 0; index < text.length; index += 1) {
const code = text.charCodeAt(index);
const isHigh = code >= 0xd800 && code <= 0xdbff;
const isLow = code >= 0xdc00 && code <= 0xdfff;
if (isHigh) {
const next = text.charCodeAt(index + 1);
if (!(next >= 0xdc00 && next <= 0xdfff)) {
return true;
}
index += 1;
} else if (isLow) {
return true;
}
}
return false;
}
describe("splitTelegramPlainTextChunks", () => {
it("does not split an astral char across the chunk boundary", () => {
// Emoji surrogate pair straddles index 10 (limit): high at 9, low at 10.
const input = `${"A".repeat(9)}😀${"B".repeat(20)}`;
const chunks = splitTelegramPlainTextChunksForTests(input, 10);
expect(chunks.length).toBeGreaterThan(1);
expect(chunks.join("")).toBe(input);
for (const chunk of chunks) {
expect(containsLoneSurrogate(chunk)).toBe(false);
}
});
it("does not hang when limit=1 and text starts with an astral char", () => {
// Regression: with limit=1 the clamp would return start (no advance),
// causing the while-loop to spin forever. The surrogate pair must be
// emitted as a unit (2 code units) so the loop always advances.
const input = "😀X";
const chunks = splitTelegramPlainTextChunksForTests(input, 1);
expect(chunks.join("")).toBe(input);
for (const chunk of chunks) {
expect(containsLoneSurrogate(chunk)).toBe(false);
}
});
it("does not hang when limit=1 and an astral char appears mid-string at a chunk boundary", () => {
// 'A' + emoji: with limit=1, second iteration starts at index 1 (high
// surrogate) — same stall condition as above, now mid-string.
const input = "A😀B";
const chunks = splitTelegramPlainTextChunksForTests(input, 1);
expect(chunks.join("")).toBe(input);
for (const chunk of chunks) {
expect(containsLoneSurrogate(chunk)).toBe(false);
}
});
});

View File

@@ -179,14 +179,40 @@ function resolveTelegramMessageIdOrThrow(
throw new Error(`Telegram ${context} returned no message_id`);
}
// Pull a chunk end back off a UTF-16 surrogate pair so neither chunk carries a
// lone surrogate that re-encodes to U+FFFD. Mirrors the guard in
// bot/native-quote.ts `truncateUtf16Safe`; shared by both plain-text splitters.
//
// `start` is the beginning of the current chunk — the return value is
// guaranteed to be > start, so callers that loop on `start = end` always
// advance. When clamping would land on `start` (i.e. the surrogate pair begins
// exactly at `start`), we emit both surrogates together (end = start + 2)
// rather than emitting a lone surrogate or stalling.
function surrogateSafeChunkEnd(text: string, end: number, start: number): number {
const high = text.charCodeAt(end - 1);
const low = text.charCodeAt(end);
const splitsPair = end > 0 && high >= 0xd800 && high <= 0xdbff && low >= 0xdc00 && low <= 0xdfff;
if (!splitsPair) {
return end;
}
const clamped = end - 1;
// Guard: never return an index that would stall the loop. If clamped equals
// start the surrogate pair's high unit is the very first char of this chunk;
// emit both surrogates together instead of splitting or stalling.
return clamped > start ? clamped : start + 2;
}
function splitTelegramPlainTextChunks(text: string, limit: number): string[] {
if (!text) {
return [];
}
const normalizedLimit = Math.max(1, Math.floor(limit));
const chunks: string[] = [];
for (let start = 0; start < text.length; start += normalizedLimit) {
chunks.push(text.slice(start, start + normalizedLimit));
let start = 0;
while (start < text.length) {
const end = surrogateSafeChunkEnd(text, start + normalizedLimit, start);
chunks.push(text.slice(start, end));
start = end;
}
return chunks;
}
@@ -209,12 +235,19 @@ function splitTelegramPlainTextFallback(text: string, chunkCount: number, limit:
remainingChunks === 1
? remainingChars
: Math.min(normalizedLimit, Math.ceil(remainingChars / remainingChunks));
chunks.push(text.slice(offset, offset + nextChunkLength));
offset += nextChunkLength;
const end = surrogateSafeChunkEnd(text, offset + nextChunkLength, offset);
chunks.push(text.slice(offset, end));
offset = end;
}
return chunks;
}
// Test-only handle: the plain-text splitter is internal, but its surrogate-safe
// chunk boundary needs direct behavior coverage.
export function splitTelegramPlainTextChunksForTests(text: string, limit: number): string[] {
return splitTelegramPlainTextChunks(text, limit);
}
function logTelegramOutboundSendOk(params: TelegramOutboundSuccessLogParams): void {
const parts = [
"telegram outbound send ok",

View File

@@ -43,6 +43,17 @@ describe("telegramPlugin outbound", () => {
expect(telegramOutbound.chunker?.(text, 4000)).toEqual([text]);
});
it("keeps astral characters whole at positive configured chunk limits", () => {
clearTelegramRuntime();
expect(telegramOutbound.chunker?.("A😀B", 1)).toEqual(["A", "😀", "B"]);
expect(telegramOutbound.chunker?.("A😀B", 1, { formatting: { parseMode: "HTML" } })).toEqual([
"A",
"😀",
"B",
]);
});
it("preserves markdown tables for the configured delivery renderer", () => {
clearTelegramRuntime();
const text = ["| Name | Value |", "|------|-------|", "| A | 1 |"].join("\n");

View File

@@ -42,6 +42,23 @@ function scanParenAwareBreakpoints(text: string): { lastNewline: number; lastWhi
return { lastNewline, lastWhitespace };
}
/**
* Keeps UTF-16 chunk boundaries from separating a supplementary-plane character.
* A one-unit positive limit still needs to emit an entire surrogate pair.
*/
export function avoidTrailingHighSurrogateBreak(text: string, start: number, end: number): number {
if (
end >= text.length ||
text.charCodeAt(end - 1) < 0xd800 ||
text.charCodeAt(end - 1) > 0xdbff ||
text.charCodeAt(end) < 0xdc00 ||
text.charCodeAt(end) > 0xdfff
) {
return end;
}
return end - 1 > start ? end - 1 : end + 1;
}
/**
* Splits plain text into size-bounded chunks at readable boundaries.
*
@@ -66,7 +83,11 @@ export function chunkText(text: string, limit: number): string[] {
// Prefer block boundaries, then spaces, then a hard size cut when no
// readable breakpoint exists inside this window.
const breakOffset = lastNewline > 0 ? lastNewline : lastWhitespace;
const end = breakOffset > 0 ? cursor + breakOffset : windowEnd;
const end = avoidTrailingHighSurrogateBreak(
text,
cursor,
breakOffset > 0 ? cursor + breakOffset : windowEnd,
);
chunks.push(text.slice(cursor, end));
cursor = end;
while (cursor < text.length && /\s/.test(text[cursor] ?? "")) {

View File

@@ -85,6 +85,28 @@ describe("renderMarkdownIRChunksWithinLimit", () => {
expect(chunks.every((chunk) => chunk.rendered.length <= 1)).toBe(true);
});
it("keeps astral characters whole when a positive limit reaches their pair", () => {
const chunks = renderMarkdownIRChunksWithinLimit({
ir: markdownToIR("A😀B"),
limit: 1,
renderChunk: (chunk) => chunk.text,
measureRendered: (rendered) => rendered.length,
});
expect(chunks.map((chunk) => chunk.source.text)).toEqual(["A", "😀", "B"]);
});
it("keeps astral characters whole when rendered size requires a retry split", () => {
const chunks = renderMarkdownIRChunksWithinLimit({
ir: markdownToIR("A😀"),
limit: 3,
renderChunk: (chunk) => (chunk.text === "A😀" ? "too long" : chunk.text),
measureRendered: (rendered) => rendered.length,
});
expect(chunks.map((chunk) => chunk.source.text)).toEqual(["A", "😀"]);
});
it("treats Infinity as no size cap and returns a single chunk", () => {
const text = "one two three four five six seven eight nine ten";
const ir = markdownToIR(text);

View File

@@ -1,3 +1,4 @@
import { avoidTrailingHighSurrogateBreak } from "./chunk-text.js";
// Markdown Core module implements render aware chunking behavior.
import {
chunkMarkdownIR,
@@ -127,10 +128,11 @@ function findLargestChunkTextLengthWithinRenderedLimit<TRendered>(
// Rendered length is not guaranteed to be monotonic after escaping/link or
// file-reference rewriting, so test exact candidates from longest to shortest.
for (let candidateLength = currentTextLength - 1; candidateLength >= 1; candidateLength -= 1) {
const candidate = sliceMarkdownIR(chunk, 0, candidateLength);
const safeCandidateLength = avoidTrailingHighSurrogateBreak(chunk.text, 0, candidateLength);
const candidate = sliceMarkdownIR(chunk, 0, safeCandidateLength);
const rendered = options.renderChunk(candidate);
if (options.measureRendered(rendered) <= renderedLimit) {
return candidateLength;
return safeCandidateLength;
}
}
return 0;
@@ -215,7 +217,7 @@ function findMarkdownIRPreservedSplitIndex(text: string, start: number, limit: n
if (lastAnyWhitespaceBreak > start) {
return resolveWhitespaceBreak(lastAnyWhitespaceBreak, lastAnyWhitespaceRunStart);
}
return maxEnd;
return avoidTrailingHighSurrogateBreak(text, start, maxEnd);
}
function splitMarkdownIRPreserveWhitespace(ir: MarkdownIR, limit: number): MarkdownIR[] {

View File

@@ -604,6 +604,10 @@ describe("chunkMarkdownTextWithMode", () => {
expect(chunks.every((chunk) => !/[\uD800-\uDBFF]$/u.test(chunk))).toBe(true);
expect(chunks.every((chunk) => !/^[\uDC00-\uDFFF]/u.test(chunk))).toBe(true);
});
it("keeps an astral character whole when a positive hard limit starts on its pair", () => {
expect(chunkMarkdownTextWithMode("A😀B", 1, "length")).toEqual(["A", "😀", "B"]);
});
});
describe("resolveChunkMode", () => {

View File

@@ -16,7 +16,7 @@ export function avoidTrailingHighSurrogateBreak(text: string, start: number, end
return end;
}
const adjusted = end - 1;
return adjusted > start ? adjusted : end;
return adjusted > start ? adjusted : end + 1;
}
export function chunkTextByBreakResolver(