fix(telegram): guard UTF-16 surrogate pairs in outbound chunkers (#93938)

Merged via squash. Prepared head SHA: 583b22354d Co-authored-by: Nas01010101 <156536069+Nas01010101@users.noreply.github.com> Co-authored-by: vincentkoc <25068+vincentkoc@users.noreply.github.com> Reviewed-by: @vincentkoc
2026-06-22 08:38:13 +00:00 · 2026-06-17 06:56:25 -04:00
parent 5d9c010628
commit df87b40bec
10 changed files with 221 additions and 9 deletions
--- a/extensions/telegram/src/format.test.ts
+++ b/extensions/telegram/src/format.test.ts
@@ -424,4 +424,47 @@ describe("markdownToTelegramHtml", () => {
  it("fails loudly when tag overhead leaves no room for text", () => {
    expect(() => splitTelegramHtmlChunks("<b><i><u>x</u></i></b>", 10)).toThrow(/tag overhead/i);
  });
+
+  it("does not split an astral char across the chunk boundary", () => {
+    // Emoji surrogate pair straddles index 10 (limit): high at 9, low at 10.
+    const input = `${"A".repeat(9)}😀${"B".repeat(20)}`;
+    const chunks = splitTelegramHtmlChunks(input, 10);
+    expect(chunks.length).toBeGreaterThan(1);
+    expect(chunks.join("")).toBe(input);
+    for (const chunk of chunks) {
+      expect(containsLoneSurrogate(chunk)).toBe(false);
+    }
+  });
+
+  it("keeps an astral char whole when a positive limit starts on its pair", () => {
+    expect(splitTelegramHtmlChunks("A😀B", 1)).toEqual(["A", "😀", "B"]);
+  });
+
+  it("keeps astral chars whole in rendered Markdown chunks", () => {
+    const chunks = markdownToTelegramChunks("A😀B", 1);
+
+    expect(chunks.map((chunk) => chunk.text)).toEqual(["A", "😀", "B"]);
+    for (const chunk of chunks) {
+      expect(containsLoneSurrogate(chunk.html)).toBe(false);
+      expect(containsLoneSurrogate(chunk.text)).toBe(false);
+    }
+  });
 });
+
+function containsLoneSurrogate(text: string): boolean {
+  for (let index = 0; index < text.length; index += 1) {
+    const code = text.charCodeAt(index);
+    const isHigh = code >= 0xd800 && code <= 0xdbff;
+    const isLow = code >= 0xdc00 && code <= 0xdfff;
+    if (isHigh) {
+      const next = text.charCodeAt(index + 1);
+      if (!(next >= 0xdc00 && next <= 0xdfff)) {
+        return true;
+      }
+      index += 1;
+    } else if (isLow) {
+      return true;
+    }
+  }
+  return false;
+}
--- a/extensions/telegram/src/format.ts
+++ b/extensions/telegram/src/format.ts
@@ -1070,11 +1070,30 @@ function findTelegramHtmlEntityEnd(text: string, start: number): number {
  return text[index] === ";" ? index : -1;
 }

+// Never return a split index that lands between a UTF-16 surrogate pair, or
+// both chunks would carry a lone surrogate that re-encodes to U+FFFD. If the
+// pair starts the segment, keep it whole so chunking still advances.
+function clampToSurrogateBoundary(text: string, index: number): number {
+  const high = text.charCodeAt(index - 1);
+  const low = text.charCodeAt(index);
+  const splitsPair =
+    index > 0 && high >= 0xd800 && high <= 0xdbff && low >= 0xdc00 && low <= 0xdfff;
+  if (!splitsPair) {
+    return index;
+  }
+  return index > 1 ? index - 1 : index + 1;
+}
+
 function findTelegramHtmlSafeSplitIndex(text: string, maxLength: number): number {
  if (text.length <= maxLength) {
    return text.length;
  }
  const normalizedMaxLength = Math.max(1, Math.floor(maxLength));
+  const splitIndex = findTelegramHtmlEntitySafeSplitIndex(text, normalizedMaxLength);
+  return clampToSurrogateBoundary(text, splitIndex);
+}
+
+function findTelegramHtmlEntitySafeSplitIndex(text: string, normalizedMaxLength: number): number {
  const lastAmpersand = text.lastIndexOf("&", normalizedMaxLength - 1);
  if (lastAmpersand === -1) {
    return normalizedMaxLength;
--- a/extensions/telegram/src/send.chunks.test.ts
+++ b/extensions/telegram/src/send.chunks.test.ts
@@ -0,0 +1,57 @@
+// Telegram tests cover plain-text chunk-splitting behavior.
+import { describe, expect, it } from "vitest";
+import { splitTelegramPlainTextChunksForTests } from "./send.js";
+
+function containsLoneSurrogate(text: string): boolean {
+  for (let index = 0; index < text.length; index += 1) {
+    const code = text.charCodeAt(index);
+    const isHigh = code >= 0xd800 && code <= 0xdbff;
+    const isLow = code >= 0xdc00 && code <= 0xdfff;
+    if (isHigh) {
+      const next = text.charCodeAt(index + 1);
+      if (!(next >= 0xdc00 && next <= 0xdfff)) {
+        return true;
+      }
+      index += 1;
+    } else if (isLow) {
+      return true;
+    }
+  }
+  return false;
+}
+
+describe("splitTelegramPlainTextChunks", () => {
+  it("does not split an astral char across the chunk boundary", () => {
+    // Emoji surrogate pair straddles index 10 (limit): high at 9, low at 10.
+    const input = `${"A".repeat(9)}😀${"B".repeat(20)}`;
+    const chunks = splitTelegramPlainTextChunksForTests(input, 10);
+    expect(chunks.length).toBeGreaterThan(1);
+    expect(chunks.join("")).toBe(input);
+    for (const chunk of chunks) {
+      expect(containsLoneSurrogate(chunk)).toBe(false);
+    }
+  });
+
+  it("does not hang when limit=1 and text starts with an astral char", () => {
+    // Regression: with limit=1 the clamp would return start (no advance),
+    // causing the while-loop to spin forever. The surrogate pair must be
+    // emitted as a unit (2 code units) so the loop always advances.
+    const input = "😀X";
+    const chunks = splitTelegramPlainTextChunksForTests(input, 1);
+    expect(chunks.join("")).toBe(input);
+    for (const chunk of chunks) {
+      expect(containsLoneSurrogate(chunk)).toBe(false);
+    }
+  });
+
+  it("does not hang when limit=1 and an astral char appears mid-string at a chunk boundary", () => {
+    // 'A' + emoji: with limit=1, second iteration starts at index 1 (high
+    // surrogate) — same stall condition as above, now mid-string.
+    const input = "A😀B";
+    const chunks = splitTelegramPlainTextChunksForTests(input, 1);
+    expect(chunks.join("")).toBe(input);
+    for (const chunk of chunks) {
+      expect(containsLoneSurrogate(chunk)).toBe(false);
+    }
+  });
+});
--- a/extensions/telegram/src/send.ts
+++ b/extensions/telegram/src/send.ts
@@ -179,14 +179,40 @@ function resolveTelegramMessageIdOrThrow(
  throw new Error(`Telegram ${context} returned no message_id`);
 }

+// Pull a chunk end back off a UTF-16 surrogate pair so neither chunk carries a
+// lone surrogate that re-encodes to U+FFFD. Mirrors the guard in
+// bot/native-quote.ts `truncateUtf16Safe`; shared by both plain-text splitters.
+//
+// `start` is the beginning of the current chunk — the return value is
+// guaranteed to be > start, so callers that loop on `start = end` always
+// advance. When clamping would land on `start` (i.e. the surrogate pair begins
+// exactly at `start`), we emit both surrogates together (end = start + 2)
+// rather than emitting a lone surrogate or stalling.
+function surrogateSafeChunkEnd(text: string, end: number, start: number): number {
+  const high = text.charCodeAt(end - 1);
+  const low = text.charCodeAt(end);
+  const splitsPair = end > 0 && high >= 0xd800 && high <= 0xdbff && low >= 0xdc00 && low <= 0xdfff;
+  if (!splitsPair) {
+    return end;
+  }
+  const clamped = end - 1;
+  // Guard: never return an index that would stall the loop. If clamped equals
+  // start the surrogate pair's high unit is the very first char of this chunk;
+  // emit both surrogates together instead of splitting or stalling.
+  return clamped > start ? clamped : start + 2;
+}
+
 function splitTelegramPlainTextChunks(text: string, limit: number): string[] {
  if (!text) {
    return [];
  }
  const normalizedLimit = Math.max(1, Math.floor(limit));
  const chunks: string[] = [];
-  for (let start = 0; start < text.length; start += normalizedLimit) {
-    chunks.push(text.slice(start, start + normalizedLimit));
+  let start = 0;
+  while (start < text.length) {
+    const end = surrogateSafeChunkEnd(text, start + normalizedLimit, start);
+    chunks.push(text.slice(start, end));
+    start = end;
  }
  return chunks;
 }
@@ -209,12 +235,19 @@ function splitTelegramPlainTextFallback(text: string, chunkCount: number, limit:
      remainingChunks === 1
        ? remainingChars
        : Math.min(normalizedLimit, Math.ceil(remainingChars / remainingChunks));
-    chunks.push(text.slice(offset, offset + nextChunkLength));
-    offset += nextChunkLength;
+    const end = surrogateSafeChunkEnd(text, offset + nextChunkLength, offset);
+    chunks.push(text.slice(offset, end));
+    offset = end;
  }
  return chunks;
 }

+// Test-only handle: the plain-text splitter is internal, but its surrogate-safe
+// chunk boundary needs direct behavior coverage.
+export function splitTelegramPlainTextChunksForTests(text: string, limit: number): string[] {
+  return splitTelegramPlainTextChunks(text, limit);
+}
+
 function logTelegramOutboundSendOk(params: TelegramOutboundSuccessLogParams): void {
  const parts = [
    "telegram outbound send ok",
--- a/extensions/telegram/src/telegram-outbound.test.ts
+++ b/extensions/telegram/src/telegram-outbound.test.ts
@@ -43,6 +43,17 @@ describe("telegramPlugin outbound", () => {
    expect(telegramOutbound.chunker?.(text, 4000)).toEqual([text]);
  });

+  it("keeps astral characters whole at positive configured chunk limits", () => {
+    clearTelegramRuntime();
+
+    expect(telegramOutbound.chunker?.("A😀B", 1)).toEqual(["A", "😀", "B"]);
+    expect(telegramOutbound.chunker?.("A😀B", 1, { formatting: { parseMode: "HTML" } })).toEqual([
+      "A",
+      "😀",
+      "B",
+    ]);
+  });
+
  it("preserves markdown tables for the configured delivery renderer", () => {
    clearTelegramRuntime();
    const text = ["| Name | Value |", "|------|-------|", "| A | 1 |"].join("\n");
--- a/packages/markdown-core/src/chunk-text.ts
+++ b/packages/markdown-core/src/chunk-text.ts
@@ -42,6 +42,23 @@ function scanParenAwareBreakpoints(text: string): { lastNewline: number; lastWhi
  return { lastNewline, lastWhitespace };
 }

+/**
+ * Keeps UTF-16 chunk boundaries from separating a supplementary-plane character.
+ * A one-unit positive limit still needs to emit an entire surrogate pair.
+ */
+export function avoidTrailingHighSurrogateBreak(text: string, start: number, end: number): number {
+  if (
+    end >= text.length ||
+    text.charCodeAt(end - 1) < 0xd800 ||
+    text.charCodeAt(end - 1) > 0xdbff ||
+    text.charCodeAt(end) < 0xdc00 ||
+    text.charCodeAt(end) > 0xdfff
+  ) {
+    return end;
+  }
+  return end - 1 > start ? end - 1 : end + 1;
+}
+
 /**
 * Splits plain text into size-bounded chunks at readable boundaries.
 *
@@ -66,7 +83,11 @@ export function chunkText(text: string, limit: number): string[] {
    // Prefer block boundaries, then spaces, then a hard size cut when no
    // readable breakpoint exists inside this window.
    const breakOffset = lastNewline > 0 ? lastNewline : lastWhitespace;
-    const end = breakOffset > 0 ? cursor + breakOffset : windowEnd;
+    const end = avoidTrailingHighSurrogateBreak(
+      text,
+      cursor,
+      breakOffset > 0 ? cursor + breakOffset : windowEnd,
+    );
    chunks.push(text.slice(cursor, end));
    cursor = end;
    while (cursor < text.length && /\s/.test(text[cursor] ?? "")) {
--- a/packages/markdown-core/src/render-aware-chunking.test.ts
+++ b/packages/markdown-core/src/render-aware-chunking.test.ts
@@ -85,6 +85,28 @@ describe("renderMarkdownIRChunksWithinLimit", () => {
    expect(chunks.every((chunk) => chunk.rendered.length <= 1)).toBe(true);
  });

+  it("keeps astral characters whole when a positive limit reaches their pair", () => {
+    const chunks = renderMarkdownIRChunksWithinLimit({
+      ir: markdownToIR("A😀B"),
+      limit: 1,
+      renderChunk: (chunk) => chunk.text,
+      measureRendered: (rendered) => rendered.length,
+    });
+
+    expect(chunks.map((chunk) => chunk.source.text)).toEqual(["A", "😀", "B"]);
+  });
+
+  it("keeps astral characters whole when rendered size requires a retry split", () => {
+    const chunks = renderMarkdownIRChunksWithinLimit({
+      ir: markdownToIR("A😀"),
+      limit: 3,
+      renderChunk: (chunk) => (chunk.text === "A😀" ? "too long" : chunk.text),
+      measureRendered: (rendered) => rendered.length,
+    });
+
+    expect(chunks.map((chunk) => chunk.source.text)).toEqual(["A", "😀"]);
+  });
+
  it("treats Infinity as no size cap and returns a single chunk", () => {
    const text = "one two three four five six seven eight nine ten";
    const ir = markdownToIR(text);
--- a/packages/markdown-core/src/render-aware-chunking.ts
+++ b/packages/markdown-core/src/render-aware-chunking.ts
@@ -1,3 +1,4 @@
+import { avoidTrailingHighSurrogateBreak } from "./chunk-text.js";
 // Markdown Core module implements render aware chunking behavior.
 import {
  chunkMarkdownIR,
@@ -127,10 +128,11 @@ function findLargestChunkTextLengthWithinRenderedLimit<TRendered>(
  // Rendered length is not guaranteed to be monotonic after escaping/link or
  // file-reference rewriting, so test exact candidates from longest to shortest.
  for (let candidateLength = currentTextLength - 1; candidateLength >= 1; candidateLength -= 1) {
-    const candidate = sliceMarkdownIR(chunk, 0, candidateLength);
+    const safeCandidateLength = avoidTrailingHighSurrogateBreak(chunk.text, 0, candidateLength);
+    const candidate = sliceMarkdownIR(chunk, 0, safeCandidateLength);
    const rendered = options.renderChunk(candidate);
    if (options.measureRendered(rendered) <= renderedLimit) {
-      return candidateLength;
+      return safeCandidateLength;
    }
  }
  return 0;
@@ -215,7 +217,7 @@ function findMarkdownIRPreservedSplitIndex(text: string, start: number, limit: n
  if (lastAnyWhitespaceBreak > start) {
    return resolveWhitespaceBreak(lastAnyWhitespaceBreak, lastAnyWhitespaceRunStart);
  }
-  return maxEnd;
+  return avoidTrailingHighSurrogateBreak(text, start, maxEnd);
 }

 function splitMarkdownIRPreserveWhitespace(ir: MarkdownIR, limit: number): MarkdownIR[] {
--- a/src/auto-reply/chunk.test.ts
+++ b/src/auto-reply/chunk.test.ts
@@ -604,6 +604,10 @@ describe("chunkMarkdownTextWithMode", () => {
    expect(chunks.every((chunk) => !/[\uD800-\uDBFF]$/u.test(chunk))).toBe(true);
    expect(chunks.every((chunk) => !/^[\uDC00-\uDFFF]/u.test(chunk))).toBe(true);
  });
+
+  it("keeps an astral character whole when a positive hard limit starts on its pair", () => {
+    expect(chunkMarkdownTextWithMode("A😀B", 1, "length")).toEqual(["A", "😀", "B"]);
+  });
 });

 describe("resolveChunkMode", () => {
--- a/src/shared/text-chunking.ts
+++ b/src/shared/text-chunking.ts
@@ -16,7 +16,7 @@ export function avoidTrailingHighSurrogateBreak(text: string, start: number, end
    return end;
  }
  const adjusted = end - 1;
-  return adjusted > start ? adjusted : end;
+  return adjusted > start ? adjusted : end + 1;
 }

 export function chunkTextByBreakResolver(