From df87b40bec5164de02dbbb0792fbd84eb5da54e8 Mon Sep 17 00:00:00 2001
From: nas <156536069+Nas01010101@users.noreply.github.com>
Date: Wed, 17 Jun 2026 06:56:25 -0400
Subject: [PATCH] fix(telegram): guard UTF-16 surrogate pairs in outbound
 chunkers (#93938)

Merged via squash.

Prepared head SHA: 583b22354d49b0ffd29128deb359f16b663c8343
Co-authored-by: Nas01010101 <156536069+Nas01010101@users.noreply.github.com>
Co-authored-by: vincentkoc <25068+vincentkoc@users.noreply.github.com>
Reviewed-by: @vincentkoc
---
 extensions/telegram/src/format.test.ts        | 43 ++++++++++++++
 extensions/telegram/src/format.ts             | 19 +++++++
 extensions/telegram/src/send.chunks.test.ts   | 57 +++++++++++++++++++
 extensions/telegram/src/send.ts               | 41 +++++++++++--
 .../telegram/src/telegram-outbound.test.ts    | 11 ++++
 packages/markdown-core/src/chunk-text.ts      | 23 +++++++-
 .../src/render-aware-chunking.test.ts         | 22 +++++++
 .../src/render-aware-chunking.ts              |  8 ++-
 src/auto-reply/chunk.test.ts                  |  4 ++
 src/shared/text-chunking.ts                   |  2 +-
 10 files changed, 221 insertions(+), 9 deletions(-)
 create mode 100644 extensions/telegram/src/send.chunks.test.ts
diff --git a/extensions/telegram/src/format.test.ts b/extensions/telegram/src/format.test.ts
index 9bc78890edb..8d4e5dc189e 100644
--- a/extensions/telegram/src/format.test.ts
+++ b/extensions/telegram/src/format.test.ts
@@ -424,4 +424,47 @@ describe("markdownToTelegramHtml", () => {
   it("fails loudly when tag overhead leaves no room for text", () => {
     expect(() => splitTelegramHtmlChunks("<b><i><u>x</u></i></b>", 10)).toThrow(/tag overhead/i);
   });
+
+  it("does not split an astral char across the chunk boundary", () => {
+    // Emoji surrogate pair straddles index 10 (limit): high at 9, low at 10.
+    const input = `${"A".repeat(9)}😀${"B".repeat(20)}`;
+    const chunks = splitTelegramHtmlChunks(input, 10);
+    expect(chunks.length).toBeGreaterThan(1);
+    expect(chunks.join("")).toBe(input);
+    for (const chunk of chunks) {
+      expect(containsLoneSurrogate(chunk)).toBe(false);
+    }
+  });
+
+  it("keeps an astral char whole when a positive limit starts on its pair", () => {
+    expect(splitTelegramHtmlChunks("A😀B", 1)).toEqual(["A", "😀", "B"]);
+  });
+
+  it("keeps astral chars whole in rendered Markdown chunks", () => {
+    const chunks = markdownToTelegramChunks("A😀B", 1);
+
+    expect(chunks.map((chunk) => chunk.text)).toEqual(["A", "😀", "B"]);
+    for (const chunk of chunks) {
+      expect(containsLoneSurrogate(chunk.html)).toBe(false);
+      expect(containsLoneSurrogate(chunk.text)).toBe(false);
+    }
+  });
 });
+
+function containsLoneSurrogate(text: string): boolean {
+  for (let index = 0; index < text.length; index += 1) {
+    const code = text.charCodeAt(index);
+    const isHigh = code >= 0xd800 && code <= 0xdbff;
+    const isLow = code >= 0xdc00 && code <= 0xdfff;
+    if (isHigh) {
+      const next = text.charCodeAt(index + 1);
+      if (!(next >= 0xdc00 && next <= 0xdfff)) {
+        return true;
+      }
+      index += 1;
+    } else if (isLow) {
+      return true;
+    }
+  }
+  return false;
+}
diff --git a/extensions/telegram/src/format.ts b/extensions/telegram/src/format.ts
index 715181efbf0..8de2530ca9d 100644
--- a/extensions/telegram/src/format.ts
+++ b/extensions/telegram/src/format.ts
@@ -1070,11 +1070,30 @@ function findTelegramHtmlEntityEnd(text: string, start: number): number {
   return text[index] === ";" ? index : -1;
 }
 
+// Never return a split index that lands between a UTF-16 surrogate pair, or
+// both chunks would carry a lone surrogate that re-encodes to U+FFFD. If the
+// pair starts the segment, keep it whole so chunking still advances.
+function clampToSurrogateBoundary(text: string, index: number): number {
+  const high = text.charCodeAt(index - 1);
+  const low = text.charCodeAt(index);
+  const splitsPair =
+    index > 0 && high >= 0xd800 && high <= 0xdbff && low >= 0xdc00 && low <= 0xdfff;
+  if (!splitsPair) {
+    return index;
+  }
+  return index > 1 ? index - 1 : index + 1;
+}
+
 function findTelegramHtmlSafeSplitIndex(text: string, maxLength: number): number {
   if (text.length <= maxLength) {
     return text.length;
   }
   const normalizedMaxLength = Math.max(1, Math.floor(maxLength));
+  const splitIndex = findTelegramHtmlEntitySafeSplitIndex(text, normalizedMaxLength);
+  return clampToSurrogateBoundary(text, splitIndex);
+}
+
+function findTelegramHtmlEntitySafeSplitIndex(text: string, normalizedMaxLength: number): number {
   const lastAmpersand = text.lastIndexOf("&", normalizedMaxLength - 1);
   if (lastAmpersand === -1) {
     return normalizedMaxLength;
diff --git a/extensions/telegram/src/send.chunks.test.ts b/extensions/telegram/src/send.chunks.test.ts
new file mode 100644
index 00000000000..c7c8673ba8e
--- /dev/null
+++ b/extensions/telegram/src/send.chunks.test.ts
@@ -0,0 +1,57 @@
+// Telegram tests cover plain-text chunk-splitting behavior.
+import { describe, expect, it } from "vitest";
+import { splitTelegramPlainTextChunksForTests } from "./send.js";
+
+function containsLoneSurrogate(text: string): boolean {
+  for (let index = 0; index < text.length; index += 1) {
+    const code = text.charCodeAt(index);
+    const isHigh = code >= 0xd800 && code <= 0xdbff;
+    const isLow = code >= 0xdc00 && code <= 0xdfff;
+    if (isHigh) {
+      const next = text.charCodeAt(index + 1);
+      if (!(next >= 0xdc00 && next <= 0xdfff)) {
+        return true;
+      }
+      index += 1;
+    } else if (isLow) {
+      return true;
+    }
+  }
+  return false;
+}
+
+describe("splitTelegramPlainTextChunks", () => {
+  it("does not split an astral char across the chunk boundary", () => {
+    // Emoji surrogate pair straddles index 10 (limit): high at 9, low at 10.
+    const input = `${"A".repeat(9)}😀${"B".repeat(20)}`;
+    const chunks = splitTelegramPlainTextChunksForTests(input, 10);
+    expect(chunks.length).toBeGreaterThan(1);
+    expect(chunks.join("")).toBe(input);
+    for (const chunk of chunks) {
+      expect(containsLoneSurrogate(chunk)).toBe(false);
+    }
+  });
+
+  it("does not hang when limit=1 and text starts with an astral char", () => {
+    // Regression: with limit=1 the clamp would return start (no advance),
+    // causing the while-loop to spin forever. The surrogate pair must be
+    // emitted as a unit (2 code units) so the loop always advances.
+    const input = "😀X";
+    const chunks = splitTelegramPlainTextChunksForTests(input, 1);
+    expect(chunks.join("")).toBe(input);
+    for (const chunk of chunks) {
+      expect(containsLoneSurrogate(chunk)).toBe(false);
+    }
+  });
+
+  it("does not hang when limit=1 and an astral char appears mid-string at a chunk boundary", () => {
+    // 'A' + emoji: with limit=1, second iteration starts at index 1 (high
+    // surrogate) — same stall condition as above, now mid-string.
+    const input = "A😀B";
+    const chunks = splitTelegramPlainTextChunksForTests(input, 1);
+    expect(chunks.join("")).toBe(input);
+    for (const chunk of chunks) {
+      expect(containsLoneSurrogate(chunk)).toBe(false);
+    }
+  });
+});
diff --git a/extensions/telegram/src/send.ts b/extensions/telegram/src/send.ts
index 363c1340dd6..33adeeffdb8 100644
--- a/extensions/telegram/src/send.ts
+++ b/extensions/telegram/src/send.ts
@@ -179,14 +179,40 @@ function resolveTelegramMessageIdOrThrow(
   throw new Error(`Telegram ${context} returned no message_id`);
 }
 
+// Pull a chunk end back off a UTF-16 surrogate pair so neither chunk carries a
+// lone surrogate that re-encodes to U+FFFD. Mirrors the guard in
+// bot/native-quote.ts `truncateUtf16Safe`; shared by both plain-text splitters.
+//
+// `start` is the beginning of the current chunk — the return value is
+// guaranteed to be > start, so callers that loop on `start = end` always
+// advance. When clamping would land on `start` (i.e. the surrogate pair begins
+// exactly at `start`), we emit both surrogates together (end = start + 2)
+// rather than emitting a lone surrogate or stalling.
+function surrogateSafeChunkEnd(text: string, end: number, start: number): number {
+  const high = text.charCodeAt(end - 1);
+  const low = text.charCodeAt(end);
+  const splitsPair = end > 0 && high >= 0xd800 && high <= 0xdbff && low >= 0xdc00 && low <= 0xdfff;
+  if (!splitsPair) {
+    return end;
+  }
+  const clamped = end - 1;
+  // Guard: never return an index that would stall the loop. If clamped equals
+  // start the surrogate pair's high unit is the very first char of this chunk;
+  // emit both surrogates together instead of splitting or stalling.
+  return clamped > start ? clamped : start + 2;
+}
+
 function splitTelegramPlainTextChunks(text: string, limit: number): string[] {
   if (!text) {
     return [];
   }
   const normalizedLimit = Math.max(1, Math.floor(limit));
   const chunks: string[] = [];
-  for (let start = 0; start < text.length; start += normalizedLimit) {
-    chunks.push(text.slice(start, start + normalizedLimit));
+  let start = 0;
+  while (start < text.length) {
+    const end = surrogateSafeChunkEnd(text, start + normalizedLimit, start);
+    chunks.push(text.slice(start, end));
+    start = end;
   }
   return chunks;
 }
@@ -209,12 +235,19 @@ function splitTelegramPlainTextFallback(text: string, chunkCount: number, limit:
       remainingChunks === 1
         ? remainingChars
         : Math.min(normalizedLimit, Math.ceil(remainingChars / remainingChunks));
-    chunks.push(text.slice(offset, offset + nextChunkLength));
-    offset += nextChunkLength;
+    const end = surrogateSafeChunkEnd(text, offset + nextChunkLength, offset);
+    chunks.push(text.slice(offset, end));
+    offset = end;
   }
   return chunks;
 }
 
+// Test-only handle: the plain-text splitter is internal, but its surrogate-safe
+// chunk boundary needs direct behavior coverage.
+export function splitTelegramPlainTextChunksForTests(text: string, limit: number): string[] {
+  return splitTelegramPlainTextChunks(text, limit);
+}
+
 function logTelegramOutboundSendOk(params: TelegramOutboundSuccessLogParams): void {
   const parts = [
     "telegram outbound send ok",
diff --git a/extensions/telegram/src/telegram-outbound.test.ts b/extensions/telegram/src/telegram-outbound.test.ts
index df0c7360378..e002c0bbd6e 100644
--- a/extensions/telegram/src/telegram-outbound.test.ts
+++ b/extensions/telegram/src/telegram-outbound.test.ts
@@ -43,6 +43,17 @@ describe("telegramPlugin outbound", () => {
     expect(telegramOutbound.chunker?.(text, 4000)).toEqual([text]);
   });
 
+  it("keeps astral characters whole at positive configured chunk limits", () => {
+    clearTelegramRuntime();
+
+    expect(telegramOutbound.chunker?.("A😀B", 1)).toEqual(["A", "😀", "B"]);
+    expect(telegramOutbound.chunker?.("A😀B", 1, { formatting: { parseMode: "HTML" } })).toEqual([
+      "A",
+      "😀",
+      "B",
+    ]);
+  });
+
   it("preserves markdown tables for the configured delivery renderer", () => {
     clearTelegramRuntime();
     const text = ["| Name | Value |", "|------|-------|", "| A | 1 |"].join("\n");
diff --git a/packages/markdown-core/src/chunk-text.ts b/packages/markdown-core/src/chunk-text.ts
index 2c9331a9a04..c0051b0ee87 100644
--- a/packages/markdown-core/src/chunk-text.ts
+++ b/packages/markdown-core/src/chunk-text.ts
@@ -42,6 +42,23 @@ function scanParenAwareBreakpoints(text: string): { lastNewline: number; lastWhi
   return { lastNewline, lastWhitespace };
 }
 
+/**
+ * Keeps UTF-16 chunk boundaries from separating a supplementary-plane character.
+ * A one-unit positive limit still needs to emit an entire surrogate pair.
+ */
+export function avoidTrailingHighSurrogateBreak(text: string, start: number, end: number): number {
+  if (
+    end >= text.length ||
+    text.charCodeAt(end - 1) < 0xd800 ||
+    text.charCodeAt(end - 1) > 0xdbff ||
+    text.charCodeAt(end) < 0xdc00 ||
+    text.charCodeAt(end) > 0xdfff
+  ) {
+    return end;
+  }
+  return end - 1 > start ? end - 1 : end + 1;
+}
+
 /**
  * Splits plain text into size-bounded chunks at readable boundaries.
  *
@@ -66,7 +83,11 @@ export function chunkText(text: string, limit: number): string[] {
     // Prefer block boundaries, then spaces, then a hard size cut when no
     // readable breakpoint exists inside this window.
     const breakOffset = lastNewline > 0 ? lastNewline : lastWhitespace;
-    const end = breakOffset > 0 ? cursor + breakOffset : windowEnd;
+    const end = avoidTrailingHighSurrogateBreak(
+      text,
+      cursor,
+      breakOffset > 0 ? cursor + breakOffset : windowEnd,
+    );
     chunks.push(text.slice(cursor, end));
     cursor = end;
     while (cursor < text.length && /\s/.test(text[cursor] ?? "")) {
diff --git a/packages/markdown-core/src/render-aware-chunking.test.ts b/packages/markdown-core/src/render-aware-chunking.test.ts
index 7bd4213e277..e59be20bfe1 100644
--- a/packages/markdown-core/src/render-aware-chunking.test.ts
+++ b/packages/markdown-core/src/render-aware-chunking.test.ts
@@ -85,6 +85,28 @@ describe("renderMarkdownIRChunksWithinLimit", () => {
     expect(chunks.every((chunk) => chunk.rendered.length <= 1)).toBe(true);
   });
 
+  it("keeps astral characters whole when a positive limit reaches their pair", () => {
+    const chunks = renderMarkdownIRChunksWithinLimit({
+      ir: markdownToIR("A😀B"),
+      limit: 1,
+      renderChunk: (chunk) => chunk.text,
+      measureRendered: (rendered) => rendered.length,
+    });
+
+    expect(chunks.map((chunk) => chunk.source.text)).toEqual(["A", "😀", "B"]);
+  });
+
+  it("keeps astral characters whole when rendered size requires a retry split", () => {
+    const chunks = renderMarkdownIRChunksWithinLimit({
+      ir: markdownToIR("A😀"),
+      limit: 3,
+      renderChunk: (chunk) => (chunk.text === "A😀" ? "too long" : chunk.text),
+      measureRendered: (rendered) => rendered.length,
+    });
+
+    expect(chunks.map((chunk) => chunk.source.text)).toEqual(["A", "😀"]);
+  });
+
   it("treats Infinity as no size cap and returns a single chunk", () => {
     const text = "one two three four five six seven eight nine ten";
     const ir = markdownToIR(text);
diff --git a/packages/markdown-core/src/render-aware-chunking.ts b/packages/markdown-core/src/render-aware-chunking.ts
index de045152e3c..37a20da48b2 100644
--- a/packages/markdown-core/src/render-aware-chunking.ts
+++ b/packages/markdown-core/src/render-aware-chunking.ts
@@ -1,3 +1,4 @@
+import { avoidTrailingHighSurrogateBreak } from "./chunk-text.js";
 // Markdown Core module implements render aware chunking behavior.
 import {
   chunkMarkdownIR,
@@ -127,10 +128,11 @@ function findLargestChunkTextLengthWithinRenderedLimit<TRendered>(
   // Rendered length is not guaranteed to be monotonic after escaping/link or
   // file-reference rewriting, so test exact candidates from longest to shortest.
   for (let candidateLength = currentTextLength - 1; candidateLength >= 1; candidateLength -= 1) {
-    const candidate = sliceMarkdownIR(chunk, 0, candidateLength);
+    const safeCandidateLength = avoidTrailingHighSurrogateBreak(chunk.text, 0, candidateLength);
+    const candidate = sliceMarkdownIR(chunk, 0, safeCandidateLength);
     const rendered = options.renderChunk(candidate);
     if (options.measureRendered(rendered) <= renderedLimit) {
-      return candidateLength;
+      return safeCandidateLength;
     }
   }
   return 0;
@@ -215,7 +217,7 @@ function findMarkdownIRPreservedSplitIndex(text: string, start: number, limit: n
   if (lastAnyWhitespaceBreak > start) {
     return resolveWhitespaceBreak(lastAnyWhitespaceBreak, lastAnyWhitespaceRunStart);
   }
-  return maxEnd;
+  return avoidTrailingHighSurrogateBreak(text, start, maxEnd);
 }
 
 function splitMarkdownIRPreserveWhitespace(ir: MarkdownIR, limit: number): MarkdownIR[] {
diff --git a/src/auto-reply/chunk.test.ts b/src/auto-reply/chunk.test.ts
index 4e102596d05..0f3ddf0ae86 100644
--- a/src/auto-reply/chunk.test.ts
+++ b/src/auto-reply/chunk.test.ts
@@ -604,6 +604,10 @@ describe("chunkMarkdownTextWithMode", () => {
     expect(chunks.every((chunk) => !/[\uD800-\uDBFF]$/u.test(chunk))).toBe(true);
     expect(chunks.every((chunk) => !/^[\uDC00-\uDFFF]/u.test(chunk))).toBe(true);
   });
+
+  it("keeps an astral character whole when a positive hard limit starts on its pair", () => {
+    expect(chunkMarkdownTextWithMode("A😀B", 1, "length")).toEqual(["A", "😀", "B"]);
+  });
 });
 
 describe("resolveChunkMode", () => {
diff --git a/src/shared/text-chunking.ts b/src/shared/text-chunking.ts
index 95e2d4b8bba..0f7800e520a 100644
--- a/src/shared/text-chunking.ts
+++ b/src/shared/text-chunking.ts
@@ -16,7 +16,7 @@ export function avoidTrailingHighSurrogateBreak(text: string, start: number, end
     return end;
   }
   const adjusted = end - 1;
-  return adjusted > start ? adjusted : end;
+  return adjusted > start ? adjusted : end + 1;
 }
 
 export function chunkTextByBreakResolver(