diff --git a/extensions/telegram/src/format.test.ts b/extensions/telegram/src/format.test.ts
index 9bc78890edb..8d4e5dc189e 100644
--- a/extensions/telegram/src/format.test.ts
+++ b/extensions/telegram/src/format.test.ts
@@ -424,4 +424,47 @@ describe("markdownToTelegramHtml", () => {
it("fails loudly when tag overhead leaves no room for text", () => {
expect(() => splitTelegramHtmlChunks("x", 10)).toThrow(/tag overhead/i);
});
+
+ it("does not split an astral char across the chunk boundary", () => {
+ // Emoji surrogate pair straddles index 10 (limit): high at 9, low at 10.
+ const input = `${"A".repeat(9)}😀${"B".repeat(20)}`;
+ const chunks = splitTelegramHtmlChunks(input, 10);
+ expect(chunks.length).toBeGreaterThan(1);
+ expect(chunks.join("")).toBe(input);
+ for (const chunk of chunks) {
+ expect(containsLoneSurrogate(chunk)).toBe(false);
+ }
+ });
+
+ it("keeps an astral char whole when a positive limit starts on its pair", () => {
+ expect(splitTelegramHtmlChunks("A😀B", 1)).toEqual(["A", "😀", "B"]);
+ });
+
+ it("keeps astral chars whole in rendered Markdown chunks", () => {
+ const chunks = markdownToTelegramChunks("A😀B", 1);
+
+ expect(chunks.map((chunk) => chunk.text)).toEqual(["A", "😀", "B"]);
+ for (const chunk of chunks) {
+ expect(containsLoneSurrogate(chunk.html)).toBe(false);
+ expect(containsLoneSurrogate(chunk.text)).toBe(false);
+ }
+ });
});
+
+function containsLoneSurrogate(text: string): boolean {
+ for (let index = 0; index < text.length; index += 1) {
+ const code = text.charCodeAt(index);
+ const isHigh = code >= 0xd800 && code <= 0xdbff;
+ const isLow = code >= 0xdc00 && code <= 0xdfff;
+ if (isHigh) {
+ const next = text.charCodeAt(index + 1);
+ if (!(next >= 0xdc00 && next <= 0xdfff)) {
+ return true;
+ }
+ index += 1;
+ } else if (isLow) {
+ return true;
+ }
+ }
+ return false;
+}
diff --git a/extensions/telegram/src/format.ts b/extensions/telegram/src/format.ts
index 715181efbf0..8de2530ca9d 100644
--- a/extensions/telegram/src/format.ts
+++ b/extensions/telegram/src/format.ts
@@ -1070,11 +1070,30 @@ function findTelegramHtmlEntityEnd(text: string, start: number): number {
return text[index] === ";" ? index : -1;
}
+// Never return a split index that lands between a UTF-16 surrogate pair, or
+// both chunks would carry a lone surrogate that re-encodes to U+FFFD. If the
+// pair starts the segment, keep it whole so chunking still advances.
+function clampToSurrogateBoundary(text: string, index: number): number {
+ const high = text.charCodeAt(index - 1);
+ const low = text.charCodeAt(index);
+ const splitsPair =
+ index > 0 && high >= 0xd800 && high <= 0xdbff && low >= 0xdc00 && low <= 0xdfff;
+ if (!splitsPair) {
+ return index;
+ }
+ return index > 1 ? index - 1 : index + 1;
+}
+
function findTelegramHtmlSafeSplitIndex(text: string, maxLength: number): number {
if (text.length <= maxLength) {
return text.length;
}
const normalizedMaxLength = Math.max(1, Math.floor(maxLength));
+ const splitIndex = findTelegramHtmlEntitySafeSplitIndex(text, normalizedMaxLength);
+ return clampToSurrogateBoundary(text, splitIndex);
+}
+
+function findTelegramHtmlEntitySafeSplitIndex(text: string, normalizedMaxLength: number): number {
const lastAmpersand = text.lastIndexOf("&", normalizedMaxLength - 1);
if (lastAmpersand === -1) {
return normalizedMaxLength;
diff --git a/extensions/telegram/src/send.chunks.test.ts b/extensions/telegram/src/send.chunks.test.ts
new file mode 100644
index 00000000000..c7c8673ba8e
--- /dev/null
+++ b/extensions/telegram/src/send.chunks.test.ts
@@ -0,0 +1,57 @@
+// Telegram tests cover plain-text chunk-splitting behavior.
+import { describe, expect, it } from "vitest";
+import { splitTelegramPlainTextChunksForTests } from "./send.js";
+
+function containsLoneSurrogate(text: string): boolean {
+ for (let index = 0; index < text.length; index += 1) {
+ const code = text.charCodeAt(index);
+ const isHigh = code >= 0xd800 && code <= 0xdbff;
+ const isLow = code >= 0xdc00 && code <= 0xdfff;
+ if (isHigh) {
+ const next = text.charCodeAt(index + 1);
+ if (!(next >= 0xdc00 && next <= 0xdfff)) {
+ return true;
+ }
+ index += 1;
+ } else if (isLow) {
+ return true;
+ }
+ }
+ return false;
+}
+
+describe("splitTelegramPlainTextChunks", () => {
+ it("does not split an astral char across the chunk boundary", () => {
+ // Emoji surrogate pair straddles index 10 (limit): high at 9, low at 10.
+ const input = `${"A".repeat(9)}😀${"B".repeat(20)}`;
+ const chunks = splitTelegramPlainTextChunksForTests(input, 10);
+ expect(chunks.length).toBeGreaterThan(1);
+ expect(chunks.join("")).toBe(input);
+ for (const chunk of chunks) {
+ expect(containsLoneSurrogate(chunk)).toBe(false);
+ }
+ });
+
+ it("does not hang when limit=1 and text starts with an astral char", () => {
+ // Regression: with limit=1 the clamp would return start (no advance),
+ // causing the while-loop to spin forever. The surrogate pair must be
+ // emitted as a unit (2 code units) so the loop always advances.
+ const input = "😀X";
+ const chunks = splitTelegramPlainTextChunksForTests(input, 1);
+ expect(chunks.join("")).toBe(input);
+ for (const chunk of chunks) {
+ expect(containsLoneSurrogate(chunk)).toBe(false);
+ }
+ });
+
+ it("does not hang when limit=1 and an astral char appears mid-string at a chunk boundary", () => {
+ // 'A' + emoji: with limit=1, second iteration starts at index 1 (high
+ // surrogate) — same stall condition as above, now mid-string.
+ const input = "A😀B";
+ const chunks = splitTelegramPlainTextChunksForTests(input, 1);
+ expect(chunks.join("")).toBe(input);
+ for (const chunk of chunks) {
+ expect(containsLoneSurrogate(chunk)).toBe(false);
+ }
+ });
+});
diff --git a/extensions/telegram/src/send.ts b/extensions/telegram/src/send.ts
index 363c1340dd6..33adeeffdb8 100644
--- a/extensions/telegram/src/send.ts
+++ b/extensions/telegram/src/send.ts
@@ -179,14 +179,40 @@ function resolveTelegramMessageIdOrThrow(
throw new Error(`Telegram ${context} returned no message_id`);
}
+// Pull a chunk end back off a UTF-16 surrogate pair so neither chunk carries a
+// lone surrogate that re-encodes to U+FFFD. Mirrors the guard in
+// bot/native-quote.ts `truncateUtf16Safe`; shared by both plain-text splitters.
+//
+// `start` is the beginning of the current chunk — the return value is
+// guaranteed to be > start, so callers that loop on `start = end` always
+// advance. When clamping would land on `start` (i.e. the surrogate pair begins
+// exactly at `start`), we emit both surrogates together (end = start + 2)
+// rather than emitting a lone surrogate or stalling.
+function surrogateSafeChunkEnd(text: string, end: number, start: number): number {
+ const high = text.charCodeAt(end - 1);
+ const low = text.charCodeAt(end);
+ const splitsPair = end > 0 && high >= 0xd800 && high <= 0xdbff && low >= 0xdc00 && low <= 0xdfff;
+ if (!splitsPair) {
+ return end;
+ }
+ const clamped = end - 1;
+ // Guard: never return an index that would stall the loop. If clamped equals
+ // start the surrogate pair's high unit is the very first char of this chunk;
+ // emit both surrogates together instead of splitting or stalling.
+ return clamped > start ? clamped : start + 2;
+}
+
function splitTelegramPlainTextChunks(text: string, limit: number): string[] {
if (!text) {
return [];
}
const normalizedLimit = Math.max(1, Math.floor(limit));
const chunks: string[] = [];
- for (let start = 0; start < text.length; start += normalizedLimit) {
- chunks.push(text.slice(start, start + normalizedLimit));
+ let start = 0;
+ while (start < text.length) {
+ const end = surrogateSafeChunkEnd(text, start + normalizedLimit, start);
+ chunks.push(text.slice(start, end));
+ start = end;
}
return chunks;
}
@@ -209,12 +235,19 @@ function splitTelegramPlainTextFallback(text: string, chunkCount: number, limit:
remainingChunks === 1
? remainingChars
: Math.min(normalizedLimit, Math.ceil(remainingChars / remainingChunks));
- chunks.push(text.slice(offset, offset + nextChunkLength));
- offset += nextChunkLength;
+ const end = surrogateSafeChunkEnd(text, offset + nextChunkLength, offset);
+ chunks.push(text.slice(offset, end));
+ offset = end;
}
return chunks;
}
+// Test-only handle: the plain-text splitter is internal, but its surrogate-safe
+// chunk boundary needs direct behavior coverage.
+export function splitTelegramPlainTextChunksForTests(text: string, limit: number): string[] {
+ return splitTelegramPlainTextChunks(text, limit);
+}
+
function logTelegramOutboundSendOk(params: TelegramOutboundSuccessLogParams): void {
const parts = [
"telegram outbound send ok",
diff --git a/extensions/telegram/src/telegram-outbound.test.ts b/extensions/telegram/src/telegram-outbound.test.ts
index df0c7360378..e002c0bbd6e 100644
--- a/extensions/telegram/src/telegram-outbound.test.ts
+++ b/extensions/telegram/src/telegram-outbound.test.ts
@@ -43,6 +43,17 @@ describe("telegramPlugin outbound", () => {
expect(telegramOutbound.chunker?.(text, 4000)).toEqual([text]);
});
+ it("keeps astral characters whole at positive configured chunk limits", () => {
+ clearTelegramRuntime();
+
+ expect(telegramOutbound.chunker?.("A😀B", 1)).toEqual(["A", "😀", "B"]);
+ expect(telegramOutbound.chunker?.("A😀B", 1, { formatting: { parseMode: "HTML" } })).toEqual([
+ "A",
+ "😀",
+ "B",
+ ]);
+ });
+
it("preserves markdown tables for the configured delivery renderer", () => {
clearTelegramRuntime();
const text = ["| Name | Value |", "|------|-------|", "| A | 1 |"].join("\n");
diff --git a/packages/markdown-core/src/chunk-text.ts b/packages/markdown-core/src/chunk-text.ts
index 2c9331a9a04..c0051b0ee87 100644
--- a/packages/markdown-core/src/chunk-text.ts
+++ b/packages/markdown-core/src/chunk-text.ts
@@ -42,6 +42,23 @@ function scanParenAwareBreakpoints(text: string): { lastNewline: number; lastWhi
return { lastNewline, lastWhitespace };
}
+/**
+ * Keeps UTF-16 chunk boundaries from separating a supplementary-plane character.
+ * A one-unit positive limit still needs to emit an entire surrogate pair.
+ */
+export function avoidTrailingHighSurrogateBreak(text: string, start: number, end: number): number {
+ if (
+ end >= text.length ||
+ text.charCodeAt(end - 1) < 0xd800 ||
+ text.charCodeAt(end - 1) > 0xdbff ||
+ text.charCodeAt(end) < 0xdc00 ||
+ text.charCodeAt(end) > 0xdfff
+ ) {
+ return end;
+ }
+ return end - 1 > start ? end - 1 : end + 1;
+}
+
/**
* Splits plain text into size-bounded chunks at readable boundaries.
*
@@ -66,7 +83,11 @@ export function chunkText(text: string, limit: number): string[] {
// Prefer block boundaries, then spaces, then a hard size cut when no
// readable breakpoint exists inside this window.
const breakOffset = lastNewline > 0 ? lastNewline : lastWhitespace;
- const end = breakOffset > 0 ? cursor + breakOffset : windowEnd;
+ const end = avoidTrailingHighSurrogateBreak(
+ text,
+ cursor,
+ breakOffset > 0 ? cursor + breakOffset : windowEnd,
+ );
chunks.push(text.slice(cursor, end));
cursor = end;
while (cursor < text.length && /\s/.test(text[cursor] ?? "")) {
diff --git a/packages/markdown-core/src/render-aware-chunking.test.ts b/packages/markdown-core/src/render-aware-chunking.test.ts
index 7bd4213e277..e59be20bfe1 100644
--- a/packages/markdown-core/src/render-aware-chunking.test.ts
+++ b/packages/markdown-core/src/render-aware-chunking.test.ts
@@ -85,6 +85,28 @@ describe("renderMarkdownIRChunksWithinLimit", () => {
expect(chunks.every((chunk) => chunk.rendered.length <= 1)).toBe(true);
});
+ it("keeps astral characters whole when a positive limit reaches their pair", () => {
+ const chunks = renderMarkdownIRChunksWithinLimit({
+ ir: markdownToIR("A😀B"),
+ limit: 1,
+ renderChunk: (chunk) => chunk.text,
+ measureRendered: (rendered) => rendered.length,
+ });
+
+ expect(chunks.map((chunk) => chunk.source.text)).toEqual(["A", "😀", "B"]);
+ });
+
+ it("keeps astral characters whole when rendered size requires a retry split", () => {
+ const chunks = renderMarkdownIRChunksWithinLimit({
+ ir: markdownToIR("A😀"),
+ limit: 3,
+ renderChunk: (chunk) => (chunk.text === "A😀" ? "too long" : chunk.text),
+ measureRendered: (rendered) => rendered.length,
+ });
+
+ expect(chunks.map((chunk) => chunk.source.text)).toEqual(["A", "😀"]);
+ });
+
it("treats Infinity as no size cap and returns a single chunk", () => {
const text = "one two three four five six seven eight nine ten";
const ir = markdownToIR(text);
diff --git a/packages/markdown-core/src/render-aware-chunking.ts b/packages/markdown-core/src/render-aware-chunking.ts
index de045152e3c..37a20da48b2 100644
--- a/packages/markdown-core/src/render-aware-chunking.ts
+++ b/packages/markdown-core/src/render-aware-chunking.ts
@@ -1,3 +1,4 @@
+import { avoidTrailingHighSurrogateBreak } from "./chunk-text.js";
// Markdown Core module implements render aware chunking behavior.
import {
chunkMarkdownIR,
@@ -127,10 +128,11 @@ function findLargestChunkTextLengthWithinRenderedLimit(
// Rendered length is not guaranteed to be monotonic after escaping/link or
// file-reference rewriting, so test exact candidates from longest to shortest.
for (let candidateLength = currentTextLength - 1; candidateLength >= 1; candidateLength -= 1) {
- const candidate = sliceMarkdownIR(chunk, 0, candidateLength);
+ const safeCandidateLength = avoidTrailingHighSurrogateBreak(chunk.text, 0, candidateLength);
+ const candidate = sliceMarkdownIR(chunk, 0, safeCandidateLength);
const rendered = options.renderChunk(candidate);
if (options.measureRendered(rendered) <= renderedLimit) {
- return candidateLength;
+ return safeCandidateLength;
}
}
return 0;
@@ -215,7 +217,7 @@ function findMarkdownIRPreservedSplitIndex(text: string, start: number, limit: n
if (lastAnyWhitespaceBreak > start) {
return resolveWhitespaceBreak(lastAnyWhitespaceBreak, lastAnyWhitespaceRunStart);
}
- return maxEnd;
+ return avoidTrailingHighSurrogateBreak(text, start, maxEnd);
}
function splitMarkdownIRPreserveWhitespace(ir: MarkdownIR, limit: number): MarkdownIR[] {
diff --git a/src/auto-reply/chunk.test.ts b/src/auto-reply/chunk.test.ts
index 4e102596d05..0f3ddf0ae86 100644
--- a/src/auto-reply/chunk.test.ts
+++ b/src/auto-reply/chunk.test.ts
@@ -604,6 +604,10 @@ describe("chunkMarkdownTextWithMode", () => {
expect(chunks.every((chunk) => !/[\uD800-\uDBFF]$/u.test(chunk))).toBe(true);
expect(chunks.every((chunk) => !/^[\uDC00-\uDFFF]/u.test(chunk))).toBe(true);
});
+
+ it("keeps an astral character whole when a positive hard limit starts on its pair", () => {
+ expect(chunkMarkdownTextWithMode("A😀B", 1, "length")).toEqual(["A", "😀", "B"]);
+ });
});
describe("resolveChunkMode", () => {
diff --git a/src/shared/text-chunking.ts b/src/shared/text-chunking.ts
index 95e2d4b8bba..0f7800e520a 100644
--- a/src/shared/text-chunking.ts
+++ b/src/shared/text-chunking.ts
@@ -16,7 +16,7 @@ export function avoidTrailingHighSurrogateBreak(text: string, start: number, end
return end;
}
const adjusted = end - 1;
- return adjusted > start ? adjusted : end;
+ return adjusted > start ? adjusted : end + 1;
}
export function chunkTextByBreakResolver(