mirror of
https://github.com/openclaw/openclaw.git
synced 2026-06-22 08:38:13 +00:00
fix(telegram): guard UTF-16 surrogate pairs in outbound chunkers (#93938)
Merged via squash.
Prepared head SHA: 583b22354d
Co-authored-by: Nas01010101 <156536069+Nas01010101@users.noreply.github.com>
Co-authored-by: vincentkoc <25068+vincentkoc@users.noreply.github.com>
Reviewed-by: @vincentkoc
This commit is contained in:
@@ -424,4 +424,47 @@ describe("markdownToTelegramHtml", () => {
|
||||
it("fails loudly when tag overhead leaves no room for text", () => {
|
||||
expect(() => splitTelegramHtmlChunks("<b><i><u>x</u></i></b>", 10)).toThrow(/tag overhead/i);
|
||||
});
|
||||
|
||||
it("does not split an astral char across the chunk boundary", () => {
|
||||
// Emoji surrogate pair straddles index 10 (limit): high at 9, low at 10.
|
||||
const input = `${"A".repeat(9)}😀${"B".repeat(20)}`;
|
||||
const chunks = splitTelegramHtmlChunks(input, 10);
|
||||
expect(chunks.length).toBeGreaterThan(1);
|
||||
expect(chunks.join("")).toBe(input);
|
||||
for (const chunk of chunks) {
|
||||
expect(containsLoneSurrogate(chunk)).toBe(false);
|
||||
}
|
||||
});
|
||||
|
||||
it("keeps an astral char whole when a positive limit starts on its pair", () => {
|
||||
expect(splitTelegramHtmlChunks("A😀B", 1)).toEqual(["A", "😀", "B"]);
|
||||
});
|
||||
|
||||
it("keeps astral chars whole in rendered Markdown chunks", () => {
|
||||
const chunks = markdownToTelegramChunks("A😀B", 1);
|
||||
|
||||
expect(chunks.map((chunk) => chunk.text)).toEqual(["A", "😀", "B"]);
|
||||
for (const chunk of chunks) {
|
||||
expect(containsLoneSurrogate(chunk.html)).toBe(false);
|
||||
expect(containsLoneSurrogate(chunk.text)).toBe(false);
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
function containsLoneSurrogate(text: string): boolean {
|
||||
for (let index = 0; index < text.length; index += 1) {
|
||||
const code = text.charCodeAt(index);
|
||||
const isHigh = code >= 0xd800 && code <= 0xdbff;
|
||||
const isLow = code >= 0xdc00 && code <= 0xdfff;
|
||||
if (isHigh) {
|
||||
const next = text.charCodeAt(index + 1);
|
||||
if (!(next >= 0xdc00 && next <= 0xdfff)) {
|
||||
return true;
|
||||
}
|
||||
index += 1;
|
||||
} else if (isLow) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -1070,11 +1070,30 @@ function findTelegramHtmlEntityEnd(text: string, start: number): number {
|
||||
return text[index] === ";" ? index : -1;
|
||||
}
|
||||
|
||||
// Never return a split index that lands between a UTF-16 surrogate pair, or
|
||||
// both chunks would carry a lone surrogate that re-encodes to U+FFFD. If the
|
||||
// pair starts the segment, keep it whole so chunking still advances.
|
||||
function clampToSurrogateBoundary(text: string, index: number): number {
|
||||
const high = text.charCodeAt(index - 1);
|
||||
const low = text.charCodeAt(index);
|
||||
const splitsPair =
|
||||
index > 0 && high >= 0xd800 && high <= 0xdbff && low >= 0xdc00 && low <= 0xdfff;
|
||||
if (!splitsPair) {
|
||||
return index;
|
||||
}
|
||||
return index > 1 ? index - 1 : index + 1;
|
||||
}
|
||||
|
||||
function findTelegramHtmlSafeSplitIndex(text: string, maxLength: number): number {
|
||||
if (text.length <= maxLength) {
|
||||
return text.length;
|
||||
}
|
||||
const normalizedMaxLength = Math.max(1, Math.floor(maxLength));
|
||||
const splitIndex = findTelegramHtmlEntitySafeSplitIndex(text, normalizedMaxLength);
|
||||
return clampToSurrogateBoundary(text, splitIndex);
|
||||
}
|
||||
|
||||
function findTelegramHtmlEntitySafeSplitIndex(text: string, normalizedMaxLength: number): number {
|
||||
const lastAmpersand = text.lastIndexOf("&", normalizedMaxLength - 1);
|
||||
if (lastAmpersand === -1) {
|
||||
return normalizedMaxLength;
|
||||
|
||||
57
extensions/telegram/src/send.chunks.test.ts
Normal file
57
extensions/telegram/src/send.chunks.test.ts
Normal file
@@ -0,0 +1,57 @@
|
||||
// Telegram tests cover plain-text chunk-splitting behavior.
|
||||
import { describe, expect, it } from "vitest";
|
||||
import { splitTelegramPlainTextChunksForTests } from "./send.js";
|
||||
|
||||
function containsLoneSurrogate(text: string): boolean {
|
||||
for (let index = 0; index < text.length; index += 1) {
|
||||
const code = text.charCodeAt(index);
|
||||
const isHigh = code >= 0xd800 && code <= 0xdbff;
|
||||
const isLow = code >= 0xdc00 && code <= 0xdfff;
|
||||
if (isHigh) {
|
||||
const next = text.charCodeAt(index + 1);
|
||||
if (!(next >= 0xdc00 && next <= 0xdfff)) {
|
||||
return true;
|
||||
}
|
||||
index += 1;
|
||||
} else if (isLow) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
describe("splitTelegramPlainTextChunks", () => {
|
||||
it("does not split an astral char across the chunk boundary", () => {
|
||||
// Emoji surrogate pair straddles index 10 (limit): high at 9, low at 10.
|
||||
const input = `${"A".repeat(9)}😀${"B".repeat(20)}`;
|
||||
const chunks = splitTelegramPlainTextChunksForTests(input, 10);
|
||||
expect(chunks.length).toBeGreaterThan(1);
|
||||
expect(chunks.join("")).toBe(input);
|
||||
for (const chunk of chunks) {
|
||||
expect(containsLoneSurrogate(chunk)).toBe(false);
|
||||
}
|
||||
});
|
||||
|
||||
it("does not hang when limit=1 and text starts with an astral char", () => {
|
||||
// Regression: with limit=1 the clamp would return start (no advance),
|
||||
// causing the while-loop to spin forever. The surrogate pair must be
|
||||
// emitted as a unit (2 code units) so the loop always advances.
|
||||
const input = "😀X";
|
||||
const chunks = splitTelegramPlainTextChunksForTests(input, 1);
|
||||
expect(chunks.join("")).toBe(input);
|
||||
for (const chunk of chunks) {
|
||||
expect(containsLoneSurrogate(chunk)).toBe(false);
|
||||
}
|
||||
});
|
||||
|
||||
it("does not hang when limit=1 and an astral char appears mid-string at a chunk boundary", () => {
|
||||
// 'A' + emoji: with limit=1, second iteration starts at index 1 (high
|
||||
// surrogate) — same stall condition as above, now mid-string.
|
||||
const input = "A😀B";
|
||||
const chunks = splitTelegramPlainTextChunksForTests(input, 1);
|
||||
expect(chunks.join("")).toBe(input);
|
||||
for (const chunk of chunks) {
|
||||
expect(containsLoneSurrogate(chunk)).toBe(false);
|
||||
}
|
||||
});
|
||||
});
|
||||
@@ -179,14 +179,40 @@ function resolveTelegramMessageIdOrThrow(
|
||||
throw new Error(`Telegram ${context} returned no message_id`);
|
||||
}
|
||||
|
||||
// Pull a chunk end back off a UTF-16 surrogate pair so neither chunk carries a
|
||||
// lone surrogate that re-encodes to U+FFFD. Mirrors the guard in
|
||||
// bot/native-quote.ts `truncateUtf16Safe`; shared by both plain-text splitters.
|
||||
//
|
||||
// `start` is the beginning of the current chunk — the return value is
|
||||
// guaranteed to be > start, so callers that loop on `start = end` always
|
||||
// advance. When clamping would land on `start` (i.e. the surrogate pair begins
|
||||
// exactly at `start`), we emit both surrogates together (end = start + 2)
|
||||
// rather than emitting a lone surrogate or stalling.
|
||||
function surrogateSafeChunkEnd(text: string, end: number, start: number): number {
|
||||
const high = text.charCodeAt(end - 1);
|
||||
const low = text.charCodeAt(end);
|
||||
const splitsPair = end > 0 && high >= 0xd800 && high <= 0xdbff && low >= 0xdc00 && low <= 0xdfff;
|
||||
if (!splitsPair) {
|
||||
return end;
|
||||
}
|
||||
const clamped = end - 1;
|
||||
// Guard: never return an index that would stall the loop. If clamped equals
|
||||
// start the surrogate pair's high unit is the very first char of this chunk;
|
||||
// emit both surrogates together instead of splitting or stalling.
|
||||
return clamped > start ? clamped : start + 2;
|
||||
}
|
||||
|
||||
function splitTelegramPlainTextChunks(text: string, limit: number): string[] {
|
||||
if (!text) {
|
||||
return [];
|
||||
}
|
||||
const normalizedLimit = Math.max(1, Math.floor(limit));
|
||||
const chunks: string[] = [];
|
||||
for (let start = 0; start < text.length; start += normalizedLimit) {
|
||||
chunks.push(text.slice(start, start + normalizedLimit));
|
||||
let start = 0;
|
||||
while (start < text.length) {
|
||||
const end = surrogateSafeChunkEnd(text, start + normalizedLimit, start);
|
||||
chunks.push(text.slice(start, end));
|
||||
start = end;
|
||||
}
|
||||
return chunks;
|
||||
}
|
||||
@@ -209,12 +235,19 @@ function splitTelegramPlainTextFallback(text: string, chunkCount: number, limit:
|
||||
remainingChunks === 1
|
||||
? remainingChars
|
||||
: Math.min(normalizedLimit, Math.ceil(remainingChars / remainingChunks));
|
||||
chunks.push(text.slice(offset, offset + nextChunkLength));
|
||||
offset += nextChunkLength;
|
||||
const end = surrogateSafeChunkEnd(text, offset + nextChunkLength, offset);
|
||||
chunks.push(text.slice(offset, end));
|
||||
offset = end;
|
||||
}
|
||||
return chunks;
|
||||
}
|
||||
|
||||
// Test-only handle: the plain-text splitter is internal, but its surrogate-safe
|
||||
// chunk boundary needs direct behavior coverage.
|
||||
export function splitTelegramPlainTextChunksForTests(text: string, limit: number): string[] {
|
||||
return splitTelegramPlainTextChunks(text, limit);
|
||||
}
|
||||
|
||||
function logTelegramOutboundSendOk(params: TelegramOutboundSuccessLogParams): void {
|
||||
const parts = [
|
||||
"telegram outbound send ok",
|
||||
|
||||
@@ -43,6 +43,17 @@ describe("telegramPlugin outbound", () => {
|
||||
expect(telegramOutbound.chunker?.(text, 4000)).toEqual([text]);
|
||||
});
|
||||
|
||||
it("keeps astral characters whole at positive configured chunk limits", () => {
|
||||
clearTelegramRuntime();
|
||||
|
||||
expect(telegramOutbound.chunker?.("A😀B", 1)).toEqual(["A", "😀", "B"]);
|
||||
expect(telegramOutbound.chunker?.("A😀B", 1, { formatting: { parseMode: "HTML" } })).toEqual([
|
||||
"A",
|
||||
"😀",
|
||||
"B",
|
||||
]);
|
||||
});
|
||||
|
||||
it("preserves markdown tables for the configured delivery renderer", () => {
|
||||
clearTelegramRuntime();
|
||||
const text = ["| Name | Value |", "|------|-------|", "| A | 1 |"].join("\n");
|
||||
|
||||
@@ -42,6 +42,23 @@ function scanParenAwareBreakpoints(text: string): { lastNewline: number; lastWhi
|
||||
return { lastNewline, lastWhitespace };
|
||||
}
|
||||
|
||||
/**
|
||||
* Keeps UTF-16 chunk boundaries from separating a supplementary-plane character.
|
||||
* A one-unit positive limit still needs to emit an entire surrogate pair.
|
||||
*/
|
||||
export function avoidTrailingHighSurrogateBreak(text: string, start: number, end: number): number {
|
||||
if (
|
||||
end >= text.length ||
|
||||
text.charCodeAt(end - 1) < 0xd800 ||
|
||||
text.charCodeAt(end - 1) > 0xdbff ||
|
||||
text.charCodeAt(end) < 0xdc00 ||
|
||||
text.charCodeAt(end) > 0xdfff
|
||||
) {
|
||||
return end;
|
||||
}
|
||||
return end - 1 > start ? end - 1 : end + 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Splits plain text into size-bounded chunks at readable boundaries.
|
||||
*
|
||||
@@ -66,7 +83,11 @@ export function chunkText(text: string, limit: number): string[] {
|
||||
// Prefer block boundaries, then spaces, then a hard size cut when no
|
||||
// readable breakpoint exists inside this window.
|
||||
const breakOffset = lastNewline > 0 ? lastNewline : lastWhitespace;
|
||||
const end = breakOffset > 0 ? cursor + breakOffset : windowEnd;
|
||||
const end = avoidTrailingHighSurrogateBreak(
|
||||
text,
|
||||
cursor,
|
||||
breakOffset > 0 ? cursor + breakOffset : windowEnd,
|
||||
);
|
||||
chunks.push(text.slice(cursor, end));
|
||||
cursor = end;
|
||||
while (cursor < text.length && /\s/.test(text[cursor] ?? "")) {
|
||||
|
||||
@@ -85,6 +85,28 @@ describe("renderMarkdownIRChunksWithinLimit", () => {
|
||||
expect(chunks.every((chunk) => chunk.rendered.length <= 1)).toBe(true);
|
||||
});
|
||||
|
||||
it("keeps astral characters whole when a positive limit reaches their pair", () => {
|
||||
const chunks = renderMarkdownIRChunksWithinLimit({
|
||||
ir: markdownToIR("A😀B"),
|
||||
limit: 1,
|
||||
renderChunk: (chunk) => chunk.text,
|
||||
measureRendered: (rendered) => rendered.length,
|
||||
});
|
||||
|
||||
expect(chunks.map((chunk) => chunk.source.text)).toEqual(["A", "😀", "B"]);
|
||||
});
|
||||
|
||||
it("keeps astral characters whole when rendered size requires a retry split", () => {
|
||||
const chunks = renderMarkdownIRChunksWithinLimit({
|
||||
ir: markdownToIR("A😀"),
|
||||
limit: 3,
|
||||
renderChunk: (chunk) => (chunk.text === "A😀" ? "too long" : chunk.text),
|
||||
measureRendered: (rendered) => rendered.length,
|
||||
});
|
||||
|
||||
expect(chunks.map((chunk) => chunk.source.text)).toEqual(["A", "😀"]);
|
||||
});
|
||||
|
||||
it("treats Infinity as no size cap and returns a single chunk", () => {
|
||||
const text = "one two three four five six seven eight nine ten";
|
||||
const ir = markdownToIR(text);
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
import { avoidTrailingHighSurrogateBreak } from "./chunk-text.js";
|
||||
// Markdown Core module implements render aware chunking behavior.
|
||||
import {
|
||||
chunkMarkdownIR,
|
||||
@@ -127,10 +128,11 @@ function findLargestChunkTextLengthWithinRenderedLimit<TRendered>(
|
||||
// Rendered length is not guaranteed to be monotonic after escaping/link or
|
||||
// file-reference rewriting, so test exact candidates from longest to shortest.
|
||||
for (let candidateLength = currentTextLength - 1; candidateLength >= 1; candidateLength -= 1) {
|
||||
const candidate = sliceMarkdownIR(chunk, 0, candidateLength);
|
||||
const safeCandidateLength = avoidTrailingHighSurrogateBreak(chunk.text, 0, candidateLength);
|
||||
const candidate = sliceMarkdownIR(chunk, 0, safeCandidateLength);
|
||||
const rendered = options.renderChunk(candidate);
|
||||
if (options.measureRendered(rendered) <= renderedLimit) {
|
||||
return candidateLength;
|
||||
return safeCandidateLength;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
@@ -215,7 +217,7 @@ function findMarkdownIRPreservedSplitIndex(text: string, start: number, limit: n
|
||||
if (lastAnyWhitespaceBreak > start) {
|
||||
return resolveWhitespaceBreak(lastAnyWhitespaceBreak, lastAnyWhitespaceRunStart);
|
||||
}
|
||||
return maxEnd;
|
||||
return avoidTrailingHighSurrogateBreak(text, start, maxEnd);
|
||||
}
|
||||
|
||||
function splitMarkdownIRPreserveWhitespace(ir: MarkdownIR, limit: number): MarkdownIR[] {
|
||||
|
||||
@@ -604,6 +604,10 @@ describe("chunkMarkdownTextWithMode", () => {
|
||||
expect(chunks.every((chunk) => !/[\uD800-\uDBFF]$/u.test(chunk))).toBe(true);
|
||||
expect(chunks.every((chunk) => !/^[\uDC00-\uDFFF]/u.test(chunk))).toBe(true);
|
||||
});
|
||||
|
||||
it("keeps an astral character whole when a positive hard limit starts on its pair", () => {
|
||||
expect(chunkMarkdownTextWithMode("A😀B", 1, "length")).toEqual(["A", "😀", "B"]);
|
||||
});
|
||||
});
|
||||
|
||||
describe("resolveChunkMode", () => {
|
||||
|
||||
@@ -16,7 +16,7 @@ export function avoidTrailingHighSurrogateBreak(text: string, start: number, end
|
||||
return end;
|
||||
}
|
||||
const adjusted = end - 1;
|
||||
return adjusted > start ? adjusted : end;
|
||||
return adjusted > start ? adjusted : end + 1;
|
||||
}
|
||||
|
||||
export function chunkTextByBreakResolver(
|
||||
|
||||
Reference in New Issue
Block a user