fix: tighten telegram html chunking

This commit is contained in:
Ayaan Zaidi
2026-03-10 20:42:19 +05:30
parent cce7953d8d
commit 1381705a14
4 changed files with 106 additions and 24 deletions

View File

@@ -113,11 +113,19 @@ describe("markdownToTelegramHtml", () => {
expect(res).toContain("trailing ||");
});
it("splits long html text without breaking balanced tags", () => {
const chunks = splitTelegramHtmlChunks(`<b>${"A".repeat(5000)}</b>`, 4000);
it("splits long multiline html text without breaking balanced tags", () => {
const chunks = splitTelegramHtmlChunks(`<b>${"A\n".repeat(2500)}</b>`, 4000);
expect(chunks.length).toBeGreaterThan(1);
expect(chunks.every((chunk) => chunk.length <= 4000)).toBe(true);
expect(chunks[0]).toMatch(/^<b>.*<\/b>$/);
expect(chunks[1]).toMatch(/^<b>.*<\/b>$/);
expect(chunks[0]).toMatch(/^<b>[\s\S]*<\/b>$/);
expect(chunks[1]).toMatch(/^<b>[\s\S]*<\/b>$/);
});
it("fails loudly when a leading entity cannot fit inside a chunk", () => {
expect(() => splitTelegramHtmlChunks(`A&amp;${"B".repeat(20)}`, 4)).toThrow(/leading entity/i);
});
it("fails loudly when tag overhead leaves no room for text", () => {
expect(() => splitTelegramHtmlChunks("<b><i><u>x</u></i></b>", 10)).toThrow(/tag overhead/i);
});
});

View File

@@ -270,13 +270,15 @@ function findTelegramHtmlSafeSplitIndex(text: string, maxLength: number): number
return text.length;
}
const normalizedMaxLength = Math.max(1, Math.floor(maxLength));
let splitAt = normalizedMaxLength;
const lastAmpersand = text.lastIndexOf("&", normalizedMaxLength - 1);
const lastSemicolon = text.lastIndexOf(";", normalizedMaxLength - 1);
if (lastAmpersand > lastSemicolon) {
splitAt = lastAmpersand;
if (lastAmpersand === -1) {
return normalizedMaxLength;
}
return splitAt > 0 ? splitAt : normalizedMaxLength;
const lastSemicolon = text.lastIndexOf(";", normalizedMaxLength - 1);
if (lastAmpersand < lastSemicolon) {
return normalizedMaxLength;
}
return lastAmpersand;
}
function popTelegramHtmlTag(tags: TelegramHtmlTag[], name: string): void {
@@ -300,15 +302,15 @@ export function splitTelegramHtmlChunks(html: string, limit: number): string[] {
const chunks: string[] = [];
const openTags: TelegramHtmlTag[] = [];
let current = "";
let chunkHasContent = false;
let chunkHasPayload = false;
const resetCurrent = () => {
current = buildTelegramHtmlOpenPrefix(openTags);
chunkHasContent = false;
chunkHasPayload = false;
};
const flushCurrent = () => {
if (!chunkHasContent) {
if (!chunkHasPayload) {
return;
}
chunks.push(`${current}${buildTelegramHtmlCloseSuffix(openTags)}`);
@@ -321,24 +323,31 @@ export function splitTelegramHtmlChunks(html: string, limit: number): string[] {
const available =
normalizedLimit - current.length - buildTelegramHtmlCloseSuffixLength(openTags);
if (available <= 0) {
const prefix = buildTelegramHtmlOpenPrefix(openTags);
if (!chunkHasContent && current === prefix) {
current += remaining;
chunkHasContent = true;
remaining = "";
break;
if (!chunkHasPayload) {
throw new Error(
`Telegram HTML chunk limit exceeded by tag overhead (limit=${normalizedLimit})`,
);
}
flushCurrent();
continue;
}
if (remaining.length <= available) {
current += remaining;
chunkHasContent = true;
chunkHasPayload = true;
break;
}
const splitAt = findTelegramHtmlSafeSplitIndex(remaining, available);
if (splitAt <= 0) {
if (!chunkHasPayload) {
throw new Error(
`Telegram HTML chunk limit exceeded by leading entity (limit=${normalizedLimit})`,
);
}
flushCurrent();
continue;
}
current += remaining.slice(0, splitAt);
chunkHasContent = true;
chunkHasPayload = true;
remaining = remaining.slice(splitAt);
flushCurrent();
}
@@ -363,7 +372,7 @@ export function splitTelegramHtmlChunks(html: string, limit: number): string[] {
if (!isClosing) {
const nextCloseLength = isSelfClosing ? 0 : `</${tagName}>`.length;
if (
chunkHasContent &&
chunkHasPayload &&
current.length +
rawTag.length +
buildTelegramHtmlCloseSuffixLength(openTags) +
@@ -375,7 +384,9 @@ export function splitTelegramHtmlChunks(html: string, limit: number): string[] {
}
current += rawTag;
chunkHasContent = true;
if (isSelfClosing) {
chunkHasPayload = true;
}
if (isClosing) {
popTelegramHtmlTag(openTags, tagName);
} else if (!isSelfClosing) {

View File

@@ -1288,6 +1288,35 @@ describe("sendMessageTelegram", () => {
});
expect(res.messageId).toBe("91");
});
it("preserves caller plain-text fallback across chunked html parse retries", async () => {
const chatId = "123";
const htmlText = `<b>${"A".repeat(5000)}</b>`;
const plainText = `${"P".repeat(2500)}${"Q".repeat(2500)}`;
const parseErr = new Error(
"400: Bad Request: can't parse entities: Can't find end of the entity starting at byte offset 9",
);
const sendMessage = vi
.fn()
.mockRejectedValueOnce(parseErr)
.mockResolvedValueOnce({ message_id: 90, chat: { id: chatId } })
.mockRejectedValueOnce(parseErr)
.mockResolvedValueOnce({ message_id: 91, chat: { id: chatId } });
const api = { sendMessage } as unknown as { sendMessage: typeof sendMessage };
const res = await sendMessageTelegram(chatId, htmlText, {
token: "tok",
api,
textMode: "html",
plainText,
});
expect(sendMessage).toHaveBeenCalledTimes(4);
const plainFallbackCalls = [sendMessage.mock.calls[1], sendMessage.mock.calls[3]];
expect(plainFallbackCalls.map((call) => String(call?.[1] ?? "")).join("")).toBe(plainText);
expect(plainFallbackCalls.every((call) => !String(call?.[1] ?? "").includes("<"))).toBe(true);
expect(res.messageId).toBe("91");
});
});
describe("reactMessageTelegram", () => {

View File

@@ -108,6 +108,36 @@ function resolveTelegramMessageIdOrThrow(
throw new Error(`Telegram ${context} returned no message_id`);
}
function splitTelegramPlainTextFallback(text: string, chunkCount: number, limit: number): string[] {
if (!text) {
return [];
}
const normalizedLimit = Math.max(1, Math.floor(limit));
if (chunkCount <= 1 || text.length <= normalizedLimit) {
return [text];
}
if (text.length > chunkCount * normalizedLimit) {
const chunks: string[] = [];
for (let start = 0; start < text.length; start += normalizedLimit) {
chunks.push(text.slice(start, start + normalizedLimit));
}
return chunks;
}
const chunks: string[] = [];
let offset = 0;
for (let index = 0; index < chunkCount && offset < text.length; index += 1) {
const remainingChars = text.length - offset;
const remainingChunks = chunkCount - index;
const nextChunkLength =
remainingChunks === 1
? remainingChars
: Math.min(normalizedLimit, Math.ceil(remainingChars / remainingChunks));
chunks.push(text.slice(offset, offset + nextChunkLength));
offset += nextChunkLength;
}
return chunks;
}
const PARSE_ERR_RE = /can't parse entities|parse entities|find end of the entity/i;
const THREAD_NOT_FOUND_RE = /400:\s*Bad Request:\s*message thread not found/i;
const MESSAGE_NOT_MODIFIED_RE =
@@ -660,10 +690,14 @@ export async function sendMessageTelegram(
rawText: string,
context: string,
): Promise<{ messageId: string; chatId: string }> => {
const chunks = splitTelegramHtmlChunks(rawText, 4000).map((chunk) => ({
const htmlChunks = splitTelegramHtmlChunks(rawText, 4000);
const plainTextChunks = opts.plainText
? splitTelegramPlainTextFallback(opts.plainText, htmlChunks.length, 4000)
: [];
const chunks = htmlChunks.map((chunk, index) => ({
rawText: chunk,
htmlText: chunk,
plainText: chunk,
plainText: plainTextChunks[index],
}));
let lastMessageId = "";