diff --git a/CHANGELOG.md b/CHANGELOG.md index 873e82c1a5e..65db02a80f8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,6 +18,7 @@ Docs: https://docs.openclaw.ai ### Fixes - Channels/Discord: fail startup closed when Discord cannot resolve the bot's own identity and keep mention gating active when only configured mention patterns can detect mentions, so the provider no longer continues with a missing bot id. Fixes #42219; carries forward #46856 and #49218. Thanks @education-01 and @BenediktSchackenberg. +- Channels/Discord: split long CJK replies at punctuation and code-point-safe fallback boundaries so Discord chunking stays readable without corrupting astral characters. Fixes #38597; repairs #71384. Thanks @p3nchan. - Browser/gateway: ignore Playwright dialog-close races from `Page.handleJavaScriptDialog` so browser automation no longer crashes the Gateway when a dialog disappears before Playwright accepts it. (#40067) Thanks @randyjtw. - Cron/Gateway: defer missed isolated agent-turn catch-up out of the channel startup window, so overdue cron work cannot starve Discord or Telegram while providers connect after a restart. Thanks @vincentkoc. - Plugins/runtime-deps: prune stale `openclaw-unknown-*` bundled runtime dependency roots during Gateway startup while keeping recent or locked roots, so old staging debris cannot keep growing across restarts. Thanks @vincentkoc. diff --git a/extensions/discord/src/chunk.test.ts b/extensions/discord/src/chunk.test.ts index 18de332e2ee..44b0ad1ef84 100644 --- a/extensions/discord/src/chunk.test.ts +++ b/extensions/discord/src/chunk.test.ts @@ -73,6 +73,30 @@ describe("chunkDiscordText", () => { expect(chunks.join("")).toBe(text); }); + it("uses CJK punctuation as a safe long-line split point", () => { + const text = "一二三四五。六七八九十。甲乙丙丁戊。"; + const chunks = chunkDiscordText(text, { maxChars: 10, maxLines: 50 }); + + expect(chunks).toEqual(["一二三四五。", "六七八九十。", "甲乙丙丁戊。"]); + expect(chunks.join("")).toBe(text); + }); + + it("still prefers whitespace before CJK punctuation", () => { + const text = "alpha beta。gamma delta"; + const chunks = chunkDiscordText(text, { maxChars: 13, maxLines: 50 }); + + expect(chunks[0]).toBe("alpha"); + expect(chunks.join("")).toBe(text); + }); + + it("does not split surrogate pairs at hard fallback boundaries", () => { + const text = "ab😀cd😀ef"; + const chunks = chunkDiscordText(text, { maxChars: 3, maxLines: 50 }); + + expect(chunks).toEqual(["ab", "😀c", "d😀", "ef"]); + expect(chunks.join("")).toBe(text); + }); + it("keeps reasoning italics balanced across chunks", () => { const body = Array.from({ length: 25 }, (_, i) => `${i + 1}. line`).join("\n"); const text = `Reasoning:\n_${body}_`; diff --git a/extensions/discord/src/chunk.ts b/extensions/discord/src/chunk.ts index 43eb616f6f0..c6e726ddeb9 100644 --- a/extensions/discord/src/chunk.ts +++ b/extensions/discord/src/chunk.ts @@ -22,6 +22,7 @@ type OpenFence = { const DEFAULT_MAX_CHARS = 2000; const DEFAULT_MAX_LINES = 17; const FENCE_RE = /^( {0,3})(`{3,}|~{3,})(.*)$/; +const CJK_PUNCTUATION_BREAK_AFTER_RE = /[、。,.!?;:)]}〉》」』】〕〗〙]/u; function countLines(text: string) { if (!text) { @@ -63,6 +64,51 @@ function closeFenceIfNeeded(text: string, openFence: OpenFence | null) { return `${text}${closeLine}`; } +function isHighSurrogate(code: number) { + return code >= 0xd800 && code <= 0xdbff; +} + +function isLowSurrogate(code: number) { + return code >= 0xdc00 && code <= 0xdfff; +} + +function clampToCodePointBoundary(text: string, index: number) { + const boundary = Math.min(Math.max(0, index), text.length); + if (boundary <= 0 || boundary >= text.length) { + return boundary; + } + const previous = text.charCodeAt(boundary - 1); + const next = text.charCodeAt(boundary); + if (isHighSurrogate(previous) && isLowSurrogate(next)) { + return boundary > 1 ? boundary - 1 : boundary + 1; + } + return boundary; +} + +function findWhitespaceBreak(window: string) { + for (let i = window.length - 1; i >= 0; i--) { + if (/\s/.test(window[i])) { + // Return the separator index so whitespace stays with the next segment. + return i; + } + } + return -1; +} + +function findCjkPunctuationBreak(window: string) { + for (let end = window.length; end > 0; ) { + const code = window.charCodeAt(end - 1); + const start = isLowSurrogate(code) && end > 1 ? end - 2 : end - 1; + const char = window.slice(start, end); + if (start > 0 && CJK_PUNCTUATION_BREAK_AFTER_RE.test(char)) { + // Return the exclusive end so CJK punctuation stays with the current segment. + return end; + } + end = start; + } + return -1; +} + function splitLongLine( line: string, maxChars: number, @@ -76,20 +122,18 @@ function splitLongLine( let remaining = line; while (remaining.length > limit) { if (opts.preserveWhitespace) { - out.push(remaining.slice(0, limit)); - remaining = remaining.slice(limit); + const breakIdx = clampToCodePointBoundary(remaining, limit); + out.push(remaining.slice(0, breakIdx)); + remaining = remaining.slice(breakIdx); continue; } const window = remaining.slice(0, limit); - let breakIdx = -1; - for (let i = window.length - 1; i >= 0; i--) { - if (/\s/.test(window[i])) { - breakIdx = i; - break; - } + let breakIdx = findWhitespaceBreak(window); + if (breakIdx <= 0) { + breakIdx = findCjkPunctuationBreak(window); } if (breakIdx <= 0) { - breakIdx = limit; + breakIdx = clampToCodePointBoundary(remaining, limit); } out.push(remaining.slice(0, breakIdx)); // Keep the separator for the next segment so words don't get glued together.