fix(discord): split CJK text at safe break points (#73960)

Co-authored-by: openclaw-clownfish[bot] <280122609+openclaw-clownfish[bot]@users.noreply.github.com>
Co-authored-by: Penchan <5032148+p3nchan@users.noreply.github.com>
This commit is contained in:
openclaw-clownfish[bot]
2026-04-29 02:07:26 -07:00
committed by GitHub
parent 0f078f2ea2
commit c33968e10c
3 changed files with 78 additions and 9 deletions

View File

@@ -18,6 +18,7 @@ Docs: https://docs.openclaw.ai
### Fixes
- Channels/Discord: fail startup closed when Discord cannot resolve the bot's own identity and keep mention gating active when only configured mention patterns can detect mentions, so the provider no longer continues with a missing bot id. Fixes #42219; carries forward #46856 and #49218. Thanks @education-01 and @BenediktSchackenberg.
- Channels/Discord: split long CJK replies at punctuation and code-point-safe fallback boundaries so Discord chunking stays readable without corrupting astral characters. Fixes #38597; repairs #71384. Thanks @p3nchan.
- Browser/gateway: ignore Playwright dialog-close races from `Page.handleJavaScriptDialog` so browser automation no longer crashes the Gateway when a dialog disappears before Playwright accepts it. (#40067) Thanks @randyjtw.
- Cron/Gateway: defer missed isolated agent-turn catch-up out of the channel startup window, so overdue cron work cannot starve Discord or Telegram while providers connect after a restart. Thanks @vincentkoc.
- Plugins/runtime-deps: prune stale `openclaw-unknown-*` bundled runtime dependency roots during Gateway startup while keeping recent or locked roots, so old staging debris cannot keep growing across restarts. Thanks @vincentkoc.

View File

@@ -73,6 +73,30 @@ describe("chunkDiscordText", () => {
expect(chunks.join("")).toBe(text);
});
it("uses CJK punctuation as a safe long-line split point", () => {
const text = "一二三四五。六七八九十。甲乙丙丁戊。";
const chunks = chunkDiscordText(text, { maxChars: 10, maxLines: 50 });
expect(chunks).toEqual(["一二三四五。", "六七八九十。", "甲乙丙丁戊。"]);
expect(chunks.join("")).toBe(text);
});
it("still prefers whitespace before CJK punctuation", () => {
const text = "alpha beta。gamma delta";
const chunks = chunkDiscordText(text, { maxChars: 13, maxLines: 50 });
expect(chunks[0]).toBe("alpha");
expect(chunks.join("")).toBe(text);
});
it("does not split surrogate pairs at hard fallback boundaries", () => {
const text = "ab😀cd😀ef";
const chunks = chunkDiscordText(text, { maxChars: 3, maxLines: 50 });
expect(chunks).toEqual(["ab", "😀c", "d😀", "ef"]);
expect(chunks.join("")).toBe(text);
});
it("keeps reasoning italics balanced across chunks", () => {
const body = Array.from({ length: 25 }, (_, i) => `${i + 1}. line`).join("\n");
const text = `Reasoning:\n_${body}_`;

View File

@@ -22,6 +22,7 @@ type OpenFence = {
const DEFAULT_MAX_CHARS = 2000;
const DEFAULT_MAX_LINES = 17;
const FENCE_RE = /^( {0,3})(`{3,}|~{3,})(.*)$/;
const CJK_PUNCTUATION_BREAK_AFTER_RE = /[]/u;
function countLines(text: string) {
if (!text) {
@@ -63,6 +64,51 @@ function closeFenceIfNeeded(text: string, openFence: OpenFence | null) {
return `${text}${closeLine}`;
}
function isHighSurrogate(code: number) {
return code >= 0xd800 && code <= 0xdbff;
}
function isLowSurrogate(code: number) {
return code >= 0xdc00 && code <= 0xdfff;
}
function clampToCodePointBoundary(text: string, index: number) {
const boundary = Math.min(Math.max(0, index), text.length);
if (boundary <= 0 || boundary >= text.length) {
return boundary;
}
const previous = text.charCodeAt(boundary - 1);
const next = text.charCodeAt(boundary);
if (isHighSurrogate(previous) && isLowSurrogate(next)) {
return boundary > 1 ? boundary - 1 : boundary + 1;
}
return boundary;
}
function findWhitespaceBreak(window: string) {
for (let i = window.length - 1; i >= 0; i--) {
if (/\s/.test(window[i])) {
// Return the separator index so whitespace stays with the next segment.
return i;
}
}
return -1;
}
function findCjkPunctuationBreak(window: string) {
for (let end = window.length; end > 0; ) {
const code = window.charCodeAt(end - 1);
const start = isLowSurrogate(code) && end > 1 ? end - 2 : end - 1;
const char = window.slice(start, end);
if (start > 0 && CJK_PUNCTUATION_BREAK_AFTER_RE.test(char)) {
// Return the exclusive end so CJK punctuation stays with the current segment.
return end;
}
end = start;
}
return -1;
}
function splitLongLine(
line: string,
maxChars: number,
@@ -76,20 +122,18 @@ function splitLongLine(
let remaining = line;
while (remaining.length > limit) {
if (opts.preserveWhitespace) {
out.push(remaining.slice(0, limit));
remaining = remaining.slice(limit);
const breakIdx = clampToCodePointBoundary(remaining, limit);
out.push(remaining.slice(0, breakIdx));
remaining = remaining.slice(breakIdx);
continue;
}
const window = remaining.slice(0, limit);
let breakIdx = -1;
for (let i = window.length - 1; i >= 0; i--) {
if (/\s/.test(window[i])) {
breakIdx = i;
break;
}
let breakIdx = findWhitespaceBreak(window);
if (breakIdx <= 0) {
breakIdx = findCjkPunctuationBreak(window);
}
if (breakIdx <= 0) {
breakIdx = limit;
breakIdx = clampToCodePointBoundary(remaining, limit);
}
out.push(remaining.slice(0, breakIdx));
// Keep the separator for the next segment so words don't get glued together.