From fc1f1f4fdfce779441acf6c6dd841aedf5d16a99 Mon Sep 17 00:00:00 2001 From: Dallin Romney Date: Mon, 4 May 2026 21:50:40 +0800 Subject: [PATCH] fix(tui): preserve code spans, code blocks, and dotted/hyphenated identifiers from long-token sanitizer (#77335) The display sanitizer's long-token chunker (`\S{33,}` -> 32-char chunks joined by spaces) was injecting literal spaces inside inline code spans, fenced code blocks, and bare identifiers it didn't recognize. Tokens like `requireConfirmationForMutatingActions`, `ubuntu-budgie-desktop-environment`, and `binary_sensor.sense_energy_monitor_power` rendered with mid-word spaces, contaminating copy/paste of package names, entity IDs, and shell line-continuations. Fix: - Make sanitizer code-aware: split text into fenced/inline-code segments and prose, and only run the chunker on prose segments. Code regions pass through verbatim. - Widen `isCopySensitiveToken` to use the punctuation-stripped candidate for all classification, and accept any `FILE_LIKE_RE` token that contains `_`, `-`, or `.` (covers package names, dotted IDs, kebab flags). Picks up the goals of #69340 and #39565. - Skip chunking for symbol-only runs (box-drawing rows, dashes, equals) so table borders aren't corrupted. - Preserve the original goal of narrow-terminal protection: long unidentifiable prose tokens (e.g. accidental base64 dumps) are still chunked so they don't blow out terminal layout. Security ordering preserved: ANSI strip / control-char strip / binary redaction still run on the whole string before segmentation, so code regions cannot smuggle escapes, control characters, or binary garbage past the sanitizer. 16 new regression tests cover: camelCase config keys in inline code, hyphenated package names (bare and in code), dotted entity IDs (bare and in code), backtick and tilde fenced blocks, base64-like blobs in code, prose-token chunking unchanged, prose-around-code mixed content, box-drawing horizontal rules, multi-line shell `\\` continuations, plus three explicit security-ordering tests asserting ANSI/control/ binary stripping still runs inside code segments. Fixes #48432, #39505. Supersedes #69340, #39565 (carries forward both ideas in a more general fix). Carries forward the code-fence-aware approach from the closed #48445. --- CHANGELOG.md | 1 + src/tui/tui-formatters.test.ts | 129 +++++++++++++++++++++++++++++++++ src/tui/tui-formatters.ts | 74 ++++++++++++++++--- 3 files changed, 195 insertions(+), 9 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 479a43cb86e..3bc9b039775 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -44,6 +44,7 @@ Docs: https://docs.openclaw.ai ### Fixes - TUI/escape abort: track the in-flight runId after `chat.send` resolves so pressing Esc during the gap before the first gateway event aborts the run instead of repeatedly printing `no active run`. Fixes #1296. Thanks @Lukavyi and @romneyda. +- TUI/render: stop the long-token sanitizer from injecting literal spaces inside inline code spans, fenced code blocks, table borders, and bare hyphenated/dotted identifiers, so copied package names, entity IDs, and shell line-continuations stay byte-for-byte intact while narrow-terminal protection still chunks unidentifiable long prose tokens. Fixes #48432, #39505. Thanks @DocOellerson, @xeusoc, @CCcassiusdjs, @akramcodez, @brokemac79, @romneyda. - Gateway/status: label Linux managed gateway services as `systemd user`, making status output explicit about the user-service scope instead of implying a system-level unit. Thanks @vincentkoc. - Plugins/install: remove the previous managed plugin directory when a reinstall switches sources, so stale ClawHub and npm copies no longer keep duplicate plugin ids in discovery after the new install wins. Thanks @vincentkoc. - Plugins/install: let official plugin reinstall recovery repair source-only installed runtime shadows, so `openclaw plugins install npm:@openclaw/discord --force` can replace the bad package instead of stopping at stale config validation. Thanks @vincentkoc. diff --git a/src/tui/tui-formatters.test.ts b/src/tui/tui-formatters.test.ts index 5d759a4a9eb..34c6c67879e 100644 --- a/src/tui/tui-formatters.test.ts +++ b/src/tui/tui-formatters.test.ts @@ -378,4 +378,133 @@ describe("sanitizeRenderableText", () => { expect(sanitized).toBe(input); }); + + it("preserves long camelCase identifiers wrapped in inline code spans (#48432)", () => { + const input = "- `requireConfirmationForMutatingActions: false`"; + const sanitized = sanitizeRenderableText(input); + + expect(sanitized).toBe(input); + }); + + it("preserves long hyphenated package names in inline code spans (#48432)", () => { + const input = "Install `ubuntu-budgie-desktop-environment` to fix it."; + const sanitized = sanitizeRenderableText(input); + + expect(sanitized).toBe(input); + }); + + it("preserves dotted entity IDs in inline code spans (#39505)", () => { + const input = "See `binary_sensor.sense_energy_monitor_power` for the live reading."; + const sanitized = sanitizeRenderableText(input); + + expect(sanitized).toBe(input); + }); + + it("preserves bare hyphenated package names in prose", () => { + const input = "Run apt install ubuntu-budgie-desktop-environment after enabling the PPA."; + const sanitized = sanitizeRenderableText(input); + + expect(sanitized).toBe(input); + }); + + it("preserves bare dotted entity IDs in prose", () => { + const input = "Watch binary_sensor.sense_energy_monitor_power.daily_energy after midnight."; + const sanitized = sanitizeRenderableText(input); + + expect(sanitized).toBe(input); + }); + + it("preserves backtick-fenced code blocks verbatim", () => { + const input = [ + "Run this:", + "```bash", + "sudo cp -a /var/lib/machines/fc41/etc/systemd/network/. \\", + " /var/lib/machines/fc43/etc/systemd/network/", + "```", + "Done.", + ].join("\n"); + const sanitized = sanitizeRenderableText(input); + + expect(sanitized).toBe(input); + }); + + it("preserves tilde-fenced code blocks verbatim", () => { + const input = [ + "Example:", + "~~~typescript", + "const requireConfirmationForMutatingActions = false;", + "~~~", + ].join("\n"); + const sanitized = sanitizeRenderableText(input); + + expect(sanitized).toBe(input); + }); + + it("preserves long base64-like blobs inside inline code spans", () => { + const input = "token: `e3b19c3b87bcf364b23eebb2c276e96ec478956ba1d84c93deadbeef`"; // pragma: allowlist secret + const sanitized = sanitizeRenderableText(input); + + expect(sanitized).toBe(input); + }); + + it("still chunks long unbroken prose tokens outside code spans", () => { + const input = `prefix ${"x".repeat(120)} suffix`; + const sanitized = sanitizeRenderableText(input); + + const longestSegment = Math.max(...sanitized.split(/\s+/).map((s) => s.length)); + expect(longestSegment).toBeLessThanOrEqual(32); + }); + + it("preserves prose around code blocks while chunking long prose tokens", () => { + const input = [ + `before ${"x".repeat(120)}`, + "```", + "code line preserved verbatim", + "```", + `after ${"y".repeat(80)}`, + ].join("\n"); + const sanitized = sanitizeRenderableText(input); + + expect(sanitized).toContain("code line preserved verbatim"); + expect(sanitized).not.toContain("x".repeat(33)); + expect(sanitized).not.toContain("y".repeat(33)); + }); + + it("does not chunk box-drawing horizontal rules used in tables", () => { + const input = "─".repeat(60); + const sanitized = sanitizeRenderableText(input); + + expect(sanitized).toBe(input); + }); + + it("does not insert spaces before backslash line-continuations in fenced code", () => { + const longContinuation = `cmd ${"a".repeat(40)} \\`; + const input = ["```bash", longContinuation, " next", "```"].join("\n"); + const sanitized = sanitizeRenderableText(input); + + expect(sanitized).toContain(longContinuation); + expect(sanitized).not.toContain("\\ "); + }); + + it("strips ANSI escapes inside fenced code blocks (sanitization runs before segmentation)", () => { + const input = "Hello\n```\nlet x = 1; injected\n```\nbye"; + const sanitized = sanitizeRenderableText(input); + + expect(sanitized).not.toContain(""); + expect(sanitized).toContain("let x = 1;"); + }); + + it("strips control chars inside inline code spans (sanitization runs before segmentation)", () => { + const input = "Hello `safe\x00content` world"; + const sanitized = sanitizeRenderableText(input); + + expect(sanitized).toBe("Hello `safecontent` world"); + }); + + it("redacts heavily corrupted lines even inside fenced code blocks", () => { + const input = `Header\n\`\`\`\n${"�".repeat(40)}\n\`\`\`\nFooter`; + const sanitized = sanitizeRenderableText(input); + + expect(sanitized).toContain("[binary data omitted]"); + }); }); diff --git a/src/tui/tui-formatters.ts b/src/tui/tui-formatters.ts index 1502c69ae5c..5b6a871780b 100644 --- a/src/tui/tui-formatters.ts +++ b/src/tui/tui-formatters.ts @@ -13,11 +13,17 @@ const URL_PREFIX_RE = /^(https?:\/\/|file:\/\/)/i; const WINDOWS_DRIVE_RE = /^[a-zA-Z]:[\\/]/; const FILE_LIKE_RE = /^[a-zA-Z0-9._-]+$/; const EDGE_PUNCTUATION_RE = /^[`"'([{<]+|[`"')\]}>.,:;!?]+$/g; +const ALPHANUMERIC_RE = /[A-Za-z0-9]/; const TOKENISH_MIN_LENGTH = 24; const RTL_SCRIPT_RE = /[\u0590-\u08ff\ufb1d-\ufdff\ufe70-\ufefc]/; const BIDI_CONTROL_RE = /[\u202a-\u202e\u2066-\u2069]/; const RTL_ISOLATE_START = "\u2067"; const RTL_ISOLATE_END = "\u2069"; +// Fenced code blocks (``` or ~~~). Lazy on content; tolerates info string after +// the opening fence. Closing fence must sit on its own line. +const FENCED_CODE_RE = /(```|~~~)[^\n]*\n[\s\S]*?\n\1[^\n]*/g; +// Inline code spans with balanced backtick run (`code`, ``co`de``, ...). +const INLINE_CODE_RE = /(`+)(?:(?!\1).)+?\1/g; function hasControlChars(text: string): boolean { for (const char of text) { @@ -62,24 +68,29 @@ function isCopySensitiveToken(token: string): boolean { const coreToken = token.replace(EDGE_PUNCTUATION_RE, ""); const candidate = coreToken || token; - if (URL_PREFIX_RE.test(token)) { + if (URL_PREFIX_RE.test(candidate)) { return true; } if ( - token.startsWith("/") || - token.startsWith("~/") || - token.startsWith("./") || - token.startsWith("../") + candidate.startsWith("/") || + candidate.startsWith("~/") || + candidate.startsWith("./") || + candidate.startsWith("../") ) { return true; } - if (WINDOWS_DRIVE_RE.test(token) || token.startsWith("\\\\")) { + if (WINDOWS_DRIVE_RE.test(candidate) || candidate.startsWith("\\\\")) { return true; } - if (token.includes("/") || token.includes("\\")) { + if (candidate.includes("/") || candidate.includes("\\")) { return true; } - if (token.includes("_") && FILE_LIKE_RE.test(token)) { + // Identifiers that look file-like, dotted, or hyphen/underscore-separated: + // package names, entity IDs, kebab/snake CLI flags, dotted module paths. + if ( + FILE_LIKE_RE.test(candidate) && + (candidate.includes("_") || candidate.includes("-") || candidate.includes(".")) + ) { return true; } @@ -96,9 +107,50 @@ function normalizeLongTokenForDisplay(token: string): string { if (isCopySensitiveToken(token)) { return token; } + // Pure symbol/punctuation runs (table borders made of `─`, `=`, `-`) carry + // no copyable identifier; chunking would corrupt the visible structure. + if (!ALPHANUMERIC_RE.test(token)) { + return token; + } return chunkToken(token, MAX_TOKEN_CHARS).join(" "); } +type Segment = { kind: "prose" | "code"; text: string }; + +function partitionByRegex(text: string, re: RegExp): Segment[] { + const parts: Segment[] = []; + let lastIndex = 0; + for (const match of text.matchAll(re)) { + const start = match.index ?? 0; + if (start > lastIndex) { + parts.push({ kind: "prose", text: text.slice(lastIndex, start) }); + } + parts.push({ kind: "code", text: match[0] }); + lastIndex = start + match[0].length; + } + if (lastIndex < text.length) { + parts.push({ kind: "prose", text: text.slice(lastIndex) }); + } + return parts; +} + +// Apply `transform` only to spans of `text` that are not inside fenced code +// blocks or inline code spans. Code regions pass through verbatim so long +// identifiers, dotted IDs, package names, and shell line-continuations the +// user may copy stay byte-for-byte intact. +function transformOutsideCode(text: string, transform: (segment: string) => string): string { + const fenced = partitionByRegex(text, FENCED_CODE_RE); + return fenced + .map((seg) => { + if (seg.kind === "code") { + return seg.text; + } + const inline = partitionByRegex(seg.text, INLINE_CODE_RE); + return inline.map((s) => (s.kind === "code" ? s.text : transform(s.text))).join(""); + }) + .join(""); +} + function redactBinaryLikeLine(line: string): string { const replacementCount = (line.match(REPLACEMENT_CHAR_RE) || []).length; if ( @@ -149,7 +201,11 @@ export function sanitizeRenderableText(text: string): string { .join("\n") : withoutControlChars; const tokenSafe = LONG_TOKEN_TEST_RE.test(redacted) - ? redacted.replace(LONG_TOKEN_RE, normalizeLongTokenForDisplay) + ? transformOutsideCode(redacted, (segment) => + LONG_TOKEN_TEST_RE.test(segment) + ? segment.replace(LONG_TOKEN_RE, normalizeLongTokenForDisplay) + : segment, + ) : redacted; return applyRtlIsolation(tokenSafe); }