fix(tui): preserve code spans, code blocks, and dotted/hyphenated identifiers from long-token sanitizer (#77335)

The display sanitizer's long-token chunker (`\S{33,}` -> 32-char chunks
joined by spaces) was injecting literal spaces inside inline code spans,
fenced code blocks, and bare identifiers it didn't recognize. Tokens like
`requireConfirmationForMutatingActions`, `ubuntu-budgie-desktop-environment`,
and `binary_sensor.sense_energy_monitor_power` rendered with mid-word
spaces, contaminating copy/paste of package names, entity IDs, and shell
line-continuations.

Fix:

- Make sanitizer code-aware: split text into fenced/inline-code segments
  and prose, and only run the chunker on prose segments. Code regions
  pass through verbatim.
- Widen `isCopySensitiveToken` to use the punctuation-stripped candidate
  for all classification, and accept any `FILE_LIKE_RE` token that
  contains `_`, `-`, or `.` (covers package names, dotted IDs, kebab
  flags). Picks up the goals of #69340 and #39565.
- Skip chunking for symbol-only runs (box-drawing rows, dashes, equals)
  so table borders aren't corrupted.
- Preserve the original goal of narrow-terminal protection: long
  unidentifiable prose tokens (e.g. accidental base64 dumps) are still
  chunked so they don't blow out terminal layout.

Security ordering preserved: ANSI strip / control-char strip / binary
redaction still run on the whole string before segmentation, so code
regions cannot smuggle escapes, control characters, or binary garbage
past the sanitizer.

16 new regression tests cover: camelCase config keys in inline code,
hyphenated package names (bare and in code), dotted entity IDs (bare
and in code), backtick and tilde fenced blocks, base64-like blobs in
code, prose-token chunking unchanged, prose-around-code mixed content,
box-drawing horizontal rules, multi-line shell `\\` continuations,
plus three explicit security-ordering tests asserting ANSI/control/
binary stripping still runs inside code segments.

Fixes #48432, #39505.
Supersedes #69340, #39565 (carries forward both ideas in a more
general fix). Carries forward the code-fence-aware approach from the
closed #48445.
This commit is contained in:
Dallin Romney
2026-05-04 21:50:40 +08:00
committed by GitHub
parent 0b3a86cab0
commit fc1f1f4fdf
3 changed files with 195 additions and 9 deletions

View File

@@ -44,6 +44,7 @@ Docs: https://docs.openclaw.ai
### Fixes
- TUI/escape abort: track the in-flight runId after `chat.send` resolves so pressing Esc during the gap before the first gateway event aborts the run instead of repeatedly printing `no active run`. Fixes #1296. Thanks @Lukavyi and @romneyda.
- TUI/render: stop the long-token sanitizer from injecting literal spaces inside inline code spans, fenced code blocks, table borders, and bare hyphenated/dotted identifiers, so copied package names, entity IDs, and shell line-continuations stay byte-for-byte intact while narrow-terminal protection still chunks unidentifiable long prose tokens. Fixes #48432, #39505. Thanks @DocOellerson, @xeusoc, @CCcassiusdjs, @akramcodez, @brokemac79, @romneyda.
- Gateway/status: label Linux managed gateway services as `systemd user`, making status output explicit about the user-service scope instead of implying a system-level unit. Thanks @vincentkoc.
- Plugins/install: remove the previous managed plugin directory when a reinstall switches sources, so stale ClawHub and npm copies no longer keep duplicate plugin ids in discovery after the new install wins. Thanks @vincentkoc.
- Plugins/install: let official plugin reinstall recovery repair source-only installed runtime shadows, so `openclaw plugins install npm:@openclaw/discord --force` can replace the bad package instead of stopping at stale config validation. Thanks @vincentkoc.

View File

@@ -378,4 +378,133 @@ describe("sanitizeRenderableText", () => {
expect(sanitized).toBe(input);
});
it("preserves long camelCase identifiers wrapped in inline code spans (#48432)", () => {
const input = "- `requireConfirmationForMutatingActions: false`";
const sanitized = sanitizeRenderableText(input);
expect(sanitized).toBe(input);
});
it("preserves long hyphenated package names in inline code spans (#48432)", () => {
const input = "Install `ubuntu-budgie-desktop-environment` to fix it.";
const sanitized = sanitizeRenderableText(input);
expect(sanitized).toBe(input);
});
it("preserves dotted entity IDs in inline code spans (#39505)", () => {
const input = "See `binary_sensor.sense_energy_monitor_power` for the live reading.";
const sanitized = sanitizeRenderableText(input);
expect(sanitized).toBe(input);
});
it("preserves bare hyphenated package names in prose", () => {
const input = "Run apt install ubuntu-budgie-desktop-environment after enabling the PPA.";
const sanitized = sanitizeRenderableText(input);
expect(sanitized).toBe(input);
});
it("preserves bare dotted entity IDs in prose", () => {
const input = "Watch binary_sensor.sense_energy_monitor_power.daily_energy after midnight.";
const sanitized = sanitizeRenderableText(input);
expect(sanitized).toBe(input);
});
it("preserves backtick-fenced code blocks verbatim", () => {
const input = [
"Run this:",
"```bash",
"sudo cp -a /var/lib/machines/fc41/etc/systemd/network/. \\",
" /var/lib/machines/fc43/etc/systemd/network/",
"```",
"Done.",
].join("\n");
const sanitized = sanitizeRenderableText(input);
expect(sanitized).toBe(input);
});
it("preserves tilde-fenced code blocks verbatim", () => {
const input = [
"Example:",
"~~~typescript",
"const requireConfirmationForMutatingActions = false;",
"~~~",
].join("\n");
const sanitized = sanitizeRenderableText(input);
expect(sanitized).toBe(input);
});
it("preserves long base64-like blobs inside inline code spans", () => {
const input = "token: `e3b19c3b87bcf364b23eebb2c276e96ec478956ba1d84c93deadbeef`"; // pragma: allowlist secret
const sanitized = sanitizeRenderableText(input);
expect(sanitized).toBe(input);
});
it("still chunks long unbroken prose tokens outside code spans", () => {
const input = `prefix ${"x".repeat(120)} suffix`;
const sanitized = sanitizeRenderableText(input);
const longestSegment = Math.max(...sanitized.split(/\s+/).map((s) => s.length));
expect(longestSegment).toBeLessThanOrEqual(32);
});
it("preserves prose around code blocks while chunking long prose tokens", () => {
const input = [
`before ${"x".repeat(120)}`,
"```",
"code line preserved verbatim",
"```",
`after ${"y".repeat(80)}`,
].join("\n");
const sanitized = sanitizeRenderableText(input);
expect(sanitized).toContain("code line preserved verbatim");
expect(sanitized).not.toContain("x".repeat(33));
expect(sanitized).not.toContain("y".repeat(33));
});
it("does not chunk box-drawing horizontal rules used in tables", () => {
const input = "─".repeat(60);
const sanitized = sanitizeRenderableText(input);
expect(sanitized).toBe(input);
});
it("does not insert spaces before backslash line-continuations in fenced code", () => {
const longContinuation = `cmd ${"a".repeat(40)} \\`;
const input = ["```bash", longContinuation, " next", "```"].join("\n");
const sanitized = sanitizeRenderableText(input);
expect(sanitized).toContain(longContinuation);
expect(sanitized).not.toContain("\\ ");
});
it("strips ANSI escapes inside fenced code blocks (sanitization runs before segmentation)", () => {
const input = "Hello\n```\nlet x = 1; injected\n```\nbye";
const sanitized = sanitizeRenderableText(input);
expect(sanitized).not.toContain("");
expect(sanitized).toContain("let x = 1;");
});
it("strips control chars inside inline code spans (sanitization runs before segmentation)", () => {
const input = "Hello `safe\x00content` world";
const sanitized = sanitizeRenderableText(input);
expect(sanitized).toBe("Hello `safecontent` world");
});
it("redacts heavily corrupted lines even inside fenced code blocks", () => {
const input = `Header\n\`\`\`\n${"<22>".repeat(40)}\n\`\`\`\nFooter`;
const sanitized = sanitizeRenderableText(input);
expect(sanitized).toContain("[binary data omitted]");
});
});

View File

@@ -13,11 +13,17 @@ const URL_PREFIX_RE = /^(https?:\/\/|file:\/\/)/i;
const WINDOWS_DRIVE_RE = /^[a-zA-Z]:[\\/]/;
const FILE_LIKE_RE = /^[a-zA-Z0-9._-]+$/;
const EDGE_PUNCTUATION_RE = /^[`"'([{<]+|[`"')\]}>.,:;!?]+$/g;
const ALPHANUMERIC_RE = /[A-Za-z0-9]/;
const TOKENISH_MIN_LENGTH = 24;
const RTL_SCRIPT_RE = /[\u0590-\u08ff\ufb1d-\ufdff\ufe70-\ufefc]/;
const BIDI_CONTROL_RE = /[\u202a-\u202e\u2066-\u2069]/;
const RTL_ISOLATE_START = "\u2067";
const RTL_ISOLATE_END = "\u2069";
// Fenced code blocks (``` or ~~~). Lazy on content; tolerates info string after
// the opening fence. Closing fence must sit on its own line.
const FENCED_CODE_RE = /(```|~~~)[^\n]*\n[\s\S]*?\n\1[^\n]*/g;
// Inline code spans with balanced backtick run (`code`, ``co`de``, ...).
const INLINE_CODE_RE = /(`+)(?:(?!\1).)+?\1/g;
function hasControlChars(text: string): boolean {
for (const char of text) {
@@ -62,24 +68,29 @@ function isCopySensitiveToken(token: string): boolean {
const coreToken = token.replace(EDGE_PUNCTUATION_RE, "");
const candidate = coreToken || token;
if (URL_PREFIX_RE.test(token)) {
if (URL_PREFIX_RE.test(candidate)) {
return true;
}
if (
token.startsWith("/") ||
token.startsWith("~/") ||
token.startsWith("./") ||
token.startsWith("../")
candidate.startsWith("/") ||
candidate.startsWith("~/") ||
candidate.startsWith("./") ||
candidate.startsWith("../")
) {
return true;
}
if (WINDOWS_DRIVE_RE.test(token) || token.startsWith("\\\\")) {
if (WINDOWS_DRIVE_RE.test(candidate) || candidate.startsWith("\\\\")) {
return true;
}
if (token.includes("/") || token.includes("\\")) {
if (candidate.includes("/") || candidate.includes("\\")) {
return true;
}
if (token.includes("_") && FILE_LIKE_RE.test(token)) {
// Identifiers that look file-like, dotted, or hyphen/underscore-separated:
// package names, entity IDs, kebab/snake CLI flags, dotted module paths.
if (
FILE_LIKE_RE.test(candidate) &&
(candidate.includes("_") || candidate.includes("-") || candidate.includes("."))
) {
return true;
}
@@ -96,9 +107,50 @@ function normalizeLongTokenForDisplay(token: string): string {
if (isCopySensitiveToken(token)) {
return token;
}
// Pure symbol/punctuation runs (table borders made of `─`, `=`, `-`) carry
// no copyable identifier; chunking would corrupt the visible structure.
if (!ALPHANUMERIC_RE.test(token)) {
return token;
}
return chunkToken(token, MAX_TOKEN_CHARS).join(" ");
}
type Segment = { kind: "prose" | "code"; text: string };
function partitionByRegex(text: string, re: RegExp): Segment[] {
const parts: Segment[] = [];
let lastIndex = 0;
for (const match of text.matchAll(re)) {
const start = match.index ?? 0;
if (start > lastIndex) {
parts.push({ kind: "prose", text: text.slice(lastIndex, start) });
}
parts.push({ kind: "code", text: match[0] });
lastIndex = start + match[0].length;
}
if (lastIndex < text.length) {
parts.push({ kind: "prose", text: text.slice(lastIndex) });
}
return parts;
}
// Apply `transform` only to spans of `text` that are not inside fenced code
// blocks or inline code spans. Code regions pass through verbatim so long
// identifiers, dotted IDs, package names, and shell line-continuations the
// user may copy stay byte-for-byte intact.
function transformOutsideCode(text: string, transform: (segment: string) => string): string {
const fenced = partitionByRegex(text, FENCED_CODE_RE);
return fenced
.map((seg) => {
if (seg.kind === "code") {
return seg.text;
}
const inline = partitionByRegex(seg.text, INLINE_CODE_RE);
return inline.map((s) => (s.kind === "code" ? s.text : transform(s.text))).join("");
})
.join("");
}
function redactBinaryLikeLine(line: string): string {
const replacementCount = (line.match(REPLACEMENT_CHAR_RE) || []).length;
if (
@@ -149,7 +201,11 @@ export function sanitizeRenderableText(text: string): string {
.join("\n")
: withoutControlChars;
const tokenSafe = LONG_TOKEN_TEST_RE.test(redacted)
? redacted.replace(LONG_TOKEN_RE, normalizeLongTokenForDisplay)
? transformOutsideCode(redacted, (segment) =>
LONG_TOKEN_TEST_RE.test(segment)
? segment.replace(LONG_TOKEN_RE, normalizeLongTokenForDisplay)
: segment,
)
: redacted;
return applyRtlIsolation(tokenSafe);
}