mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 05:30:42 +00:00
fix(tui): preserve code spans, code blocks, and dotted/hyphenated identifiers from long-token sanitizer (#77335)
The display sanitizer's long-token chunker (`\S{33,}` -> 32-char chunks
joined by spaces) was injecting literal spaces inside inline code spans,
fenced code blocks, and bare identifiers it didn't recognize. Tokens like
`requireConfirmationForMutatingActions`, `ubuntu-budgie-desktop-environment`,
and `binary_sensor.sense_energy_monitor_power` rendered with mid-word
spaces, contaminating copy/paste of package names, entity IDs, and shell
line-continuations.
Fix:
- Make sanitizer code-aware: split text into fenced/inline-code segments
and prose, and only run the chunker on prose segments. Code regions
pass through verbatim.
- Widen `isCopySensitiveToken` to use the punctuation-stripped candidate
for all classification, and accept any `FILE_LIKE_RE` token that
contains `_`, `-`, or `.` (covers package names, dotted IDs, kebab
flags). Picks up the goals of #69340 and #39565.
- Skip chunking for symbol-only runs (box-drawing rows, dashes, equals)
so table borders aren't corrupted.
- Preserve the original goal of narrow-terminal protection: long
unidentifiable prose tokens (e.g. accidental base64 dumps) are still
chunked so they don't blow out terminal layout.
Security ordering preserved: ANSI strip / control-char strip / binary
redaction still run on the whole string before segmentation, so code
regions cannot smuggle escapes, control characters, or binary garbage
past the sanitizer.
16 new regression tests cover: camelCase config keys in inline code,
hyphenated package names (bare and in code), dotted entity IDs (bare
and in code), backtick and tilde fenced blocks, base64-like blobs in
code, prose-token chunking unchanged, prose-around-code mixed content,
box-drawing horizontal rules, multi-line shell `\\` continuations,
plus three explicit security-ordering tests asserting ANSI/control/
binary stripping still runs inside code segments.
Fixes #48432, #39505.
Supersedes #69340, #39565 (carries forward both ideas in a more
general fix). Carries forward the code-fence-aware approach from the
closed #48445.
This commit is contained in:
@@ -44,6 +44,7 @@ Docs: https://docs.openclaw.ai
|
||||
### Fixes
|
||||
|
||||
- TUI/escape abort: track the in-flight runId after `chat.send` resolves so pressing Esc during the gap before the first gateway event aborts the run instead of repeatedly printing `no active run`. Fixes #1296. Thanks @Lukavyi and @romneyda.
|
||||
- TUI/render: stop the long-token sanitizer from injecting literal spaces inside inline code spans, fenced code blocks, table borders, and bare hyphenated/dotted identifiers, so copied package names, entity IDs, and shell line-continuations stay byte-for-byte intact while narrow-terminal protection still chunks unidentifiable long prose tokens. Fixes #48432, #39505. Thanks @DocOellerson, @xeusoc, @CCcassiusdjs, @akramcodez, @brokemac79, @romneyda.
|
||||
- Gateway/status: label Linux managed gateway services as `systemd user`, making status output explicit about the user-service scope instead of implying a system-level unit. Thanks @vincentkoc.
|
||||
- Plugins/install: remove the previous managed plugin directory when a reinstall switches sources, so stale ClawHub and npm copies no longer keep duplicate plugin ids in discovery after the new install wins. Thanks @vincentkoc.
|
||||
- Plugins/install: let official plugin reinstall recovery repair source-only installed runtime shadows, so `openclaw plugins install npm:@openclaw/discord --force` can replace the bad package instead of stopping at stale config validation. Thanks @vincentkoc.
|
||||
|
||||
@@ -378,4 +378,133 @@ describe("sanitizeRenderableText", () => {
|
||||
|
||||
expect(sanitized).toBe(input);
|
||||
});
|
||||
|
||||
it("preserves long camelCase identifiers wrapped in inline code spans (#48432)", () => {
|
||||
const input = "- `requireConfirmationForMutatingActions: false`";
|
||||
const sanitized = sanitizeRenderableText(input);
|
||||
|
||||
expect(sanitized).toBe(input);
|
||||
});
|
||||
|
||||
it("preserves long hyphenated package names in inline code spans (#48432)", () => {
|
||||
const input = "Install `ubuntu-budgie-desktop-environment` to fix it.";
|
||||
const sanitized = sanitizeRenderableText(input);
|
||||
|
||||
expect(sanitized).toBe(input);
|
||||
});
|
||||
|
||||
it("preserves dotted entity IDs in inline code spans (#39505)", () => {
|
||||
const input = "See `binary_sensor.sense_energy_monitor_power` for the live reading.";
|
||||
const sanitized = sanitizeRenderableText(input);
|
||||
|
||||
expect(sanitized).toBe(input);
|
||||
});
|
||||
|
||||
it("preserves bare hyphenated package names in prose", () => {
|
||||
const input = "Run apt install ubuntu-budgie-desktop-environment after enabling the PPA.";
|
||||
const sanitized = sanitizeRenderableText(input);
|
||||
|
||||
expect(sanitized).toBe(input);
|
||||
});
|
||||
|
||||
it("preserves bare dotted entity IDs in prose", () => {
|
||||
const input = "Watch binary_sensor.sense_energy_monitor_power.daily_energy after midnight.";
|
||||
const sanitized = sanitizeRenderableText(input);
|
||||
|
||||
expect(sanitized).toBe(input);
|
||||
});
|
||||
|
||||
it("preserves backtick-fenced code blocks verbatim", () => {
|
||||
const input = [
|
||||
"Run this:",
|
||||
"```bash",
|
||||
"sudo cp -a /var/lib/machines/fc41/etc/systemd/network/. \\",
|
||||
" /var/lib/machines/fc43/etc/systemd/network/",
|
||||
"```",
|
||||
"Done.",
|
||||
].join("\n");
|
||||
const sanitized = sanitizeRenderableText(input);
|
||||
|
||||
expect(sanitized).toBe(input);
|
||||
});
|
||||
|
||||
it("preserves tilde-fenced code blocks verbatim", () => {
|
||||
const input = [
|
||||
"Example:",
|
||||
"~~~typescript",
|
||||
"const requireConfirmationForMutatingActions = false;",
|
||||
"~~~",
|
||||
].join("\n");
|
||||
const sanitized = sanitizeRenderableText(input);
|
||||
|
||||
expect(sanitized).toBe(input);
|
||||
});
|
||||
|
||||
it("preserves long base64-like blobs inside inline code spans", () => {
|
||||
const input = "token: `e3b19c3b87bcf364b23eebb2c276e96ec478956ba1d84c93deadbeef`"; // pragma: allowlist secret
|
||||
const sanitized = sanitizeRenderableText(input);
|
||||
|
||||
expect(sanitized).toBe(input);
|
||||
});
|
||||
|
||||
it("still chunks long unbroken prose tokens outside code spans", () => {
|
||||
const input = `prefix ${"x".repeat(120)} suffix`;
|
||||
const sanitized = sanitizeRenderableText(input);
|
||||
|
||||
const longestSegment = Math.max(...sanitized.split(/\s+/).map((s) => s.length));
|
||||
expect(longestSegment).toBeLessThanOrEqual(32);
|
||||
});
|
||||
|
||||
it("preserves prose around code blocks while chunking long prose tokens", () => {
|
||||
const input = [
|
||||
`before ${"x".repeat(120)}`,
|
||||
"```",
|
||||
"code line preserved verbatim",
|
||||
"```",
|
||||
`after ${"y".repeat(80)}`,
|
||||
].join("\n");
|
||||
const sanitized = sanitizeRenderableText(input);
|
||||
|
||||
expect(sanitized).toContain("code line preserved verbatim");
|
||||
expect(sanitized).not.toContain("x".repeat(33));
|
||||
expect(sanitized).not.toContain("y".repeat(33));
|
||||
});
|
||||
|
||||
it("does not chunk box-drawing horizontal rules used in tables", () => {
|
||||
const input = "─".repeat(60);
|
||||
const sanitized = sanitizeRenderableText(input);
|
||||
|
||||
expect(sanitized).toBe(input);
|
||||
});
|
||||
|
||||
it("does not insert spaces before backslash line-continuations in fenced code", () => {
|
||||
const longContinuation = `cmd ${"a".repeat(40)} \\`;
|
||||
const input = ["```bash", longContinuation, " next", "```"].join("\n");
|
||||
const sanitized = sanitizeRenderableText(input);
|
||||
|
||||
expect(sanitized).toContain(longContinuation);
|
||||
expect(sanitized).not.toContain("\\ ");
|
||||
});
|
||||
|
||||
it("strips ANSI escapes inside fenced code blocks (sanitization runs before segmentation)", () => {
|
||||
const input = "Hello\n```\nlet x = 1;[31m injected[0m\n```\nbye";
|
||||
const sanitized = sanitizeRenderableText(input);
|
||||
|
||||
expect(sanitized).not.toContain("");
|
||||
expect(sanitized).toContain("let x = 1;");
|
||||
});
|
||||
|
||||
it("strips control chars inside inline code spans (sanitization runs before segmentation)", () => {
|
||||
const input = "Hello `safe\x00content` world";
|
||||
const sanitized = sanitizeRenderableText(input);
|
||||
|
||||
expect(sanitized).toBe("Hello `safecontent` world");
|
||||
});
|
||||
|
||||
it("redacts heavily corrupted lines even inside fenced code blocks", () => {
|
||||
const input = `Header\n\`\`\`\n${"<22>".repeat(40)}\n\`\`\`\nFooter`;
|
||||
const sanitized = sanitizeRenderableText(input);
|
||||
|
||||
expect(sanitized).toContain("[binary data omitted]");
|
||||
});
|
||||
});
|
||||
|
||||
@@ -13,11 +13,17 @@ const URL_PREFIX_RE = /^(https?:\/\/|file:\/\/)/i;
|
||||
const WINDOWS_DRIVE_RE = /^[a-zA-Z]:[\\/]/;
|
||||
const FILE_LIKE_RE = /^[a-zA-Z0-9._-]+$/;
|
||||
const EDGE_PUNCTUATION_RE = /^[`"'([{<]+|[`"')\]}>.,:;!?]+$/g;
|
||||
const ALPHANUMERIC_RE = /[A-Za-z0-9]/;
|
||||
const TOKENISH_MIN_LENGTH = 24;
|
||||
const RTL_SCRIPT_RE = /[\u0590-\u08ff\ufb1d-\ufdff\ufe70-\ufefc]/;
|
||||
const BIDI_CONTROL_RE = /[\u202a-\u202e\u2066-\u2069]/;
|
||||
const RTL_ISOLATE_START = "\u2067";
|
||||
const RTL_ISOLATE_END = "\u2069";
|
||||
// Fenced code blocks (``` or ~~~). Lazy on content; tolerates info string after
|
||||
// the opening fence. Closing fence must sit on its own line.
|
||||
const FENCED_CODE_RE = /(```|~~~)[^\n]*\n[\s\S]*?\n\1[^\n]*/g;
|
||||
// Inline code spans with balanced backtick run (`code`, ``co`de``, ...).
|
||||
const INLINE_CODE_RE = /(`+)(?:(?!\1).)+?\1/g;
|
||||
|
||||
function hasControlChars(text: string): boolean {
|
||||
for (const char of text) {
|
||||
@@ -62,24 +68,29 @@ function isCopySensitiveToken(token: string): boolean {
|
||||
const coreToken = token.replace(EDGE_PUNCTUATION_RE, "");
|
||||
const candidate = coreToken || token;
|
||||
|
||||
if (URL_PREFIX_RE.test(token)) {
|
||||
if (URL_PREFIX_RE.test(candidate)) {
|
||||
return true;
|
||||
}
|
||||
if (
|
||||
token.startsWith("/") ||
|
||||
token.startsWith("~/") ||
|
||||
token.startsWith("./") ||
|
||||
token.startsWith("../")
|
||||
candidate.startsWith("/") ||
|
||||
candidate.startsWith("~/") ||
|
||||
candidate.startsWith("./") ||
|
||||
candidate.startsWith("../")
|
||||
) {
|
||||
return true;
|
||||
}
|
||||
if (WINDOWS_DRIVE_RE.test(token) || token.startsWith("\\\\")) {
|
||||
if (WINDOWS_DRIVE_RE.test(candidate) || candidate.startsWith("\\\\")) {
|
||||
return true;
|
||||
}
|
||||
if (token.includes("/") || token.includes("\\")) {
|
||||
if (candidate.includes("/") || candidate.includes("\\")) {
|
||||
return true;
|
||||
}
|
||||
if (token.includes("_") && FILE_LIKE_RE.test(token)) {
|
||||
// Identifiers that look file-like, dotted, or hyphen/underscore-separated:
|
||||
// package names, entity IDs, kebab/snake CLI flags, dotted module paths.
|
||||
if (
|
||||
FILE_LIKE_RE.test(candidate) &&
|
||||
(candidate.includes("_") || candidate.includes("-") || candidate.includes("."))
|
||||
) {
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -96,9 +107,50 @@ function normalizeLongTokenForDisplay(token: string): string {
|
||||
if (isCopySensitiveToken(token)) {
|
||||
return token;
|
||||
}
|
||||
// Pure symbol/punctuation runs (table borders made of `─`, `=`, `-`) carry
|
||||
// no copyable identifier; chunking would corrupt the visible structure.
|
||||
if (!ALPHANUMERIC_RE.test(token)) {
|
||||
return token;
|
||||
}
|
||||
return chunkToken(token, MAX_TOKEN_CHARS).join(" ");
|
||||
}
|
||||
|
||||
type Segment = { kind: "prose" | "code"; text: string };
|
||||
|
||||
function partitionByRegex(text: string, re: RegExp): Segment[] {
|
||||
const parts: Segment[] = [];
|
||||
let lastIndex = 0;
|
||||
for (const match of text.matchAll(re)) {
|
||||
const start = match.index ?? 0;
|
||||
if (start > lastIndex) {
|
||||
parts.push({ kind: "prose", text: text.slice(lastIndex, start) });
|
||||
}
|
||||
parts.push({ kind: "code", text: match[0] });
|
||||
lastIndex = start + match[0].length;
|
||||
}
|
||||
if (lastIndex < text.length) {
|
||||
parts.push({ kind: "prose", text: text.slice(lastIndex) });
|
||||
}
|
||||
return parts;
|
||||
}
|
||||
|
||||
// Apply `transform` only to spans of `text` that are not inside fenced code
|
||||
// blocks or inline code spans. Code regions pass through verbatim so long
|
||||
// identifiers, dotted IDs, package names, and shell line-continuations the
|
||||
// user may copy stay byte-for-byte intact.
|
||||
function transformOutsideCode(text: string, transform: (segment: string) => string): string {
|
||||
const fenced = partitionByRegex(text, FENCED_CODE_RE);
|
||||
return fenced
|
||||
.map((seg) => {
|
||||
if (seg.kind === "code") {
|
||||
return seg.text;
|
||||
}
|
||||
const inline = partitionByRegex(seg.text, INLINE_CODE_RE);
|
||||
return inline.map((s) => (s.kind === "code" ? s.text : transform(s.text))).join("");
|
||||
})
|
||||
.join("");
|
||||
}
|
||||
|
||||
function redactBinaryLikeLine(line: string): string {
|
||||
const replacementCount = (line.match(REPLACEMENT_CHAR_RE) || []).length;
|
||||
if (
|
||||
@@ -149,7 +201,11 @@ export function sanitizeRenderableText(text: string): string {
|
||||
.join("\n")
|
||||
: withoutControlChars;
|
||||
const tokenSafe = LONG_TOKEN_TEST_RE.test(redacted)
|
||||
? redacted.replace(LONG_TOKEN_RE, normalizeLongTokenForDisplay)
|
||||
? transformOutsideCode(redacted, (segment) =>
|
||||
LONG_TOKEN_TEST_RE.test(segment)
|
||||
? segment.replace(LONG_TOKEN_RE, normalizeLongTokenForDisplay)
|
||||
: segment,
|
||||
)
|
||||
: redacted;
|
||||
return applyRtlIsolation(tokenSafe);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user