mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-04 21:30:22 +00:00
fix: preserve code block indentation in normalizeDirectiveWhitespace
## Summary - Problem: `normalizeDirectiveWhitespace` applied whitespace-collapsing regexes globally, including inside fenced code blocks (` ``` ` / `~~~`) and indent-code-blocks (4-space / tab), corrupting indentation in assistant replies that contain code snippets - Why it matters: Any language where indentation is significant (Python, Go, YAML, etc.) or visually meaningful would render incorrectly after stripping inline directive tags - What changed: Stash code blocks under a Unicode private-use sentinel (`\uE000`) before normalization, run the existing prose regexes on the masked text, then restore the original blocks verbatim - What did NOT change: All prose normalization rules are retained as-is (`\r\n`, multi-space collapse, leading blank-line strip, trailing whitespace, 3+ newline fold) ## Change Type - [x] Bug fix ## Scope - [ ] Gateway / orchestration ## Root Cause - Root cause: Prose whitespace regexes were applied to the full text string with no awareness of Markdown code block boundaries - Missing detection / guardrail: No tests covered indented content inside fenced blocks - Contributing context: Directive tag stripping (`[[reply_to_current]]`, `[[audio_as_voice]]`) is applied before delivery, making the normalization step a silent corruption point for code-heavy replies ## Regression Test Plan - Coverage level that should have caught this: - [x] Unit test - Target test or file: `src/utils/directive-tags.test.ts` - Scenario the test should lock in: `parseInlineDirectives` with fenced/indent code blocks must preserve all leading whitespace inside those blocks - Why this is the smallest reliable guardrail: Pure function with deterministic string in/out; no mocks needed - If no new test is added, why not: 7 new unit tests added ## User-visible / Behavior Changes Code blocks in assistant replies containing `[[reply_to_current]]` or `[[audio_as_voice]]` directives now retain correct indentation after the directive is stripped. ## Security Impact - New permissions/capabilities? No - Secrets/tokens handling changed? No - New/changed network calls? No - Command/tool execution surface changed? No - Data access scope changed? No ## Compatibility / Migration - Backward compatible? Yes - Config/env changes? No - Migration needed? No Co-Authored-By: Codemax <codemax@binance.com>
This commit is contained in:
committed by
Peter Steinberger
parent
1b309fff71
commit
af7c21f207
@@ -88,6 +88,109 @@ describe("parseInlineDirectives", () => {
|
||||
expect(result.hasReplyTag).toBe(true);
|
||||
expect(result.text).toBe("text");
|
||||
});
|
||||
|
||||
// --- code-fence aware normalizeDirectiveWhitespace ---
|
||||
|
||||
test("preserves indented code block (4-space) inside a fenced block after stripping a directive", () => {
|
||||
const input = [
|
||||
"[[reply_to_current]]",
|
||||
"```js",
|
||||
"function foo() {",
|
||||
" return 42;",
|
||||
" const nested = true;",
|
||||
"}",
|
||||
"```",
|
||||
].join("\n");
|
||||
const result = parseInlineDirectives(input);
|
||||
expect(result.hasReplyTag).toBe(true);
|
||||
expect(result.text).toBe(
|
||||
[
|
||||
"```js",
|
||||
"function foo() {",
|
||||
" return 42;",
|
||||
" const nested = true;",
|
||||
"}",
|
||||
"```",
|
||||
].join("\n"),
|
||||
);
|
||||
});
|
||||
|
||||
test("preserves tab-indented lines inside a fenced code block", () => {
|
||||
const input = [
|
||||
"[[reply_to_current]]",
|
||||
"```go",
|
||||
"func main() {",
|
||||
'\tfmt.Println("hello")',
|
||||
"\t\tif true {",
|
||||
"\t\t}",
|
||||
"}",
|
||||
"```",
|
||||
].join("\n");
|
||||
const result = parseInlineDirectives(input);
|
||||
expect(result.hasReplyTag).toBe(true);
|
||||
expect(result.text).toBe(
|
||||
[
|
||||
"```go",
|
||||
"func main() {",
|
||||
'\tfmt.Println("hello")',
|
||||
"\t\tif true {",
|
||||
"\t\t}",
|
||||
"}",
|
||||
"```",
|
||||
].join("\n"),
|
||||
);
|
||||
});
|
||||
|
||||
test("preserves indent-code-block lines (4-space prefix) outside a fenced block", () => {
|
||||
const input = "[[reply_to_current]]\nHere is some code:\n\n const x = 1;\n const y = 2;";
|
||||
const result = parseInlineDirectives(input);
|
||||
expect(result.hasReplyTag).toBe(true);
|
||||
expect(result.text).toBe("Here is some code:\n\n const x = 1;\n const y = 2;");
|
||||
});
|
||||
|
||||
test("collapses multiple spaces on normal prose lines but not inside code blocks", () => {
|
||||
const input = [
|
||||
"[[reply_to_current]]",
|
||||
"prose with extra spaces",
|
||||
"```",
|
||||
" preserved spacing inside",
|
||||
"```",
|
||||
].join("\n");
|
||||
const result = parseInlineDirectives(input);
|
||||
expect(result.hasReplyTag).toBe(true);
|
||||
expect(result.text).toBe(
|
||||
["prose with extra spaces", "```", " preserved spacing inside", "```"].join("\n"),
|
||||
);
|
||||
});
|
||||
|
||||
test("handles tilde fenced blocks (~~~) the same as backtick blocks", () => {
|
||||
const input = [
|
||||
"[[reply_to_current]]",
|
||||
"~~~python",
|
||||
" x = 1",
|
||||
" y = 2",
|
||||
"~~~",
|
||||
].join("\n");
|
||||
const result = parseInlineDirectives(input);
|
||||
expect(result.hasReplyTag).toBe(true);
|
||||
expect(result.text).toBe(["~~~python", " x = 1", " y = 2", "~~~"].join("\n"));
|
||||
});
|
||||
|
||||
test("normalizes plain text without directives using code-fence awareness", () => {
|
||||
const input = "plain text with extra spaces\n\n```\n code preserved\n```";
|
||||
const result = parseInlineDirectives(input);
|
||||
expect(result.hasReplyTag).toBe(false);
|
||||
expect(result.text).toBe("plain text with extra spaces\n\n```\n code preserved\n```");
|
||||
});
|
||||
|
||||
test("audio_as_voice directive does not corrupt adjacent fenced code block indentation", () => {
|
||||
const input = ["[[audio_as_voice]]", "```bash", " echo 'hello'", " indented", "```"].join(
|
||||
"\n",
|
||||
);
|
||||
const result = parseInlineDirectives(input);
|
||||
expect(result.audioAsVoice).toBe(true);
|
||||
expect(result.text).toBe(["```bash", " echo 'hello'", " indented", "```"].join("\n"));
|
||||
});
|
||||
});
|
||||
|
||||
describe("stripInlineDirectiveTagsFromMessageForDisplay", () => {
|
||||
|
||||
@@ -25,8 +25,25 @@ function replacementPreservesWordBoundary(source: string, offset: number, length
|
||||
return before && after && !/\s/u.test(before) && !/\s/u.test(after) ? " " : "";
|
||||
}
|
||||
|
||||
// Unicode private-use sentinel that cannot appear in normal markdown text.
|
||||
// Used to bracket code-block placeholders during whitespace normalization.
|
||||
const BLOCK_SENTINEL = "\uE000";
|
||||
const BLOCK_PLACEHOLDER_RE = new RegExp(`${BLOCK_SENTINEL}(\\d+)${BLOCK_SENTINEL}`, "g");
|
||||
|
||||
function normalizeDirectiveWhitespace(text: string): string {
|
||||
return text
|
||||
// Extract → normalize prose → restore:
|
||||
// Stash every code block (fenced ``` / ~~~ and indent-code 4-space/tab)
|
||||
// under a sentinel-delimited placeholder so the prose regexes never touch them.
|
||||
const blocks: string[] = [];
|
||||
const masked = text.replace(
|
||||
/(`{3,}|~{3,})[^\n]*\n[\s\S]*?\n\1[^\n]*|(?:(?:^|\n)(?: |\t)[^\n]*)+/gm,
|
||||
(block) => {
|
||||
blocks.push(block);
|
||||
return `${BLOCK_SENTINEL}${blocks.length - 1}${BLOCK_SENTINEL}`;
|
||||
},
|
||||
);
|
||||
|
||||
const normalized = masked
|
||||
.replace(/\r\n/g, "\n")
|
||||
.replace(/([^\s])[ \t]{2,}([^\s])/g, "$1 $2")
|
||||
.replace(/^\n+/, "")
|
||||
@@ -34,6 +51,8 @@ function normalizeDirectiveWhitespace(text: string): string {
|
||||
.replace(/[ \t]+\n/g, "\n")
|
||||
.replace(/\n{3,}/g, "\n\n")
|
||||
.trimEnd();
|
||||
|
||||
return normalized.replace(BLOCK_PLACEHOLDER_RE, (_, i) => blocks[Number(i)]);
|
||||
}
|
||||
|
||||
type StripInlineDirectiveTagsResult = {
|
||||
|
||||
Reference in New Issue
Block a user