fix: strip leaked outbound tool-call scaffolding (#60619)

Co-authored-by: Frank Yang <frank.ekn@gmail.com>
2026-04-10 08:41:13 +00:00 · 2026-04-05 02:02:36 +08:00
parent 0cf9c6ec95
commit 7ff90c516a
4 changed files with 558 additions and 23 deletions
--- a/src/shared/text/assistant-visible-text.test.ts
+++ b/src/shared/text/assistant-visible-text.test.ts
@@ -1,5 +1,6 @@
 import { describe, expect, it } from "vitest";
 import { stripAssistantInternalScaffolding } from "./assistant-visible-text.js";
+import { stripModelSpecialTokens } from "./model-special-tokens.js";

 describe("stripAssistantInternalScaffolding", () => {
  function expectVisibleText(input: string, expected: string) {
@@ -99,4 +100,271 @@ describe("stripAssistantInternalScaffolding", () => {
    }
    expectVisibleText(input, expected);
  });
+
+  describe("tool-call XML stripping", () => {
+    it("strips closed <tool_call> blocks", () => {
+      expectVisibleText(
+        'Let me check.\n\n<tool_call> {"name": "read", "arguments": {"file_path": "test.md"}} </tool_call> after',
+        "Let me check.\n\n after",
+      );
+    });
+
+    it("strips closed <function_calls> blocks", () => {
+      expectVisibleText(
+        'Checking now. <function_calls>{"name": "exec", "args": {"cmd": "ls"}}</function_calls> Done.',
+        "Checking now.  Done.",
+      );
+    });
+
+    it("hides dangling <tool_call> content to end-of-string", () => {
+      expectVisibleText(
+        'Let me run.\n<tool_call>\n{"name": "find", "arguments": {}}\n',
+        "Let me run.\n",
+      );
+    });
+
+    it("does not close early on </tool_call> text inside JSON strings", () => {
+      expectVisibleText(
+        [
+          "prefix",
+          "<tool_call>",
+          '{"name":"x","arguments":{"html":"<div></tool_call><span>leak</span>"}}',
+          "</tool_call>",
+          "suffix",
+        ].join("\n"),
+        "prefix\n\nsuffix",
+      );
+    });
+
+    it("does not close early on </tool_call> text inside single-quoted payload strings", () => {
+      expectVisibleText(
+        [
+          "prefix",
+          "<tool_call>",
+          "{'html':'</tool_call> leak','tail':'still hidden'}",
+          "</tool_call>",
+          "suffix",
+        ].join("\n"),
+        "prefix\n\nsuffix",
+      );
+    });
+
+    it("does not close early on mismatched closing tool tags", () => {
+      expectVisibleText(
+        [
+          "prefix",
+          "<tool_call>",
+          '{"name":"read",',
+          "</function_calls>",
+          "still-hidden",
+          "</tool_call>",
+          "suffix",
+        ].join("\n"),
+        "prefix\n\nsuffix",
+      );
+    });
+
+    it("hides truncated <tool_call openings that never reach >", () => {
+      expectVisibleText('prefix\n<tool_call\n{"name":"find","arguments":{}}', "prefix\n");
+    });
+
+    it("hides truncated <tool_call openings with attributes before JSON payload", () => {
+      expectVisibleText('prefix\n<tool_call name="find"\n{"arguments":{}}', "prefix\n");
+    });
+
+    it("preserves lone <tool_call> mentions in normal prose", () => {
+      expectVisibleText("Use <tool_call> to invoke tools.", "Use <tool_call> to invoke tools.");
+    });
+
+    it("strips self-closing <tool_call/> tags", () => {
+      expectVisibleText("prefix <tool_call/> suffix", "prefix  suffix");
+    });
+
+    it("strips self-closing <function_calls .../> tags", () => {
+      expectVisibleText('prefix <function_calls name="x"/> suffix', "prefix  suffix");
+    });
+
+    it("strips lone closing tool-call tags", () => {
+      expectVisibleText("prefix </tool_call> suffix", "prefix  suffix");
+      expectVisibleText("prefix </function_calls> suffix", "prefix  suffix");
+    });
+
+    it("preserves XML-style explanations after lone <tool_call> tags", () => {
+      expectVisibleText("Use <tool_call><arg> literally.", "Use <tool_call><arg> literally.");
+    });
+
+    it("preserves literal XML-style paired tool_call examples in prose", () => {
+      expectVisibleText(
+        "prefix <tool_call><arg>secret</arg></tool_call> suffix",
+        "prefix <tool_call><arg>secret</arg></tool_call> suffix",
+      );
+    });
+
+    it("preserves machine-style XML payload examples in prose", () => {
+      expectVisibleText(
+        'prefix <function_calls><invoke name="find">secret</invoke></function_calls> suffix',
+        'prefix <function_calls><invoke name="find">secret</invoke></function_calls> suffix',
+      );
+    });
+
+    it("preserves non-tool tag names that share the tool_call prefix", () => {
+      expectVisibleText(
+        'prefix <tool_call-example>{"name":"read"}</tool_call-example> suffix',
+        'prefix <tool_call-example>{"name":"read"}</tool_call-example> suffix',
+      );
+    });
+
+    it("preserves truncated <tool_call mentions in prose", () => {
+      expectVisibleText("Use <tool_call to invoke tools.", "Use <tool_call to invoke tools.");
+    });
+
+    it("preserves truncated <tool_call mentions with prose attributes", () => {
+      expectVisibleText(
+        'Use <tool_call name="find" to invoke tools.',
+        'Use <tool_call name="find" to invoke tools.',
+      );
+    });
+
+    it("still strips later JSON payloads after a truncated prose mention", () => {
+      expectVisibleText(
+        'Use <tool_call to invoke tools.\n<tool_call>{"name":"find"}</tool_call>',
+        "Use <tool_call to invoke tools.\n",
+      );
+    });
+
+    it("still strips later JSON payloads after a truncated closing-tag mention", () => {
+      expectVisibleText(
+        'Use </tool_call to explain tags.\n<tool_call>{"name":"find"}</tool_call>',
+        "Use </tool_call to explain tags.\n",
+      );
+    });
+
+    it("still closes a tool-call block when malformed payload opens a fenced code region", () => {
+      expectVisibleText(
+        [
+          "prefix",
+          "<tool_call>",
+          '{"name":"read",',
+          "```xml",
+          "<note>hi</note>",
+          "</tool_call>",
+          "suffix",
+        ].join("\n"),
+        "prefix\n\nsuffix",
+      );
+    });
+
+    it("preserves truncated XML payload openings in prose", () => {
+      expectVisibleText(
+        'prefix\n<function_calls\n<invoke name="find">',
+        'prefix\n<function_calls\n<invoke name="find">',
+      );
+    });
+
+    it("hides truncated <function_calls openings with attributes before array payload", () => {
+      expectVisibleText('prefix\n<function_calls id="x"\n[{"name":"find"}]', "prefix\n");
+    });
+
+    it("preserves tool-call tags inside fenced code blocks", () => {
+      const input = [
+        "```xml",
+        '<tool_call> {"name": "find"} </tool_call>',
+        "```",
+        "",
+        "Visible text",
+      ].join("\n");
+      expectVisibleText(input, input);
+    });
+
+    it("preserves inline code references to tool_call tags", () => {
+      expectVisibleText("Use `<tool_call>` to invoke tools.", "Use `<tool_call>` to invoke tools.");
+    });
+  });
+
+  describe("model special token stripping", () => {
+    it("strips Kimi/GLM special tokens in isolation", () => {
+      expectVisibleText("<|assistant|>Here is the answer<|end|>", "Here is the answer ");
+    });
+
+    it("strips full-width pipe DeepSeek tokens", () => {
+      expectVisibleText("<｜begin▁of▁sentence｜>Hello world", "Hello world");
+    });
+
+    it("strips special tokens mixed with normal text", () => {
+      expectVisibleText(
+        "Start <|tool_call_result_begin|>middle<|tool_call_result_end|> end",
+        "Start  middle  end",
+      );
+    });
+
+    it("preserves special-token-like syntax inside code blocks", () => {
+      expectVisibleText("Use <div>hello</div> in HTML", "Use <div>hello</div> in HTML");
+    });
+
+    it("strips special tokens combined with reasoning tags", () => {
+      const input = [
+        "<thinking>",
+        "internal reasoning",
+        "</thinking>",
+        "<|assistant|>Visible response",
+      ].join("\n");
+      expectVisibleText(input, "Visible response");
+    });
+
+    it("preserves indentation in code blocks", () => {
+      const input = [
+        "<|assistant|>Here is the code:",
+        "",
+        "```python",
+        "def foo():",
+        "    if True:",
+        "        return 42",
+        "```",
+      ].join("\n");
+      const expected = [
+        "Here is the code:",
+        "",
+        "```python",
+        "def foo():",
+        "    if True:",
+        "        return 42",
+        "```",
+      ].join("\n");
+      expectVisibleText(input, expected);
+    });
+
+    it("preserves special tokens inside fenced code blocks", () => {
+      const input = [
+        "Here are the model tokens:",
+        "",
+        "```",
+        "<|assistant|>Hello<|end|>",
+        "```",
+        "",
+        "As you can see above.",
+      ].join("\n");
+      expectVisibleText(input, input);
+    });
+
+    it("preserves special tokens inside inline code spans", () => {
+      expectVisibleText(
+        "The token `<|assistant|>` marks the start.",
+        "The token `<|assistant|>` marks the start.",
+      );
+    });
+
+    it("preserves malformed tokens that end inside inline code spans", () => {
+      expectVisibleText("Before <|token `code|>` after", "Before <|token `code|>` after");
+    });
+
+    it("preserves malformed tokens that end inside fenced code blocks", () => {
+      const input = ["Before <|token", "```js", "const x = 1;|>", "```", "after"].join("\n");
+      expectVisibleText(input, input);
+    });
+
+    it("resets special-token regex state between calls", () => {
+      expect(stripModelSpecialTokens("prefix <|assistant|>")).toBe("prefix  ");
+      expect(stripModelSpecialTokens("<|assistant|>short")).toBe(" short");
+    });
+  });
 });
--- a/src/shared/text/assistant-visible-text.ts
+++ b/src/shared/text/assistant-visible-text.ts
@@ -1,9 +1,247 @@
 import { findCodeRegions, isInsideCode } from "./code-regions.js";
+import { stripModelSpecialTokens } from "./model-special-tokens.js";
 import { stripReasoningTagsFromText } from "./reasoning-tags.js";

 const MEMORY_TAG_RE = /<\s*(\/?)\s*relevant[-_]memories\b[^<>]*>/gi;
 const MEMORY_TAG_QUICK_RE = /<\s*\/?\s*relevant[-_]memories\b/i;

+/**
+ * Strip XML-style tool call tags that models sometimes emit as plain text.
+ * This stateful pass hides content from an opening tag through the matching
+ * closing tag, or to end-of-string if the stream was truncated mid-tag.
+ */
+const TOOL_CALL_QUICK_RE = /<\s*\/?\s*(?:tool_call|function_calls?|tool_calls)\b/i;
+const TOOL_CALL_TAG_NAMES = new Set(["tool_call", "function_call", "function_calls", "tool_calls"]);
+const TOOL_CALL_JSON_PAYLOAD_START_RE =
+  /^(?:\s+[A-Za-z_:][-A-Za-z0-9_:.]*\s*=\s*(?:"[^"]*"|'[^']*'|[^\s"'=<>`]+))*\s*(?:\r?\n\s*)?[[{]/;
+
+function endsInsideQuotedString(text: string, start: number, end: number): boolean {
+  let quoteChar: "'" | '"' | null = null;
+  let isEscaped = false;
+
+  for (let idx = start; idx < end; idx += 1) {
+    const char = text[idx];
+    if (quoteChar === null) {
+      if (char === '"' || char === "'") {
+        quoteChar = char;
+      }
+      continue;
+    }
+
+    if (isEscaped) {
+      isEscaped = false;
+      continue;
+    }
+
+    if (char === "\\") {
+      isEscaped = true;
+      continue;
+    }
+
+    if (char === quoteChar) {
+      quoteChar = null;
+    }
+  }
+
+  return quoteChar !== null;
+}
+
+interface ParsedToolCallTag {
+  contentStart: number;
+  end: number;
+  isClose: boolean;
+  isSelfClosing: boolean;
+  tagName: string;
+  isTruncated: boolean;
+}
+
+function isToolCallBoundary(char: string | undefined): boolean {
+  return !char || /\s/.test(char) || char === "/" || char === ">";
+}
+
+function findTagCloseIndex(text: string, start: number): number {
+  let quoteChar: "'" | '"' | null = null;
+  let isEscaped = false;
+
+  for (let idx = start; idx < text.length; idx += 1) {
+    const char = text[idx];
+    if (quoteChar !== null) {
+      if (isEscaped) {
+        isEscaped = false;
+        continue;
+      }
+      if (char === "\\") {
+        isEscaped = true;
+        continue;
+      }
+      if (char === quoteChar) {
+        quoteChar = null;
+      }
+      continue;
+    }
+
+    if (char === '"' || char === "'") {
+      quoteChar = char;
+      continue;
+    }
+    if (char === "<") {
+      return -1;
+    }
+    if (char === ">") {
+      return idx;
+    }
+  }
+
+  return -1;
+}
+
+function looksLikeToolCallPayloadStart(text: string, start: number): boolean {
+  return TOOL_CALL_JSON_PAYLOAD_START_RE.test(text.slice(start));
+}
+
+function parseToolCallTagAt(text: string, start: number): ParsedToolCallTag | null {
+  if (text[start] !== "<") {
+    return null;
+  }
+
+  let cursor = start + 1;
+  while (cursor < text.length && /\s/.test(text[cursor])) {
+    cursor += 1;
+  }
+
+  let isClose = false;
+  if (text[cursor] === "/") {
+    isClose = true;
+    cursor += 1;
+    while (cursor < text.length && /\s/.test(text[cursor])) {
+      cursor += 1;
+    }
+  }
+
+  const nameStart = cursor;
+  while (cursor < text.length && /[A-Za-z_]/.test(text[cursor])) {
+    cursor += 1;
+  }
+
+  const tagName = text.slice(nameStart, cursor).toLowerCase();
+  if (!TOOL_CALL_TAG_NAMES.has(tagName) || !isToolCallBoundary(text[cursor])) {
+    return null;
+  }
+  const contentStart = cursor;
+
+  const closeIndex = findTagCloseIndex(text, cursor);
+  if (closeIndex === -1) {
+    return {
+      contentStart,
+      end: text.length,
+      isClose,
+      isSelfClosing: false,
+      tagName,
+      isTruncated: true,
+    };
+  }
+
+  return {
+    contentStart,
+    end: closeIndex + 1,
+    isClose,
+    isSelfClosing: !isClose && /\/\s*$/.test(text.slice(cursor, closeIndex)),
+    tagName,
+    isTruncated: false,
+  };
+}
+
+function stripToolCallXmlTags(text: string): string {
+  if (!text || !TOOL_CALL_QUICK_RE.test(text)) {
+    return text;
+  }
+
+  const codeRegions = findCodeRegions(text);
+  let result = "";
+  let lastIndex = 0;
+  let inToolCallBlock = false;
+  let toolCallContentStart = 0;
+  let toolCallBlockTagName: string | null = null;
+  const visibleTagBalance = new Map<string, number>();
+
+  for (let idx = 0; idx < text.length; idx += 1) {
+    if (text[idx] !== "<") {
+      continue;
+    }
+    if (!inToolCallBlock && isInsideCode(idx, codeRegions)) {
+      continue;
+    }
+
+    const tag = parseToolCallTagAt(text, idx);
+    if (!tag) {
+      continue;
+    }
+
+    if (!inToolCallBlock) {
+      result += text.slice(lastIndex, idx);
+      if (tag.isClose) {
+        if (tag.isTruncated) {
+          const preserveEnd = tag.contentStart;
+          result += text.slice(idx, preserveEnd);
+          lastIndex = preserveEnd;
+          idx = Math.max(idx, preserveEnd - 1);
+          continue;
+        }
+        const balance = visibleTagBalance.get(tag.tagName) ?? 0;
+        if (balance > 0) {
+          result += text.slice(idx, tag.end);
+          visibleTagBalance.set(tag.tagName, balance - 1);
+        }
+        lastIndex = tag.end;
+        idx = Math.max(idx, tag.end - 1);
+        continue;
+      }
+      if (tag.isSelfClosing) {
+        lastIndex = tag.end;
+        idx = Math.max(idx, tag.end - 1);
+        continue;
+      }
+      if (
+        !tag.isClose &&
+        looksLikeToolCallPayloadStart(text, tag.isTruncated ? tag.contentStart : tag.end)
+      ) {
+        inToolCallBlock = true;
+        toolCallContentStart = tag.end;
+        toolCallBlockTagName = tag.tagName;
+        if (tag.isTruncated) {
+          lastIndex = text.length;
+          break;
+        }
+      } else {
+        const preserveEnd = tag.isTruncated ? tag.contentStart : tag.end;
+        result += text.slice(idx, preserveEnd);
+        if (!tag.isTruncated) {
+          visibleTagBalance.set(tag.tagName, (visibleTagBalance.get(tag.tagName) ?? 0) + 1);
+        }
+        lastIndex = preserveEnd;
+        idx = Math.max(idx, preserveEnd - 1);
+        continue;
+      }
+    } else if (
+      tag.isClose &&
+      tag.tagName === toolCallBlockTagName &&
+      !endsInsideQuotedString(text, toolCallContentStart, idx)
+    ) {
+      inToolCallBlock = false;
+      toolCallBlockTagName = null;
+    }
+
+    lastIndex = tag.end;
+    idx = Math.max(idx, tag.end - 1);
+  }
+
+  if (!inToolCallBlock) {
+    result += text.slice(lastIndex);
+  }
+
+  return result;
+}
+
 function stripRelevantMemoriesTags(text: string): string {
  if (!text || !MEMORY_TAG_QUICK_RE.test(text)) {
    return text;
@@ -43,5 +281,8 @@ function stripRelevantMemoriesTags(text: string): string {

 export function stripAssistantInternalScaffolding(text: string): string {
  const withoutReasoning = stripReasoningTagsFromText(text, { mode: "preserve", trim: "start" });
-  return stripRelevantMemoriesTags(withoutReasoning).trimStart();
+  const withoutMemories = stripRelevantMemoriesTags(withoutReasoning);
+  const withoutToolCalls = stripToolCallXmlTags(withoutMemories);
+  const withoutSpecialTokens = stripModelSpecialTokens(withoutToolCalls);
+  return withoutSpecialTokens.trimStart();
 }
--- a/src/shared/text/model-special-tokens.ts
+++ b/src/shared/text/model-special-tokens.ts
@@ -0,0 +1,47 @@
+/**
+ * Strip model control tokens leaked into assistant text output.
+ *
+ * Models like GLM-5 and DeepSeek sometimes emit internal delimiter tokens
+ * (e.g. `<|assistant|>`, `<|tool_call_result_begin|>`, `<｜begin▁of▁sentence｜>`)
+ * in their responses. These use the universal `<|...|>` convention (ASCII or
+ * full-width pipe variants) and should never reach end users.
+ *
+ * Matches inside fenced code blocks or inline code spans are preserved so
+ * that documentation / examples that reference these tokens are not corrupted.
+ *
+ * This is a provider bug — no upstream fix tracked yet.
+ * Remove this function when upstream providers stop leaking tokens.
+ * @see https://github.com/openclaw/openclaw/issues/40020
+ */
+import { findCodeRegions, isInsideCode } from "./code-regions.js";
+
+// Match both ASCII pipe <|...|> and full-width pipe <｜...｜> (U+FF5C) variants.
+const MODEL_SPECIAL_TOKEN_RE = /<[|｜][^|｜]*[|｜]>/g;
+
+function overlapsCodeRegion(
+  start: number,
+  end: number,
+  codeRegions: { start: number; end: number }[],
+): boolean {
+  return codeRegions.some((region) => start < region.end && end > region.start);
+}
+
+export function stripModelSpecialTokens(text: string): string {
+  if (!text) {
+    return text;
+  }
+  MODEL_SPECIAL_TOKEN_RE.lastIndex = 0;
+  if (!MODEL_SPECIAL_TOKEN_RE.test(text)) {
+    return text;
+  }
+  MODEL_SPECIAL_TOKEN_RE.lastIndex = 0;
+
+  const codeRegions = findCodeRegions(text);
+  return text.replace(MODEL_SPECIAL_TOKEN_RE, (match, offset) => {
+    const start = offset;
+    const end = start + match.length;
+    return isInsideCode(start, codeRegions) || overlapsCodeRegion(start, end, codeRegions)
+      ? match
+      : " ";
+  });
+}