From 309162f9a26a4516233f2f68e7a51365965ffea6 Mon Sep 17 00:00:00 2001 From: George Zhang Date: Tue, 10 Mar 2026 06:27:59 -0700 Subject: [PATCH] fix: strip leaked model control tokens from user-facing text (#42173) Models like GLM-5 and DeepSeek sometimes emit internal delimiter tokens in their responses. Uses generic pattern in the text extraction pipeline, following the same architecture as stripMinimaxToolCallXml. Closes #40020 Supersedes #40573 Co-authored-by: imwyvern <100903837+imwyvern@users.noreply.github.com> --- ...d-utils.strip-model-special-tokens.test.ts | 25 +++++++++++++++++ src/agents/pi-embedded-utils.ts | 28 ++++++++++++++++++- src/agents/tools/sessions-helpers.ts | 5 +++- 3 files changed, 56 insertions(+), 2 deletions(-) create mode 100644 src/agents/pi-embedded-utils.strip-model-special-tokens.test.ts diff --git a/src/agents/pi-embedded-utils.strip-model-special-tokens.test.ts b/src/agents/pi-embedded-utils.strip-model-special-tokens.test.ts new file mode 100644 index 00000000000..ef0e2b32dec --- /dev/null +++ b/src/agents/pi-embedded-utils.strip-model-special-tokens.test.ts @@ -0,0 +1,25 @@ +import { describe, expect, it } from "vitest"; +import { stripModelSpecialTokens } from "./pi-embedded-utils.js"; + +/** + * @see https://github.com/openclaw/openclaw/issues/40020 + */ +describe("stripModelSpecialTokens", () => { + it("strips tokens and inserts space between adjacent words", () => { + expect(stripModelSpecialTokens("<|user|>Question<|assistant|>Answer")).toBe("Question Answer"); + }); + + it("strips full-width pipe variants (DeepSeek U+FF5C)", () => { + expect(stripModelSpecialTokens("<|begin▁of▁sentence|>Hello there")).toBe("Hello there"); + }); + + it("does not strip normal angle brackets or HTML", () => { + expect(stripModelSpecialTokens("a < b && c > d")).toBe("a < b && c > d"); + expect(stripModelSpecialTokens("
hello
")).toBe("
hello
"); + }); + + it("passes through text without tokens unchanged", () => { + const text = "Just a normal response."; + expect(stripModelSpecialTokens(text)).toBe(text); + }); +}); diff --git a/src/agents/pi-embedded-utils.ts b/src/agents/pi-embedded-utils.ts index 21a4eb39fd5..da1dd7911b8 100644 --- a/src/agents/pi-embedded-utils.ts +++ b/src/agents/pi-embedded-utils.ts @@ -33,6 +33,32 @@ export function stripMinimaxToolCallXml(text: string): string { return cleaned; } +/** + * Strip model control tokens leaked into assistant text output. + * + * Models like GLM-5 and DeepSeek sometimes emit internal delimiter tokens + * (e.g. `<|assistant|>`, `<|tool_call_result_begin|>`, `<|begin▁of▁sentence|>`) + * in their responses. These use the universal `<|...|>` convention (ASCII or + * full-width pipe variants) and should never reach end users. + * + * This is a provider bug — no upstream fix tracked yet. + * Remove this function when upstream providers stop leaking tokens. + * @see https://github.com/openclaw/openclaw/issues/40020 + */ +// Match both ASCII pipe <|...|> and full-width pipe <|...|> (U+FF5C) variants. +const MODEL_SPECIAL_TOKEN_RE = /<[||][^||]*[||]>/g; + +export function stripModelSpecialTokens(text: string): string { + if (!text) { + return text; + } + if (!MODEL_SPECIAL_TOKEN_RE.test(text)) { + return text; + } + MODEL_SPECIAL_TOKEN_RE.lastIndex = 0; + return text.replace(MODEL_SPECIAL_TOKEN_RE, " ").replace(/ +/g, " ").trim(); +} + /** * Strip downgraded tool call text representations that leak into text content. * When replaying history to Gemini, tool calls without `thought_signature` are @@ -212,7 +238,7 @@ export function extractAssistantText(msg: AssistantMessage): string { extractTextFromChatContent(msg.content, { sanitizeText: (text) => stripThinkingTagsFromText( - stripDowngradedToolCallText(stripMinimaxToolCallXml(text)), + stripDowngradedToolCallText(stripModelSpecialTokens(stripMinimaxToolCallXml(text))), ).trim(), joinWith: "\n", normalizeText: (text) => text.trim(), diff --git a/src/agents/tools/sessions-helpers.ts b/src/agents/tools/sessions-helpers.ts index 7a244e32de0..5b5f94699c6 100644 --- a/src/agents/tools/sessions-helpers.ts +++ b/src/agents/tools/sessions-helpers.ts @@ -32,6 +32,7 @@ import { sanitizeUserFacingText } from "../pi-embedded-helpers.js"; import { stripDowngradedToolCallText, stripMinimaxToolCallXml, + stripModelSpecialTokens, stripThinkingTagsFromText, } from "../pi-embedded-utils.js"; @@ -142,7 +143,9 @@ export function sanitizeTextContent(text: string): string { if (!text) { return text; } - return stripThinkingTagsFromText(stripDowngradedToolCallText(stripMinimaxToolCallXml(text))); + return stripThinkingTagsFromText( + stripDowngradedToolCallText(stripModelSpecialTokens(stripMinimaxToolCallXml(text))), + ); } export function extractAssistantText(message: unknown): string | undefined {