mirror of
https://github.com/openclaw/openclaw.git
synced 2026-03-12 07:20:45 +00:00
fix: strip leaked model control tokens from user-facing text (#42173)
Models like GLM-5 and DeepSeek sometimes emit internal delimiter tokens in their responses. Uses generic pattern in the text extraction pipeline, following the same architecture as stripMinimaxToolCallXml. Closes #40020 Supersedes #40573 Co-authored-by: imwyvern <100903837+imwyvern@users.noreply.github.com>
This commit is contained in:
@@ -0,0 +1,25 @@
|
||||
import { describe, expect, it } from "vitest";
|
||||
import { stripModelSpecialTokens } from "./pi-embedded-utils.js";
|
||||
|
||||
/**
|
||||
* @see https://github.com/openclaw/openclaw/issues/40020
|
||||
*/
|
||||
describe("stripModelSpecialTokens", () => {
|
||||
it("strips tokens and inserts space between adjacent words", () => {
|
||||
expect(stripModelSpecialTokens("<|user|>Question<|assistant|>Answer")).toBe("Question Answer");
|
||||
});
|
||||
|
||||
it("strips full-width pipe variants (DeepSeek U+FF5C)", () => {
|
||||
expect(stripModelSpecialTokens("<|begin▁of▁sentence|>Hello there")).toBe("Hello there");
|
||||
});
|
||||
|
||||
it("does not strip normal angle brackets or HTML", () => {
|
||||
expect(stripModelSpecialTokens("a < b && c > d")).toBe("a < b && c > d");
|
||||
expect(stripModelSpecialTokens("<div>hello</div>")).toBe("<div>hello</div>");
|
||||
});
|
||||
|
||||
it("passes through text without tokens unchanged", () => {
|
||||
const text = "Just a normal response.";
|
||||
expect(stripModelSpecialTokens(text)).toBe(text);
|
||||
});
|
||||
});
|
||||
@@ -33,6 +33,32 @@ export function stripMinimaxToolCallXml(text: string): string {
|
||||
return cleaned;
|
||||
}
|
||||
|
||||
/**
|
||||
* Strip model control tokens leaked into assistant text output.
|
||||
*
|
||||
* Models like GLM-5 and DeepSeek sometimes emit internal delimiter tokens
|
||||
* (e.g. `<|assistant|>`, `<|tool_call_result_begin|>`, `<|begin▁of▁sentence|>`)
|
||||
* in their responses. These use the universal `<|...|>` convention (ASCII or
|
||||
* full-width pipe variants) and should never reach end users.
|
||||
*
|
||||
* This is a provider bug — no upstream fix tracked yet.
|
||||
* Remove this function when upstream providers stop leaking tokens.
|
||||
* @see https://github.com/openclaw/openclaw/issues/40020
|
||||
*/
|
||||
// Match both ASCII pipe <|...|> and full-width pipe <|...|> (U+FF5C) variants.
|
||||
const MODEL_SPECIAL_TOKEN_RE = /<[||][^||]*[||]>/g;
|
||||
|
||||
export function stripModelSpecialTokens(text: string): string {
|
||||
if (!text) {
|
||||
return text;
|
||||
}
|
||||
if (!MODEL_SPECIAL_TOKEN_RE.test(text)) {
|
||||
return text;
|
||||
}
|
||||
MODEL_SPECIAL_TOKEN_RE.lastIndex = 0;
|
||||
return text.replace(MODEL_SPECIAL_TOKEN_RE, " ").replace(/ +/g, " ").trim();
|
||||
}
|
||||
|
||||
/**
|
||||
* Strip downgraded tool call text representations that leak into text content.
|
||||
* When replaying history to Gemini, tool calls without `thought_signature` are
|
||||
@@ -212,7 +238,7 @@ export function extractAssistantText(msg: AssistantMessage): string {
|
||||
extractTextFromChatContent(msg.content, {
|
||||
sanitizeText: (text) =>
|
||||
stripThinkingTagsFromText(
|
||||
stripDowngradedToolCallText(stripMinimaxToolCallXml(text)),
|
||||
stripDowngradedToolCallText(stripModelSpecialTokens(stripMinimaxToolCallXml(text))),
|
||||
).trim(),
|
||||
joinWith: "\n",
|
||||
normalizeText: (text) => text.trim(),
|
||||
|
||||
@@ -32,6 +32,7 @@ import { sanitizeUserFacingText } from "../pi-embedded-helpers.js";
|
||||
import {
|
||||
stripDowngradedToolCallText,
|
||||
stripMinimaxToolCallXml,
|
||||
stripModelSpecialTokens,
|
||||
stripThinkingTagsFromText,
|
||||
} from "../pi-embedded-utils.js";
|
||||
|
||||
@@ -142,7 +143,9 @@ export function sanitizeTextContent(text: string): string {
|
||||
if (!text) {
|
||||
return text;
|
||||
}
|
||||
return stripThinkingTagsFromText(stripDowngradedToolCallText(stripMinimaxToolCallXml(text)));
|
||||
return stripThinkingTagsFromText(
|
||||
stripDowngradedToolCallText(stripModelSpecialTokens(stripMinimaxToolCallXml(text))),
|
||||
);
|
||||
}
|
||||
|
||||
export function extractAssistantText(message: unknown): string | undefined {
|
||||
|
||||
Reference in New Issue
Block a user