From 62d0cfeee7af20c76091ddca4e40cf6958bbead6 Mon Sep 17 00:00:00 2001 From: AytuncYildizli Date: Mon, 2 Mar 2026 19:43:12 +0300 Subject: [PATCH] fix(delivery): strip HTML tags for plain-text messaging surfaces Models occasionally produce HTML tags in their output. While these render fine on web surfaces, they appear as literal text on WhatsApp, Signal, SMS, IRC, and Telegram. Add sanitizeForPlainText() utility that converts common inline HTML to lightweight-markup equivalents and strips remaining tags. Applied in the outbound delivery pipeline for non-HTML surfaces only. Closes #31884 See also: #18558 --- src/infra/outbound/deliver.ts | 25 ++++-- src/infra/outbound/sanitize-text.test.ts | 110 +++++++++++++++++++++++ src/infra/outbound/sanitize-text.ts | 62 +++++++++++++ 3 files changed, 190 insertions(+), 7 deletions(-) create mode 100644 src/infra/outbound/sanitize-text.test.ts create mode 100644 src/infra/outbound/sanitize-text.ts diff --git a/src/infra/outbound/deliver.ts b/src/infra/outbound/deliver.ts index a6acc956941..1fd5f3de7dc 100644 --- a/src/infra/outbound/deliver.ts +++ b/src/infra/outbound/deliver.ts @@ -33,6 +33,7 @@ import { ackDelivery, enqueueDelivery, failDelivery } from "./delivery-queue.js" import type { OutboundIdentity } from "./identity.js"; import type { NormalizedOutboundPayload } from "./payloads.js"; import { normalizeReplyPayloadsForDelivery } from "./payloads.js"; +import { isPlainTextSurface, sanitizeForPlainText } from "./sanitize-text.js"; import type { OutboundSessionContext } from "./session-context.js"; import type { OutboundChannel } from "./targets.js"; @@ -445,13 +446,23 @@ async function deliverOutboundPayloadsCore( text: normalizedText, }; }; - const normalizedPayloads = normalizeReplyPayloadsForDelivery(payloads).flatMap((payload) => { - if (channel !== "whatsapp") { - return [payload]; - } - const normalized = normalizeWhatsAppPayload(payload); - return normalized ? [normalized] : []; - }); + const normalizedPayloads = normalizeReplyPayloadsForDelivery(payloads) + .flatMap((payload) => { + if (channel !== "whatsapp") { + return [payload]; + } + const normalized = normalizeWhatsAppPayload(payload); + return normalized ? [normalized] : []; + }) + .map((payload) => { + // Strip HTML tags for plain-text surfaces (WhatsApp, Signal, etc.) + // Models occasionally produce
, , etc. that render as literal text. + // See https://github.com/openclaw/openclaw/issues/31884 + if (!isPlainTextSurface(channel) || !payload.text) { + return payload; + } + return { ...payload, text: sanitizeForPlainText(payload.text) }; + }); const hookRunner = getGlobalHookRunner(); const sessionKeyForInternalHooks = params.mirror?.sessionKey ?? params.session?.key; if ( diff --git a/src/infra/outbound/sanitize-text.test.ts b/src/infra/outbound/sanitize-text.test.ts new file mode 100644 index 00000000000..c7aebb6dba5 --- /dev/null +++ b/src/infra/outbound/sanitize-text.test.ts @@ -0,0 +1,110 @@ +import { describe, expect, it } from "vitest"; +import { isPlainTextSurface, sanitizeForPlainText } from "./sanitize-text.js"; + +// --------------------------------------------------------------------------- +// isPlainTextSurface +// --------------------------------------------------------------------------- + +describe("isPlainTextSurface", () => { + it.each(["whatsapp", "signal", "sms", "irc", "telegram", "imessage", "googlechat"])( + "returns true for %s", + (channel) => { + expect(isPlainTextSurface(channel)).toBe(true); + }, + ); + + it.each(["discord", "slack", "web", "matrix"])("returns false for %s", (channel) => { + expect(isPlainTextSurface(channel)).toBe(false); + }); + + it("is case-insensitive", () => { + expect(isPlainTextSurface("WhatsApp")).toBe(true); + expect(isPlainTextSurface("SIGNAL")).toBe(true); + }); +}); + +// --------------------------------------------------------------------------- +// sanitizeForPlainText +// --------------------------------------------------------------------------- + +describe("sanitizeForPlainText", () => { + // --- line breaks -------------------------------------------------------- + + it("converts
to newline", () => { + expect(sanitizeForPlainText("hello
world")).toBe("hello\nworld"); + }); + + it("converts self-closing
and
variants", () => { + expect(sanitizeForPlainText("a
b")).toBe("a\nb"); + expect(sanitizeForPlainText("a
b")).toBe("a\nb"); + }); + + // --- inline formatting -------------------------------------------------- + + it("converts and to WhatsApp bold", () => { + expect(sanitizeForPlainText("bold")).toBe("*bold*"); + expect(sanitizeForPlainText("bold")).toBe("*bold*"); + }); + + it("converts and to WhatsApp italic", () => { + expect(sanitizeForPlainText("italic")).toBe("_italic_"); + expect(sanitizeForPlainText("italic")).toBe("_italic_"); + }); + + it("converts , , and to WhatsApp strikethrough", () => { + expect(sanitizeForPlainText("deleted")).toBe("~deleted~"); + expect(sanitizeForPlainText("removed")).toBe("~removed~"); + expect(sanitizeForPlainText("old")).toBe("~old~"); + }); + + it("converts to backtick wrapping", () => { + expect(sanitizeForPlainText("foo()")).toBe("`foo()`"); + }); + + // --- block elements ----------------------------------------------------- + + it("converts

and

to newlines", () => { + expect(sanitizeForPlainText("

paragraph

")).toBe("\nparagraph\n"); + }); + + it("converts headings to bold text with newlines", () => { + expect(sanitizeForPlainText("

Title

")).toBe("\n*Title*\n"); + expect(sanitizeForPlainText("

Section

")).toBe("\n*Section*\n"); + }); + + it("converts
  • to bullet points", () => { + expect(sanitizeForPlainText("
  • item one
  • item two
  • ")).toBe( + "• item one\n• item two\n", + ); + }); + + // --- tag stripping ------------------------------------------------------ + + it("strips unknown/remaining tags", () => { + expect(sanitizeForPlainText('text')).toBe("text"); + expect(sanitizeForPlainText('link')).toBe("link"); + }); + + // --- passthrough -------------------------------------------------------- + + it("passes through clean text unchanged", () => { + expect(sanitizeForPlainText("hello world")).toBe("hello world"); + }); + + it("does not corrupt angle brackets in prose", () => { + // `a < b` does not match `` pattern because there is no closing `>` + // immediately after a tag-like sequence. + expect(sanitizeForPlainText("a < b && c > d")).toBe("a < b && c > d"); + }); + + // --- mixed content ------------------------------------------------------ + + it("handles mixed HTML content", () => { + const input = "Hello
    world this is nice"; + expect(sanitizeForPlainText(input)).toBe("Hello\n*world* this is _nice_"); + }); + + it("collapses excessive newlines", () => { + expect(sanitizeForPlainText("a



    b")).toBe("a\n\nb"); + }); +}); diff --git a/src/infra/outbound/sanitize-text.ts b/src/infra/outbound/sanitize-text.ts new file mode 100644 index 00000000000..fb6b6abbbc1 --- /dev/null +++ b/src/infra/outbound/sanitize-text.ts @@ -0,0 +1,62 @@ +/** + * Sanitize model output for plain-text messaging surfaces. + * + * LLMs occasionally produce HTML tags (`
    `, ``, ``, etc.) that render + * correctly on web but appear as literal text on WhatsApp, Signal, SMS, and IRC. + * + * Converts common inline HTML to lightweight-markup equivalents used by + * WhatsApp/Signal/Telegram and strips any remaining tags. + * + * @see https://github.com/openclaw/openclaw/issues/31884 + * @see https://github.com/openclaw/openclaw/issues/18558 + */ + +/** Channels where HTML tags should be converted/stripped. */ +const PLAIN_TEXT_SURFACES = new Set([ + "whatsapp", + "signal", + "sms", + "irc", + "telegram", + "imessage", + "googlechat", +]); + +/** Returns `true` when the channel cannot render raw HTML. */ +export function isPlainTextSurface(channelId: string): boolean { + return PLAIN_TEXT_SURFACES.has(channelId.toLowerCase()); +} + +/** + * Convert common HTML tags to their plain-text/lightweight-markup equivalents + * and strip anything that remains. + * + * The function is intentionally conservative — it only targets tags that models + * are known to produce and avoids false positives on angle brackets in normal + * prose (e.g. `a < b`). + */ +export function sanitizeForPlainText(text: string): string { + return ( + text + // Line breaks + .replace(//gi, "\n") + // Block elements → newlines + .replace(/<\/?(p|div)>/gi, "\n") + // Bold → WhatsApp/Signal bold + .replace(/<(b|strong)>(.*?)<\/\1>/gi, "*$2*") + // Italic → WhatsApp/Signal italic + .replace(/<(i|em)>(.*?)<\/\1>/gi, "_$2_") + // Strikethrough → WhatsApp/Signal strikethrough + .replace(/<(s|strike|del)>(.*?)<\/\1>/gi, "~$2~") + // Inline code + .replace(/(.*?)<\/code>/gi, "`$1`") + // Headings → bold text with newline + .replace(/]*>(.*?)<\/h[1-6]>/gi, "\n*$1*\n") + // List items → bullet points + .replace(/]*>(.*?)<\/li>/gi, "• $1\n") + // Strip remaining HTML tags (require tag-like structure: ) + .replace(/<\/?[a-z][a-z0-9]*\b[^>]*>/gi, "") + // Collapse 3+ consecutive newlines into 2 + .replace(/\n{3,}/g, "\n\n") + ); +}