fix(delivery): strip HTML tags for plain-text messaging surfaces

Models occasionally produce HTML tags in their output. While these render
fine on web surfaces, they appear as literal text on WhatsApp, Signal,
SMS, IRC, and Telegram.

Add sanitizeForPlainText() utility that converts common inline HTML to
lightweight-markup equivalents and strips remaining tags. Applied in the
outbound delivery pipeline for non-HTML surfaces only.

Closes #31884
See also: #18558
This commit is contained in:
AytuncYildizli
2026-03-02 19:43:12 +03:00
committed by Peter Steinberger
parent a19a7f5e6e
commit 62d0cfeee7
3 changed files with 190 additions and 7 deletions

View File

@@ -33,6 +33,7 @@ import { ackDelivery, enqueueDelivery, failDelivery } from "./delivery-queue.js"
import type { OutboundIdentity } from "./identity.js";
import type { NormalizedOutboundPayload } from "./payloads.js";
import { normalizeReplyPayloadsForDelivery } from "./payloads.js";
import { isPlainTextSurface, sanitizeForPlainText } from "./sanitize-text.js";
import type { OutboundSessionContext } from "./session-context.js";
import type { OutboundChannel } from "./targets.js";
@@ -445,13 +446,23 @@ async function deliverOutboundPayloadsCore(
text: normalizedText,
};
};
const normalizedPayloads = normalizeReplyPayloadsForDelivery(payloads).flatMap((payload) => {
if (channel !== "whatsapp") {
return [payload];
}
const normalized = normalizeWhatsAppPayload(payload);
return normalized ? [normalized] : [];
});
const normalizedPayloads = normalizeReplyPayloadsForDelivery(payloads)
.flatMap((payload) => {
if (channel !== "whatsapp") {
return [payload];
}
const normalized = normalizeWhatsAppPayload(payload);
return normalized ? [normalized] : [];
})
.map((payload) => {
// Strip HTML tags for plain-text surfaces (WhatsApp, Signal, etc.)
// Models occasionally produce <br>, <b>, etc. that render as literal text.
// See https://github.com/openclaw/openclaw/issues/31884
if (!isPlainTextSurface(channel) || !payload.text) {
return payload;
}
return { ...payload, text: sanitizeForPlainText(payload.text) };
});
const hookRunner = getGlobalHookRunner();
const sessionKeyForInternalHooks = params.mirror?.sessionKey ?? params.session?.key;
if (

View File

@@ -0,0 +1,110 @@
import { describe, expect, it } from "vitest";
import { isPlainTextSurface, sanitizeForPlainText } from "./sanitize-text.js";
// ---------------------------------------------------------------------------
// isPlainTextSurface
// ---------------------------------------------------------------------------
describe("isPlainTextSurface", () => {
it.each(["whatsapp", "signal", "sms", "irc", "telegram", "imessage", "googlechat"])(
"returns true for %s",
(channel) => {
expect(isPlainTextSurface(channel)).toBe(true);
},
);
it.each(["discord", "slack", "web", "matrix"])("returns false for %s", (channel) => {
expect(isPlainTextSurface(channel)).toBe(false);
});
it("is case-insensitive", () => {
expect(isPlainTextSurface("WhatsApp")).toBe(true);
expect(isPlainTextSurface("SIGNAL")).toBe(true);
});
});
// ---------------------------------------------------------------------------
// sanitizeForPlainText
// ---------------------------------------------------------------------------
describe("sanitizeForPlainText", () => {
// --- line breaks --------------------------------------------------------
it("converts <br> to newline", () => {
expect(sanitizeForPlainText("hello<br>world")).toBe("hello\nworld");
});
it("converts self-closing <br/> and <br /> variants", () => {
expect(sanitizeForPlainText("a<br/>b")).toBe("a\nb");
expect(sanitizeForPlainText("a<br />b")).toBe("a\nb");
});
// --- inline formatting --------------------------------------------------
it("converts <b> and <strong> to WhatsApp bold", () => {
expect(sanitizeForPlainText("<b>bold</b>")).toBe("*bold*");
expect(sanitizeForPlainText("<strong>bold</strong>")).toBe("*bold*");
});
it("converts <i> and <em> to WhatsApp italic", () => {
expect(sanitizeForPlainText("<i>italic</i>")).toBe("_italic_");
expect(sanitizeForPlainText("<em>italic</em>")).toBe("_italic_");
});
it("converts <s>, <strike>, and <del> to WhatsApp strikethrough", () => {
expect(sanitizeForPlainText("<s>deleted</s>")).toBe("~deleted~");
expect(sanitizeForPlainText("<del>removed</del>")).toBe("~removed~");
expect(sanitizeForPlainText("<strike>old</strike>")).toBe("~old~");
});
it("converts <code> to backtick wrapping", () => {
expect(sanitizeForPlainText("<code>foo()</code>")).toBe("`foo()`");
});
// --- block elements -----------------------------------------------------
it("converts <p> and <div> to newlines", () => {
expect(sanitizeForPlainText("<p>paragraph</p>")).toBe("\nparagraph\n");
});
it("converts headings to bold text with newlines", () => {
expect(sanitizeForPlainText("<h1>Title</h1>")).toBe("\n*Title*\n");
expect(sanitizeForPlainText("<h3>Section</h3>")).toBe("\n*Section*\n");
});
it("converts <li> to bullet points", () => {
expect(sanitizeForPlainText("<li>item one</li><li>item two</li>")).toBe(
"• item one\n• item two\n",
);
});
// --- tag stripping ------------------------------------------------------
it("strips unknown/remaining tags", () => {
expect(sanitizeForPlainText('<span class="x">text</span>')).toBe("text");
expect(sanitizeForPlainText('<a href="https://example.com">link</a>')).toBe("link");
});
// --- passthrough --------------------------------------------------------
it("passes through clean text unchanged", () => {
expect(sanitizeForPlainText("hello world")).toBe("hello world");
});
it("does not corrupt angle brackets in prose", () => {
// `a < b` does not match `<tag>` pattern because there is no closing `>`
// immediately after a tag-like sequence.
expect(sanitizeForPlainText("a < b && c > d")).toBe("a < b && c > d");
});
// --- mixed content ------------------------------------------------------
it("handles mixed HTML content", () => {
const input = "Hello<br><b>world</b> this is <i>nice</i>";
expect(sanitizeForPlainText(input)).toBe("Hello\n*world* this is _nice_");
});
it("collapses excessive newlines", () => {
expect(sanitizeForPlainText("a<br><br><br><br>b")).toBe("a\n\nb");
});
});

View File

@@ -0,0 +1,62 @@
/**
* Sanitize model output for plain-text messaging surfaces.
*
* LLMs occasionally produce HTML tags (`<br>`, `<b>`, `<i>`, etc.) that render
* correctly on web but appear as literal text on WhatsApp, Signal, SMS, and IRC.
*
* Converts common inline HTML to lightweight-markup equivalents used by
* WhatsApp/Signal/Telegram and strips any remaining tags.
*
* @see https://github.com/openclaw/openclaw/issues/31884
* @see https://github.com/openclaw/openclaw/issues/18558
*/
/** Channels where HTML tags should be converted/stripped. */
const PLAIN_TEXT_SURFACES = new Set([
"whatsapp",
"signal",
"sms",
"irc",
"telegram",
"imessage",
"googlechat",
]);
/** Returns `true` when the channel cannot render raw HTML. */
export function isPlainTextSurface(channelId: string): boolean {
return PLAIN_TEXT_SURFACES.has(channelId.toLowerCase());
}
/**
* Convert common HTML tags to their plain-text/lightweight-markup equivalents
* and strip anything that remains.
*
* The function is intentionally conservative — it only targets tags that models
* are known to produce and avoids false positives on angle brackets in normal
* prose (e.g. `a < b`).
*/
export function sanitizeForPlainText(text: string): string {
return (
text
// Line breaks
.replace(/<br\s*\/?>/gi, "\n")
// Block elements → newlines
.replace(/<\/?(p|div)>/gi, "\n")
// Bold → WhatsApp/Signal bold
.replace(/<(b|strong)>(.*?)<\/\1>/gi, "*$2*")
// Italic → WhatsApp/Signal italic
.replace(/<(i|em)>(.*?)<\/\1>/gi, "_$2_")
// Strikethrough → WhatsApp/Signal strikethrough
.replace(/<(s|strike|del)>(.*?)<\/\1>/gi, "~$2~")
// Inline code
.replace(/<code>(.*?)<\/code>/gi, "`$1`")
// Headings → bold text with newline
.replace(/<h[1-6][^>]*>(.*?)<\/h[1-6]>/gi, "\n*$1*\n")
// List items → bullet points
.replace(/<li[^>]*>(.*?)<\/li>/gi, "• $1\n")
// Strip remaining HTML tags (require tag-like structure: <word...>)
.replace(/<\/?[a-z][a-z0-9]*\b[^>]*>/gi, "")
// Collapse 3+ consecutive newlines into 2
.replace(/\n{3,}/g, "\n\n")
);
}