mirror of
https://github.com/openclaw/openclaw.git
synced 2026-03-12 15:30:39 +00:00
fix(delivery): strip HTML tags for plain-text messaging surfaces
Models occasionally produce HTML tags in their output. While these render fine on web surfaces, they appear as literal text on WhatsApp, Signal, SMS, IRC, and Telegram. Add sanitizeForPlainText() utility that converts common inline HTML to lightweight-markup equivalents and strips remaining tags. Applied in the outbound delivery pipeline for non-HTML surfaces only. Closes #31884 See also: #18558
This commit is contained in:
committed by
Peter Steinberger
parent
a19a7f5e6e
commit
62d0cfeee7
@@ -33,6 +33,7 @@ import { ackDelivery, enqueueDelivery, failDelivery } from "./delivery-queue.js"
|
||||
import type { OutboundIdentity } from "./identity.js";
|
||||
import type { NormalizedOutboundPayload } from "./payloads.js";
|
||||
import { normalizeReplyPayloadsForDelivery } from "./payloads.js";
|
||||
import { isPlainTextSurface, sanitizeForPlainText } from "./sanitize-text.js";
|
||||
import type { OutboundSessionContext } from "./session-context.js";
|
||||
import type { OutboundChannel } from "./targets.js";
|
||||
|
||||
@@ -445,13 +446,23 @@ async function deliverOutboundPayloadsCore(
|
||||
text: normalizedText,
|
||||
};
|
||||
};
|
||||
const normalizedPayloads = normalizeReplyPayloadsForDelivery(payloads).flatMap((payload) => {
|
||||
if (channel !== "whatsapp") {
|
||||
return [payload];
|
||||
}
|
||||
const normalized = normalizeWhatsAppPayload(payload);
|
||||
return normalized ? [normalized] : [];
|
||||
});
|
||||
const normalizedPayloads = normalizeReplyPayloadsForDelivery(payloads)
|
||||
.flatMap((payload) => {
|
||||
if (channel !== "whatsapp") {
|
||||
return [payload];
|
||||
}
|
||||
const normalized = normalizeWhatsAppPayload(payload);
|
||||
return normalized ? [normalized] : [];
|
||||
})
|
||||
.map((payload) => {
|
||||
// Strip HTML tags for plain-text surfaces (WhatsApp, Signal, etc.)
|
||||
// Models occasionally produce <br>, <b>, etc. that render as literal text.
|
||||
// See https://github.com/openclaw/openclaw/issues/31884
|
||||
if (!isPlainTextSurface(channel) || !payload.text) {
|
||||
return payload;
|
||||
}
|
||||
return { ...payload, text: sanitizeForPlainText(payload.text) };
|
||||
});
|
||||
const hookRunner = getGlobalHookRunner();
|
||||
const sessionKeyForInternalHooks = params.mirror?.sessionKey ?? params.session?.key;
|
||||
if (
|
||||
|
||||
110
src/infra/outbound/sanitize-text.test.ts
Normal file
110
src/infra/outbound/sanitize-text.test.ts
Normal file
@@ -0,0 +1,110 @@
|
||||
import { describe, expect, it } from "vitest";
|
||||
import { isPlainTextSurface, sanitizeForPlainText } from "./sanitize-text.js";
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// isPlainTextSurface
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
describe("isPlainTextSurface", () => {
|
||||
it.each(["whatsapp", "signal", "sms", "irc", "telegram", "imessage", "googlechat"])(
|
||||
"returns true for %s",
|
||||
(channel) => {
|
||||
expect(isPlainTextSurface(channel)).toBe(true);
|
||||
},
|
||||
);
|
||||
|
||||
it.each(["discord", "slack", "web", "matrix"])("returns false for %s", (channel) => {
|
||||
expect(isPlainTextSurface(channel)).toBe(false);
|
||||
});
|
||||
|
||||
it("is case-insensitive", () => {
|
||||
expect(isPlainTextSurface("WhatsApp")).toBe(true);
|
||||
expect(isPlainTextSurface("SIGNAL")).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// sanitizeForPlainText
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
describe("sanitizeForPlainText", () => {
|
||||
// --- line breaks --------------------------------------------------------
|
||||
|
||||
it("converts <br> to newline", () => {
|
||||
expect(sanitizeForPlainText("hello<br>world")).toBe("hello\nworld");
|
||||
});
|
||||
|
||||
it("converts self-closing <br/> and <br /> variants", () => {
|
||||
expect(sanitizeForPlainText("a<br/>b")).toBe("a\nb");
|
||||
expect(sanitizeForPlainText("a<br />b")).toBe("a\nb");
|
||||
});
|
||||
|
||||
// --- inline formatting --------------------------------------------------
|
||||
|
||||
it("converts <b> and <strong> to WhatsApp bold", () => {
|
||||
expect(sanitizeForPlainText("<b>bold</b>")).toBe("*bold*");
|
||||
expect(sanitizeForPlainText("<strong>bold</strong>")).toBe("*bold*");
|
||||
});
|
||||
|
||||
it("converts <i> and <em> to WhatsApp italic", () => {
|
||||
expect(sanitizeForPlainText("<i>italic</i>")).toBe("_italic_");
|
||||
expect(sanitizeForPlainText("<em>italic</em>")).toBe("_italic_");
|
||||
});
|
||||
|
||||
it("converts <s>, <strike>, and <del> to WhatsApp strikethrough", () => {
|
||||
expect(sanitizeForPlainText("<s>deleted</s>")).toBe("~deleted~");
|
||||
expect(sanitizeForPlainText("<del>removed</del>")).toBe("~removed~");
|
||||
expect(sanitizeForPlainText("<strike>old</strike>")).toBe("~old~");
|
||||
});
|
||||
|
||||
it("converts <code> to backtick wrapping", () => {
|
||||
expect(sanitizeForPlainText("<code>foo()</code>")).toBe("`foo()`");
|
||||
});
|
||||
|
||||
// --- block elements -----------------------------------------------------
|
||||
|
||||
it("converts <p> and <div> to newlines", () => {
|
||||
expect(sanitizeForPlainText("<p>paragraph</p>")).toBe("\nparagraph\n");
|
||||
});
|
||||
|
||||
it("converts headings to bold text with newlines", () => {
|
||||
expect(sanitizeForPlainText("<h1>Title</h1>")).toBe("\n*Title*\n");
|
||||
expect(sanitizeForPlainText("<h3>Section</h3>")).toBe("\n*Section*\n");
|
||||
});
|
||||
|
||||
it("converts <li> to bullet points", () => {
|
||||
expect(sanitizeForPlainText("<li>item one</li><li>item two</li>")).toBe(
|
||||
"• item one\n• item two\n",
|
||||
);
|
||||
});
|
||||
|
||||
// --- tag stripping ------------------------------------------------------
|
||||
|
||||
it("strips unknown/remaining tags", () => {
|
||||
expect(sanitizeForPlainText('<span class="x">text</span>')).toBe("text");
|
||||
expect(sanitizeForPlainText('<a href="https://example.com">link</a>')).toBe("link");
|
||||
});
|
||||
|
||||
// --- passthrough --------------------------------------------------------
|
||||
|
||||
it("passes through clean text unchanged", () => {
|
||||
expect(sanitizeForPlainText("hello world")).toBe("hello world");
|
||||
});
|
||||
|
||||
it("does not corrupt angle brackets in prose", () => {
|
||||
// `a < b` does not match `<tag>` pattern because there is no closing `>`
|
||||
// immediately after a tag-like sequence.
|
||||
expect(sanitizeForPlainText("a < b && c > d")).toBe("a < b && c > d");
|
||||
});
|
||||
|
||||
// --- mixed content ------------------------------------------------------
|
||||
|
||||
it("handles mixed HTML content", () => {
|
||||
const input = "Hello<br><b>world</b> this is <i>nice</i>";
|
||||
expect(sanitizeForPlainText(input)).toBe("Hello\n*world* this is _nice_");
|
||||
});
|
||||
|
||||
it("collapses excessive newlines", () => {
|
||||
expect(sanitizeForPlainText("a<br><br><br><br>b")).toBe("a\n\nb");
|
||||
});
|
||||
});
|
||||
62
src/infra/outbound/sanitize-text.ts
Normal file
62
src/infra/outbound/sanitize-text.ts
Normal file
@@ -0,0 +1,62 @@
|
||||
/**
|
||||
* Sanitize model output for plain-text messaging surfaces.
|
||||
*
|
||||
* LLMs occasionally produce HTML tags (`<br>`, `<b>`, `<i>`, etc.) that render
|
||||
* correctly on web but appear as literal text on WhatsApp, Signal, SMS, and IRC.
|
||||
*
|
||||
* Converts common inline HTML to lightweight-markup equivalents used by
|
||||
* WhatsApp/Signal/Telegram and strips any remaining tags.
|
||||
*
|
||||
* @see https://github.com/openclaw/openclaw/issues/31884
|
||||
* @see https://github.com/openclaw/openclaw/issues/18558
|
||||
*/
|
||||
|
||||
/** Channels where HTML tags should be converted/stripped. */
|
||||
const PLAIN_TEXT_SURFACES = new Set([
|
||||
"whatsapp",
|
||||
"signal",
|
||||
"sms",
|
||||
"irc",
|
||||
"telegram",
|
||||
"imessage",
|
||||
"googlechat",
|
||||
]);
|
||||
|
||||
/** Returns `true` when the channel cannot render raw HTML. */
|
||||
export function isPlainTextSurface(channelId: string): boolean {
|
||||
return PLAIN_TEXT_SURFACES.has(channelId.toLowerCase());
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert common HTML tags to their plain-text/lightweight-markup equivalents
|
||||
* and strip anything that remains.
|
||||
*
|
||||
* The function is intentionally conservative — it only targets tags that models
|
||||
* are known to produce and avoids false positives on angle brackets in normal
|
||||
* prose (e.g. `a < b`).
|
||||
*/
|
||||
export function sanitizeForPlainText(text: string): string {
|
||||
return (
|
||||
text
|
||||
// Line breaks
|
||||
.replace(/<br\s*\/?>/gi, "\n")
|
||||
// Block elements → newlines
|
||||
.replace(/<\/?(p|div)>/gi, "\n")
|
||||
// Bold → WhatsApp/Signal bold
|
||||
.replace(/<(b|strong)>(.*?)<\/\1>/gi, "*$2*")
|
||||
// Italic → WhatsApp/Signal italic
|
||||
.replace(/<(i|em)>(.*?)<\/\1>/gi, "_$2_")
|
||||
// Strikethrough → WhatsApp/Signal strikethrough
|
||||
.replace(/<(s|strike|del)>(.*?)<\/\1>/gi, "~$2~")
|
||||
// Inline code
|
||||
.replace(/<code>(.*?)<\/code>/gi, "`$1`")
|
||||
// Headings → bold text with newline
|
||||
.replace(/<h[1-6][^>]*>(.*?)<\/h[1-6]>/gi, "\n*$1*\n")
|
||||
// List items → bullet points
|
||||
.replace(/<li[^>]*>(.*?)<\/li>/gi, "• $1\n")
|
||||
// Strip remaining HTML tags (require tag-like structure: <word...>)
|
||||
.replace(/<\/?[a-z][a-z0-9]*\b[^>]*>/gi, "")
|
||||
// Collapse 3+ consecutive newlines into 2
|
||||
.replace(/\n{3,}/g, "\n\n")
|
||||
);
|
||||
}
|
||||
Reference in New Issue
Block a user