fix(media): gate markdown image extraction by channel (#72718)

Closes #72642

Co-authored-by: Peter Steinberger <steipete@gmail.com>
This commit is contained in:
Bartok
2026-04-27 06:27:35 -04:00
committed by GitHub
parent 775ed36c16
commit f0b327cf68
15 changed files with 251 additions and 30 deletions

View File

@@ -97,6 +97,7 @@ Docs: https://docs.openclaw.ai
- Google Meet: route stateful `google_meet` tool actions through the gateway-owned runtime so created or joined realtime sessions remain visible to status, speak, and leave after the agent turn ends. Fixes #72440. (#72441) Thanks @BsnizND.
- Google Meet/Voice Call: send Gemini Live a non-blocking consult continuation before long OpenClaw agent consults finish, then deliver the final result when idle so calls and meetings do not sit silent during tool-backed answers. (#72189) Thanks @VACInc.
- Google Meet: preserve Gemini Live function names when replying to realtime tool calls so Google SDK validation accepts the `FunctionResponse` payload. Fixes #72425. (#72426) Thanks @BsnizND.
- Discord/media: keep incidental Markdown image badges in final replies as text unless a channel opts into Markdown-image media extraction, while preserving Telegram Markdown-image media replies and explicit `MEDIA:` attachments. Fixes #72642. Thanks @solavrc and @Bartok9.
- Matrix/E2EE: stabilize recovery and broken-device QA flows while avoiding Matrix device-cleanup sync races that could leave shutdown-time crypto work running. Thanks @gumadeiras.
- Cron: apply `cron.maxConcurrentRuns` to a dedicated `cron-nested` isolated agent-turn lane as well as cron dispatch, so parallel cron jobs no longer serialize on inner LLM execution while non-cron nested flows keep their existing lane behavior. Fixes #72707. Thanks @kagura-agent.
- Cron: report isolated runs as successful when verified cron delivery already delivered the reply, while keeping unresolved Message/Canvas tool failures fatal. Fixes #72732 and #50170; follow-up to #54188. Thanks @zNatix, @pixeldyn, and @ChickenEggRoll.

View File

@@ -17,6 +17,10 @@ Remote `MEDIA:` attachments must be public `https:` URLs. Plain `http:`,
loopback, link-local, private, and internal hostnames are ignored as attachment
directives; server-side media fetchers still enforce their own network guards.
Plain Markdown image syntax stays text by default. Channels that intentionally
map Markdown image replies to media attachments opt in at their outbound
adapter; Telegram does this so `![alt](url)` can still become a media reply.
These directives are separate. `MEDIA:` and reply/voice tags remain delivery metadata; `[embed ...]` is the web-only rich render path.
Trusted tool-result media uses the same `MEDIA:` / `[[audio_as_voice]]` parser before delivery, so text tool outputs can still mark an audio attachment as a voice note.

View File

@@ -121,6 +121,7 @@ export const telegramOutbound: ChannelOutboundAdapter = {
deliveryMode: "direct",
chunker: markdownToTelegramHtmlChunks,
chunkerMode: "markdown",
extractMarkdownImages: true,
textChunkLimit: TELEGRAM_TEXT_CHUNK_LIMIT,
sanitizeText: ({ text }) => sanitizeForPlainText(text),
shouldSkipPlainTextSanitization: ({ payload }) => Boolean(payload.channelData),

View File

@@ -4,6 +4,7 @@ export const telegramOutboundBaseAdapter = {
deliveryMode: "direct" as const,
chunker: chunkMarkdownText,
chunkerMode: "markdown" as const,
extractMarkdownImages: true,
textChunkLimit: 4000,
pollMaxOptions: 10,
};

View File

@@ -350,6 +350,7 @@ describe("buildReplyPayloads media filter integration", () => {
it("extracts markdown image replies into final payload media urls", async () => {
const { replyPayloads } = await buildReplyPayloads({
...baseParams,
extractMarkdownImages: true,
payloads: [{ text: "Here you go\n\n![chart](https://example.com/chart.png)" }],
});
@@ -364,6 +365,7 @@ describe("buildReplyPayloads media filter integration", () => {
it("preserves inline caption text when lifting markdown image replies into media", async () => {
const { replyPayloads } = await buildReplyPayloads({
...baseParams,
extractMarkdownImages: true,
payloads: [{ text: 'Look ![chart](https://example.com/chart.png "Quarterly chart") now' }],
});
@@ -379,6 +381,7 @@ describe("buildReplyPayloads media filter integration", () => {
const text = "Look ![chart](file:///etc/passwd) now";
const { replyPayloads } = await buildReplyPayloads({
...baseParams,
extractMarkdownImages: true,
payloads: [{ text }],
});

View File

@@ -107,6 +107,7 @@ export async function buildReplyPayloads(params: {
originatingChannel?: OriginatingChannelType;
originatingTo?: string;
accountId?: string;
extractMarkdownImages?: boolean;
normalizeMediaPaths?: (payload: ReplyPayload) => Promise<ReplyPayload>;
}): Promise<{ replyPayloads: ReplyPayload[]; didLogHeartbeatStrip: boolean }> {
let didLogHeartbeatStrip = params.didLogHeartbeatStrip;
@@ -148,6 +149,7 @@ export async function buildReplyPayloads(params: {
currentMessageId: params.currentMessageId,
silentToken: SILENT_REPLY_TOKEN,
parseMode: "always",
extractMarkdownImages: params.extractMarkdownImages,
});
const mediaNormalizedPayload = await normalizeReplyPayloadMedia({
payload: parsed.payload,

View File

@@ -17,6 +17,7 @@ export function normalizeReplyPayloadDirectives(params: {
silentToken?: string;
trimLeadingWhitespace?: boolean;
parseMode?: ReplyDirectiveParseMode;
extractMarkdownImages?: boolean;
}): { payload: ReplyPayload; isSilent: boolean } {
const parseMode = params.parseMode ?? "always";
const silentToken = params.silentToken ?? SILENT_REPLY_TOKEN;
@@ -27,12 +28,14 @@ export function normalizeReplyPayloadDirectives(params: {
(parseMode === "auto" &&
(sourceText.includes("[[") ||
/media:/i.test(sourceText) ||
(params.extractMarkdownImages === true && /!\[[^\]]*]\(/.test(sourceText)) ||
sourceText.includes(silentToken)));
const parsed = shouldParse
? parseReplyDirectives(sourceText, {
currentMessageId: params.currentMessageId,
silentToken,
extractMarkdownImages: params.extractMarkdownImages,
})
: undefined;

View File

@@ -13,11 +13,19 @@ export type ReplyDirectiveParseResult = {
isSilent: boolean;
};
export type ReplyDirectiveParseOptions = {
currentMessageId?: string;
silentToken?: string;
extractMarkdownImages?: boolean;
};
export function parseReplyDirectives(
raw: string,
options: { currentMessageId?: string; silentToken?: string } = {},
options: ReplyDirectiveParseOptions = {},
): ReplyDirectiveParseResult {
const split = splitMediaFromOutput(raw);
const split = splitMediaFromOutput(raw, {
extractMarkdownImages: options.extractMarkdownImages,
});
let text = split.text ?? "";
const replyParsed = parseInlineDirectives(text, {

View File

@@ -76,6 +76,8 @@ export type ChannelOutboundAdapter = {
deliveryMode: "direct" | "gateway" | "hybrid";
chunker?: ((text: string, limit: number, ctx?: ChannelOutboundChunkContext) => string[]) | null;
chunkerMode?: "text" | "markdown";
/** Lift remote Markdown image syntax in text into outbound media attachments. */
extractMarkdownImages?: boolean;
textChunkLimit?: number;
sanitizeText?: (params: { text: string; payload: ReplyPayload }) => string;
pollMaxOptions?: number;

View File

@@ -1235,6 +1235,54 @@ describe("deliverOutboundPayloads", () => {
);
});
it("keeps markdown images as text for channels that do not opt in", async () => {
const sendMatrix = vi.fn().mockResolvedValue({ messageId: "m-text", roomId: "!room" });
await deliverOutboundPayloads({
cfg: matrixChunkConfig,
channel: "matrix",
to: "!room:example",
payloads: [{ text: "Tech: ![Node.js](https://img.shields.io/badge/Node.js-339933)" }],
deps: { matrix: sendMatrix },
});
expect(sendMatrix).toHaveBeenCalledWith(
"!room:example",
"Tech: ![Node.js](https://img.shields.io/badge/Node.js-339933)",
expect.not.objectContaining({ mediaUrl: expect.any(String) }),
);
});
it("extracts markdown images for channels that opt in", async () => {
const sendMatrix = vi.fn().mockResolvedValue({ messageId: "m-media", roomId: "!room" });
setActivePluginRegistry(
createTestRegistry([
{
pluginId: "matrix",
source: "test",
plugin: createOutboundTestPlugin({
id: "matrix",
outbound: { ...matrixOutboundForTest, extractMarkdownImages: true },
}),
},
]),
);
await deliverOutboundPayloads({
cfg: matrixChunkConfig,
channel: "matrix",
to: "!room:example",
payloads: [{ text: "Chart ![chart](https://example.com/chart.png) now" }],
deps: { matrix: sendMatrix },
});
expect(sendMatrix).toHaveBeenCalledWith(
"!room:example",
"Chart now",
expect.objectContaining({ mediaUrl: "https://example.com/chart.png" }),
);
});
it("normalizes payloads and drops empty entries", () => {
const normalized = normalizeOutboundPayloads([
{ text: "hi" },

View File

@@ -147,6 +147,24 @@ type ChannelHandlerParams = {
};
// Channel docking: outbound delivery delegates to plugin.outbound adapters.
async function resolveChannelOutboundDirectiveOptions(params: {
cfg: OpenClawConfig;
channel: Exclude<OutboundChannel, "none">;
}): Promise<{ extractMarkdownImages?: boolean }> {
let outbound = await loadChannelOutboundAdapter(params.channel);
if (!outbound) {
const { bootstrapOutboundChannelPlugin } = await loadChannelBootstrapRuntime();
bootstrapOutboundChannelPlugin({
channel: params.channel,
cfg: params.cfg,
});
outbound = await loadChannelOutboundAdapter(params.channel);
}
return {
extractMarkdownImages: outbound?.extractMarkdownImages === true ? true : undefined,
};
}
async function createChannelHandler(params: ChannelHandlerParams): Promise<ChannelHandler> {
let outbound = await loadChannelOutboundAdapter(params.channel);
if (!outbound) {
@@ -841,11 +859,13 @@ async function deliverOutboundPayloadsCore(
params: DeliverOutboundPayloadsCoreParams,
): Promise<OutboundDeliveryResult[]> {
const { cfg, channel, to, payloads } = params;
const directiveOptions = await resolveChannelOutboundDirectiveOptions({ cfg, channel });
const outboundPayloadPlan = createOutboundPayloadPlan(payloads, {
cfg,
sessionKey: params.session?.policyKey ?? params.session?.key,
surface: channel,
conversationType: params.session?.conversationType,
extractMarkdownImages: directiveOptions.extractMarkdownImages,
});
const accountId = params.accountId;
const deps = params.deps;

View File

@@ -642,6 +642,44 @@ describe("OutboundPayloadPlan projections", () => {
const plan = createOutboundPayloadPlan(matrix);
expect(projectOutboundPayloadPlanForMirror(plan)).toEqual(resolveMirrorProjection(matrix));
});
it("keeps markdown images as text unless extraction is enabled", () => {
const input = "Tech: ![Node.js](https://img.shields.io/badge/Node.js-339933)";
expect(
projectOutboundPayloadPlanForDelivery(createOutboundPayloadPlan([{ text: input }])),
).toEqual([
{
text: input,
mediaUrl: undefined,
mediaUrls: undefined,
replyToId: undefined,
replyToCurrent: undefined,
replyToTag: false,
audioAsVoice: false,
},
]);
});
it("extracts markdown images when the outbound channel opts in", () => {
const input = "Chart ![chart](https://example.com/chart.png) now";
expect(
projectOutboundPayloadPlanForDelivery(
createOutboundPayloadPlan([{ text: input }], { extractMarkdownImages: true }),
),
).toEqual([
{
text: "Chart now",
mediaUrl: "https://example.com/chart.png",
mediaUrls: ["https://example.com/chart.png"],
replyToId: undefined,
replyToCurrent: undefined,
replyToTag: false,
audioAsVoice: false,
},
]);
});
});
describe("formatOutboundPayloadLog", () => {

View File

@@ -67,6 +67,7 @@ type OutboundPayloadPlanContext = {
* (see `pending-spawn-query.ts`).
*/
hasPendingSpawnedChildren?: boolean;
extractMarkdownImages?: boolean;
};
export type OutboundPayloadMirror = {
@@ -131,11 +132,14 @@ type PreparedOutboundPayloadPlanEntry = {
function createOutboundPayloadPlanEntry(
payload: ReplyPayload,
context: Pick<OutboundPayloadPlanContext, "extractMarkdownImages"> = {},
): PreparedOutboundPayloadPlanEntry | null {
if (shouldSuppressReasoningPayload(payload)) {
return null;
}
const parsed = parseReplyDirectives(payload.text ?? "");
const parsed = parseReplyDirectives(payload.text ?? "", {
extractMarkdownImages: context.extractMarkdownImages,
});
const explicitMediaUrls = payload.mediaUrls ?? parsed.mediaUrls;
const explicitMediaUrl = payload.mediaUrl ?? parsed.mediaUrl;
const mergedMedia = mergeMediaUrls(
@@ -193,7 +197,9 @@ export function createOutboundPayloadPlan(
context.hasPendingSpawnedChildren ?? resolvePendingSpawnedChildren(context.sessionKey);
const prepared: PreparedOutboundPayloadPlanEntry[] = [];
for (const payload of payloads) {
const entry = createOutboundPayloadPlanEntry(payload);
const entry = createOutboundPayloadPlanEntry(payload, {
extractMarkdownImages: context.extractMarkdownImages,
});
if (!entry) {
continue;
}

View File

@@ -1,5 +1,5 @@
import { describe, expect, it } from "vitest";
import { splitMediaFromOutput } from "./parse.js";
import { splitMediaFromOutput, type SplitMediaFromOutputOptions } from "./parse.js";
describe("splitMediaFromOutput", () => {
function expectParsedMediaOutputCase(
@@ -9,8 +9,9 @@ describe("splitMediaFromOutput", () => {
text?: string;
audioAsVoice?: boolean;
},
options?: SplitMediaFromOutputOptions,
) {
const result = splitMediaFromOutput(input);
const result = splitMediaFromOutput(input, options);
expect(result.text).toBe(expected.text ?? "");
if ("audioAsVoice" in expected) {
expect(result.audioAsVoice).toBe(expected.audioAsVoice);
@@ -126,18 +127,36 @@ describe("splitMediaFromOutput", () => {
]);
});
it("extracts markdown image urls while keeping surrounding caption text", () => {
expectParsedMediaOutputCase("Caption\n\n![chart](https://example.com/chart.png)", {
text: "Caption",
mediaUrls: ["https://example.com/chart.png"],
const extractMarkdownImages = { extractMarkdownImages: true } as const;
it("keeps markdown image urls as text by default", () => {
const input = "Caption\n\n![chart](https://example.com/chart.png)";
expectParsedMediaOutputCase(input, {
text: input,
mediaUrls: undefined,
});
});
it("keeps inline caption text around markdown images", () => {
expectParsedMediaOutputCase("Look ![chart](https://example.com/chart.png) now", {
text: "Look now",
mediaUrls: ["https://example.com/chart.png"],
});
it("extracts markdown image urls while keeping surrounding caption text when enabled", () => {
expectParsedMediaOutputCase(
"Caption\n\n![chart](https://example.com/chart.png)",
{
text: "Caption",
mediaUrls: ["https://example.com/chart.png"],
},
extractMarkdownImages,
);
});
it("keeps inline caption text around markdown images when enabled", () => {
expectParsedMediaOutputCase(
"Look ![chart](https://example.com/chart.png) now",
{
text: "Look now",
mediaUrls: ["https://example.com/chart.png"],
},
extractMarkdownImages,
);
});
it("extracts multiple markdown image urls in order", () => {
@@ -147,6 +166,7 @@ describe("splitMediaFromOutput", () => {
text: "Before\nMiddle\nAfter",
mediaUrls: ["https://example.com/one.png", "https://example.com/two.png"],
},
extractMarkdownImages,
);
});
@@ -157,14 +177,19 @@ describe("splitMediaFromOutput", () => {
text: "Caption",
mediaUrls: ["https://example.com/chart.png"],
},
extractMarkdownImages,
);
});
it("keeps balanced parentheses inside markdown image urls", () => {
expectParsedMediaOutputCase("Chart ![img](https://example.com/a_(1).png) now", {
text: "Chart now",
mediaUrls: ["https://example.com/a_(1).png"],
});
expectParsedMediaOutputCase(
"Chart ![img](https://example.com/a_(1).png) now",
{
text: "Chart now",
mediaUrls: ["https://example.com/a_(1).png"],
},
extractMarkdownImages,
);
});
it.each([
@@ -174,27 +199,76 @@ describe("splitMediaFromOutput", () => {
"![x](http://example.com/a.png)",
"![x](https://127.0.0.1/a.png)",
] as const)("does not lift local markdown image target: %s", (input) => {
expectParsedMediaOutputCase(input, {
text: input,
mediaUrls: undefined,
});
expectParsedMediaOutputCase(
input,
{
text: input,
mediaUrls: undefined,
},
extractMarkdownImages,
);
});
it("does not lift markdown image urls that fail media validation", () => {
const longUrl = `![x](https://example.com/${"a".repeat(4097)}.png)`;
expectParsedMediaOutputCase(longUrl, {
text: longUrl,
mediaUrls: undefined,
});
expectParsedMediaOutputCase(
longUrl,
{
text: longUrl,
mediaUrls: undefined,
},
extractMarkdownImages,
);
});
it("leaves very long markdown-image candidate lines as text", () => {
const input = `${"prefix ".repeat(3000)}![x](https://example.com/image.png)`;
expectParsedMediaOutputCase(
input,
{
text: input,
mediaUrls: undefined,
},
extractMarkdownImages,
);
});
it.each([
"![Node.js](https://img.shields.io/badge/Node.js-339933?logo=node.js&logoColor=white)",
"![build](https://img.shields.io/github/actions/workflow/status/owner/repo/ci.yml)",
"![npm](https://badge.fury.io/js/some-package.svg)",
"![badgen](https://badgen.net/npm/v/some-package)",
"![CI](https://github.com/owner/repo/actions/workflows/ci.yml/badge.svg)",
"![flat-badge](https://flat.badgen.net/npm/v/some-package)",
] as const)("keeps markdown badge image as text by default: %s", (input) => {
expectParsedMediaOutputCase(input, {
text: input,
mediaUrls: undefined,
});
});
it("keeps surrounding text around inline badge images by default", () => {
expectParsedMediaOutputCase(
"tech: ![Node.js](https://img.shields.io/badge/Node.js-339933?logo=node.js&logoColor=white) stack",
{
text: "tech: ![Node.js](https://img.shields.io/badge/Node.js-339933?logo=node.js&logoColor=white) stack",
mediaUrls: undefined,
},
);
});
it("still extracts markdown images when explicitly enabled", () => {
expectParsedMediaOutputCase(
"![badge](https://img.shields.io/badge/status-passing-green)\n![photo](https://example.com/photo.png)",
{
mediaUrls: [
"https://img.shields.io/badge/status-passing-green",
"https://example.com/photo.png",
],
},
extractMarkdownImages,
);
});
});

View File

@@ -26,6 +26,10 @@ export type ParsedMediaOutputSegment =
url: string;
};
export type SplitMediaFromOutputOptions = {
extractMarkdownImages?: boolean;
};
export function normalizeMediaSource(src: string) {
return src.startsWith("file://") ? src.replace("file://", "") : src;
}
@@ -462,7 +466,10 @@ function isInsideFence(fenceSpans: Array<{ start: number; end: number }>, offset
return fenceSpans.some((span) => offset >= span.start && offset < span.end);
}
export function splitMediaFromOutput(raw: string): {
export function splitMediaFromOutput(
raw: string,
options: SplitMediaFromOutputOptions = {},
): {
text: string;
mediaUrls?: string[];
mediaUrl?: string; // legacy first item for backward compatibility
@@ -475,8 +482,9 @@ export function splitMediaFromOutput(raw: string): {
if (!trimmedRaw.trim()) {
return { text: "" };
}
const extractMarkdownImages = options.extractMarkdownImages === true;
const mayContainMediaToken = /media:/i.test(trimmedRaw);
const mayContainMarkdownImage = /!\[[^\]]*]\(/.test(trimmedRaw);
const mayContainMarkdownImage = extractMarkdownImages && /!\[[^\]]*]\(/.test(trimmedRaw);
const mayContainAudioTag = trimmedRaw.includes("[[");
if (!mayContainMediaToken && !mayContainMarkdownImage && !mayContainAudioTag) {
return { text: trimmedRaw };
@@ -518,7 +526,9 @@ export function splitMediaFromOutput(raw: string): {
const trimmedStart = line.trimStart();
if (!trimmedStart.toUpperCase().startsWith("MEDIA:")) {
const markdownImageResult = collectMarkdownImageSegments({ line, media });
const markdownImageResult = extractMarkdownImages
? collectMarkdownImageSegments({ line, media })
: { lineSegments: [], foundMedia: false };
if (!markdownImageResult.foundMedia) {
keptLines.push(line);
pushTextSegment(line);