fix(media): gate markdown image extraction by channel (#72718)

Closes #72642 Co-authored-by: Peter Steinberger <steipete@gmail.com>
2026-05-06 08:30:42 +00:00 · 2026-04-27 06:27:35 -04:00
parent 775ed36c16
commit f0b327cf68
15 changed files with 251 additions and 30 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -97,6 +97,7 @@ Docs: https://docs.openclaw.ai
 - Google Meet: route stateful `google_meet` tool actions through the gateway-owned runtime so created or joined realtime sessions remain visible to status, speak, and leave after the agent turn ends. Fixes #72440. (#72441) Thanks @BsnizND.
 - Google Meet/Voice Call: send Gemini Live a non-blocking consult continuation before long OpenClaw agent consults finish, then deliver the final result when idle so calls and meetings do not sit silent during tool-backed answers. (#72189) Thanks @VACInc.
 - Google Meet: preserve Gemini Live function names when replying to realtime tool calls so Google SDK validation accepts the `FunctionResponse` payload. Fixes #72425. (#72426) Thanks @BsnizND.
+- Discord/media: keep incidental Markdown image badges in final replies as text unless a channel opts into Markdown-image media extraction, while preserving Telegram Markdown-image media replies and explicit `MEDIA:` attachments. Fixes #72642. Thanks @solavrc and @Bartok9.
 - Matrix/E2EE: stabilize recovery and broken-device QA flows while avoiding Matrix device-cleanup sync races that could leave shutdown-time crypto work running. Thanks @gumadeiras.
 - Cron: apply `cron.maxConcurrentRuns` to a dedicated `cron-nested` isolated agent-turn lane as well as cron dispatch, so parallel cron jobs no longer serialize on inner LLM execution while non-cron nested flows keep their existing lane behavior. Fixes #72707. Thanks @kagura-agent.
 - Cron: report isolated runs as successful when verified cron delivery already delivered the reply, while keeping unresolved Message/Canvas tool failures fatal. Fixes #72732 and #50170; follow-up to #54188. Thanks @zNatix, @pixeldyn, and @ChickenEggRoll.
--- a/docs/reference/rich-output-protocol.md
+++ b/docs/reference/rich-output-protocol.md
@@ -17,6 +17,10 @@ Remote `MEDIA:` attachments must be public `https:` URLs. Plain `http:`,
 loopback, link-local, private, and internal hostnames are ignored as attachment
 directives; server-side media fetchers still enforce their own network guards.

+Plain Markdown image syntax stays text by default. Channels that intentionally
+map Markdown image replies to media attachments opt in at their outbound
+adapter; Telegram does this so `![alt](url)` can still become a media reply.
+
 These directives are separate. `MEDIA:` and reply/voice tags remain delivery metadata; `[embed ...]` is the web-only rich render path.
 Trusted tool-result media uses the same `MEDIA:` / `[[audio_as_voice]]` parser before delivery, so text tool outputs can still mark an audio attachment as a voice note.

--- a/extensions/telegram/src/outbound-adapter.ts
+++ b/extensions/telegram/src/outbound-adapter.ts
@@ -121,6 +121,7 @@ export const telegramOutbound: ChannelOutboundAdapter = {
  deliveryMode: "direct",
  chunker: markdownToTelegramHtmlChunks,
  chunkerMode: "markdown",
+  extractMarkdownImages: true,
  textChunkLimit: TELEGRAM_TEXT_CHUNK_LIMIT,
  sanitizeText: ({ text }) => sanitizeForPlainText(text),
  shouldSkipPlainTextSanitization: ({ payload }) => Boolean(payload.channelData),
--- a/extensions/telegram/src/outbound-base.ts
+++ b/extensions/telegram/src/outbound-base.ts
@@ -4,6 +4,7 @@ export const telegramOutboundBaseAdapter = {
  deliveryMode: "direct" as const,
  chunker: chunkMarkdownText,
  chunkerMode: "markdown" as const,
+  extractMarkdownImages: true,
  textChunkLimit: 4000,
  pollMaxOptions: 10,
 };
--- a/src/auto-reply/reply/agent-runner-payloads.test.ts
+++ b/src/auto-reply/reply/agent-runner-payloads.test.ts
@@ -350,6 +350,7 @@ describe("buildReplyPayloads media filter integration", () => {
  it("extracts markdown image replies into final payload media urls", async () => {
    const { replyPayloads } = await buildReplyPayloads({
      ...baseParams,
+      extractMarkdownImages: true,
      payloads: [{ text: "Here you go\n\n![chart](https://example.com/chart.png)" }],
    });

@@ -364,6 +365,7 @@ describe("buildReplyPayloads media filter integration", () => {
  it("preserves inline caption text when lifting markdown image replies into media", async () => {
    const { replyPayloads } = await buildReplyPayloads({
      ...baseParams,
+      extractMarkdownImages: true,
      payloads: [{ text: 'Look ![chart](https://example.com/chart.png "Quarterly chart") now' }],
    });

@@ -379,6 +381,7 @@ describe("buildReplyPayloads media filter integration", () => {
    const text = "Look ![chart](file:///etc/passwd) now";
    const { replyPayloads } = await buildReplyPayloads({
      ...baseParams,
+      extractMarkdownImages: true,
      payloads: [{ text }],
    });

--- a/src/auto-reply/reply/agent-runner-payloads.ts
+++ b/src/auto-reply/reply/agent-runner-payloads.ts
@@ -107,6 +107,7 @@ export async function buildReplyPayloads(params: {
  originatingChannel?: OriginatingChannelType;
  originatingTo?: string;
  accountId?: string;
+  extractMarkdownImages?: boolean;
  normalizeMediaPaths?: (payload: ReplyPayload) => Promise<ReplyPayload>;
 }): Promise<{ replyPayloads: ReplyPayload[]; didLogHeartbeatStrip: boolean }> {
  let didLogHeartbeatStrip = params.didLogHeartbeatStrip;
@@ -148,6 +149,7 @@ export async function buildReplyPayloads(params: {
          currentMessageId: params.currentMessageId,
          silentToken: SILENT_REPLY_TOKEN,
          parseMode: "always",
+          extractMarkdownImages: params.extractMarkdownImages,
        });
        const mediaNormalizedPayload = await normalizeReplyPayloadMedia({
          payload: parsed.payload,
--- a/src/auto-reply/reply/reply-delivery.ts
+++ b/src/auto-reply/reply/reply-delivery.ts
@@ -17,6 +17,7 @@ export function normalizeReplyPayloadDirectives(params: {
  silentToken?: string;
  trimLeadingWhitespace?: boolean;
  parseMode?: ReplyDirectiveParseMode;
+  extractMarkdownImages?: boolean;
 }): { payload: ReplyPayload; isSilent: boolean } {
  const parseMode = params.parseMode ?? "always";
  const silentToken = params.silentToken ?? SILENT_REPLY_TOKEN;
@@ -27,12 +28,14 @@ export function normalizeReplyPayloadDirectives(params: {
    (parseMode === "auto" &&
      (sourceText.includes("[[") ||
        /media:/i.test(sourceText) ||
+        (params.extractMarkdownImages === true && /!\[[^\]]*]\(/.test(sourceText)) ||
        sourceText.includes(silentToken)));

  const parsed = shouldParse
    ? parseReplyDirectives(sourceText, {
        currentMessageId: params.currentMessageId,
        silentToken,
+        extractMarkdownImages: params.extractMarkdownImages,
      })
    : undefined;

--- a/src/auto-reply/reply/reply-directives.ts
+++ b/src/auto-reply/reply/reply-directives.ts
@@ -13,11 +13,19 @@ export type ReplyDirectiveParseResult = {
  isSilent: boolean;
 };

+export type ReplyDirectiveParseOptions = {
+  currentMessageId?: string;
+  silentToken?: string;
+  extractMarkdownImages?: boolean;
+};
+
 export function parseReplyDirectives(
  raw: string,
-  options: { currentMessageId?: string; silentToken?: string } = {},
+  options: ReplyDirectiveParseOptions = {},
 ): ReplyDirectiveParseResult {
-  const split = splitMediaFromOutput(raw);
+  const split = splitMediaFromOutput(raw, {
+    extractMarkdownImages: options.extractMarkdownImages,
+  });
  let text = split.text ?? "";

  const replyParsed = parseInlineDirectives(text, {
--- a/src/channels/plugins/outbound.types.ts
+++ b/src/channels/plugins/outbound.types.ts
@@ -76,6 +76,8 @@ export type ChannelOutboundAdapter = {
  deliveryMode: "direct" | "gateway" | "hybrid";
  chunker?: ((text: string, limit: number, ctx?: ChannelOutboundChunkContext) => string[]) | null;
  chunkerMode?: "text" | "markdown";
+  /** Lift remote Markdown image syntax in text into outbound media attachments. */
+  extractMarkdownImages?: boolean;
  textChunkLimit?: number;
  sanitizeText?: (params: { text: string; payload: ReplyPayload }) => string;
  pollMaxOptions?: number;
--- a/src/infra/outbound/deliver.test.ts
+++ b/src/infra/outbound/deliver.test.ts
@@ -1235,6 +1235,54 @@ describe("deliverOutboundPayloads", () => {
    );
  });

+  it("keeps markdown images as text for channels that do not opt in", async () => {
+    const sendMatrix = vi.fn().mockResolvedValue({ messageId: "m-text", roomId: "!room" });
+
+    await deliverOutboundPayloads({
+      cfg: matrixChunkConfig,
+      channel: "matrix",
+      to: "!room:example",
+      payloads: [{ text: "Tech: ![Node.js](https://img.shields.io/badge/Node.js-339933)" }],
+      deps: { matrix: sendMatrix },
+    });
+
+    expect(sendMatrix).toHaveBeenCalledWith(
+      "!room:example",
+      "Tech: ![Node.js](https://img.shields.io/badge/Node.js-339933)",
+      expect.not.objectContaining({ mediaUrl: expect.any(String) }),
+    );
+  });
+
+  it("extracts markdown images for channels that opt in", async () => {
+    const sendMatrix = vi.fn().mockResolvedValue({ messageId: "m-media", roomId: "!room" });
+    setActivePluginRegistry(
+      createTestRegistry([
+        {
+          pluginId: "matrix",
+          source: "test",
+          plugin: createOutboundTestPlugin({
+            id: "matrix",
+            outbound: { ...matrixOutboundForTest, extractMarkdownImages: true },
+          }),
+        },
+      ]),
+    );
+
+    await deliverOutboundPayloads({
+      cfg: matrixChunkConfig,
+      channel: "matrix",
+      to: "!room:example",
+      payloads: [{ text: "Chart ![chart](https://example.com/chart.png) now" }],
+      deps: { matrix: sendMatrix },
+    });
+
+    expect(sendMatrix).toHaveBeenCalledWith(
+      "!room:example",
+      "Chart now",
+      expect.objectContaining({ mediaUrl: "https://example.com/chart.png" }),
+    );
+  });
+
  it("normalizes payloads and drops empty entries", () => {
    const normalized = normalizeOutboundPayloads([
      { text: "hi" },
--- a/src/infra/outbound/deliver.ts
+++ b/src/infra/outbound/deliver.ts
@@ -147,6 +147,24 @@ type ChannelHandlerParams = {
 };

 // Channel docking: outbound delivery delegates to plugin.outbound adapters.
+async function resolveChannelOutboundDirectiveOptions(params: {
+  cfg: OpenClawConfig;
+  channel: Exclude<OutboundChannel, "none">;
+}): Promise<{ extractMarkdownImages?: boolean }> {
+  let outbound = await loadChannelOutboundAdapter(params.channel);
+  if (!outbound) {
+    const { bootstrapOutboundChannelPlugin } = await loadChannelBootstrapRuntime();
+    bootstrapOutboundChannelPlugin({
+      channel: params.channel,
+      cfg: params.cfg,
+    });
+    outbound = await loadChannelOutboundAdapter(params.channel);
+  }
+  return {
+    extractMarkdownImages: outbound?.extractMarkdownImages === true ? true : undefined,
+  };
+}
+
 async function createChannelHandler(params: ChannelHandlerParams): Promise<ChannelHandler> {
  let outbound = await loadChannelOutboundAdapter(params.channel);
  if (!outbound) {
@@ -841,11 +859,13 @@ async function deliverOutboundPayloadsCore(
  params: DeliverOutboundPayloadsCoreParams,
 ): Promise<OutboundDeliveryResult[]> {
  const { cfg, channel, to, payloads } = params;
+  const directiveOptions = await resolveChannelOutboundDirectiveOptions({ cfg, channel });
  const outboundPayloadPlan = createOutboundPayloadPlan(payloads, {
    cfg,
    sessionKey: params.session?.policyKey ?? params.session?.key,
    surface: channel,
    conversationType: params.session?.conversationType,
+    extractMarkdownImages: directiveOptions.extractMarkdownImages,
  });
  const accountId = params.accountId;
  const deps = params.deps;
--- a/src/infra/outbound/payloads.test.ts
+++ b/src/infra/outbound/payloads.test.ts
@@ -642,6 +642,44 @@ describe("OutboundPayloadPlan projections", () => {
    const plan = createOutboundPayloadPlan(matrix);
    expect(projectOutboundPayloadPlanForMirror(plan)).toEqual(resolveMirrorProjection(matrix));
  });
+
+  it("keeps markdown images as text unless extraction is enabled", () => {
+    const input = "Tech: ![Node.js](https://img.shields.io/badge/Node.js-339933)";
+
+    expect(
+      projectOutboundPayloadPlanForDelivery(createOutboundPayloadPlan([{ text: input }])),
+    ).toEqual([
+      {
+        text: input,
+        mediaUrl: undefined,
+        mediaUrls: undefined,
+        replyToId: undefined,
+        replyToCurrent: undefined,
+        replyToTag: false,
+        audioAsVoice: false,
+      },
+    ]);
+  });
+
+  it("extracts markdown images when the outbound channel opts in", () => {
+    const input = "Chart ![chart](https://example.com/chart.png) now";
+
+    expect(
+      projectOutboundPayloadPlanForDelivery(
+        createOutboundPayloadPlan([{ text: input }], { extractMarkdownImages: true }),
+      ),
+    ).toEqual([
+      {
+        text: "Chart now",
+        mediaUrl: "https://example.com/chart.png",
+        mediaUrls: ["https://example.com/chart.png"],
+        replyToId: undefined,
+        replyToCurrent: undefined,
+        replyToTag: false,
+        audioAsVoice: false,
+      },
+    ]);
+  });
 });

 describe("formatOutboundPayloadLog", () => {
--- a/src/infra/outbound/payloads.ts
+++ b/src/infra/outbound/payloads.ts
@@ -67,6 +67,7 @@ type OutboundPayloadPlanContext = {
   * (see `pending-spawn-query.ts`).
   */
  hasPendingSpawnedChildren?: boolean;
+  extractMarkdownImages?: boolean;
 };

 export type OutboundPayloadMirror = {
@@ -131,11 +132,14 @@ type PreparedOutboundPayloadPlanEntry = {

 function createOutboundPayloadPlanEntry(
  payload: ReplyPayload,
+  context: Pick<OutboundPayloadPlanContext, "extractMarkdownImages"> = {},
 ): PreparedOutboundPayloadPlanEntry | null {
  if (shouldSuppressReasoningPayload(payload)) {
    return null;
  }
-  const parsed = parseReplyDirectives(payload.text ?? "");
+  const parsed = parseReplyDirectives(payload.text ?? "", {
+    extractMarkdownImages: context.extractMarkdownImages,
+  });
  const explicitMediaUrls = payload.mediaUrls ?? parsed.mediaUrls;
  const explicitMediaUrl = payload.mediaUrl ?? parsed.mediaUrl;
  const mergedMedia = mergeMediaUrls(
@@ -193,7 +197,9 @@ export function createOutboundPayloadPlan(
    context.hasPendingSpawnedChildren ?? resolvePendingSpawnedChildren(context.sessionKey);
  const prepared: PreparedOutboundPayloadPlanEntry[] = [];
  for (const payload of payloads) {
-    const entry = createOutboundPayloadPlanEntry(payload);
+    const entry = createOutboundPayloadPlanEntry(payload, {
+      extractMarkdownImages: context.extractMarkdownImages,
+    });
    if (!entry) {
      continue;
    }
--- a/src/media/parse.test.ts
+++ b/src/media/parse.test.ts
@@ -1,5 +1,5 @@
 import { describe, expect, it } from "vitest";
-import { splitMediaFromOutput } from "./parse.js";
+import { splitMediaFromOutput, type SplitMediaFromOutputOptions } from "./parse.js";

 describe("splitMediaFromOutput", () => {
  function expectParsedMediaOutputCase(
@@ -9,8 +9,9 @@ describe("splitMediaFromOutput", () => {
      text?: string;
      audioAsVoice?: boolean;
    },
+    options?: SplitMediaFromOutputOptions,
  ) {
-    const result = splitMediaFromOutput(input);
+    const result = splitMediaFromOutput(input, options);
    expect(result.text).toBe(expected.text ?? "");
    if ("audioAsVoice" in expected) {
      expect(result.audioAsVoice).toBe(expected.audioAsVoice);
@@ -126,18 +127,36 @@ describe("splitMediaFromOutput", () => {
    ]);
  });

-  it("extracts markdown image urls while keeping surrounding caption text", () => {
-    expectParsedMediaOutputCase("Caption\n\n![chart](https://example.com/chart.png)", {
-      text: "Caption",
-      mediaUrls: ["https://example.com/chart.png"],
+  const extractMarkdownImages = { extractMarkdownImages: true } as const;
+
+  it("keeps markdown image urls as text by default", () => {
+    const input = "Caption\n\n![chart](https://example.com/chart.png)";
+    expectParsedMediaOutputCase(input, {
+      text: input,
+      mediaUrls: undefined,
    });
  });

-  it("keeps inline caption text around markdown images", () => {
-    expectParsedMediaOutputCase("Look ![chart](https://example.com/chart.png) now", {
-      text: "Look now",
-      mediaUrls: ["https://example.com/chart.png"],
-    });
+  it("extracts markdown image urls while keeping surrounding caption text when enabled", () => {
+    expectParsedMediaOutputCase(
+      "Caption\n\n![chart](https://example.com/chart.png)",
+      {
+        text: "Caption",
+        mediaUrls: ["https://example.com/chart.png"],
+      },
+      extractMarkdownImages,
+    );
+  });
+
+  it("keeps inline caption text around markdown images when enabled", () => {
+    expectParsedMediaOutputCase(
+      "Look ![chart](https://example.com/chart.png) now",
+      {
+        text: "Look now",
+        mediaUrls: ["https://example.com/chart.png"],
+      },
+      extractMarkdownImages,
+    );
  });

  it("extracts multiple markdown image urls in order", () => {
@@ -147,6 +166,7 @@ describe("splitMediaFromOutput", () => {
        text: "Before\nMiddle\nAfter",
        mediaUrls: ["https://example.com/one.png", "https://example.com/two.png"],
      },
+      extractMarkdownImages,
    );
  });

@@ -157,14 +177,19 @@ describe("splitMediaFromOutput", () => {
        text: "Caption",
        mediaUrls: ["https://example.com/chart.png"],
      },
+      extractMarkdownImages,
    );
  });

  it("keeps balanced parentheses inside markdown image urls", () => {
-    expectParsedMediaOutputCase("Chart ![img](https://example.com/a_(1).png) now", {
-      text: "Chart now",
-      mediaUrls: ["https://example.com/a_(1).png"],
-    });
+    expectParsedMediaOutputCase(
+      "Chart ![img](https://example.com/a_(1).png) now",
+      {
+        text: "Chart now",
+        mediaUrls: ["https://example.com/a_(1).png"],
+      },
+      extractMarkdownImages,
+    );
  });

  it.each([
@@ -174,27 +199,76 @@ describe("splitMediaFromOutput", () => {
    "![x](http://example.com/a.png)",
    "![x](https://127.0.0.1/a.png)",
  ] as const)("does not lift local markdown image target: %s", (input) => {
-    expectParsedMediaOutputCase(input, {
-      text: input,
-      mediaUrls: undefined,
-    });
+    expectParsedMediaOutputCase(
+      input,
+      {
+        text: input,
+        mediaUrls: undefined,
+      },
+      extractMarkdownImages,
+    );
  });

  it("does not lift markdown image urls that fail media validation", () => {
    const longUrl = `![x](https://example.com/${"a".repeat(4097)}.png)`;

-    expectParsedMediaOutputCase(longUrl, {
-      text: longUrl,
-      mediaUrls: undefined,
-    });
+    expectParsedMediaOutputCase(
+      longUrl,
+      {
+        text: longUrl,
+        mediaUrls: undefined,
+      },
+      extractMarkdownImages,
+    );
  });

  it("leaves very long markdown-image candidate lines as text", () => {
    const input = `${"prefix ".repeat(3000)}![x](https://example.com/image.png)`;

+    expectParsedMediaOutputCase(
+      input,
+      {
+        text: input,
+        mediaUrls: undefined,
+      },
+      extractMarkdownImages,
+    );
+  });
+
+  it.each([
+    "![Node.js](https://img.shields.io/badge/Node.js-339933?logo=node.js&logoColor=white)",
+    "![build](https://img.shields.io/github/actions/workflow/status/owner/repo/ci.yml)",
+    "![npm](https://badge.fury.io/js/some-package.svg)",
+    "![badgen](https://badgen.net/npm/v/some-package)",
+    "![CI](https://github.com/owner/repo/actions/workflows/ci.yml/badge.svg)",
+    "![flat-badge](https://flat.badgen.net/npm/v/some-package)",
+  ] as const)("keeps markdown badge image as text by default: %s", (input) => {
    expectParsedMediaOutputCase(input, {
      text: input,
      mediaUrls: undefined,
    });
  });
+
+  it("keeps surrounding text around inline badge images by default", () => {
+    expectParsedMediaOutputCase(
+      "tech: ![Node.js](https://img.shields.io/badge/Node.js-339933?logo=node.js&logoColor=white) stack",
+      {
+        text: "tech: ![Node.js](https://img.shields.io/badge/Node.js-339933?logo=node.js&logoColor=white) stack",
+        mediaUrls: undefined,
+      },
+    );
+  });
+
+  it("still extracts markdown images when explicitly enabled", () => {
+    expectParsedMediaOutputCase(
+      "![badge](https://img.shields.io/badge/status-passing-green)\n![photo](https://example.com/photo.png)",
+      {
+        mediaUrls: [
+          "https://img.shields.io/badge/status-passing-green",
+          "https://example.com/photo.png",
+        ],
+      },
+      extractMarkdownImages,
+    );
+  });
 });
--- a/src/media/parse.ts
+++ b/src/media/parse.ts
@@ -26,6 +26,10 @@ export type ParsedMediaOutputSegment =
      url: string;
    };

+export type SplitMediaFromOutputOptions = {
+  extractMarkdownImages?: boolean;
+};
+
 export function normalizeMediaSource(src: string) {
  return src.startsWith("file://") ? src.replace("file://", "") : src;
 }
@@ -462,7 +466,10 @@ function isInsideFence(fenceSpans: Array<{ start: number; end: number }>, offset
  return fenceSpans.some((span) => offset >= span.start && offset < span.end);
 }

-export function splitMediaFromOutput(raw: string): {
+export function splitMediaFromOutput(
+  raw: string,
+  options: SplitMediaFromOutputOptions = {},
+): {
  text: string;
  mediaUrls?: string[];
  mediaUrl?: string; // legacy first item for backward compatibility
@@ -475,8 +482,9 @@ export function splitMediaFromOutput(raw: string): {
  if (!trimmedRaw.trim()) {
    return { text: "" };
  }
+  const extractMarkdownImages = options.extractMarkdownImages === true;
  const mayContainMediaToken = /media:/i.test(trimmedRaw);
-  const mayContainMarkdownImage = /!\[[^\]]*]\(/.test(trimmedRaw);
+  const mayContainMarkdownImage = extractMarkdownImages && /!\[[^\]]*]\(/.test(trimmedRaw);
  const mayContainAudioTag = trimmedRaw.includes("[[");
  if (!mayContainMediaToken && !mayContainMarkdownImage && !mayContainAudioTag) {
    return { text: trimmedRaw };
@@ -518,7 +526,9 @@ export function splitMediaFromOutput(raw: string): {

    const trimmedStart = line.trimStart();
    if (!trimmedStart.toUpperCase().startsWith("MEDIA:")) {
-      const markdownImageResult = collectMarkdownImageSegments({ line, media });
+      const markdownImageResult = extractMarkdownImages
+        ? collectMarkdownImageSegments({ line, media })
+        : { lineSegments: [], foundMedia: false };
      if (!markdownImageResult.foundMedia) {
        keptLines.push(line);
        pushTextSegment(line);