fix(pair): render /pair qr as media (#70047)

* fix(pair): render pair qr as media * fix(gateway): preserve media reply threading * fix(gateway): harden webchat media replies * fix(plugin-sdk): keep trustedLocalMedia internal * docs(changelog): note pair qr media fix * Update CHANGELOG with recent fixes and enhancements Updated changelog to include recent fixes and enhancements.
2026-05-06 14:50:45 +00:00 · 2026-04-22 03:31:09 -05:00
parent 81ca7bc40b
commit 43a941b51c
43 changed files with 678 additions and 87 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -18,6 +18,7 @@ Docs: https://docs.openclaw.ai

 ### Fixes

+- Gateway/pairing webchat: render `/pair qr` replies as structured media instead of raw markdown text, preserve inline reply threading and silent-control handling on media replies, avoid persisting sensitive QR images into transcript history, and keep local webchat media embedding behind internal-only trust markers. (#70047) Thanks @BunsDev.
 - OpenAI/Responses: keep embedded OpenAI Responses runs on HTTP when `models.providers.openai.baseUrl` points at a local mock or other non-public endpoint, so mocked/custom endpoints no longer drift onto the hardcoded public websocket transport. (#69815) Thanks @vincentkoc.
 - Channels/config: require resolved runtime config on channel send/action/client helpers and block runtime helper `loadConfig()` calls, so SecretRefs are resolved at startup/boundaries instead of being re-read during sends.
 - CLI/channels: preserve bundled setup promotion metadata when a loaded partial channel plugin omits it, so adding a non-default account still moves legacy single-account fields such as Telegram `streaming` into `accounts.default`.
--- a/docs/.generated/plugin-sdk-api-baseline.sha256
+++ b/docs/.generated/plugin-sdk-api-baseline.sha256
@@ -1,2 +1,2 @@
-55b39075f07def786f5056b029921db64fcbdc5e2cab3d645215eccc857ba9a4  plugin-sdk-api-baseline.json
-4a6b8f4afc9e6aa7c56b0cbab0886dacc4ead534c47761ab30eb76480d8fd673  plugin-sdk-api-baseline.jsonl
+ba9b9d9b321b405fef89d4e95c1a3d629d1b956398a5b2a7f25b2a7654879783  plugin-sdk-api-baseline.json
+8bbbee0ea2326148d4fd49f61fe74f83c5bb24c0742cfbf3609f43939fcd4c34  plugin-sdk-api-baseline.jsonl
--- a/extensions/device-pair/index.test.ts
+++ b/extensions/device-pair/index.test.ts
@@ -251,6 +251,7 @@ describe("device-pair /pair qr", () => {
        gatewayClientScopes: ["operator.write", "operator.pairing"],
      }),
    );
+    const payload = result as { text?: string; mediaUrl?: string; sensitiveMedia?: boolean };
    const text = requireText(result);

    expect(pluginApiMocks.renderQrPngBase64).toHaveBeenCalledTimes(1);
@@ -261,11 +262,12 @@ describe("device-pair /pair qr", () => {
      },
    });
    expect(text).toContain("Scan this QR code with the OpenClaw iOS app:");
-    expect(text).toContain("![OpenClaw pairing QR](data:image/png;base64,ZmFrZXBuZw==)");
+    expect(payload.mediaUrl).toBe("data:image/png;base64,ZmFrZXBuZw==");
+    expect(payload.sensitiveMedia).toBe(true);
    expect(text).toContain("- Security: single-use bootstrap token");
    expect(text).toContain("**Important:** Run `/pair cleanup` after pairing finishes.");
    expect(text).toContain("If this QR code leaks, run `/pair cleanup` immediately.");
-    expect(text).not.toContain("```");
+    expect(text).not.toContain("![OpenClaw pairing QR]");
  });

  it("rejects qr setup for internal gateway callers without operator.pairing", async () => {
--- a/extensions/device-pair/index.ts
+++ b/extensions/device-pair/index.ts
@@ -732,9 +732,9 @@ export default definePluginEntry({
                  autoNotifyArmed,
                  expiresAtMs: payload.expiresAtMs,
                }),
-                "",
-                `![OpenClaw pairing QR](${qrDataUrl})`,
              ].join("\n"),
+              mediaUrl: qrDataUrl,
+              sensitiveMedia: true,
            };
          }

--- a/src/agents/pi-embedded-payloads.ts
+++ b/src/agents/pi-embedded-payloads.ts
@@ -2,6 +2,8 @@ export type BlockReplyPayload = {
  text?: string;
  mediaUrls?: string[];
  audioAsVoice?: boolean;
+  trustedLocalMedia?: boolean;
+  sensitiveMedia?: boolean;
  isReasoning?: boolean;
  replyToId?: string;
  replyToTag?: boolean;
--- a/src/agents/pi-embedded-subscribe.handlers.messages.test.ts
+++ b/src/agents/pi-embedded-subscribe.handlers.messages.test.ts
@@ -251,6 +251,7 @@ describe("consumePendingToolMediaIntoReply", () => {
    const state = {
      pendingToolMediaUrls: ["/tmp/a.png", "/tmp/b.png"],
      pendingToolAudioAsVoice: false,
+      pendingToolTrustedLocalMedia: false,
    };

    expect(
@@ -269,6 +270,7 @@ describe("consumePendingToolMediaIntoReply", () => {
    const state = {
      pendingToolMediaUrls: ["/tmp/a.png"],
      pendingToolAudioAsVoice: true,
+      pendingToolTrustedLocalMedia: false,
    };

    expect(
@@ -290,6 +292,7 @@ describe("consumePendingToolMediaReply", () => {
    const state = {
      pendingToolMediaUrls: ["/tmp/reply.opus"],
      pendingToolAudioAsVoice: true,
+      pendingToolTrustedLocalMedia: false,
    };

    expect(consumePendingToolMediaReply(state)).toEqual({
--- a/src/agents/pi-embedded-subscribe.handlers.messages.ts
+++ b/src/agents/pi-embedded-subscribe.handlers.messages.ts
@@ -178,20 +178,31 @@ export function resolveSilentReplyFallbackText(params: {
 }

 function clearPendingToolMedia(
-  state: Pick<EmbeddedPiSubscribeState, "pendingToolMediaUrls" | "pendingToolAudioAsVoice">,
+  state: Pick<
+    EmbeddedPiSubscribeState,
+    "pendingToolMediaUrls" | "pendingToolAudioAsVoice" | "pendingToolTrustedLocalMedia"
+  >,
 ) {
  state.pendingToolMediaUrls = [];
  state.pendingToolAudioAsVoice = false;
+  state.pendingToolTrustedLocalMedia = false;
 }

 export function consumePendingToolMediaIntoReply(
-  state: Pick<EmbeddedPiSubscribeState, "pendingToolMediaUrls" | "pendingToolAudioAsVoice">,
+  state: Pick<
+    EmbeddedPiSubscribeState,
+    "pendingToolMediaUrls" | "pendingToolAudioAsVoice" | "pendingToolTrustedLocalMedia"
+  >,
  payload: BlockReplyPayload,
 ): BlockReplyPayload {
  if (payload.isReasoning) {
    return payload;
  }
-  if (state.pendingToolMediaUrls.length === 0 && !state.pendingToolAudioAsVoice) {
+  if (
+    state.pendingToolMediaUrls.length === 0 &&
+    !state.pendingToolAudioAsVoice &&
+    !state.pendingToolTrustedLocalMedia
+  ) {
    return payload;
  }
  const mergedMediaUrls = Array.from(
@@ -201,15 +212,24 @@ export function consumePendingToolMediaIntoReply(
    ...payload,
    mediaUrls: mergedMediaUrls.length ? mergedMediaUrls : undefined,
    audioAsVoice: payload.audioAsVoice || state.pendingToolAudioAsVoice || undefined,
+    trustedLocalMedia:
+      payload.trustedLocalMedia || state.pendingToolTrustedLocalMedia || undefined,
  };
  clearPendingToolMedia(state);
  return mergedPayload;
 }

 export function consumePendingToolMediaReply(
-  state: Pick<EmbeddedPiSubscribeState, "pendingToolMediaUrls" | "pendingToolAudioAsVoice">,
+  state: Pick<
+    EmbeddedPiSubscribeState,
+    "pendingToolMediaUrls" | "pendingToolAudioAsVoice" | "pendingToolTrustedLocalMedia"
+  >,
 ): BlockReplyPayload | null {
-  if (state.pendingToolMediaUrls.length === 0 && !state.pendingToolAudioAsVoice) {
+  if (
+    state.pendingToolMediaUrls.length === 0 &&
+    !state.pendingToolAudioAsVoice &&
+    !state.pendingToolTrustedLocalMedia
+  ) {
    return null;
  }
  const payload: BlockReplyPayload = {
@@ -217,6 +237,7 @@ export function consumePendingToolMediaReply(
      ? Array.from(new Set(state.pendingToolMediaUrls))
      : undefined,
    audioAsVoice: state.pendingToolAudioAsVoice || undefined,
+    trustedLocalMedia: state.pendingToolTrustedLocalMedia || undefined,
  };
  clearPendingToolMedia(state);
  return payload;
--- a/src/agents/pi-embedded-subscribe.handlers.tools.test.ts
+++ b/src/agents/pi-embedded-subscribe.handlers.tools.test.ts
@@ -47,6 +47,7 @@ function createTestContext(): {
      pendingMessagingMediaUrls: new Map<string, string[]>(),
      pendingToolMediaUrls: [],
      pendingToolAudioAsVoice: false,
+      pendingToolTrustedLocalMedia: false,
      deterministicApprovalPromptPending: false,
      replayState: { replayInvalid: false, hadPotentialSideEffects: false },
      messagingToolSentTexts: [],
--- a/src/agents/pi-embedded-subscribe.handlers.tools.ts
+++ b/src/agents/pi-embedded-subscribe.handlers.tools.ts
@@ -293,7 +293,7 @@ function collectMessagingMediaUrlsFromToolResult(result: unknown): string[] {

 function queuePendingToolMedia(
  ctx: ToolHandlerContext,
-  mediaReply: { mediaUrls: string[]; audioAsVoice?: boolean },
+  mediaReply: { mediaUrls: string[]; audioAsVoice?: boolean; trustedLocalMedia?: boolean },
 ) {
  const seen = new Set(ctx.state.pendingToolMediaUrls);
  for (const mediaUrl of mediaReply.mediaUrls) {
@@ -306,6 +306,9 @@ function queuePendingToolMedia(
  if (mediaReply.audioAsVoice) {
    ctx.state.pendingToolAudioAsVoice = true;
  }
+  if (mediaReply.trustedLocalMedia) {
+    ctx.state.pendingToolTrustedLocalMedia = true;
+  }
 }

 async function collectEmittedToolOutputMediaUrls(
--- a/src/agents/pi-embedded-subscribe.handlers.types.ts
+++ b/src/agents/pi-embedded-subscribe.handlers.types.ts
@@ -81,6 +81,7 @@ export type EmbeddedPiSubscribeState = {
  pendingMessagingMediaUrls: Map<string, string[]>;
  pendingToolMediaUrls: string[];
  pendingToolAudioAsVoice: boolean;
+  pendingToolTrustedLocalMedia: boolean;
  deterministicApprovalPromptPending: boolean;
  deterministicApprovalPromptSent: boolean;
  lastAssistant?: AgentMessage;
@@ -165,6 +166,7 @@ export type ToolHandlerState = Pick<
  | "pendingMessagingMediaUrls"
  | "pendingToolMediaUrls"
  | "pendingToolAudioAsVoice"
+  | "pendingToolTrustedLocalMedia"
  | "deterministicApprovalPromptPending"
  | "replayState"
  | "messagingToolSentTexts"
--- a/src/agents/pi-embedded-subscribe.tools.media.test.ts
+++ b/src/agents/pi-embedded-subscribe.tools.media.test.ts
@@ -51,6 +51,22 @@ describe("extractToolResultMediaPaths", () => {
    });
  });

+  it("extracts structured media trust markers", () => {
+    expect(
+      extractToolResultMediaArtifact({
+        details: {
+          media: {
+            mediaUrl: "/tmp/reply.opus",
+            trustedLocalMedia: true,
+          },
+        },
+      }),
+    ).toEqual({
+      mediaUrls: ["/tmp/reply.opus"],
+      trustedLocalMedia: true,
+    });
+  });
+
  it("extracts MEDIA: path from text content block", () => {
    const result = {
      content: [
--- a/src/agents/pi-embedded-subscribe.tools.ts
+++ b/src/agents/pi-embedded-subscribe.tools.ts
@@ -249,6 +249,7 @@ export function filterToolResultMediaUrls(
 export type ToolResultMediaArtifact = {
  mediaUrls: string[];
  audioAsVoice?: boolean;
+  trustedLocalMedia?: boolean;
 };

 function readToolResultDetailsMedia(
@@ -292,6 +293,7 @@ export function extractToolResultMediaArtifact(
      return {
        mediaUrls,
        ...(detailsMedia.audioAsVoice === true ? { audioAsVoice: true } : {}),
+        ...(detailsMedia.trustedLocalMedia === true ? { trustedLocalMedia: true } : {}),
      };
    }
  }
--- a/src/agents/pi-embedded-subscribe.ts
+++ b/src/agents/pi-embedded-subscribe.ts
@@ -123,6 +123,7 @@ export function subscribeEmbeddedPiSession(params: SubscribeEmbeddedPiSessionPar
    pendingMessagingMediaUrls: new Map(),
    pendingToolMediaUrls: initialPendingToolMediaUrls,
    pendingToolAudioAsVoice: false,
+    pendingToolTrustedLocalMedia: false,
    deterministicApprovalPromptPending: false,
    deterministicApprovalPromptSent: false,
  };
--- a/src/agents/pi-tool-handler-state.test-helpers.ts
+++ b/src/agents/pi-tool-handler-state.test-helpers.ts
@@ -15,6 +15,7 @@ export function createBaseToolHandlerState() {
    pendingMessagingMediaUrls: new Map<string, string[]>(),
    pendingToolMediaUrls: [] as string[],
    pendingToolAudioAsVoice: false,
+    pendingToolTrustedLocalMedia: false,
    deterministicApprovalPromptPending: false,
    messagingToolSentTexts: [] as string[],
    messagingToolSentTextsNormalized: [] as string[],
--- a/src/agents/tools/tts-tool.test.ts
+++ b/src/agents/tools/tts-tool.test.ts
@@ -35,6 +35,7 @@ describe("createTtsTool", () => {
        provider: "test",
        media: {
          mediaUrl: "/tmp/reply.opus",
+          trustedLocalMedia: true,
          audioAsVoice: true,
        },
      },
--- a/src/agents/tools/tts-tool.ts
+++ b/src/agents/tools/tts-tool.ts
@@ -43,6 +43,7 @@ export function createTtsTool(opts?: {
            provider: result.provider,
            media: {
              mediaUrl: result.audioPath,
+              trustedLocalMedia: true,
              ...(result.voiceCompatible ? { audioAsVoice: true } : {}),
            },
          },
--- a/src/auto-reply/reply-payload.ts
+++ b/src/auto-reply/reply-payload.ts
@@ -8,6 +8,10 @@ export type ReplyPayload = {
  text?: string;
  mediaUrl?: string;
  mediaUrls?: string[];
+  /** Internal-only trust signal for gateway webchat local media embedding. */
+  trustedLocalMedia?: boolean;
+  /** Treat media as live-only content and avoid persisting the underlying media reference. */
+  sensitiveMedia?: boolean;
  /** Channel-agnostic rich presentation. Core degrades or asks the channel renderer to map it. */
  presentation?: MessagePresentation;
  /** Channel-agnostic delivery preferences, e.g. pin the sent message when supported. */
--- a/src/auto-reply/reply/commands-tts.ts
+++ b/src/auto-reply/reply/commands-tts.ts
@@ -167,6 +167,7 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand
      const payload: ReplyPayload = {
        mediaUrl: result.audioPath,
        audioAsVoice: result.voiceCompatible === true,
+        trustedLocalMedia: true,
      };
      return { shouldContinue: false, reply: payload };
    }
--- a/src/gateway/server-methods/chat-webchat-media.test.ts
+++ b/src/gateway/server-methods/chat-webchat-media.test.ts
@@ -4,7 +4,10 @@ import path from "node:path";
 import { pathToFileURL } from "node:url";
 import { afterEach, describe, expect, it, vi } from "vitest";
 import { getDefaultLocalRoots } from "../../media/local-media-access.js";
-import { buildWebchatAudioContentBlocksFromReplyPayloads } from "./chat-webchat-media.js";
+import {
+  buildWebchatAssistantMessageFromReplyPayloads,
+  buildWebchatAudioContentBlocksFromReplyPayloads,
+} from "./chat-webchat-media.js";

 describe("buildWebchatAudioContentBlocksFromReplyPayloads", () => {
  let tmpDir: string | undefined;
@@ -22,7 +25,7 @@ describe("buildWebchatAudioContentBlocksFromReplyPayloads", () => {
    fs.writeFileSync(audioPath, Buffer.from([0xff, 0xfb, 0x90, 0x00]));

    const blocks = await buildWebchatAudioContentBlocksFromReplyPayloads(
-      [{ mediaUrl: audioPath }],
+      [{ mediaUrl: audioPath, trustedLocalMedia: true }],
      { localRoots: [tmpDir] },
    );

@@ -42,7 +45,7 @@ describe("buildWebchatAudioContentBlocksFromReplyPayloads", () => {

  it("skips remote URLs", async () => {
    const blocks = await buildWebchatAudioContentBlocksFromReplyPayloads([
-      { mediaUrl: "https://example.com/a.mp3" },
+      { mediaUrl: "https://example.com/a.mp3", trustedLocalMedia: true },
    ]);
    expect(blocks).toHaveLength(0);
  });
@@ -53,7 +56,7 @@ describe("buildWebchatAudioContentBlocksFromReplyPayloads", () => {
    fs.writeFileSync(imagePath, Buffer.from([0x89, 0x50, 0x4e, 0x47]));

    const blocks = await buildWebchatAudioContentBlocksFromReplyPayloads(
-      [{ mediaUrl: imagePath }],
+      [{ mediaUrl: imagePath, trustedLocalMedia: true }],
      { localRoots: [tmpDir] },
    );

@@ -66,7 +69,10 @@ describe("buildWebchatAudioContentBlocksFromReplyPayloads", () => {
    fs.writeFileSync(audioPath, Buffer.from([0x00]));

    const blocks = await buildWebchatAudioContentBlocksFromReplyPayloads(
-      [{ mediaUrl: audioPath }, { mediaUrl: audioPath }],
+      [
+        { mediaUrl: audioPath, trustedLocalMedia: true },
+        { mediaUrl: audioPath, trustedLocalMedia: true },
+      ],
      { localRoots: [tmpDir] },
    );
    expect(blocks).toHaveLength(1);
@@ -78,9 +84,12 @@ describe("buildWebchatAudioContentBlocksFromReplyPayloads", () => {
    fs.writeFileSync(audioPath, Buffer.from([0x01]));

    const fileUrl = pathToFileURL(audioPath).href;
-    const blocks = await buildWebchatAudioContentBlocksFromReplyPayloads([{ mediaUrl: fileUrl }], {
-      localRoots: [tmpDir],
-    });
+    const blocks = await buildWebchatAudioContentBlocksFromReplyPayloads(
+      [{ mediaUrl: fileUrl, trustedLocalMedia: true }],
+      {
+        localRoots: [tmpDir],
+      },
+    );

    expect(blocks).toHaveLength(1);
    expect((blocks[0] as { type?: string }).type).toBe("audio");
@@ -94,6 +103,7 @@ describe("buildWebchatAudioContentBlocksFromReplyPayloads", () => {
      {
        text: "MEDIA:file://attacker/share/probe.mp3",
        mediaUrl: "file://attacker/share/probe.mp3",
+        trustedLocalMedia: true,
      },
    ]);

@@ -116,7 +126,7 @@ describe("buildWebchatAudioContentBlocksFromReplyPayloads", () => {

    const onLocalAudioAccessDenied = vi.fn();
    const blocks = await buildWebchatAudioContentBlocksFromReplyPayloads(
-      [{ mediaUrl: audioPath }],
+      [{ mediaUrl: audioPath, trustedLocalMedia: true }],
      {
        localRoots: [allowedRoot],
        onLocalAudioAccessDenied,
@@ -136,7 +146,9 @@ describe("buildWebchatAudioContentBlocksFromReplyPayloads", () => {
    const audioPath = path.join(tmpDir, "clip.mp3");
    fs.writeFileSync(audioPath, Buffer.from([0x04]));

-    const blocks = await buildWebchatAudioContentBlocksFromReplyPayloads([{ mediaUrl: audioPath }]);
+    const blocks = await buildWebchatAudioContentBlocksFromReplyPayloads([
+      { mediaUrl: audioPath, trustedLocalMedia: true },
+    ]);

    expect(blocks).toHaveLength(1);
    expect((blocks[0] as { type?: string }).type).toBe("audio");
@@ -157,7 +169,7 @@ describe("buildWebchatAudioContentBlocksFromReplyPayloads", () => {
    const readSpy = vi.spyOn(fs, "readFileSync");

    const blocks = await buildWebchatAudioContentBlocksFromReplyPayloads(
-      [{ mediaUrl: audioPath }],
+      [{ mediaUrl: audioPath, trustedLocalMedia: true }],
      { localRoots: [tmpDir] },
    );

@@ -167,4 +179,121 @@ describe("buildWebchatAudioContentBlocksFromReplyPayloads", () => {
    statSpy.mockRestore();
    readSpy.mockRestore();
  });
+
+  it("rejects untrusted local audio paths", async () => {
+    tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "openclaw-webchat-audio-"));
+    const audioPath = path.join(tmpDir, "clip.mp3");
+    fs.writeFileSync(audioPath, Buffer.from([0xff, 0xfb, 0x90, 0x00]));
+
+    const blocks = await buildWebchatAudioContentBlocksFromReplyPayloads(
+      [{ mediaUrl: audioPath }],
+      { localRoots: [tmpDir] },
+    );
+
+    expect(blocks).toHaveLength(0);
+  });
+});
+
+describe("buildWebchatAssistantMessageFromReplyPayloads", () => {
+  it("converts image data URLs into webchat image blocks", async () => {
+    const message = await buildWebchatAssistantMessageFromReplyPayloads([
+      {
+        text: "Scan this QR code with the OpenClaw iOS app:",
+        mediaUrl: "data:image/png;base64,cG5n",
+      },
+    ]);
+
+    expect(message).toEqual({
+      transcriptText: "Scan this QR code with the OpenClaw iOS app:",
+      content: [
+        { type: "text", text: "Scan this QR code with the OpenClaw iOS app:" },
+        { type: "input_image", image_url: "data:image/png;base64,cG5n" },
+      ],
+    });
+  });
+
+  it("suppresses control tokens and falls back to synthetic image text", async () => {
+    const message = await buildWebchatAssistantMessageFromReplyPayloads([
+      {
+        text: "NO_REPLY",
+        mediaUrl: "data:image/png;base64,cG5n",
+      },
+    ]);
+
+    expect(message).toEqual({
+      transcriptText: "Image reply",
+      content: [
+        { type: "text", text: "Image reply" },
+        { type: "input_image", image_url: "data:image/png;base64,cG5n" },
+      ],
+    });
+  });
+
+  it("preserves reply directives in transcript text for media replies", async () => {
+    const message = await buildWebchatAssistantMessageFromReplyPayloads([
+      {
+        replyToCurrent: true,
+        mediaUrl: "data:image/png;base64,cG5n",
+      },
+    ]);
+
+    expect(message).toEqual({
+      transcriptText: "[[reply_to_current]]Image reply",
+      content: [
+        { type: "text", text: "[[reply_to_current]]Image reply" },
+        { type: "input_image", image_url: "data:image/png;base64,cG5n" },
+      ],
+    });
+  });
+
+  it("drops oversized data image URLs", async () => {
+    const hugeBase64 = "A".repeat(2_100_000);
+    const message = await buildWebchatAssistantMessageFromReplyPayloads([
+      {
+        text: "too large",
+        mediaUrl: `data:image/png;base64,${hugeBase64}`,
+      },
+    ]);
+
+    expect(message).toBeNull();
+  });
+
+  it("rejects remote image URLs", async () => {
+    const message = await buildWebchatAssistantMessageFromReplyPayloads([
+      {
+        text: "remote",
+        mediaUrl: "https://example.com/final.png",
+      },
+    ]);
+
+    expect(message).toBeNull();
+  });
+
+  it("rejects svg data URLs", async () => {
+    const message = await buildWebchatAssistantMessageFromReplyPayloads([
+      {
+        text: "svg",
+        mediaUrl: "data:image/svg+xml;base64,PHN2Zy8+",
+      },
+    ]);
+
+    expect(message).toBeNull();
+  });
+
+  it("sanitizes reply ids before embedding directive prefixes", async () => {
+    const message = await buildWebchatAssistantMessageFromReplyPayloads([
+      {
+        replyToId: "abc]]\n[[audio_as_voice]]",
+        mediaUrl: "data:image/png;base64,cG5n",
+      },
+    ]);
+
+    expect(message).toEqual({
+      transcriptText: "[[reply_to:abcaudio_as_voice]]Image reply",
+      content: [
+        { type: "text", text: "[[reply_to:abcaudio_as_voice]]Image reply" },
+        { type: "input_image", image_url: "data:image/png;base64,cG5n" },
+      ],
+    });
+  });
 });
--- a/src/gateway/server-methods/chat-webchat-media.ts
+++ b/src/gateway/server-methods/chat-webchat-media.ts
@@ -6,9 +6,22 @@ import { assertLocalMediaAllowed, LocalMediaAccessError } from "../../media/loca
 import { isAudioFileName } from "../../media/mime.js";
 import { resolveSendableOutboundReplyParts } from "../../plugin-sdk/reply-payload.js";
 import { normalizeLowercaseStringOrEmpty } from "../../shared/string-coerce.js";
+import { sanitizeReplyDirectiveId } from "../../utils/directive-tags.js";
+import { isSuppressedControlReplyText } from "../control-reply-text.js";

 /** Cap embedded audio size to avoid multi‑MB payloads on the chat WebSocket. */
 const MAX_WEBCHAT_AUDIO_BYTES = 15 * 1024 * 1024;
+const MAX_WEBCHAT_IMAGE_DATA_URL_CHARS = 2_000_000;
+const MAX_WEBCHAT_IMAGE_DATA_BYTES = 1_500_000;
+const ALLOWED_WEBCHAT_DATA_IMAGE_MEDIA_TYPES = new Set([
+  "image/apng",
+  "image/avif",
+  "image/bmp",
+  "image/gif",
+  "image/jpeg",
+  "image/png",
+  "image/webp",
+]);

 const MIME_BY_EXT: Record<string, string> = {
  ".aac": "audio/aac",
@@ -26,6 +39,8 @@ type WebchatAudioEmbeddingOptions = {
  onLocalAudioAccessDenied?: (err: LocalMediaAccessError) => void;
 };

+type WebchatAssistantMediaOptions = WebchatAudioEmbeddingOptions;
+
 /** Map `mediaUrl` strings to an absolute filesystem path for local embedding (plain paths or `file:` URLs). */
 function resolveLocalMediaPathForEmbedding(raw: string): string | null {
  const trimmed = raw.trim();
@@ -62,9 +77,13 @@ function resolveLocalMediaPathForEmbedding(raw: string): string | null {

 /** Returns a readable local file path when it is a regular file and within the size cap (single stat before read). */
 async function resolveLocalAudioFileForEmbedding(
+  payload: ReplyPayload,
  raw: string,
  options: WebchatAudioEmbeddingOptions | undefined,
 ): Promise<string | null> {
+  if (payload.trustedLocalMedia !== true) {
+    return null;
+  }
  const resolved = resolveLocalMediaPathForEmbedding(raw);
  if (!resolved) {
    return null;
@@ -92,6 +111,47 @@ function mimeTypeForPath(filePath: string): string {
  return MIME_BY_EXT[ext] ?? "audio/mpeg";
 }

+function estimateBase64DecodedBytes(base64: string): number {
+  const sanitized = base64.replace(/\s+/g, "");
+  const padding =
+    sanitized.endsWith("==") ? 2 : sanitized.endsWith("=") ? 1 : 0;
+  return Math.floor((sanitized.length * 3) / 4) - padding;
+}
+
+function resolveEmbeddableImageUrl(url: string): string | null {
+  const trimmed = url.trim();
+  if (!trimmed) {
+    return null;
+  }
+  if (trimmed.length > MAX_WEBCHAT_IMAGE_DATA_URL_CHARS) {
+    return null;
+  }
+  const match = /^data:(image\/[a-z0-9.+-]+);base64,([a-z0-9+/=\s]+)$/i.exec(trimmed);
+  if (!match) {
+    return null;
+  }
+  const mediaType = normalizeLowercaseStringOrEmpty(match[1]);
+  const base64Data = match[2];
+  if (!ALLOWED_WEBCHAT_DATA_IMAGE_MEDIA_TYPES.has(mediaType)) {
+    return null;
+  }
+  if (estimateBase64DecodedBytes(base64Data) > MAX_WEBCHAT_IMAGE_DATA_BYTES) {
+    return null;
+  }
+  return trimmed;
+}
+
+function resolveReplyDirectivePrefix(payload: ReplyPayload): string {
+  const replyToId = sanitizeReplyDirectiveId(payload.replyToId);
+  if (replyToId) {
+    return `[[reply_to:${replyToId}]]`;
+  }
+  if (payload.replyToCurrent) {
+    return "[[reply_to_current]]";
+  }
+  return "";
+}
+
 /**
 * Build Control UI / transcript `content` blocks for local TTS (or other) audio files
 * referenced by slash-command / agent replies when the webchat path only had text aggregation.
@@ -109,7 +169,7 @@ export async function buildWebchatAudioContentBlocksFromReplyPayloads(
      if (!url) {
        continue;
      }
-      const resolved = await resolveLocalAudioFileForEmbedding(url, options);
+      const resolved = await resolveLocalAudioFileForEmbedding(payload, url, options);
      if (!resolved || seen.has(resolved)) {
        continue;
      }
@@ -123,6 +183,87 @@ export async function buildWebchatAudioContentBlocksFromReplyPayloads(
  return blocks;
 }

+export async function buildWebchatAssistantMessageFromReplyPayloads(
+  payloads: ReplyPayload[],
+  options?: WebchatAssistantMediaOptions,
+): Promise<{ content: Array<Record<string, unknown>>; transcriptText: string } | null> {
+  const content: Array<Record<string, unknown>> = [];
+  const transcriptTextParts: string[] = [];
+  const seenAudio = new Set<string>();
+  const seenImages = new Set<string>();
+  let hasAudio = false;
+  let hasImage = false;
+
+  for (const payload of payloads) {
+    const visibleText = payload.text?.trim();
+    const text =
+      visibleText && !isSuppressedControlReplyText(visibleText) ? visibleText : undefined;
+    const replyDirectivePrefix = resolveReplyDirectivePrefix(payload);
+    let payloadHasAudio = false;
+    let payloadHasImage = false;
+    const payloadMediaBlocks: Array<Record<string, unknown>> = [];
+    const parts = resolveSendableOutboundReplyParts(payload);
+    for (const raw of parts.mediaUrls) {
+      const url = raw.trim();
+      if (!url) {
+        continue;
+      }
+      const resolvedAudioPath = await resolveLocalAudioFileForEmbedding(payload, url, options);
+      if (resolvedAudioPath) {
+        if (seenAudio.has(resolvedAudioPath)) {
+          continue;
+        }
+        seenAudio.add(resolvedAudioPath);
+        const block = tryReadLocalAudioContentBlock(resolvedAudioPath);
+        if (block) {
+          payloadMediaBlocks.push(block);
+          hasAudio = true;
+          payloadHasAudio = true;
+        }
+        continue;
+      }
+      const imageUrl = resolveEmbeddableImageUrl(url);
+      if (!imageUrl || seenImages.has(imageUrl)) {
+        continue;
+      }
+      seenImages.add(imageUrl);
+      payloadMediaBlocks.push({ type: "input_image", image_url: imageUrl });
+      hasImage = true;
+      payloadHasImage = true;
+    }
+    const needsSyntheticText =
+      payloadMediaBlocks.length > 0 && (!text || replyDirectivePrefix) && transcriptTextParts.length === 0;
+    const syntheticText = needsSyntheticText
+      ? payloadHasAudio && payloadHasImage
+        ? "Media reply"
+        : payloadHasAudio
+          ? "Audio reply"
+          : "Image reply"
+      : undefined;
+    const blockText = text ?? syntheticText;
+    if (blockText) {
+      const fullText = replyDirectivePrefix ? `${replyDirectivePrefix}${blockText}` : blockText;
+      transcriptTextParts.push(fullText);
+      content.push({ type: "text", text: fullText });
+    } else if (replyDirectivePrefix) {
+      transcriptTextParts.push(replyDirectivePrefix);
+      content.push({ type: "text", text: replyDirectivePrefix });
+    }
+    content.push(...payloadMediaBlocks);
+  }
+
+  if (!hasAudio && !hasImage) {
+    return null;
+  }
+  const transcriptText =
+    transcriptTextParts.join("\n\n").trim() ||
+    (hasAudio && hasImage ? "Media reply" : hasAudio ? "Audio reply" : "Image reply");
+  if (transcriptTextParts.length === 0) {
+    content.unshift({ type: "text", text: transcriptText });
+  }
+  return { content, transcriptText };
+}
+
 function tryReadLocalAudioContentBlock(filePath: string): Record<string, unknown> | null {
  try {
    const buf = fs.readFileSync(filePath);
--- a/src/gateway/server-methods/chat.directive-tags.test.ts
+++ b/src/gateway/server-methods/chat.directive-tags.test.ts
@@ -20,10 +20,23 @@ const mockState = vi.hoisted(() => ({
  sessionId: "sess-1",
  mainSessionKey: "main",
  finalText: "[[reply_to_current]]",
-  finalPayload: null as { text?: string; mediaUrl?: string } | null,
+  finalPayload: null as {
+    text?: string;
+    mediaUrl?: string;
+    sensitiveMedia?: boolean;
+    replyToId?: string;
+    replyToCurrent?: boolean;
+  } | null,
  dispatchedReplies: [] as Array<{
    kind: "tool" | "block" | "final";
-    payload: { text?: string; mediaUrl?: string; mediaUrls?: string[] };
+    payload: {
+      text?: string;
+      mediaUrl?: string;
+      mediaUrls?: string[];
+      trustedLocalMedia?: boolean;
+      replyToId?: string;
+      replyToCurrent?: boolean;
+    };
  }>,
  dispatchError: null as Error | null,
  triggerAgentRunStart: false,
@@ -91,16 +104,28 @@ vi.mock("../../auto-reply/dispatch.js", () => ({
    async (params: {
      ctx: MsgContext;
      dispatcher: {
-        sendFinalReply: (payload: { text?: string; mediaUrl?: string }) => boolean;
+        sendFinalReply: (payload: {
+          text?: string;
+          mediaUrl?: string;
+          sensitiveMedia?: boolean;
+          replyToId?: string;
+          replyToCurrent?: boolean;
+        }) => boolean;
        sendBlockReply: (payload: {
          text?: string;
          mediaUrl?: string;
          mediaUrls?: string[];
+          trustedLocalMedia?: boolean;
+          replyToId?: string;
+          replyToCurrent?: boolean;
        }) => boolean;
        sendToolResult: (payload: {
          text?: string;
          mediaUrl?: string;
          mediaUrls?: string[];
+          trustedLocalMedia?: boolean;
+          replyToId?: string;
+          replyToCurrent?: boolean;
        }) => boolean;
        markComplete: () => void;
        waitForIdle: () => Promise<void>;
@@ -130,9 +155,7 @@ vi.mock("../../auto-reply/dispatch.js", () => ({
            params.dispatcher.sendBlockReply(reply.payload);
            continue;
          }
-          params.dispatcher.sendFinalReply({
-            text: reply.payload.text ?? "",
-          });
+          params.dispatcher.sendFinalReply(reply.payload);
        }
      } else {
        params.dispatcher.sendFinalReply(mockState.finalPayload ?? { text: mockState.finalText });
@@ -500,6 +523,7 @@ describe("chat directive tag stripping for non-streaming final payloads", () =>
        payload: {
          mediaUrl: audioPath,
          mediaUrls: [audioPath],
+          trustedLocalMedia: true,
        },
      },
    ];
@@ -528,7 +552,7 @@ describe("chat directive tag stripping for non-streaming final payloads", () =>
      expect(assistantUpdate).toMatchObject({
        message: {
          role: "assistant",
-          idempotencyKey: "idem-agent-audio:assistant-audio",
+          idempotencyKey: "idem-agent-audio:assistant-media",
          content: [
            { type: "text", text: "Audio reply" },
            {
@@ -544,6 +568,31 @@ describe("chat directive tag stripping for non-streaming final payloads", () =>
    });
  });

+  it("renders image reply payloads as assistant image content instead of MEDIA text", async () => {
+    createTranscriptFixture("openclaw-chat-send-agent-image-");
+    mockState.finalPayload = {
+      text: "Scan this QR code with the OpenClaw iOS app:",
+      mediaUrl: "data:image/png;base64,cG5n",
+    };
+    const respond = vi.fn();
+    const context = createChatContext();
+
+    const payload = await runNonStreamingChatSend({
+      context,
+      respond,
+      idempotencyKey: "idem-agent-image",
+    });
+
+    expect(payload?.message).toMatchObject({
+      role: "assistant",
+      content: [
+        { type: "text", text: "Scan this QR code with the OpenClaw iOS app:" },
+        { type: "input_image", image_url: "data:image/png;base64,cG5n" },
+      ],
+    });
+    expect(JSON.stringify(payload?.message)).not.toContain("MEDIA:data:image/png;base64,cG5n");
+  });
+
  it("chat.inject keeps message defined when directive tag is the only content", async () => {
    createTranscriptFixture("openclaw-chat-inject-directive-only-");
    const respond = vi.fn();
@@ -693,7 +742,7 @@ describe("chat directive tag stripping for non-streaming final payloads", () =>
      respond,
      idempotencyKey: "idem-untrusted-context",
    });
-    expect(extractFirstTextBlock(payload)).toBe("hello");
+    expect(extractFirstTextBlock(payload)?.trim()).toBe("hello");
  });

  it("chat.send non-streaming final broadcasts and routes on the canonical session key", async () => {
@@ -1867,7 +1916,7 @@ describe("chat directive tag stripping for non-streaming final payloads", () =>

  it("preserves media-only final replies in the final broadcast message", async () => {
    createTranscriptFixture("openclaw-chat-send-media-only-final-");
-    mockState.finalPayload = { mediaUrl: "https://example.com/final.png" };
+    mockState.finalPayload = { mediaUrl: "data:image/png;base64,cG5n" };
    const respond = vi.fn();
    const context = createChatContext();

@@ -1877,14 +1926,20 @@ describe("chat directive tag stripping for non-streaming final payloads", () =>
      idempotencyKey: "idem-media-only-final",
    });

-    expect(extractFirstTextBlock(payload)).toBe("MEDIA:https://example.com/final.png");
+    expect(payload?.message).toMatchObject({
+      role: "assistant",
+      content: [
+        { type: "text", text: "Image reply" },
+        { type: "input_image", image_url: "data:image/png;base64,cG5n" },
+      ],
+    });
  });

  it("strips NO_REPLY from transcript text when final replies only carry media", async () => {
    createTranscriptFixture("openclaw-chat-send-media-only-silent-final-");
    mockState.finalPayload = {
      text: "NO_REPLY",
-      mediaUrl: "https://example.com/final.png",
+      mediaUrl: "data:image/png;base64,cG5n",
    };
    const respond = vi.fn();
    const context = createChatContext();
@@ -1895,7 +1950,122 @@ describe("chat directive tag stripping for non-streaming final payloads", () =>
      idempotencyKey: "idem-media-only-silent-final",
    });

-    expect(extractFirstTextBlock(payload)).toBe("MEDIA:https://example.com/final.png");
+    expect(payload?.message).toMatchObject({
+      role: "assistant",
+      content: [
+        { type: "text", text: "Image reply" },
+        { type: "input_image", image_url: "data:image/png;base64,cG5n" },
+      ],
+    });
+  });
+
+  it("preserves reply tags in transcript updates for media replies while stripping them from the broadcast", async () => {
+    createTranscriptFixture("openclaw-chat-send-media-reply-tags-");
+    mockState.finalPayload = {
+      replyToCurrent: true,
+      mediaUrl: "data:image/png;base64,cG5n",
+    };
+    const respond = vi.fn();
+    const context = createChatContext();
+
+    const payload = await runNonStreamingChatSend({
+      context,
+      respond,
+      idempotencyKey: "idem-media-reply-tags",
+    });
+
+    expect(payload?.message).toMatchObject({
+      role: "assistant",
+      content: [
+        { type: "text", text: "Image reply" },
+        { type: "input_image", image_url: "data:image/png;base64,cG5n" },
+      ],
+    });
+    const transcriptUpdate = mockState.emittedTranscriptUpdates.find(
+      (update) =>
+        typeof update.message === "object" &&
+        update.message !== null &&
+        (update.message as { role?: unknown }).role === "assistant" &&
+        Array.isArray((update.message as { content?: unknown }).content) &&
+        ((update.message as { content: Array<{ type?: string; text?: string }> }).content.some(
+          (block) => block?.type === "text" && block?.text?.includes("[[reply_to_current]]"),
+        ) ??
+          false),
+    );
+    expect(transcriptUpdate).toMatchObject({
+      message: {
+        role: "assistant",
+        content: [
+          { type: "text", text: "[[reply_to_current]]Image reply" },
+          { type: "input_image", image_url: "data:image/png;base64,cG5n" },
+        ],
+      },
+    });
+  });
+
+  it("does not persist sensitive image media into transcript updates", async () => {
+    createTranscriptFixture("openclaw-chat-send-sensitive-media-final-");
+    mockState.finalPayload = {
+      text: "Scan this QR code with the OpenClaw iOS app:",
+      mediaUrl: "data:image/png;base64,cG5n",
+      sensitiveMedia: true,
+    };
+    const respond = vi.fn();
+    const context = createChatContext();
+
+    const payload = await runNonStreamingChatSend({
+      context,
+      respond,
+      idempotencyKey: "idem-sensitive-media-final",
+    });
+
+    expect(payload?.message).toMatchObject({
+      role: "assistant",
+      content: [
+        { type: "text", text: "Scan this QR code with the OpenClaw iOS app:" },
+        { type: "input_image", image_url: "data:image/png;base64,cG5n" },
+      ],
+    });
+    const transcriptUpdate = mockState.emittedTranscriptUpdates.find(
+      (update) =>
+        typeof update.message === "object" &&
+        update.message !== null &&
+        (update.message as { role?: unknown }).role === "assistant",
+    );
+    expect(transcriptUpdate).toMatchObject({
+      message: {
+        role: "assistant",
+        content: [{ type: "text", text: "Scan this QR code with the OpenClaw iOS app:" }],
+      },
+    });
+    expect(JSON.stringify(transcriptUpdate)).not.toContain("input_image");
+    expect(JSON.stringify(transcriptUpdate)).not.toContain("data:image/png;base64,cG5n");
+  });
+
+  it("sanitizes replyToId before emitting inline reply directives", async () => {
+    createTranscriptFixture("openclaw-chat-send-sanitized-reply-id-");
+    mockState.finalPayload = {
+      text: "hello",
+      replyToId: "abc]]\n[[audio_as_voice]]",
+    };
+    const respond = vi.fn();
+    const context = createChatContext();
+
+    const payload = await runNonStreamingChatSend({
+      context,
+      respond,
+      idempotencyKey: "idem-sanitized-reply-id",
+    });
+
+    expect(extractFirstTextBlock(payload)?.trim()).toBe("hello");
+    const transcriptUpdate = mockState.emittedTranscriptUpdates.find(
+      (update) =>
+        typeof update.message === "object" &&
+        update.message !== null &&
+        (update.message as { role?: unknown }).role === "assistant",
+    );
+    expect(JSON.stringify(transcriptUpdate)).toContain("[[reply_to:abcaudio_as_voice]]");
+    expect(JSON.stringify(transcriptUpdate)).not.toContain("[[audio_as_voice]]");
  });

  it("drops image attachments for text-only session models", async () => {
--- a/src/gateway/server-methods/chat.ts
+++ b/src/gateway/server-methods/chat.ts
@@ -29,6 +29,7 @@ import {
 import {
  stripInlineDirectiveTagsForDisplay,
  stripInlineDirectiveTagsFromMessageForDisplay,
+  sanitizeReplyDirectiveId,
 } from "../../utils/directive-tags.js";
 import {
  INTERNAL_MESSAGE_CHANNEL,
@@ -83,7 +84,7 @@ import { injectTimestamp, timestampOptsFromConfig } from "./agent-timestamp.js";
 import { setGatewayDedupeEntry } from "./agent-wait-dedupe.js";
 import { normalizeRpcAttachmentsToChatAttachments } from "./attachment-normalize.js";
 import { appendInjectedAssistantMessageToTranscript } from "./chat-transcript-inject.js";
-import { buildWebchatAudioContentBlocksFromReplyPayloads } from "./chat-webchat-media.js";
+import { buildWebchatAssistantMessageFromReplyPayloads } from "./chat-webchat-media.js";
 import type {
  GatewayRequestContext,
  GatewayRequestHandlerOptions,
@@ -123,26 +124,19 @@ function isMediaBearingPayload(payload: ReplyPayload): boolean {
  return false;
 }

-async function buildWebchatAudioOnlyAssistantMessage(
+async function buildWebchatAssistantMediaMessage(
  payloads: ReplyPayload[],
  options?: {
    localRoots?: readonly string[];
    onLocalAudioAccessDenied?: (message: string) => void;
  },
 ): Promise<{ content: Array<Record<string, unknown>>; transcriptText: string } | null> {
-  const audioBlocks = await buildWebchatAudioContentBlocksFromReplyPayloads(payloads, {
+  return buildWebchatAssistantMessageFromReplyPayloads(payloads, {
    localRoots: options?.localRoots,
    onLocalAudioAccessDenied: (err) => {
      options?.onLocalAudioAccessDenied?.(formatForLog(err));
    },
  });
-  if (audioBlocks.length === 0) {
-    return null;
-  }
-  return {
-    transcriptText: "Audio reply",
-    content: [{ type: "text", text: "Audio reply" }, ...audioBlocks],
-  };
 }

 export const DEFAULT_CHAT_HISTORY_TEXT_MAX_CHARS = 8_000;
@@ -225,8 +219,9 @@ function buildTranscriptReplyText(payloads: ReplyPayload[]): string {
    .map((payload) => {
      const parts = resolveSendableOutboundReplyParts(payload);
      const lines: string[] = [];
-      if (typeof payload.replyToId === "string" && payload.replyToId.trim()) {
-        lines.push(`[[reply_to:${payload.replyToId.trim()}]]`);
+      const replyToId = sanitizeReplyDirectiveId(payload.replyToId);
+      if (replyToId) {
+        lines.push(`[[reply_to:${replyToId}]]`);
      } else if (payload.replyToCurrent) {
        lines.push("[[reply_to_current]]");
      }
@@ -235,6 +230,9 @@ function buildTranscriptReplyText(payloads: ReplyPayload[]): string {
        lines.push(text);
      }
      for (const mediaUrl of parts.mediaUrls) {
+        if (payload.sensitiveMedia === true) {
+          continue;
+        }
        const trimmed = mediaUrl.trim();
        if (trimmed) {
          lines.push(`MEDIA:${trimmed}`);
@@ -249,6 +247,10 @@ function buildTranscriptReplyText(payloads: ReplyPayload[]): string {
  return chunks.join("\n\n").trim();
 }

+function hasSensitiveMediaPayload(payloads: ReplyPayload[]): boolean {
+  return payloads.some((payload) => payload.sensitiveMedia === true && isMediaBearingPayload(payload));
+}
+
 function resolveChatSendOriginatingRoute(params: {
  client?: { mode?: string | null; id?: string | null } | null;
  deliver?: boolean;
@@ -2036,7 +2038,7 @@ export const chatHandlers: GatewayRequestHandlers = {
        channel: INTERNAL_MESSAGE_CHANNEL,
      });
      const deliveredReplies: Array<{ payload: ReplyPayload; kind: "block" | "final" }> = [];
-      let appendedWebchatAgentAudio = false;
+      let appendedWebchatAgentMedia = false;
      let userTranscriptUpdatePromise: Promise<void> | null = null;
      const emitUserTranscriptUpdate = async () => {
        if (userTranscriptUpdatePromise) {
@@ -2098,37 +2100,37 @@ export const chatHandlers: GatewayRequestHandlers = {
          savedImages: await persistedImagesPromise,
        });
      };
-      const appendWebchatAgentAudioTranscriptIfNeeded = async (payload: ReplyPayload) => {
-        if (!agentRunStarted || appendedWebchatAgentAudio || !isMediaBearingPayload(payload)) {
+      const appendWebchatAgentMediaTranscriptIfNeeded = async (payload: ReplyPayload) => {
+        if (!agentRunStarted || appendedWebchatAgentMedia || !isMediaBearingPayload(payload)) {
          return;
        }
-        const audioMessage = await buildWebchatAudioOnlyAssistantMessage([payload], {
+        const mediaMessage = await buildWebchatAssistantMediaMessage([payload], {
          localRoots: getAgentScopedMediaLocalRoots(cfg, agentId),
          onLocalAudioAccessDenied: (message) => {
            context.logGateway.warn(`webchat audio embedding denied local path: ${message}`);
          },
        });
-        if (!audioMessage) {
+        if (!mediaMessage) {
          return;
        }
        const { storePath: latestStorePath, entry: latestEntry } = loadSessionEntry(sessionKey);
        const sessionId = latestEntry?.sessionId ?? entry?.sessionId ?? clientRunId;
        const appended = appendAssistantTranscriptMessage({
-          message: audioMessage.transcriptText,
-          content: audioMessage.content,
+          message: mediaMessage.transcriptText,
+          ...(payload.sensitiveMedia === true ? {} : { content: mediaMessage.content }),
          sessionId,
          storePath: latestStorePath,
          sessionFile: latestEntry?.sessionFile,
          agentId,
          createIfMissing: true,
-          idempotencyKey: `${clientRunId}:assistant-audio`,
+          idempotencyKey: `${clientRunId}:assistant-media`,
        });
        if (appended.ok) {
-          appendedWebchatAgentAudio = true;
+          appendedWebchatAgentMedia = true;
          return;
        }
        context.logGateway.warn(
-          `webchat transcript append failed for audio reply: ${appended.error ?? "unknown error"}`,
+          `webchat transcript append failed for media reply: ${appended.error ?? "unknown error"}`,
        );
      };
      const dispatcher = createReplyDispatcher({
@@ -2141,7 +2143,7 @@ export const chatHandlers: GatewayRequestHandlers = {
            case "block":
            case "final":
              deliveredReplies.push({ payload, kind: info.kind });
-              await appendWebchatAgentAudioTranscriptIfNeeded(payload);
+              await appendWebchatAgentMediaTranscriptIfNeeded(payload);
              break;
            case "tool":
              // Tool results that carry audio (e.g. the TTS tool) must be promoted
@@ -2231,18 +2233,25 @@ export const chatHandlers: GatewayRequestHandlers = {
                sessionKey,
              });
            } else {
-              const combinedReply = buildTranscriptReplyText(
-                deliveredReplies
-                  .filter((entry) => entry.kind === "final")
-                  .map((entry) => entry.payload),
-              );
+              const finalPayloads = deliveredReplies
+                .filter((entry) => entry.kind === "final")
+                .map((entry) => entry.payload);
+              const combinedReply = buildTranscriptReplyText(finalPayloads);
+              const mediaMessage = await buildWebchatAssistantMediaMessage(finalPayloads, {
+                localRoots: getAgentScopedMediaLocalRoots(cfg, agentId),
+                onLocalAudioAccessDenied: (message) => {
+                  context.logGateway.warn(`webchat audio embedding denied local path: ${message}`);
+                },
+              });
+              const hasSensitiveMedia = hasSensitiveMediaPayload(finalPayloads);
              let message: Record<string, unknown> | undefined;
-              if (combinedReply) {
+              if (mediaMessage || combinedReply) {
                const { storePath: latestStorePath, entry: latestEntry } =
                  loadSessionEntry(sessionKey);
                const sessionId = latestEntry?.sessionId ?? entry?.sessionId ?? clientRunId;
                const appended = appendAssistantTranscriptMessage({
-                  message: combinedReply,
+                  message: mediaMessage?.transcriptText ?? combinedReply,
+                  ...(mediaMessage && !hasSensitiveMedia ? { content: mediaMessage.content } : {}),
                  sessionId,
                  storePath: latestStorePath,
                  sessionFile: latestEntry?.sessionFile,
@@ -2250,7 +2259,14 @@ export const chatHandlers: GatewayRequestHandlers = {
                  createIfMissing: true,
                });
                if (appended.ok) {
-                  message = appended.message;
+                  if (hasSensitiveMedia && mediaMessage) {
+                    message = {
+                      ...appended.message,
+                      content: mediaMessage.content,
+                    };
+                  } else {
+                    message = appended.message;
+                  }
                } else {
                  context.logGateway.warn(
                    `webchat transcript append failed: ${appended.error ?? "unknown error"}`,
@@ -2258,7 +2274,7 @@ export const chatHandlers: GatewayRequestHandlers = {
                  const now = Date.now();
                  message = {
                    role: "assistant",
-                    content: [{ type: "text", text: combinedReply }],
+                    content: mediaMessage?.content ?? [{ type: "text", text: combinedReply }],
                    timestamp: now,
                    // Keep this compatible with Pi stopReason enums even though this message isn't
                    // persisted to the transcript due to the append failure.
--- a/src/plugin-sdk/approval-client-helpers.ts
+++ b/src/plugin-sdk/approval-client-helpers.ts
@@ -1,4 +1,3 @@
-import type { ReplyPayload } from "../auto-reply/reply-payload.js";
 import type { ExecApprovalForwardTarget } from "../config/types.approvals.js";
 import { matchesApprovalRequestFilters } from "../infra/approval-request-filters.js";
 import { getExecApprovalReplyMetadata } from "../infra/exec-approval-reply.js";
@@ -9,6 +8,7 @@ import {
  normalizeOptionalString,
 } from "../shared/string-coerce.js";
 import type { OpenClawConfig } from "./config-runtime.js";
+import type { ReplyPayload } from "./reply-payload.js";
 import { normalizeAccountId } from "./routing.js";

 type ApprovalRequest = ExecApprovalRequest | PluginApprovalRequest;
--- a/src/plugin-sdk/approval-renderers.ts
+++ b/src/plugin-sdk/approval-renderers.ts
@@ -1,4 +1,3 @@
-import type { ReplyPayload } from "../auto-reply/reply-payload.js";
 import {
  buildApprovalInteractiveReply,
  type ExecApprovalReplyDecision,
@@ -10,6 +9,7 @@ import {
  type PluginApprovalResolved,
 } from "../infra/plugin-approvals.js";
 import { normalizeOptionalString } from "../shared/string-coerce.js";
+import type { ReplyPayload } from "./reply-payload.js";

 const DEFAULT_ALLOWED_DECISIONS = ["allow-once", "allow-always", "deny"] as const;

--- a/src/plugin-sdk/channel-reply-pipeline.ts
+++ b/src/plugin-sdk/channel-reply-pipeline.ts
@@ -1,4 +1,3 @@
-import type { ReplyPayload } from "../auto-reply/reply-payload.js";
 import { getChannelPlugin, normalizeChannelId } from "../channels/plugins/index.js";
 import {
  createReplyPrefixContext,
@@ -11,6 +10,7 @@ import {
  type CreateTypingCallbacksParams,
  type TypingCallbacks,
 } from "../channels/typing.js";
+import type { ReplyPayload } from "./reply-payload.js";

 export type ReplyPrefixContext = ReplyPrefixContextBundle["prefixContext"];
 export type { ReplyPrefixContextBundle, ReplyPrefixOptions };
--- a/src/plugin-sdk/core.ts
+++ b/src/plugin-sdk/core.ts
@@ -109,7 +109,7 @@ export type {
 export type { OpenClawConfig } from "../config/config.js";
 export type { OutboundIdentity } from "../infra/outbound/identity.js";
 export type { HistoryEntry } from "../auto-reply/reply/history.js";
-export type { ReplyPayload } from "../auto-reply/reply-payload.js";
+export type { ReplyPayload } from "./reply-payload.js";
 export type { AllowlistMatch } from "../channels/allowlist-match.js";
 export type {
  BaseProbeResult,
--- a/src/plugin-sdk/feishu.ts
+++ b/src/plugin-sdk/feishu.ts
@@ -8,7 +8,7 @@ export {
  DEFAULT_GROUP_HISTORY_LIMIT,
  recordPendingHistoryEntryIfEnabled,
 } from "../auto-reply/reply/history.js";
-export type { ReplyPayload } from "../auto-reply/reply-payload.js";
+export type { ReplyPayload } from "./reply-payload.js";
 export { logTypingFailure } from "../channels/logging.js";
 export type { AllowlistMatch } from "../channels/plugins/allowlist-match.js";
 export { buildChannelConfigSchema } from "../channels/plugins/config-schema.js";
--- a/src/plugin-sdk/index.ts
+++ b/src/plugin-sdk/index.ts
@@ -91,7 +91,7 @@ export * from "./music-generation.js";
 export type { SecretInput, SecretRef } from "../config/types.secrets.js";
 export type { RuntimeEnv } from "../runtime.js";
 export type { HookEntry } from "../hooks/types.js";
-export type { ReplyPayload } from "../auto-reply/reply-payload.js";
+export type { ReplyPayload } from "./reply-payload.js";
 export type { WizardPrompter } from "../wizard/prompts.js";
 export type { ContextEngineFactory } from "../context-engine/registry.js";
 export type { DiagnosticEventPayload } from "../infra/diagnostic-events.js";
--- a/src/plugin-sdk/line.ts
+++ b/src/plugin-sdk/line.ts
@@ -5,7 +5,7 @@ export type {
 } from "../channels/plugins/types.public.js";
 export type { ChannelPlugin } from "../channels/plugins/types.plugin.js";
 export type { OpenClawConfig } from "../config/config.js";
-export type { ReplyPayload } from "../auto-reply/reply-payload.js";
+export type { ReplyPayload } from "./reply-payload.js";
 export type { ChannelSetupAdapter } from "../channels/plugins/types.adapters.js";
 export type { OpenClawPluginApi, PluginRuntime } from "./channel-plugin-common.js";

--- a/src/plugin-sdk/matrix.ts
+++ b/src/plugin-sdk/matrix.ts
@@ -29,7 +29,7 @@ export {
  readStringParam,
 } from "../agents/tools/common.js";
 export type { BlockReplyContext } from "../auto-reply/get-reply-options.types.js";
-export type { ReplyPayload } from "../auto-reply/reply-payload.js";
+export type { ReplyPayload } from "./reply-payload.js";
 export { resolveAckReaction } from "../agents/identity.js";
 export {
  compileAllowlist,
--- a/src/plugin-sdk/mattermost.ts
+++ b/src/plugin-sdk/mattermost.ts
@@ -10,7 +10,7 @@ export {
  recordPendingHistoryEntryIfEnabled,
 } from "../auto-reply/reply/history.js";
 export { listSkillCommandsForAgents } from "../auto-reply/skill-commands.js";
-export type { ReplyPayload } from "../auto-reply/reply-payload.js";
+export type { ReplyPayload } from "./reply-payload.js";
 export type { ChatType } from "../channels/chat-type.js";
 export { resolveControlCommandGate } from "../channels/command-gating.js";
 export { logInboundDrop, logTypingFailure } from "../channels/logging.js";
--- a/src/plugin-sdk/msteams.ts
+++ b/src/plugin-sdk/msteams.ts
@@ -12,7 +12,7 @@ export {
  recordPendingHistoryEntryIfEnabled,
 } from "../auto-reply/reply/history.js";
 export { isSilentReplyText, SILENT_REPLY_TOKEN } from "../auto-reply/tokens.js";
-export type { ReplyPayload } from "../auto-reply/reply-payload.js";
+export type { ReplyPayload } from "./reply-payload.js";
 export { mergeAllowlist, summarizeMapping } from "../channels/allowlists/resolve-utils.js";
 export {
  resolveControlCommandGate,
--- a/src/plugin-sdk/reply-chunking.ts
+++ b/src/plugin-sdk/reply-chunking.ts
@@ -7,4 +7,4 @@ export {
 } from "../auto-reply/chunk.js";
 export type { ChunkMode } from "../auto-reply/chunk.js";
 export { isSilentReplyText } from "../auto-reply/tokens.js";
-export type { ReplyPayload } from "../auto-reply/reply-payload.js";
+export type { ReplyPayload } from "./reply-payload.js";
--- a/src/plugin-sdk/reply-dispatch-runtime.ts
+++ b/src/plugin-sdk/reply-dispatch-runtime.ts
@@ -4,4 +4,4 @@ export {
  dispatchReplyWithBufferedBlockDispatcher,
  dispatchReplyWithDispatcher,
 } from "../auto-reply/reply/provider-dispatcher.js";
-export type { ReplyPayload } from "../auto-reply/reply-payload.js";
+export type { ReplyPayload } from "./reply-payload.js";
--- a/src/plugin-sdk/reply-payload.test.ts
+++ b/src/plugin-sdk/reply-payload.test.ts
@@ -1,6 +1,7 @@
 import { describe, expect, it, vi } from "vitest";
 import {
  countOutboundMedia,
+  createNormalizedOutboundDeliverer,
  deliverFormattedTextWithAttachments,
  deliverTextOrMediaReply,
  hasOutboundMedia,
@@ -8,6 +9,7 @@ import {
  hasOutboundText,
  isReasoningReplyPayload,
  isNumericTargetId,
+  normalizeOutboundReplyPayload,
  resolveOutboundMediaUrls,
  resolveSendableOutboundReplyParts,
  resolveTextChunksWithFallback,
@@ -87,6 +89,45 @@ describe("sendPayloadWithChunkedTextAndMedia", () => {
  });
 });

+describe("normalizeOutboundReplyPayload", () => {
+  it("strips internal-only local media trust flags from loose payload objects", () => {
+    expect(
+      normalizeOutboundReplyPayload({
+        text: "hello",
+        mediaUrl: "/tmp/reply.opus",
+        trustedLocalMedia: true,
+        sensitiveMedia: true,
+        replyToId: "abc123",
+      }),
+    ).toEqual({
+      text: "hello",
+      mediaUrl: "/tmp/reply.opus",
+      sensitiveMedia: true,
+      replyToId: "abc123",
+    });
+  });
+
+  it("keeps the normalized deliverer from forwarding trustedLocalMedia", async () => {
+    const handler = vi.fn(async () => {});
+    const deliver = createNormalizedOutboundDeliverer(handler);
+
+    await deliver({
+      text: "hello",
+      mediaUrl: "/tmp/reply.opus",
+      trustedLocalMedia: true,
+      sensitiveMedia: true,
+    });
+
+    expect(handler).toHaveBeenCalledWith({
+      text: "hello",
+      mediaUrl: "/tmp/reply.opus",
+      sensitiveMedia: true,
+      replyToId: undefined,
+      mediaUrls: undefined,
+    });
+  });
+});
+
 describe("resolveOutboundMediaUrls", () => {
  it.each([
    {
--- a/src/plugin-sdk/reply-payload.ts
+++ b/src/plugin-sdk/reply-payload.ts
@@ -1,14 +1,16 @@
+import type { ReplyPayload as InternalReplyPayload } from "../auto-reply/reply-payload.js";
 import type { ChannelOutboundAdapter } from "../channels/plugins/outbound.types.js";
 import { normalizeLowercaseStringOrEmpty, readStringValue } from "../shared/string-coerce.js";

 export type { MediaPayload, MediaPayloadInput } from "../channels/plugins/media-payload.js";
 export { buildMediaPayload } from "../channels/plugins/media-payload.js";
-export type { ReplyPayload } from "../auto-reply/reply-payload.js";
+export type ReplyPayload = Omit<InternalReplyPayload, "trustedLocalMedia">;

 export type OutboundReplyPayload = {
  text?: string;
  mediaUrls?: string[];
  mediaUrl?: string;
+  sensitiveMedia?: boolean;
  replyToId?: string;
 };

@@ -72,11 +74,13 @@ export function normalizeOutboundReplyPayload(
      )
    : undefined;
  const mediaUrl = readStringValue(payload.mediaUrl);
+  const sensitiveMedia = payload.sensitiveMedia === true ? true : undefined;
  const replyToId = readStringValue(payload.replyToId);
  return {
    text,
    mediaUrls,
    mediaUrl,
+    sensitiveMedia,
    replyToId,
  };
 }
--- a/src/plugin-sdk/reply-runtime.ts
+++ b/src/plugin-sdk/reply-runtime.ts
@@ -54,7 +54,7 @@ export type {
 } from "../auto-reply/reply/reply-dispatcher.js";
 export { createReplyReferencePlanner } from "../auto-reply/reply/reply-reference.js";
 export type { GetReplyOptions, BlockReplyContext } from "../auto-reply/get-reply-options.types.js";
-export type { ReplyPayload } from "../auto-reply/reply-payload.js";
+export type { ReplyPayload } from "./reply-payload.js";
 export type { FinalizedMsgContext, MsgContext } from "../auto-reply/templating.js";
 export { generateConversationLabel } from "../auto-reply/reply/conversation-label-generator.js";
 export type { ConversationLabelParams } from "../auto-reply/reply/conversation-label-generator.js";
--- a/src/plugin-sdk/tlon.ts
+++ b/src/plugin-sdk/tlon.ts
@@ -3,7 +3,7 @@

 import { createOptionalChannelSetupSurface } from "./channel-setup.js";

-export type { ReplyPayload } from "../auto-reply/reply-payload.js";
+export type { ReplyPayload } from "./reply-payload.js";
 export { buildChannelConfigSchema } from "../channels/plugins/config-schema.js";
 export {
  applyAccountNameToChannelSection,
--- a/src/plugin-sdk/tts-runtime.types.ts
+++ b/src/plugin-sdk/tts-runtime.types.ts
@@ -1,4 +1,3 @@
-import type { ReplyPayload } from "../auto-reply/reply-payload.js";
 import type { OpenClawConfig } from "../config/types.openclaw.js";
 import type { TtsAutoMode, TtsProvider } from "../config/types.tts.js";
 import type {
@@ -8,6 +7,7 @@ import type {
  TtsDirectiveParseResult,
 } from "../tts/provider-types.js";
 import type { ResolvedTtsConfig, ResolvedTtsModelOverrides } from "../tts/tts-types.js";
+import type { ReplyPayload } from "./reply-payload.js";

 export type { ResolvedTtsConfig, ResolvedTtsModelOverrides };
 export type { TtsDirectiveOverrides, TtsDirectiveParseResult };
--- a/src/plugin-sdk/twitch.ts
+++ b/src/plugin-sdk/twitch.ts
@@ -3,7 +3,7 @@

 import { createOptionalChannelSetupSurface } from "./channel-setup.js";

-export type { ReplyPayload } from "../auto-reply/reply-payload.js";
+export type { ReplyPayload } from "./reply-payload.js";
 export { buildChannelConfigSchema } from "../channels/plugins/config-schema.js";
 export type {
  ChannelGatewayContext,
--- a/src/plugin-sdk/zalo.ts
+++ b/src/plugin-sdk/zalo.ts
@@ -2,7 +2,7 @@
 // Keep this list additive and scoped to the bundled Zalo surface.

 export { jsonResult, readStringParam } from "../agents/tools/common.js";
-export type { ReplyPayload } from "../auto-reply/reply-payload.js";
+export type { ReplyPayload } from "./reply-payload.js";
 export {
  deleteAccountFromConfigSection,
  setAccountEnabledInConfigSection,
--- a/src/plugin-sdk/zalouser.ts
+++ b/src/plugin-sdk/zalouser.ts
@@ -3,7 +3,7 @@

 import { createOptionalChannelSetupSurface } from "./channel-setup.js";

-export type { ReplyPayload } from "../auto-reply/reply-payload.js";
+export type { ReplyPayload } from "./reply-payload.js";
 export { mergeAllowlist, summarizeMapping } from "../channels/allowlists/resolve-utils.js";
 export {
  resolveMentionGating,
--- a/src/utils/directive-tags.ts
+++ b/src/utils/directive-tags.ts
@@ -20,6 +20,7 @@ const AUDIO_TAG_RE = /\[\[\s*audio_as_voice\s*\]\]/gi;
 const REPLY_TAG_RE = /\[\[\s*(?:reply_to_current|reply_to\s*:\s*([^\]\n]+))\s*\]\]/gi;
 const INLINE_DIRECTIVE_TAG_WITH_PADDING_RE =
  /\s*(?:\[\[\s*audio_as_voice\s*\]\]|\[\[\s*(?:reply_to_current|reply_to\s*:\s*[^\]\n]+)\s*\]\])\s*/gi;
+const MAX_REPLY_DIRECTIVE_ID_LENGTH = 256;

 function replacementPreservesWordBoundary(source: string, offset: number, length: number): string {
  const before = source[offset - 1];
@@ -92,6 +93,33 @@ export function stripInlineDirectiveTagsForDisplay(text: string): StripInlineDir
  };
 }

+function stripUnsafeReplyDirectiveChars(value: string): string {
+  let next = "";
+  for (const ch of value) {
+    const code = ch.charCodeAt(0);
+    if ((code >= 0 && code <= 31) || code === 127 || ch === "[" || ch === "]") {
+      continue;
+    }
+    next += ch;
+  }
+  return next;
+}
+
+export function sanitizeReplyDirectiveId(rawReplyToId?: string): string | undefined {
+  const trimmed = rawReplyToId?.trim();
+  if (!trimmed) {
+    return undefined;
+  }
+  const sanitized = stripUnsafeReplyDirectiveChars(trimmed).trim();
+  if (!sanitized) {
+    return undefined;
+  }
+  if (sanitized.length > MAX_REPLY_DIRECTIVE_ID_LENGTH) {
+    return sanitized.slice(0, MAX_REPLY_DIRECTIVE_ID_LENGTH);
+  }
+  return sanitized;
+}
+
 export function stripInlineDirectiveTagsForDelivery(text: string): StripInlineDirectiveTagsResult {
  if (!text) {
    return { text, changed: false };