diff --git a/CHANGELOG.md b/CHANGELOG.md index e761d22cb69..b8f008b1bdc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,6 +18,7 @@ Docs: https://docs.openclaw.ai ### Fixes +- Gateway/pairing webchat: render `/pair qr` replies as structured media instead of raw markdown text, preserve inline reply threading and silent-control handling on media replies, avoid persisting sensitive QR images into transcript history, and keep local webchat media embedding behind internal-only trust markers. (#70047) Thanks @BunsDev. - OpenAI/Responses: keep embedded OpenAI Responses runs on HTTP when `models.providers.openai.baseUrl` points at a local mock or other non-public endpoint, so mocked/custom endpoints no longer drift onto the hardcoded public websocket transport. (#69815) Thanks @vincentkoc. - Channels/config: require resolved runtime config on channel send/action/client helpers and block runtime helper `loadConfig()` calls, so SecretRefs are resolved at startup/boundaries instead of being re-read during sends. - CLI/channels: preserve bundled setup promotion metadata when a loaded partial channel plugin omits it, so adding a non-default account still moves legacy single-account fields such as Telegram `streaming` into `accounts.default`. diff --git a/docs/.generated/plugin-sdk-api-baseline.sha256 b/docs/.generated/plugin-sdk-api-baseline.sha256 index 581bafd9b80..9d756bbdd91 100644 --- a/docs/.generated/plugin-sdk-api-baseline.sha256 +++ b/docs/.generated/plugin-sdk-api-baseline.sha256 @@ -1,2 +1,2 @@ -55b39075f07def786f5056b029921db64fcbdc5e2cab3d645215eccc857ba9a4 plugin-sdk-api-baseline.json -4a6b8f4afc9e6aa7c56b0cbab0886dacc4ead534c47761ab30eb76480d8fd673 plugin-sdk-api-baseline.jsonl +ba9b9d9b321b405fef89d4e95c1a3d629d1b956398a5b2a7f25b2a7654879783 plugin-sdk-api-baseline.json +8bbbee0ea2326148d4fd49f61fe74f83c5bb24c0742cfbf3609f43939fcd4c34 plugin-sdk-api-baseline.jsonl diff --git a/extensions/device-pair/index.test.ts b/extensions/device-pair/index.test.ts index d34447691d9..66d6a897984 100644 --- a/extensions/device-pair/index.test.ts +++ b/extensions/device-pair/index.test.ts @@ -251,6 +251,7 @@ describe("device-pair /pair qr", () => { gatewayClientScopes: ["operator.write", "operator.pairing"], }), ); + const payload = result as { text?: string; mediaUrl?: string; sensitiveMedia?: boolean }; const text = requireText(result); expect(pluginApiMocks.renderQrPngBase64).toHaveBeenCalledTimes(1); @@ -261,11 +262,12 @@ describe("device-pair /pair qr", () => { }, }); expect(text).toContain("Scan this QR code with the OpenClaw iOS app:"); - expect(text).toContain("![OpenClaw pairing QR](data:image/png;base64,ZmFrZXBuZw==)"); + expect(payload.mediaUrl).toBe("data:image/png;base64,ZmFrZXBuZw=="); + expect(payload.sensitiveMedia).toBe(true); expect(text).toContain("- Security: single-use bootstrap token"); expect(text).toContain("**Important:** Run `/pair cleanup` after pairing finishes."); expect(text).toContain("If this QR code leaks, run `/pair cleanup` immediately."); - expect(text).not.toContain("```"); + expect(text).not.toContain("![OpenClaw pairing QR]"); }); it("rejects qr setup for internal gateway callers without operator.pairing", async () => { diff --git a/extensions/device-pair/index.ts b/extensions/device-pair/index.ts index 4d60be47d62..8e928223af3 100644 --- a/extensions/device-pair/index.ts +++ b/extensions/device-pair/index.ts @@ -732,9 +732,9 @@ export default definePluginEntry({ autoNotifyArmed, expiresAtMs: payload.expiresAtMs, }), - "", - `![OpenClaw pairing QR](${qrDataUrl})`, ].join("\n"), + mediaUrl: qrDataUrl, + sensitiveMedia: true, }; } diff --git a/src/agents/pi-embedded-payloads.ts b/src/agents/pi-embedded-payloads.ts index 1186111db10..fe89378965a 100644 --- a/src/agents/pi-embedded-payloads.ts +++ b/src/agents/pi-embedded-payloads.ts @@ -2,6 +2,8 @@ export type BlockReplyPayload = { text?: string; mediaUrls?: string[]; audioAsVoice?: boolean; + trustedLocalMedia?: boolean; + sensitiveMedia?: boolean; isReasoning?: boolean; replyToId?: string; replyToTag?: boolean; diff --git a/src/agents/pi-embedded-subscribe.handlers.messages.test.ts b/src/agents/pi-embedded-subscribe.handlers.messages.test.ts index 1b19786a14a..db315eca0d7 100644 --- a/src/agents/pi-embedded-subscribe.handlers.messages.test.ts +++ b/src/agents/pi-embedded-subscribe.handlers.messages.test.ts @@ -251,6 +251,7 @@ describe("consumePendingToolMediaIntoReply", () => { const state = { pendingToolMediaUrls: ["/tmp/a.png", "/tmp/b.png"], pendingToolAudioAsVoice: false, + pendingToolTrustedLocalMedia: false, }; expect( @@ -269,6 +270,7 @@ describe("consumePendingToolMediaIntoReply", () => { const state = { pendingToolMediaUrls: ["/tmp/a.png"], pendingToolAudioAsVoice: true, + pendingToolTrustedLocalMedia: false, }; expect( @@ -290,6 +292,7 @@ describe("consumePendingToolMediaReply", () => { const state = { pendingToolMediaUrls: ["/tmp/reply.opus"], pendingToolAudioAsVoice: true, + pendingToolTrustedLocalMedia: false, }; expect(consumePendingToolMediaReply(state)).toEqual({ diff --git a/src/agents/pi-embedded-subscribe.handlers.messages.ts b/src/agents/pi-embedded-subscribe.handlers.messages.ts index a6847b47d34..27db0016e0e 100644 --- a/src/agents/pi-embedded-subscribe.handlers.messages.ts +++ b/src/agents/pi-embedded-subscribe.handlers.messages.ts @@ -178,20 +178,31 @@ export function resolveSilentReplyFallbackText(params: { } function clearPendingToolMedia( - state: Pick, + state: Pick< + EmbeddedPiSubscribeState, + "pendingToolMediaUrls" | "pendingToolAudioAsVoice" | "pendingToolTrustedLocalMedia" + >, ) { state.pendingToolMediaUrls = []; state.pendingToolAudioAsVoice = false; + state.pendingToolTrustedLocalMedia = false; } export function consumePendingToolMediaIntoReply( - state: Pick, + state: Pick< + EmbeddedPiSubscribeState, + "pendingToolMediaUrls" | "pendingToolAudioAsVoice" | "pendingToolTrustedLocalMedia" + >, payload: BlockReplyPayload, ): BlockReplyPayload { if (payload.isReasoning) { return payload; } - if (state.pendingToolMediaUrls.length === 0 && !state.pendingToolAudioAsVoice) { + if ( + state.pendingToolMediaUrls.length === 0 && + !state.pendingToolAudioAsVoice && + !state.pendingToolTrustedLocalMedia + ) { return payload; } const mergedMediaUrls = Array.from( @@ -201,15 +212,24 @@ export function consumePendingToolMediaIntoReply( ...payload, mediaUrls: mergedMediaUrls.length ? mergedMediaUrls : undefined, audioAsVoice: payload.audioAsVoice || state.pendingToolAudioAsVoice || undefined, + trustedLocalMedia: + payload.trustedLocalMedia || state.pendingToolTrustedLocalMedia || undefined, }; clearPendingToolMedia(state); return mergedPayload; } export function consumePendingToolMediaReply( - state: Pick, + state: Pick< + EmbeddedPiSubscribeState, + "pendingToolMediaUrls" | "pendingToolAudioAsVoice" | "pendingToolTrustedLocalMedia" + >, ): BlockReplyPayload | null { - if (state.pendingToolMediaUrls.length === 0 && !state.pendingToolAudioAsVoice) { + if ( + state.pendingToolMediaUrls.length === 0 && + !state.pendingToolAudioAsVoice && + !state.pendingToolTrustedLocalMedia + ) { return null; } const payload: BlockReplyPayload = { @@ -217,6 +237,7 @@ export function consumePendingToolMediaReply( ? Array.from(new Set(state.pendingToolMediaUrls)) : undefined, audioAsVoice: state.pendingToolAudioAsVoice || undefined, + trustedLocalMedia: state.pendingToolTrustedLocalMedia || undefined, }; clearPendingToolMedia(state); return payload; diff --git a/src/agents/pi-embedded-subscribe.handlers.tools.test.ts b/src/agents/pi-embedded-subscribe.handlers.tools.test.ts index 661f0d4ebe1..9d216daef4e 100644 --- a/src/agents/pi-embedded-subscribe.handlers.tools.test.ts +++ b/src/agents/pi-embedded-subscribe.handlers.tools.test.ts @@ -47,6 +47,7 @@ function createTestContext(): { pendingMessagingMediaUrls: new Map(), pendingToolMediaUrls: [], pendingToolAudioAsVoice: false, + pendingToolTrustedLocalMedia: false, deterministicApprovalPromptPending: false, replayState: { replayInvalid: false, hadPotentialSideEffects: false }, messagingToolSentTexts: [], diff --git a/src/agents/pi-embedded-subscribe.handlers.tools.ts b/src/agents/pi-embedded-subscribe.handlers.tools.ts index 7f765fb5f07..8c67904288a 100644 --- a/src/agents/pi-embedded-subscribe.handlers.tools.ts +++ b/src/agents/pi-embedded-subscribe.handlers.tools.ts @@ -293,7 +293,7 @@ function collectMessagingMediaUrlsFromToolResult(result: unknown): string[] { function queuePendingToolMedia( ctx: ToolHandlerContext, - mediaReply: { mediaUrls: string[]; audioAsVoice?: boolean }, + mediaReply: { mediaUrls: string[]; audioAsVoice?: boolean; trustedLocalMedia?: boolean }, ) { const seen = new Set(ctx.state.pendingToolMediaUrls); for (const mediaUrl of mediaReply.mediaUrls) { @@ -306,6 +306,9 @@ function queuePendingToolMedia( if (mediaReply.audioAsVoice) { ctx.state.pendingToolAudioAsVoice = true; } + if (mediaReply.trustedLocalMedia) { + ctx.state.pendingToolTrustedLocalMedia = true; + } } async function collectEmittedToolOutputMediaUrls( diff --git a/src/agents/pi-embedded-subscribe.handlers.types.ts b/src/agents/pi-embedded-subscribe.handlers.types.ts index 542e59b524a..e0e544e4b7b 100644 --- a/src/agents/pi-embedded-subscribe.handlers.types.ts +++ b/src/agents/pi-embedded-subscribe.handlers.types.ts @@ -81,6 +81,7 @@ export type EmbeddedPiSubscribeState = { pendingMessagingMediaUrls: Map; pendingToolMediaUrls: string[]; pendingToolAudioAsVoice: boolean; + pendingToolTrustedLocalMedia: boolean; deterministicApprovalPromptPending: boolean; deterministicApprovalPromptSent: boolean; lastAssistant?: AgentMessage; @@ -165,6 +166,7 @@ export type ToolHandlerState = Pick< | "pendingMessagingMediaUrls" | "pendingToolMediaUrls" | "pendingToolAudioAsVoice" + | "pendingToolTrustedLocalMedia" | "deterministicApprovalPromptPending" | "replayState" | "messagingToolSentTexts" diff --git a/src/agents/pi-embedded-subscribe.tools.media.test.ts b/src/agents/pi-embedded-subscribe.tools.media.test.ts index 443d07c2a34..8ffab15b574 100644 --- a/src/agents/pi-embedded-subscribe.tools.media.test.ts +++ b/src/agents/pi-embedded-subscribe.tools.media.test.ts @@ -51,6 +51,22 @@ describe("extractToolResultMediaPaths", () => { }); }); + it("extracts structured media trust markers", () => { + expect( + extractToolResultMediaArtifact({ + details: { + media: { + mediaUrl: "/tmp/reply.opus", + trustedLocalMedia: true, + }, + }, + }), + ).toEqual({ + mediaUrls: ["/tmp/reply.opus"], + trustedLocalMedia: true, + }); + }); + it("extracts MEDIA: path from text content block", () => { const result = { content: [ diff --git a/src/agents/pi-embedded-subscribe.tools.ts b/src/agents/pi-embedded-subscribe.tools.ts index 7021bcb3b72..14c2be9119f 100644 --- a/src/agents/pi-embedded-subscribe.tools.ts +++ b/src/agents/pi-embedded-subscribe.tools.ts @@ -249,6 +249,7 @@ export function filterToolResultMediaUrls( export type ToolResultMediaArtifact = { mediaUrls: string[]; audioAsVoice?: boolean; + trustedLocalMedia?: boolean; }; function readToolResultDetailsMedia( @@ -292,6 +293,7 @@ export function extractToolResultMediaArtifact( return { mediaUrls, ...(detailsMedia.audioAsVoice === true ? { audioAsVoice: true } : {}), + ...(detailsMedia.trustedLocalMedia === true ? { trustedLocalMedia: true } : {}), }; } } diff --git a/src/agents/pi-embedded-subscribe.ts b/src/agents/pi-embedded-subscribe.ts index 1fddc10b216..3197f575f2c 100644 --- a/src/agents/pi-embedded-subscribe.ts +++ b/src/agents/pi-embedded-subscribe.ts @@ -123,6 +123,7 @@ export function subscribeEmbeddedPiSession(params: SubscribeEmbeddedPiSessionPar pendingMessagingMediaUrls: new Map(), pendingToolMediaUrls: initialPendingToolMediaUrls, pendingToolAudioAsVoice: false, + pendingToolTrustedLocalMedia: false, deterministicApprovalPromptPending: false, deterministicApprovalPromptSent: false, }; diff --git a/src/agents/pi-tool-handler-state.test-helpers.ts b/src/agents/pi-tool-handler-state.test-helpers.ts index 600395e8987..aa8d6186f07 100644 --- a/src/agents/pi-tool-handler-state.test-helpers.ts +++ b/src/agents/pi-tool-handler-state.test-helpers.ts @@ -15,6 +15,7 @@ export function createBaseToolHandlerState() { pendingMessagingMediaUrls: new Map(), pendingToolMediaUrls: [] as string[], pendingToolAudioAsVoice: false, + pendingToolTrustedLocalMedia: false, deterministicApprovalPromptPending: false, messagingToolSentTexts: [] as string[], messagingToolSentTextsNormalized: [] as string[], diff --git a/src/agents/tools/tts-tool.test.ts b/src/agents/tools/tts-tool.test.ts index 7faa0790950..41cd1f6da58 100644 --- a/src/agents/tools/tts-tool.test.ts +++ b/src/agents/tools/tts-tool.test.ts @@ -35,6 +35,7 @@ describe("createTtsTool", () => { provider: "test", media: { mediaUrl: "/tmp/reply.opus", + trustedLocalMedia: true, audioAsVoice: true, }, }, diff --git a/src/agents/tools/tts-tool.ts b/src/agents/tools/tts-tool.ts index ff789f2787b..b192321cbbd 100644 --- a/src/agents/tools/tts-tool.ts +++ b/src/agents/tools/tts-tool.ts @@ -43,6 +43,7 @@ export function createTtsTool(opts?: { provider: result.provider, media: { mediaUrl: result.audioPath, + trustedLocalMedia: true, ...(result.voiceCompatible ? { audioAsVoice: true } : {}), }, }, diff --git a/src/auto-reply/reply-payload.ts b/src/auto-reply/reply-payload.ts index da257eca2ad..0aacb840371 100644 --- a/src/auto-reply/reply-payload.ts +++ b/src/auto-reply/reply-payload.ts @@ -8,6 +8,10 @@ export type ReplyPayload = { text?: string; mediaUrl?: string; mediaUrls?: string[]; + /** Internal-only trust signal for gateway webchat local media embedding. */ + trustedLocalMedia?: boolean; + /** Treat media as live-only content and avoid persisting the underlying media reference. */ + sensitiveMedia?: boolean; /** Channel-agnostic rich presentation. Core degrades or asks the channel renderer to map it. */ presentation?: MessagePresentation; /** Channel-agnostic delivery preferences, e.g. pin the sent message when supported. */ diff --git a/src/auto-reply/reply/commands-tts.ts b/src/auto-reply/reply/commands-tts.ts index 46a054d8280..b6567857bd1 100644 --- a/src/auto-reply/reply/commands-tts.ts +++ b/src/auto-reply/reply/commands-tts.ts @@ -167,6 +167,7 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand const payload: ReplyPayload = { mediaUrl: result.audioPath, audioAsVoice: result.voiceCompatible === true, + trustedLocalMedia: true, }; return { shouldContinue: false, reply: payload }; } diff --git a/src/gateway/server-methods/chat-webchat-media.test.ts b/src/gateway/server-methods/chat-webchat-media.test.ts index 33cc2662eb7..67de5784ede 100644 --- a/src/gateway/server-methods/chat-webchat-media.test.ts +++ b/src/gateway/server-methods/chat-webchat-media.test.ts @@ -4,7 +4,10 @@ import path from "node:path"; import { pathToFileURL } from "node:url"; import { afterEach, describe, expect, it, vi } from "vitest"; import { getDefaultLocalRoots } from "../../media/local-media-access.js"; -import { buildWebchatAudioContentBlocksFromReplyPayloads } from "./chat-webchat-media.js"; +import { + buildWebchatAssistantMessageFromReplyPayloads, + buildWebchatAudioContentBlocksFromReplyPayloads, +} from "./chat-webchat-media.js"; describe("buildWebchatAudioContentBlocksFromReplyPayloads", () => { let tmpDir: string | undefined; @@ -22,7 +25,7 @@ describe("buildWebchatAudioContentBlocksFromReplyPayloads", () => { fs.writeFileSync(audioPath, Buffer.from([0xff, 0xfb, 0x90, 0x00])); const blocks = await buildWebchatAudioContentBlocksFromReplyPayloads( - [{ mediaUrl: audioPath }], + [{ mediaUrl: audioPath, trustedLocalMedia: true }], { localRoots: [tmpDir] }, ); @@ -42,7 +45,7 @@ describe("buildWebchatAudioContentBlocksFromReplyPayloads", () => { it("skips remote URLs", async () => { const blocks = await buildWebchatAudioContentBlocksFromReplyPayloads([ - { mediaUrl: "https://example.com/a.mp3" }, + { mediaUrl: "https://example.com/a.mp3", trustedLocalMedia: true }, ]); expect(blocks).toHaveLength(0); }); @@ -53,7 +56,7 @@ describe("buildWebchatAudioContentBlocksFromReplyPayloads", () => { fs.writeFileSync(imagePath, Buffer.from([0x89, 0x50, 0x4e, 0x47])); const blocks = await buildWebchatAudioContentBlocksFromReplyPayloads( - [{ mediaUrl: imagePath }], + [{ mediaUrl: imagePath, trustedLocalMedia: true }], { localRoots: [tmpDir] }, ); @@ -66,7 +69,10 @@ describe("buildWebchatAudioContentBlocksFromReplyPayloads", () => { fs.writeFileSync(audioPath, Buffer.from([0x00])); const blocks = await buildWebchatAudioContentBlocksFromReplyPayloads( - [{ mediaUrl: audioPath }, { mediaUrl: audioPath }], + [ + { mediaUrl: audioPath, trustedLocalMedia: true }, + { mediaUrl: audioPath, trustedLocalMedia: true }, + ], { localRoots: [tmpDir] }, ); expect(blocks).toHaveLength(1); @@ -78,9 +84,12 @@ describe("buildWebchatAudioContentBlocksFromReplyPayloads", () => { fs.writeFileSync(audioPath, Buffer.from([0x01])); const fileUrl = pathToFileURL(audioPath).href; - const blocks = await buildWebchatAudioContentBlocksFromReplyPayloads([{ mediaUrl: fileUrl }], { - localRoots: [tmpDir], - }); + const blocks = await buildWebchatAudioContentBlocksFromReplyPayloads( + [{ mediaUrl: fileUrl, trustedLocalMedia: true }], + { + localRoots: [tmpDir], + }, + ); expect(blocks).toHaveLength(1); expect((blocks[0] as { type?: string }).type).toBe("audio"); @@ -94,6 +103,7 @@ describe("buildWebchatAudioContentBlocksFromReplyPayloads", () => { { text: "MEDIA:file://attacker/share/probe.mp3", mediaUrl: "file://attacker/share/probe.mp3", + trustedLocalMedia: true, }, ]); @@ -116,7 +126,7 @@ describe("buildWebchatAudioContentBlocksFromReplyPayloads", () => { const onLocalAudioAccessDenied = vi.fn(); const blocks = await buildWebchatAudioContentBlocksFromReplyPayloads( - [{ mediaUrl: audioPath }], + [{ mediaUrl: audioPath, trustedLocalMedia: true }], { localRoots: [allowedRoot], onLocalAudioAccessDenied, @@ -136,7 +146,9 @@ describe("buildWebchatAudioContentBlocksFromReplyPayloads", () => { const audioPath = path.join(tmpDir, "clip.mp3"); fs.writeFileSync(audioPath, Buffer.from([0x04])); - const blocks = await buildWebchatAudioContentBlocksFromReplyPayloads([{ mediaUrl: audioPath }]); + const blocks = await buildWebchatAudioContentBlocksFromReplyPayloads([ + { mediaUrl: audioPath, trustedLocalMedia: true }, + ]); expect(blocks).toHaveLength(1); expect((blocks[0] as { type?: string }).type).toBe("audio"); @@ -157,7 +169,7 @@ describe("buildWebchatAudioContentBlocksFromReplyPayloads", () => { const readSpy = vi.spyOn(fs, "readFileSync"); const blocks = await buildWebchatAudioContentBlocksFromReplyPayloads( - [{ mediaUrl: audioPath }], + [{ mediaUrl: audioPath, trustedLocalMedia: true }], { localRoots: [tmpDir] }, ); @@ -167,4 +179,121 @@ describe("buildWebchatAudioContentBlocksFromReplyPayloads", () => { statSpy.mockRestore(); readSpy.mockRestore(); }); + + it("rejects untrusted local audio paths", async () => { + tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "openclaw-webchat-audio-")); + const audioPath = path.join(tmpDir, "clip.mp3"); + fs.writeFileSync(audioPath, Buffer.from([0xff, 0xfb, 0x90, 0x00])); + + const blocks = await buildWebchatAudioContentBlocksFromReplyPayloads( + [{ mediaUrl: audioPath }], + { localRoots: [tmpDir] }, + ); + + expect(blocks).toHaveLength(0); + }); +}); + +describe("buildWebchatAssistantMessageFromReplyPayloads", () => { + it("converts image data URLs into webchat image blocks", async () => { + const message = await buildWebchatAssistantMessageFromReplyPayloads([ + { + text: "Scan this QR code with the OpenClaw iOS app:", + mediaUrl: "data:image/png;base64,cG5n", + }, + ]); + + expect(message).toEqual({ + transcriptText: "Scan this QR code with the OpenClaw iOS app:", + content: [ + { type: "text", text: "Scan this QR code with the OpenClaw iOS app:" }, + { type: "input_image", image_url: "data:image/png;base64,cG5n" }, + ], + }); + }); + + it("suppresses control tokens and falls back to synthetic image text", async () => { + const message = await buildWebchatAssistantMessageFromReplyPayloads([ + { + text: "NO_REPLY", + mediaUrl: "data:image/png;base64,cG5n", + }, + ]); + + expect(message).toEqual({ + transcriptText: "Image reply", + content: [ + { type: "text", text: "Image reply" }, + { type: "input_image", image_url: "data:image/png;base64,cG5n" }, + ], + }); + }); + + it("preserves reply directives in transcript text for media replies", async () => { + const message = await buildWebchatAssistantMessageFromReplyPayloads([ + { + replyToCurrent: true, + mediaUrl: "data:image/png;base64,cG5n", + }, + ]); + + expect(message).toEqual({ + transcriptText: "[[reply_to_current]]Image reply", + content: [ + { type: "text", text: "[[reply_to_current]]Image reply" }, + { type: "input_image", image_url: "data:image/png;base64,cG5n" }, + ], + }); + }); + + it("drops oversized data image URLs", async () => { + const hugeBase64 = "A".repeat(2_100_000); + const message = await buildWebchatAssistantMessageFromReplyPayloads([ + { + text: "too large", + mediaUrl: `data:image/png;base64,${hugeBase64}`, + }, + ]); + + expect(message).toBeNull(); + }); + + it("rejects remote image URLs", async () => { + const message = await buildWebchatAssistantMessageFromReplyPayloads([ + { + text: "remote", + mediaUrl: "https://example.com/final.png", + }, + ]); + + expect(message).toBeNull(); + }); + + it("rejects svg data URLs", async () => { + const message = await buildWebchatAssistantMessageFromReplyPayloads([ + { + text: "svg", + mediaUrl: "data:image/svg+xml;base64,PHN2Zy8+", + }, + ]); + + expect(message).toBeNull(); + }); + + it("sanitizes reply ids before embedding directive prefixes", async () => { + const message = await buildWebchatAssistantMessageFromReplyPayloads([ + { + replyToId: "abc]]\n[[audio_as_voice]]", + mediaUrl: "data:image/png;base64,cG5n", + }, + ]); + + expect(message).toEqual({ + transcriptText: "[[reply_to:abcaudio_as_voice]]Image reply", + content: [ + { type: "text", text: "[[reply_to:abcaudio_as_voice]]Image reply" }, + { type: "input_image", image_url: "data:image/png;base64,cG5n" }, + ], + }); + }); }); diff --git a/src/gateway/server-methods/chat-webchat-media.ts b/src/gateway/server-methods/chat-webchat-media.ts index d13113f3f71..c39e0077dd4 100644 --- a/src/gateway/server-methods/chat-webchat-media.ts +++ b/src/gateway/server-methods/chat-webchat-media.ts @@ -6,9 +6,22 @@ import { assertLocalMediaAllowed, LocalMediaAccessError } from "../../media/loca import { isAudioFileName } from "../../media/mime.js"; import { resolveSendableOutboundReplyParts } from "../../plugin-sdk/reply-payload.js"; import { normalizeLowercaseStringOrEmpty } from "../../shared/string-coerce.js"; +import { sanitizeReplyDirectiveId } from "../../utils/directive-tags.js"; +import { isSuppressedControlReplyText } from "../control-reply-text.js"; /** Cap embedded audio size to avoid multi‑MB payloads on the chat WebSocket. */ const MAX_WEBCHAT_AUDIO_BYTES = 15 * 1024 * 1024; +const MAX_WEBCHAT_IMAGE_DATA_URL_CHARS = 2_000_000; +const MAX_WEBCHAT_IMAGE_DATA_BYTES = 1_500_000; +const ALLOWED_WEBCHAT_DATA_IMAGE_MEDIA_TYPES = new Set([ + "image/apng", + "image/avif", + "image/bmp", + "image/gif", + "image/jpeg", + "image/png", + "image/webp", +]); const MIME_BY_EXT: Record = { ".aac": "audio/aac", @@ -26,6 +39,8 @@ type WebchatAudioEmbeddingOptions = { onLocalAudioAccessDenied?: (err: LocalMediaAccessError) => void; }; +type WebchatAssistantMediaOptions = WebchatAudioEmbeddingOptions; + /** Map `mediaUrl` strings to an absolute filesystem path for local embedding (plain paths or `file:` URLs). */ function resolveLocalMediaPathForEmbedding(raw: string): string | null { const trimmed = raw.trim(); @@ -62,9 +77,13 @@ function resolveLocalMediaPathForEmbedding(raw: string): string | null { /** Returns a readable local file path when it is a regular file and within the size cap (single stat before read). */ async function resolveLocalAudioFileForEmbedding( + payload: ReplyPayload, raw: string, options: WebchatAudioEmbeddingOptions | undefined, ): Promise { + if (payload.trustedLocalMedia !== true) { + return null; + } const resolved = resolveLocalMediaPathForEmbedding(raw); if (!resolved) { return null; @@ -92,6 +111,47 @@ function mimeTypeForPath(filePath: string): string { return MIME_BY_EXT[ext] ?? "audio/mpeg"; } +function estimateBase64DecodedBytes(base64: string): number { + const sanitized = base64.replace(/\s+/g, ""); + const padding = + sanitized.endsWith("==") ? 2 : sanitized.endsWith("=") ? 1 : 0; + return Math.floor((sanitized.length * 3) / 4) - padding; +} + +function resolveEmbeddableImageUrl(url: string): string | null { + const trimmed = url.trim(); + if (!trimmed) { + return null; + } + if (trimmed.length > MAX_WEBCHAT_IMAGE_DATA_URL_CHARS) { + return null; + } + const match = /^data:(image\/[a-z0-9.+-]+);base64,([a-z0-9+/=\s]+)$/i.exec(trimmed); + if (!match) { + return null; + } + const mediaType = normalizeLowercaseStringOrEmpty(match[1]); + const base64Data = match[2]; + if (!ALLOWED_WEBCHAT_DATA_IMAGE_MEDIA_TYPES.has(mediaType)) { + return null; + } + if (estimateBase64DecodedBytes(base64Data) > MAX_WEBCHAT_IMAGE_DATA_BYTES) { + return null; + } + return trimmed; +} + +function resolveReplyDirectivePrefix(payload: ReplyPayload): string { + const replyToId = sanitizeReplyDirectiveId(payload.replyToId); + if (replyToId) { + return `[[reply_to:${replyToId}]]`; + } + if (payload.replyToCurrent) { + return "[[reply_to_current]]"; + } + return ""; +} + /** * Build Control UI / transcript `content` blocks for local TTS (or other) audio files * referenced by slash-command / agent replies when the webchat path only had text aggregation. @@ -109,7 +169,7 @@ export async function buildWebchatAudioContentBlocksFromReplyPayloads( if (!url) { continue; } - const resolved = await resolveLocalAudioFileForEmbedding(url, options); + const resolved = await resolveLocalAudioFileForEmbedding(payload, url, options); if (!resolved || seen.has(resolved)) { continue; } @@ -123,6 +183,87 @@ export async function buildWebchatAudioContentBlocksFromReplyPayloads( return blocks; } +export async function buildWebchatAssistantMessageFromReplyPayloads( + payloads: ReplyPayload[], + options?: WebchatAssistantMediaOptions, +): Promise<{ content: Array>; transcriptText: string } | null> { + const content: Array> = []; + const transcriptTextParts: string[] = []; + const seenAudio = new Set(); + const seenImages = new Set(); + let hasAudio = false; + let hasImage = false; + + for (const payload of payloads) { + const visibleText = payload.text?.trim(); + const text = + visibleText && !isSuppressedControlReplyText(visibleText) ? visibleText : undefined; + const replyDirectivePrefix = resolveReplyDirectivePrefix(payload); + let payloadHasAudio = false; + let payloadHasImage = false; + const payloadMediaBlocks: Array> = []; + const parts = resolveSendableOutboundReplyParts(payload); + for (const raw of parts.mediaUrls) { + const url = raw.trim(); + if (!url) { + continue; + } + const resolvedAudioPath = await resolveLocalAudioFileForEmbedding(payload, url, options); + if (resolvedAudioPath) { + if (seenAudio.has(resolvedAudioPath)) { + continue; + } + seenAudio.add(resolvedAudioPath); + const block = tryReadLocalAudioContentBlock(resolvedAudioPath); + if (block) { + payloadMediaBlocks.push(block); + hasAudio = true; + payloadHasAudio = true; + } + continue; + } + const imageUrl = resolveEmbeddableImageUrl(url); + if (!imageUrl || seenImages.has(imageUrl)) { + continue; + } + seenImages.add(imageUrl); + payloadMediaBlocks.push({ type: "input_image", image_url: imageUrl }); + hasImage = true; + payloadHasImage = true; + } + const needsSyntheticText = + payloadMediaBlocks.length > 0 && (!text || replyDirectivePrefix) && transcriptTextParts.length === 0; + const syntheticText = needsSyntheticText + ? payloadHasAudio && payloadHasImage + ? "Media reply" + : payloadHasAudio + ? "Audio reply" + : "Image reply" + : undefined; + const blockText = text ?? syntheticText; + if (blockText) { + const fullText = replyDirectivePrefix ? `${replyDirectivePrefix}${blockText}` : blockText; + transcriptTextParts.push(fullText); + content.push({ type: "text", text: fullText }); + } else if (replyDirectivePrefix) { + transcriptTextParts.push(replyDirectivePrefix); + content.push({ type: "text", text: replyDirectivePrefix }); + } + content.push(...payloadMediaBlocks); + } + + if (!hasAudio && !hasImage) { + return null; + } + const transcriptText = + transcriptTextParts.join("\n\n").trim() || + (hasAudio && hasImage ? "Media reply" : hasAudio ? "Audio reply" : "Image reply"); + if (transcriptTextParts.length === 0) { + content.unshift({ type: "text", text: transcriptText }); + } + return { content, transcriptText }; +} + function tryReadLocalAudioContentBlock(filePath: string): Record | null { try { const buf = fs.readFileSync(filePath); diff --git a/src/gateway/server-methods/chat.directive-tags.test.ts b/src/gateway/server-methods/chat.directive-tags.test.ts index e871929da69..4cc6ea70e77 100644 --- a/src/gateway/server-methods/chat.directive-tags.test.ts +++ b/src/gateway/server-methods/chat.directive-tags.test.ts @@ -20,10 +20,23 @@ const mockState = vi.hoisted(() => ({ sessionId: "sess-1", mainSessionKey: "main", finalText: "[[reply_to_current]]", - finalPayload: null as { text?: string; mediaUrl?: string } | null, + finalPayload: null as { + text?: string; + mediaUrl?: string; + sensitiveMedia?: boolean; + replyToId?: string; + replyToCurrent?: boolean; + } | null, dispatchedReplies: [] as Array<{ kind: "tool" | "block" | "final"; - payload: { text?: string; mediaUrl?: string; mediaUrls?: string[] }; + payload: { + text?: string; + mediaUrl?: string; + mediaUrls?: string[]; + trustedLocalMedia?: boolean; + replyToId?: string; + replyToCurrent?: boolean; + }; }>, dispatchError: null as Error | null, triggerAgentRunStart: false, @@ -91,16 +104,28 @@ vi.mock("../../auto-reply/dispatch.js", () => ({ async (params: { ctx: MsgContext; dispatcher: { - sendFinalReply: (payload: { text?: string; mediaUrl?: string }) => boolean; + sendFinalReply: (payload: { + text?: string; + mediaUrl?: string; + sensitiveMedia?: boolean; + replyToId?: string; + replyToCurrent?: boolean; + }) => boolean; sendBlockReply: (payload: { text?: string; mediaUrl?: string; mediaUrls?: string[]; + trustedLocalMedia?: boolean; + replyToId?: string; + replyToCurrent?: boolean; }) => boolean; sendToolResult: (payload: { text?: string; mediaUrl?: string; mediaUrls?: string[]; + trustedLocalMedia?: boolean; + replyToId?: string; + replyToCurrent?: boolean; }) => boolean; markComplete: () => void; waitForIdle: () => Promise; @@ -130,9 +155,7 @@ vi.mock("../../auto-reply/dispatch.js", () => ({ params.dispatcher.sendBlockReply(reply.payload); continue; } - params.dispatcher.sendFinalReply({ - text: reply.payload.text ?? "", - }); + params.dispatcher.sendFinalReply(reply.payload); } } else { params.dispatcher.sendFinalReply(mockState.finalPayload ?? { text: mockState.finalText }); @@ -500,6 +523,7 @@ describe("chat directive tag stripping for non-streaming final payloads", () => payload: { mediaUrl: audioPath, mediaUrls: [audioPath], + trustedLocalMedia: true, }, }, ]; @@ -528,7 +552,7 @@ describe("chat directive tag stripping for non-streaming final payloads", () => expect(assistantUpdate).toMatchObject({ message: { role: "assistant", - idempotencyKey: "idem-agent-audio:assistant-audio", + idempotencyKey: "idem-agent-audio:assistant-media", content: [ { type: "text", text: "Audio reply" }, { @@ -544,6 +568,31 @@ describe("chat directive tag stripping for non-streaming final payloads", () => }); }); + it("renders image reply payloads as assistant image content instead of MEDIA text", async () => { + createTranscriptFixture("openclaw-chat-send-agent-image-"); + mockState.finalPayload = { + text: "Scan this QR code with the OpenClaw iOS app:", + mediaUrl: "data:image/png;base64,cG5n", + }; + const respond = vi.fn(); + const context = createChatContext(); + + const payload = await runNonStreamingChatSend({ + context, + respond, + idempotencyKey: "idem-agent-image", + }); + + expect(payload?.message).toMatchObject({ + role: "assistant", + content: [ + { type: "text", text: "Scan this QR code with the OpenClaw iOS app:" }, + { type: "input_image", image_url: "data:image/png;base64,cG5n" }, + ], + }); + expect(JSON.stringify(payload?.message)).not.toContain("MEDIA:data:image/png;base64,cG5n"); + }); + it("chat.inject keeps message defined when directive tag is the only content", async () => { createTranscriptFixture("openclaw-chat-inject-directive-only-"); const respond = vi.fn(); @@ -693,7 +742,7 @@ describe("chat directive tag stripping for non-streaming final payloads", () => respond, idempotencyKey: "idem-untrusted-context", }); - expect(extractFirstTextBlock(payload)).toBe("hello"); + expect(extractFirstTextBlock(payload)?.trim()).toBe("hello"); }); it("chat.send non-streaming final broadcasts and routes on the canonical session key", async () => { @@ -1867,7 +1916,7 @@ describe("chat directive tag stripping for non-streaming final payloads", () => it("preserves media-only final replies in the final broadcast message", async () => { createTranscriptFixture("openclaw-chat-send-media-only-final-"); - mockState.finalPayload = { mediaUrl: "https://example.com/final.png" }; + mockState.finalPayload = { mediaUrl: "data:image/png;base64,cG5n" }; const respond = vi.fn(); const context = createChatContext(); @@ -1877,14 +1926,20 @@ describe("chat directive tag stripping for non-streaming final payloads", () => idempotencyKey: "idem-media-only-final", }); - expect(extractFirstTextBlock(payload)).toBe("MEDIA:https://example.com/final.png"); + expect(payload?.message).toMatchObject({ + role: "assistant", + content: [ + { type: "text", text: "Image reply" }, + { type: "input_image", image_url: "data:image/png;base64,cG5n" }, + ], + }); }); it("strips NO_REPLY from transcript text when final replies only carry media", async () => { createTranscriptFixture("openclaw-chat-send-media-only-silent-final-"); mockState.finalPayload = { text: "NO_REPLY", - mediaUrl: "https://example.com/final.png", + mediaUrl: "data:image/png;base64,cG5n", }; const respond = vi.fn(); const context = createChatContext(); @@ -1895,7 +1950,122 @@ describe("chat directive tag stripping for non-streaming final payloads", () => idempotencyKey: "idem-media-only-silent-final", }); - expect(extractFirstTextBlock(payload)).toBe("MEDIA:https://example.com/final.png"); + expect(payload?.message).toMatchObject({ + role: "assistant", + content: [ + { type: "text", text: "Image reply" }, + { type: "input_image", image_url: "data:image/png;base64,cG5n" }, + ], + }); + }); + + it("preserves reply tags in transcript updates for media replies while stripping them from the broadcast", async () => { + createTranscriptFixture("openclaw-chat-send-media-reply-tags-"); + mockState.finalPayload = { + replyToCurrent: true, + mediaUrl: "data:image/png;base64,cG5n", + }; + const respond = vi.fn(); + const context = createChatContext(); + + const payload = await runNonStreamingChatSend({ + context, + respond, + idempotencyKey: "idem-media-reply-tags", + }); + + expect(payload?.message).toMatchObject({ + role: "assistant", + content: [ + { type: "text", text: "Image reply" }, + { type: "input_image", image_url: "data:image/png;base64,cG5n" }, + ], + }); + const transcriptUpdate = mockState.emittedTranscriptUpdates.find( + (update) => + typeof update.message === "object" && + update.message !== null && + (update.message as { role?: unknown }).role === "assistant" && + Array.isArray((update.message as { content?: unknown }).content) && + ((update.message as { content: Array<{ type?: string; text?: string }> }).content.some( + (block) => block?.type === "text" && block?.text?.includes("[[reply_to_current]]"), + ) ?? + false), + ); + expect(transcriptUpdate).toMatchObject({ + message: { + role: "assistant", + content: [ + { type: "text", text: "[[reply_to_current]]Image reply" }, + { type: "input_image", image_url: "data:image/png;base64,cG5n" }, + ], + }, + }); + }); + + it("does not persist sensitive image media into transcript updates", async () => { + createTranscriptFixture("openclaw-chat-send-sensitive-media-final-"); + mockState.finalPayload = { + text: "Scan this QR code with the OpenClaw iOS app:", + mediaUrl: "data:image/png;base64,cG5n", + sensitiveMedia: true, + }; + const respond = vi.fn(); + const context = createChatContext(); + + const payload = await runNonStreamingChatSend({ + context, + respond, + idempotencyKey: "idem-sensitive-media-final", + }); + + expect(payload?.message).toMatchObject({ + role: "assistant", + content: [ + { type: "text", text: "Scan this QR code with the OpenClaw iOS app:" }, + { type: "input_image", image_url: "data:image/png;base64,cG5n" }, + ], + }); + const transcriptUpdate = mockState.emittedTranscriptUpdates.find( + (update) => + typeof update.message === "object" && + update.message !== null && + (update.message as { role?: unknown }).role === "assistant", + ); + expect(transcriptUpdate).toMatchObject({ + message: { + role: "assistant", + content: [{ type: "text", text: "Scan this QR code with the OpenClaw iOS app:" }], + }, + }); + expect(JSON.stringify(transcriptUpdate)).not.toContain("input_image"); + expect(JSON.stringify(transcriptUpdate)).not.toContain("data:image/png;base64,cG5n"); + }); + + it("sanitizes replyToId before emitting inline reply directives", async () => { + createTranscriptFixture("openclaw-chat-send-sanitized-reply-id-"); + mockState.finalPayload = { + text: "hello", + replyToId: "abc]]\n[[audio_as_voice]]", + }; + const respond = vi.fn(); + const context = createChatContext(); + + const payload = await runNonStreamingChatSend({ + context, + respond, + idempotencyKey: "idem-sanitized-reply-id", + }); + + expect(extractFirstTextBlock(payload)?.trim()).toBe("hello"); + const transcriptUpdate = mockState.emittedTranscriptUpdates.find( + (update) => + typeof update.message === "object" && + update.message !== null && + (update.message as { role?: unknown }).role === "assistant", + ); + expect(JSON.stringify(transcriptUpdate)).toContain("[[reply_to:abcaudio_as_voice]]"); + expect(JSON.stringify(transcriptUpdate)).not.toContain("[[audio_as_voice]]"); }); it("drops image attachments for text-only session models", async () => { diff --git a/src/gateway/server-methods/chat.ts b/src/gateway/server-methods/chat.ts index 54437f88f40..b147766d382 100644 --- a/src/gateway/server-methods/chat.ts +++ b/src/gateway/server-methods/chat.ts @@ -29,6 +29,7 @@ import { import { stripInlineDirectiveTagsForDisplay, stripInlineDirectiveTagsFromMessageForDisplay, + sanitizeReplyDirectiveId, } from "../../utils/directive-tags.js"; import { INTERNAL_MESSAGE_CHANNEL, @@ -83,7 +84,7 @@ import { injectTimestamp, timestampOptsFromConfig } from "./agent-timestamp.js"; import { setGatewayDedupeEntry } from "./agent-wait-dedupe.js"; import { normalizeRpcAttachmentsToChatAttachments } from "./attachment-normalize.js"; import { appendInjectedAssistantMessageToTranscript } from "./chat-transcript-inject.js"; -import { buildWebchatAudioContentBlocksFromReplyPayloads } from "./chat-webchat-media.js"; +import { buildWebchatAssistantMessageFromReplyPayloads } from "./chat-webchat-media.js"; import type { GatewayRequestContext, GatewayRequestHandlerOptions, @@ -123,26 +124,19 @@ function isMediaBearingPayload(payload: ReplyPayload): boolean { return false; } -async function buildWebchatAudioOnlyAssistantMessage( +async function buildWebchatAssistantMediaMessage( payloads: ReplyPayload[], options?: { localRoots?: readonly string[]; onLocalAudioAccessDenied?: (message: string) => void; }, ): Promise<{ content: Array>; transcriptText: string } | null> { - const audioBlocks = await buildWebchatAudioContentBlocksFromReplyPayloads(payloads, { + return buildWebchatAssistantMessageFromReplyPayloads(payloads, { localRoots: options?.localRoots, onLocalAudioAccessDenied: (err) => { options?.onLocalAudioAccessDenied?.(formatForLog(err)); }, }); - if (audioBlocks.length === 0) { - return null; - } - return { - transcriptText: "Audio reply", - content: [{ type: "text", text: "Audio reply" }, ...audioBlocks], - }; } export const DEFAULT_CHAT_HISTORY_TEXT_MAX_CHARS = 8_000; @@ -225,8 +219,9 @@ function buildTranscriptReplyText(payloads: ReplyPayload[]): string { .map((payload) => { const parts = resolveSendableOutboundReplyParts(payload); const lines: string[] = []; - if (typeof payload.replyToId === "string" && payload.replyToId.trim()) { - lines.push(`[[reply_to:${payload.replyToId.trim()}]]`); + const replyToId = sanitizeReplyDirectiveId(payload.replyToId); + if (replyToId) { + lines.push(`[[reply_to:${replyToId}]]`); } else if (payload.replyToCurrent) { lines.push("[[reply_to_current]]"); } @@ -235,6 +230,9 @@ function buildTranscriptReplyText(payloads: ReplyPayload[]): string { lines.push(text); } for (const mediaUrl of parts.mediaUrls) { + if (payload.sensitiveMedia === true) { + continue; + } const trimmed = mediaUrl.trim(); if (trimmed) { lines.push(`MEDIA:${trimmed}`); @@ -249,6 +247,10 @@ function buildTranscriptReplyText(payloads: ReplyPayload[]): string { return chunks.join("\n\n").trim(); } +function hasSensitiveMediaPayload(payloads: ReplyPayload[]): boolean { + return payloads.some((payload) => payload.sensitiveMedia === true && isMediaBearingPayload(payload)); +} + function resolveChatSendOriginatingRoute(params: { client?: { mode?: string | null; id?: string | null } | null; deliver?: boolean; @@ -2036,7 +2038,7 @@ export const chatHandlers: GatewayRequestHandlers = { channel: INTERNAL_MESSAGE_CHANNEL, }); const deliveredReplies: Array<{ payload: ReplyPayload; kind: "block" | "final" }> = []; - let appendedWebchatAgentAudio = false; + let appendedWebchatAgentMedia = false; let userTranscriptUpdatePromise: Promise | null = null; const emitUserTranscriptUpdate = async () => { if (userTranscriptUpdatePromise) { @@ -2098,37 +2100,37 @@ export const chatHandlers: GatewayRequestHandlers = { savedImages: await persistedImagesPromise, }); }; - const appendWebchatAgentAudioTranscriptIfNeeded = async (payload: ReplyPayload) => { - if (!agentRunStarted || appendedWebchatAgentAudio || !isMediaBearingPayload(payload)) { + const appendWebchatAgentMediaTranscriptIfNeeded = async (payload: ReplyPayload) => { + if (!agentRunStarted || appendedWebchatAgentMedia || !isMediaBearingPayload(payload)) { return; } - const audioMessage = await buildWebchatAudioOnlyAssistantMessage([payload], { + const mediaMessage = await buildWebchatAssistantMediaMessage([payload], { localRoots: getAgentScopedMediaLocalRoots(cfg, agentId), onLocalAudioAccessDenied: (message) => { context.logGateway.warn(`webchat audio embedding denied local path: ${message}`); }, }); - if (!audioMessage) { + if (!mediaMessage) { return; } const { storePath: latestStorePath, entry: latestEntry } = loadSessionEntry(sessionKey); const sessionId = latestEntry?.sessionId ?? entry?.sessionId ?? clientRunId; const appended = appendAssistantTranscriptMessage({ - message: audioMessage.transcriptText, - content: audioMessage.content, + message: mediaMessage.transcriptText, + ...(payload.sensitiveMedia === true ? {} : { content: mediaMessage.content }), sessionId, storePath: latestStorePath, sessionFile: latestEntry?.sessionFile, agentId, createIfMissing: true, - idempotencyKey: `${clientRunId}:assistant-audio`, + idempotencyKey: `${clientRunId}:assistant-media`, }); if (appended.ok) { - appendedWebchatAgentAudio = true; + appendedWebchatAgentMedia = true; return; } context.logGateway.warn( - `webchat transcript append failed for audio reply: ${appended.error ?? "unknown error"}`, + `webchat transcript append failed for media reply: ${appended.error ?? "unknown error"}`, ); }; const dispatcher = createReplyDispatcher({ @@ -2141,7 +2143,7 @@ export const chatHandlers: GatewayRequestHandlers = { case "block": case "final": deliveredReplies.push({ payload, kind: info.kind }); - await appendWebchatAgentAudioTranscriptIfNeeded(payload); + await appendWebchatAgentMediaTranscriptIfNeeded(payload); break; case "tool": // Tool results that carry audio (e.g. the TTS tool) must be promoted @@ -2231,18 +2233,25 @@ export const chatHandlers: GatewayRequestHandlers = { sessionKey, }); } else { - const combinedReply = buildTranscriptReplyText( - deliveredReplies - .filter((entry) => entry.kind === "final") - .map((entry) => entry.payload), - ); + const finalPayloads = deliveredReplies + .filter((entry) => entry.kind === "final") + .map((entry) => entry.payload); + const combinedReply = buildTranscriptReplyText(finalPayloads); + const mediaMessage = await buildWebchatAssistantMediaMessage(finalPayloads, { + localRoots: getAgentScopedMediaLocalRoots(cfg, agentId), + onLocalAudioAccessDenied: (message) => { + context.logGateway.warn(`webchat audio embedding denied local path: ${message}`); + }, + }); + const hasSensitiveMedia = hasSensitiveMediaPayload(finalPayloads); let message: Record | undefined; - if (combinedReply) { + if (mediaMessage || combinedReply) { const { storePath: latestStorePath, entry: latestEntry } = loadSessionEntry(sessionKey); const sessionId = latestEntry?.sessionId ?? entry?.sessionId ?? clientRunId; const appended = appendAssistantTranscriptMessage({ - message: combinedReply, + message: mediaMessage?.transcriptText ?? combinedReply, + ...(mediaMessage && !hasSensitiveMedia ? { content: mediaMessage.content } : {}), sessionId, storePath: latestStorePath, sessionFile: latestEntry?.sessionFile, @@ -2250,7 +2259,14 @@ export const chatHandlers: GatewayRequestHandlers = { createIfMissing: true, }); if (appended.ok) { - message = appended.message; + if (hasSensitiveMedia && mediaMessage) { + message = { + ...appended.message, + content: mediaMessage.content, + }; + } else { + message = appended.message; + } } else { context.logGateway.warn( `webchat transcript append failed: ${appended.error ?? "unknown error"}`, @@ -2258,7 +2274,7 @@ export const chatHandlers: GatewayRequestHandlers = { const now = Date.now(); message = { role: "assistant", - content: [{ type: "text", text: combinedReply }], + content: mediaMessage?.content ?? [{ type: "text", text: combinedReply }], timestamp: now, // Keep this compatible with Pi stopReason enums even though this message isn't // persisted to the transcript due to the append failure. diff --git a/src/plugin-sdk/approval-client-helpers.ts b/src/plugin-sdk/approval-client-helpers.ts index 4988ebcee5a..3c73703ef7f 100644 --- a/src/plugin-sdk/approval-client-helpers.ts +++ b/src/plugin-sdk/approval-client-helpers.ts @@ -1,4 +1,3 @@ -import type { ReplyPayload } from "../auto-reply/reply-payload.js"; import type { ExecApprovalForwardTarget } from "../config/types.approvals.js"; import { matchesApprovalRequestFilters } from "../infra/approval-request-filters.js"; import { getExecApprovalReplyMetadata } from "../infra/exec-approval-reply.js"; @@ -9,6 +8,7 @@ import { normalizeOptionalString, } from "../shared/string-coerce.js"; import type { OpenClawConfig } from "./config-runtime.js"; +import type { ReplyPayload } from "./reply-payload.js"; import { normalizeAccountId } from "./routing.js"; type ApprovalRequest = ExecApprovalRequest | PluginApprovalRequest; diff --git a/src/plugin-sdk/approval-renderers.ts b/src/plugin-sdk/approval-renderers.ts index e36c0d127d5..e693e66e6e6 100644 --- a/src/plugin-sdk/approval-renderers.ts +++ b/src/plugin-sdk/approval-renderers.ts @@ -1,4 +1,3 @@ -import type { ReplyPayload } from "../auto-reply/reply-payload.js"; import { buildApprovalInteractiveReply, type ExecApprovalReplyDecision, @@ -10,6 +9,7 @@ import { type PluginApprovalResolved, } from "../infra/plugin-approvals.js"; import { normalizeOptionalString } from "../shared/string-coerce.js"; +import type { ReplyPayload } from "./reply-payload.js"; const DEFAULT_ALLOWED_DECISIONS = ["allow-once", "allow-always", "deny"] as const; diff --git a/src/plugin-sdk/channel-reply-pipeline.ts b/src/plugin-sdk/channel-reply-pipeline.ts index d91bb20827f..8516e28b9ec 100644 --- a/src/plugin-sdk/channel-reply-pipeline.ts +++ b/src/plugin-sdk/channel-reply-pipeline.ts @@ -1,4 +1,3 @@ -import type { ReplyPayload } from "../auto-reply/reply-payload.js"; import { getChannelPlugin, normalizeChannelId } from "../channels/plugins/index.js"; import { createReplyPrefixContext, @@ -11,6 +10,7 @@ import { type CreateTypingCallbacksParams, type TypingCallbacks, } from "../channels/typing.js"; +import type { ReplyPayload } from "./reply-payload.js"; export type ReplyPrefixContext = ReplyPrefixContextBundle["prefixContext"]; export type { ReplyPrefixContextBundle, ReplyPrefixOptions }; diff --git a/src/plugin-sdk/core.ts b/src/plugin-sdk/core.ts index a38db81b67b..80c325606fd 100644 --- a/src/plugin-sdk/core.ts +++ b/src/plugin-sdk/core.ts @@ -109,7 +109,7 @@ export type { export type { OpenClawConfig } from "../config/config.js"; export type { OutboundIdentity } from "../infra/outbound/identity.js"; export type { HistoryEntry } from "../auto-reply/reply/history.js"; -export type { ReplyPayload } from "../auto-reply/reply-payload.js"; +export type { ReplyPayload } from "./reply-payload.js"; export type { AllowlistMatch } from "../channels/allowlist-match.js"; export type { BaseProbeResult, diff --git a/src/plugin-sdk/feishu.ts b/src/plugin-sdk/feishu.ts index 6966c97434a..36fa311f169 100644 --- a/src/plugin-sdk/feishu.ts +++ b/src/plugin-sdk/feishu.ts @@ -8,7 +8,7 @@ export { DEFAULT_GROUP_HISTORY_LIMIT, recordPendingHistoryEntryIfEnabled, } from "../auto-reply/reply/history.js"; -export type { ReplyPayload } from "../auto-reply/reply-payload.js"; +export type { ReplyPayload } from "./reply-payload.js"; export { logTypingFailure } from "../channels/logging.js"; export type { AllowlistMatch } from "../channels/plugins/allowlist-match.js"; export { buildChannelConfigSchema } from "../channels/plugins/config-schema.js"; diff --git a/src/plugin-sdk/index.ts b/src/plugin-sdk/index.ts index 87d5c3c3fcd..1812f389082 100644 --- a/src/plugin-sdk/index.ts +++ b/src/plugin-sdk/index.ts @@ -91,7 +91,7 @@ export * from "./music-generation.js"; export type { SecretInput, SecretRef } from "../config/types.secrets.js"; export type { RuntimeEnv } from "../runtime.js"; export type { HookEntry } from "../hooks/types.js"; -export type { ReplyPayload } from "../auto-reply/reply-payload.js"; +export type { ReplyPayload } from "./reply-payload.js"; export type { WizardPrompter } from "../wizard/prompts.js"; export type { ContextEngineFactory } from "../context-engine/registry.js"; export type { DiagnosticEventPayload } from "../infra/diagnostic-events.js"; diff --git a/src/plugin-sdk/line.ts b/src/plugin-sdk/line.ts index 7ae99f5c058..092ab17677a 100644 --- a/src/plugin-sdk/line.ts +++ b/src/plugin-sdk/line.ts @@ -5,7 +5,7 @@ export type { } from "../channels/plugins/types.public.js"; export type { ChannelPlugin } from "../channels/plugins/types.plugin.js"; export type { OpenClawConfig } from "../config/config.js"; -export type { ReplyPayload } from "../auto-reply/reply-payload.js"; +export type { ReplyPayload } from "./reply-payload.js"; export type { ChannelSetupAdapter } from "../channels/plugins/types.adapters.js"; export type { OpenClawPluginApi, PluginRuntime } from "./channel-plugin-common.js"; diff --git a/src/plugin-sdk/matrix.ts b/src/plugin-sdk/matrix.ts index d5915c9538a..8677d811253 100644 --- a/src/plugin-sdk/matrix.ts +++ b/src/plugin-sdk/matrix.ts @@ -29,7 +29,7 @@ export { readStringParam, } from "../agents/tools/common.js"; export type { BlockReplyContext } from "../auto-reply/get-reply-options.types.js"; -export type { ReplyPayload } from "../auto-reply/reply-payload.js"; +export type { ReplyPayload } from "./reply-payload.js"; export { resolveAckReaction } from "../agents/identity.js"; export { compileAllowlist, diff --git a/src/plugin-sdk/mattermost.ts b/src/plugin-sdk/mattermost.ts index 2a771521ccf..8b0f35c4e84 100644 --- a/src/plugin-sdk/mattermost.ts +++ b/src/plugin-sdk/mattermost.ts @@ -10,7 +10,7 @@ export { recordPendingHistoryEntryIfEnabled, } from "../auto-reply/reply/history.js"; export { listSkillCommandsForAgents } from "../auto-reply/skill-commands.js"; -export type { ReplyPayload } from "../auto-reply/reply-payload.js"; +export type { ReplyPayload } from "./reply-payload.js"; export type { ChatType } from "../channels/chat-type.js"; export { resolveControlCommandGate } from "../channels/command-gating.js"; export { logInboundDrop, logTypingFailure } from "../channels/logging.js"; diff --git a/src/plugin-sdk/msteams.ts b/src/plugin-sdk/msteams.ts index 81459ea3b3a..835e53571c8 100644 --- a/src/plugin-sdk/msteams.ts +++ b/src/plugin-sdk/msteams.ts @@ -12,7 +12,7 @@ export { recordPendingHistoryEntryIfEnabled, } from "../auto-reply/reply/history.js"; export { isSilentReplyText, SILENT_REPLY_TOKEN } from "../auto-reply/tokens.js"; -export type { ReplyPayload } from "../auto-reply/reply-payload.js"; +export type { ReplyPayload } from "./reply-payload.js"; export { mergeAllowlist, summarizeMapping } from "../channels/allowlists/resolve-utils.js"; export { resolveControlCommandGate, diff --git a/src/plugin-sdk/reply-chunking.ts b/src/plugin-sdk/reply-chunking.ts index 456b689e090..f572c0835ad 100644 --- a/src/plugin-sdk/reply-chunking.ts +++ b/src/plugin-sdk/reply-chunking.ts @@ -7,4 +7,4 @@ export { } from "../auto-reply/chunk.js"; export type { ChunkMode } from "../auto-reply/chunk.js"; export { isSilentReplyText } from "../auto-reply/tokens.js"; -export type { ReplyPayload } from "../auto-reply/reply-payload.js"; +export type { ReplyPayload } from "./reply-payload.js"; diff --git a/src/plugin-sdk/reply-dispatch-runtime.ts b/src/plugin-sdk/reply-dispatch-runtime.ts index 9d7054b38ea..13b25f6a8bc 100644 --- a/src/plugin-sdk/reply-dispatch-runtime.ts +++ b/src/plugin-sdk/reply-dispatch-runtime.ts @@ -4,4 +4,4 @@ export { dispatchReplyWithBufferedBlockDispatcher, dispatchReplyWithDispatcher, } from "../auto-reply/reply/provider-dispatcher.js"; -export type { ReplyPayload } from "../auto-reply/reply-payload.js"; +export type { ReplyPayload } from "./reply-payload.js"; diff --git a/src/plugin-sdk/reply-payload.test.ts b/src/plugin-sdk/reply-payload.test.ts index 3f6cb300020..66338a5f839 100644 --- a/src/plugin-sdk/reply-payload.test.ts +++ b/src/plugin-sdk/reply-payload.test.ts @@ -1,6 +1,7 @@ import { describe, expect, it, vi } from "vitest"; import { countOutboundMedia, + createNormalizedOutboundDeliverer, deliverFormattedTextWithAttachments, deliverTextOrMediaReply, hasOutboundMedia, @@ -8,6 +9,7 @@ import { hasOutboundText, isReasoningReplyPayload, isNumericTargetId, + normalizeOutboundReplyPayload, resolveOutboundMediaUrls, resolveSendableOutboundReplyParts, resolveTextChunksWithFallback, @@ -87,6 +89,45 @@ describe("sendPayloadWithChunkedTextAndMedia", () => { }); }); +describe("normalizeOutboundReplyPayload", () => { + it("strips internal-only local media trust flags from loose payload objects", () => { + expect( + normalizeOutboundReplyPayload({ + text: "hello", + mediaUrl: "/tmp/reply.opus", + trustedLocalMedia: true, + sensitiveMedia: true, + replyToId: "abc123", + }), + ).toEqual({ + text: "hello", + mediaUrl: "/tmp/reply.opus", + sensitiveMedia: true, + replyToId: "abc123", + }); + }); + + it("keeps the normalized deliverer from forwarding trustedLocalMedia", async () => { + const handler = vi.fn(async () => {}); + const deliver = createNormalizedOutboundDeliverer(handler); + + await deliver({ + text: "hello", + mediaUrl: "/tmp/reply.opus", + trustedLocalMedia: true, + sensitiveMedia: true, + }); + + expect(handler).toHaveBeenCalledWith({ + text: "hello", + mediaUrl: "/tmp/reply.opus", + sensitiveMedia: true, + replyToId: undefined, + mediaUrls: undefined, + }); + }); +}); + describe("resolveOutboundMediaUrls", () => { it.each([ { diff --git a/src/plugin-sdk/reply-payload.ts b/src/plugin-sdk/reply-payload.ts index ec564a583b7..231e34f8f5d 100644 --- a/src/plugin-sdk/reply-payload.ts +++ b/src/plugin-sdk/reply-payload.ts @@ -1,14 +1,16 @@ +import type { ReplyPayload as InternalReplyPayload } from "../auto-reply/reply-payload.js"; import type { ChannelOutboundAdapter } from "../channels/plugins/outbound.types.js"; import { normalizeLowercaseStringOrEmpty, readStringValue } from "../shared/string-coerce.js"; export type { MediaPayload, MediaPayloadInput } from "../channels/plugins/media-payload.js"; export { buildMediaPayload } from "../channels/plugins/media-payload.js"; -export type { ReplyPayload } from "../auto-reply/reply-payload.js"; +export type ReplyPayload = Omit; export type OutboundReplyPayload = { text?: string; mediaUrls?: string[]; mediaUrl?: string; + sensitiveMedia?: boolean; replyToId?: string; }; @@ -72,11 +74,13 @@ export function normalizeOutboundReplyPayload( ) : undefined; const mediaUrl = readStringValue(payload.mediaUrl); + const sensitiveMedia = payload.sensitiveMedia === true ? true : undefined; const replyToId = readStringValue(payload.replyToId); return { text, mediaUrls, mediaUrl, + sensitiveMedia, replyToId, }; } diff --git a/src/plugin-sdk/reply-runtime.ts b/src/plugin-sdk/reply-runtime.ts index ade9710f2cd..286ca6cb18d 100644 --- a/src/plugin-sdk/reply-runtime.ts +++ b/src/plugin-sdk/reply-runtime.ts @@ -54,7 +54,7 @@ export type { } from "../auto-reply/reply/reply-dispatcher.js"; export { createReplyReferencePlanner } from "../auto-reply/reply/reply-reference.js"; export type { GetReplyOptions, BlockReplyContext } from "../auto-reply/get-reply-options.types.js"; -export type { ReplyPayload } from "../auto-reply/reply-payload.js"; +export type { ReplyPayload } from "./reply-payload.js"; export type { FinalizedMsgContext, MsgContext } from "../auto-reply/templating.js"; export { generateConversationLabel } from "../auto-reply/reply/conversation-label-generator.js"; export type { ConversationLabelParams } from "../auto-reply/reply/conversation-label-generator.js"; diff --git a/src/plugin-sdk/tlon.ts b/src/plugin-sdk/tlon.ts index c21806bfb2d..36389fe64aa 100644 --- a/src/plugin-sdk/tlon.ts +++ b/src/plugin-sdk/tlon.ts @@ -3,7 +3,7 @@ import { createOptionalChannelSetupSurface } from "./channel-setup.js"; -export type { ReplyPayload } from "../auto-reply/reply-payload.js"; +export type { ReplyPayload } from "./reply-payload.js"; export { buildChannelConfigSchema } from "../channels/plugins/config-schema.js"; export { applyAccountNameToChannelSection, diff --git a/src/plugin-sdk/tts-runtime.types.ts b/src/plugin-sdk/tts-runtime.types.ts index 2a8c5b61458..f57e043a5ef 100644 --- a/src/plugin-sdk/tts-runtime.types.ts +++ b/src/plugin-sdk/tts-runtime.types.ts @@ -1,4 +1,3 @@ -import type { ReplyPayload } from "../auto-reply/reply-payload.js"; import type { OpenClawConfig } from "../config/types.openclaw.js"; import type { TtsAutoMode, TtsProvider } from "../config/types.tts.js"; import type { @@ -8,6 +7,7 @@ import type { TtsDirectiveParseResult, } from "../tts/provider-types.js"; import type { ResolvedTtsConfig, ResolvedTtsModelOverrides } from "../tts/tts-types.js"; +import type { ReplyPayload } from "./reply-payload.js"; export type { ResolvedTtsConfig, ResolvedTtsModelOverrides }; export type { TtsDirectiveOverrides, TtsDirectiveParseResult }; diff --git a/src/plugin-sdk/twitch.ts b/src/plugin-sdk/twitch.ts index f889fb01083..0ddc98853e7 100644 --- a/src/plugin-sdk/twitch.ts +++ b/src/plugin-sdk/twitch.ts @@ -3,7 +3,7 @@ import { createOptionalChannelSetupSurface } from "./channel-setup.js"; -export type { ReplyPayload } from "../auto-reply/reply-payload.js"; +export type { ReplyPayload } from "./reply-payload.js"; export { buildChannelConfigSchema } from "../channels/plugins/config-schema.js"; export type { ChannelGatewayContext, diff --git a/src/plugin-sdk/zalo.ts b/src/plugin-sdk/zalo.ts index fcd97a3e619..1d8f7d7ff4c 100644 --- a/src/plugin-sdk/zalo.ts +++ b/src/plugin-sdk/zalo.ts @@ -2,7 +2,7 @@ // Keep this list additive and scoped to the bundled Zalo surface. export { jsonResult, readStringParam } from "../agents/tools/common.js"; -export type { ReplyPayload } from "../auto-reply/reply-payload.js"; +export type { ReplyPayload } from "./reply-payload.js"; export { deleteAccountFromConfigSection, setAccountEnabledInConfigSection, diff --git a/src/plugin-sdk/zalouser.ts b/src/plugin-sdk/zalouser.ts index c476090fcff..f920c8a2fb3 100644 --- a/src/plugin-sdk/zalouser.ts +++ b/src/plugin-sdk/zalouser.ts @@ -3,7 +3,7 @@ import { createOptionalChannelSetupSurface } from "./channel-setup.js"; -export type { ReplyPayload } from "../auto-reply/reply-payload.js"; +export type { ReplyPayload } from "./reply-payload.js"; export { mergeAllowlist, summarizeMapping } from "../channels/allowlists/resolve-utils.js"; export { resolveMentionGating, diff --git a/src/utils/directive-tags.ts b/src/utils/directive-tags.ts index ce5dccfabc5..6f215ddd2f3 100644 --- a/src/utils/directive-tags.ts +++ b/src/utils/directive-tags.ts @@ -20,6 +20,7 @@ const AUDIO_TAG_RE = /\[\[\s*audio_as_voice\s*\]\]/gi; const REPLY_TAG_RE = /\[\[\s*(?:reply_to_current|reply_to\s*:\s*([^\]\n]+))\s*\]\]/gi; const INLINE_DIRECTIVE_TAG_WITH_PADDING_RE = /\s*(?:\[\[\s*audio_as_voice\s*\]\]|\[\[\s*(?:reply_to_current|reply_to\s*:\s*[^\]\n]+)\s*\]\])\s*/gi; +const MAX_REPLY_DIRECTIVE_ID_LENGTH = 256; function replacementPreservesWordBoundary(source: string, offset: number, length: number): string { const before = source[offset - 1]; @@ -92,6 +93,33 @@ export function stripInlineDirectiveTagsForDisplay(text: string): StripInlineDir }; } +function stripUnsafeReplyDirectiveChars(value: string): string { + let next = ""; + for (const ch of value) { + const code = ch.charCodeAt(0); + if ((code >= 0 && code <= 31) || code === 127 || ch === "[" || ch === "]") { + continue; + } + next += ch; + } + return next; +} + +export function sanitizeReplyDirectiveId(rawReplyToId?: string): string | undefined { + const trimmed = rawReplyToId?.trim(); + if (!trimmed) { + return undefined; + } + const sanitized = stripUnsafeReplyDirectiveChars(trimmed).trim(); + if (!sanitized) { + return undefined; + } + if (sanitized.length > MAX_REPLY_DIRECTIVE_ID_LENGTH) { + return sanitized.slice(0, MAX_REPLY_DIRECTIVE_ID_LENGTH); + } + return sanitized; +} + export function stripInlineDirectiveTagsForDelivery(text: string): StripInlineDirectiveTagsResult { if (!text) { return { text, changed: false };