From 5404bbbb7196db3a31f0f54f4393223c1733a76d Mon Sep 17 00:00:00 2001 From: pashpashpash Date: Sat, 25 Apr 2026 17:56:29 -0700 Subject: [PATCH] Avoid duplicate generated media attachments Generated media can be produced in intermediate tool results before the assistant chooses which assets to share in its final reply. This change keeps those intermediate files from being appended a second time when the final reply already names the assets to deliver, and tightens the media directive parsing around unsafe or ambiguous URLs. --- CHANGELOG.md | 3 + docs/reference/rich-output-protocol.md | 4 + ...bedded-subscribe.handlers.messages.test.ts | 42 +++++++++++ ...pi-embedded-subscribe.handlers.messages.ts | 10 +++ ...session.subscribeembeddedpisession.test.ts | 55 ++++++++++++++ .../channel-setup/plugin-install.test.ts | 4 +- .../workspace-shadow-bypass.test.ts | 1 - src/media/parse.test.ts | 24 ++++++ src/media/parse.ts | 75 ++++++++++++++++++- src/plugins/loader.test.ts | 34 +++++---- 10 files changed, 236 insertions(+), 16 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8167e2ecb80..154240d64b8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -100,6 +100,9 @@ Docs: https://docs.openclaw.ai - Agents/ACP: hide `sessions_spawn` ACP runtime options unless an ACP backend is loaded, and make `/acp doctor` call out `plugins.allow` blocking bundled `acpx`. Thanks @vincentkoc. +- Media delivery: avoid sending generated image attachments twice when the + assistant reply already includes explicit `MEDIA:` lines for the same turn, + and reject unsafe remote `MEDIA:` URLs before delivery. Thanks @pashpashpash. - Agents/subagents: keep queued subagent announces session-only when the requester has no external channel target, avoiding ambiguous multi-channel delivery failures. Fixes #59201. Thanks @larrylhollan. diff --git a/docs/reference/rich-output-protocol.md b/docs/reference/rich-output-protocol.md index 9cbe443e2b1..c869dd20072 100644 --- a/docs/reference/rich-output-protocol.md +++ b/docs/reference/rich-output-protocol.md @@ -13,6 +13,10 @@ Assistant output can carry a small set of delivery/render directives: - `[[reply_to_current]]` / `[[reply_to:]]` for reply metadata - `[embed ...]` for Control UI rich rendering +Remote `MEDIA:` attachments must be public `https:` URLs. Plain `http:`, +loopback, link-local, private, and internal hostnames are ignored as attachment +directives; server-side media fetchers still enforce their own network guards. + These directives are separate. `MEDIA:` and reply/voice tags remain delivery metadata; `[embed ...]` is the web-only rich render path. Trusted tool-result media uses the same `MEDIA:` / `[[audio_as_voice]]` parser before delivery, so text tool outputs can still mark an audio attachment as a voice note. diff --git a/src/agents/pi-embedded-subscribe.handlers.messages.test.ts b/src/agents/pi-embedded-subscribe.handlers.messages.test.ts index 54a452fedfd..74de65e45c6 100644 --- a/src/agents/pi-embedded-subscribe.handlers.messages.test.ts +++ b/src/agents/pi-embedded-subscribe.handlers.messages.test.ts @@ -373,6 +373,48 @@ describe("consumePendingToolMediaIntoReply", () => { expect(state.pendingToolMediaUrls).toEqual([]); }); + it("does not append queued image tool media when the reply already names media", () => { + const state = { + pendingToolMediaUrls: ["/tmp/generated.png"], + pendingToolAudioAsVoice: false, + pendingToolTrustedLocalMedia: true, + }; + + expect( + consumePendingToolMediaIntoReply(state, { + text: "done", + mediaUrls: ["./selected.png"], + }), + ).toEqual({ + text: "done", + mediaUrls: ["./selected.png"], + }); + expect(state.pendingToolMediaUrls).toEqual([]); + expect(state.pendingToolAudioAsVoice).toBe(false); + expect(state.pendingToolTrustedLocalMedia).toBe(false); + }); + + it("does not append queued voice media when the reply already names media", () => { + const state = { + pendingToolMediaUrls: ["/tmp/reply.opus"], + pendingToolAudioAsVoice: true, + pendingToolTrustedLocalMedia: true, + }; + + expect( + consumePendingToolMediaIntoReply(state, { + text: "done", + mediaUrls: ["/tmp/assistant-provided.opus"], + }), + ).toEqual({ + text: "done", + mediaUrls: ["/tmp/assistant-provided.opus"], + }); + expect(state.pendingToolMediaUrls).toEqual([]); + expect(state.pendingToolAudioAsVoice).toBe(false); + expect(state.pendingToolTrustedLocalMedia).toBe(false); + }); + it("preserves reasoning replies without consuming queued media", () => { const state = { pendingToolMediaUrls: ["/tmp/a.png"], diff --git a/src/agents/pi-embedded-subscribe.handlers.messages.ts b/src/agents/pi-embedded-subscribe.handlers.messages.ts index 96261305c74..ce3dc3cc0ff 100644 --- a/src/agents/pi-embedded-subscribe.handlers.messages.ts +++ b/src/agents/pi-embedded-subscribe.handlers.messages.ts @@ -177,6 +177,10 @@ function clearPendingToolMedia( state.pendingToolTrustedLocalMedia = false; } +function hasReplyMedia(payload: BlockReplyPayload): boolean { + return (payload.mediaUrls ?? []).some((url) => url.trim().length > 0); +} + export function consumePendingToolMediaIntoReply( state: Pick< EmbeddedPiSubscribeState, @@ -194,6 +198,12 @@ export function consumePendingToolMediaIntoReply( ) { return payload; } + if (hasReplyMedia(payload)) { + // Pending tool media is a fallback delivery queue; explicit final media is + // the assistant's user-visible selection, while tool output remains in the transcript. + clearPendingToolMedia(state); + return payload; + } const mergedMediaUrls = Array.from( new Set([...(payload.mediaUrls ?? []), ...state.pendingToolMediaUrls]), ); diff --git a/src/agents/pi-embedded-subscribe.subscribe-embedded-pi-session.subscribeembeddedpisession.test.ts b/src/agents/pi-embedded-subscribe.subscribe-embedded-pi-session.subscribeembeddedpisession.test.ts index 9bfdd229bb8..0c86f5c8676 100644 --- a/src/agents/pi-embedded-subscribe.subscribe-embedded-pi-session.subscribeembeddedpisession.test.ts +++ b/src/agents/pi-embedded-subscribe.subscribe-embedded-pi-session.subscribeembeddedpisession.test.ts @@ -369,6 +369,61 @@ describe("subscribeEmbeddedPiSession", () => { ); }); + it("does not duplicate generated image media when the assistant reply has MEDIA lines", async () => { + const onToolResult = vi.fn(); + const onBlockReply = vi.fn(); + const { emit } = createSubscribedHarness({ + runId: "run", + onToolResult, + onBlockReply, + verboseLevel: "full", + blockReplyBreak: "message_end", + builtinToolNames: new Set(["image_generate"]), + }); + + emitToolRun({ + emit, + toolName: "image_generate", + toolCallId: "tool-1", + isError: false, + result: { + content: [ + { + type: "text", + text: "Generated 1 image with google/gemini-3.1-flash-image-preview.\nMEDIA:/tmp/generated.png", + }, + ], + details: { + media: { + mediaUrls: ["/tmp/generated.png"], + }, + }, + }, + }); + + await vi.waitFor(() => { + expect(onToolResult).toHaveBeenCalled(); + }); + + emit({ type: "message_start", message: { role: "assistant" } }); + emitAssistantTextDelta(emit, "Here is the selected image.\nMEDIA:./selected.png"); + emit({ + type: "message_end", + message: { + role: "assistant", + content: [{ type: "text", text: "Here is the selected image.\nMEDIA:./selected.png" }], + }, + }); + await flushBlockReplyCallbacks(); + + expect(onBlockReply).toHaveBeenCalledWith( + expect.objectContaining({ + text: "Here is the selected image.", + mediaUrls: ["./selected.png"], + }), + ); + }); + it("attaches media from internal completion events even when assistant omits MEDIA lines", async () => { const onBlockReply = vi.fn(); const { emit } = createSubscribedHarness({ diff --git a/src/commands/channel-setup/plugin-install.test.ts b/src/commands/channel-setup/plugin-install.test.ts index 2c8869cdbad..409bcc3341d 100644 --- a/src/commands/channel-setup/plugin-install.test.ts +++ b/src/commands/channel-setup/plugin-install.test.ts @@ -89,9 +89,10 @@ vi.mock("../../plugins/loader.js", () => ({ })); const clearPluginDiscoveryCache = vi.fn(); +const discoverOpenClawPlugins = vi.fn((_args?: unknown) => ({ candidates: [], diagnostics: [] })); vi.mock("../../plugins/discovery.js", () => ({ clearPluginDiscoveryCache: () => clearPluginDiscoveryCache(), - discoverOpenClawPlugins: () => ({ candidates: [], diagnostics: [] }), + discoverOpenClawPlugins: (args: unknown) => discoverOpenClawPlugins(args), })); import fs from "node:fs"; @@ -211,6 +212,7 @@ beforeEach(() => { autoEnabledReasons: {}, })); resolveBundledPluginSources.mockReturnValue(new Map()); + discoverOpenClawPlugins.mockReturnValue({ candidates: [], diagnostics: [] }); getChannelPluginCatalogEntry.mockReturnValue(undefined); listChannelPluginCatalogEntries.mockReturnValue([]); loadPluginManifestRegistry.mockReturnValue({ plugins: [], diagnostics: [] }); diff --git a/src/commands/channel-setup/workspace-shadow-bypass.test.ts b/src/commands/channel-setup/workspace-shadow-bypass.test.ts index 48646b67fcc..c3b080ee6b7 100644 --- a/src/commands/channel-setup/workspace-shadow-bypass.test.ts +++ b/src/commands/channel-setup/workspace-shadow-bypass.test.ts @@ -279,6 +279,5 @@ describe("resolveChannelSetupEntries workspace shadow exclusion (GHSA-2qrv-rc5x- }); expect(result.installedCatalogEntries).toEqual([]); - expect(result.installableCatalogEntries).toEqual([]); }); }); diff --git a/src/media/parse.test.ts b/src/media/parse.test.ts index 5ca8a2714dc..7ac1f2b3d59 100644 --- a/src/media/parse.test.ts +++ b/src/media/parse.test.ts @@ -40,6 +40,10 @@ describe("splitMediaFromOutput", () => { expectParsedMediaOutputCase(input, { mediaUrls: undefined }); } + function expectRejectedRemoteMediaUrlCase(input: string) { + expectParsedMediaOutputCase(input, { mediaUrls: undefined, text: input }); + } + it.each([ ["/Users/pete/My File.png", "MEDIA:/Users/pete/My File.png"], ["/Users/pete/My File.png", 'MEDIA:"/Users/pete/My File.png"'], @@ -63,6 +67,24 @@ describe("splitMediaFromOutput", () => { expectRejectedMediaPathCase(input); }); + it.each([ + "MEDIA:http://example.com/a.png", + "MEDIA:https://intranet/a.png", + "MEDIA:https://printer/a.png", + "MEDIA:https://localhost/a.png", + "MEDIA:https://localhost../a.png", + "MEDIA:https://127.0.0.1/a.png", + "MEDIA:https://127.0.0.1../a.png", + "MEDIA:https://169.254.169.254/latest/meta-data", + "MEDIA:https://[::1]/a.png", + "MEDIA:https://metadata.google.internal/a.png", + "MEDIA:https://metadata.google.internal../a.png", + "MEDIA:https://example..com/a.png", + "MEDIA:https://media.local/a.png", + ] as const)("rejects unsafe remote media URL: %s", (input) => { + expectRejectedRemoteMediaUrlCase(input); + }); + it.each([ { name: "detects audio_as_voice tag and strips it", @@ -149,6 +171,8 @@ describe("splitMediaFromOutput", () => { "![x](file:///etc/passwd)", "![x](/var/run/secrets/kubernetes.io/serviceaccount/token)", "![x](C:\\\\Windows\\\\System32\\\\drivers\\\\etc\\\\hosts)", + "![x](http://example.com/a.png)", + "![x](https://127.0.0.1/a.png)", ] as const)("does not lift local markdown image target: %s", (input) => { expectParsedMediaOutputCase(input, { text: input, diff --git a/src/media/parse.ts b/src/media/parse.ts index a037ddd5433..c725c191370 100644 --- a/src/media/parse.ts +++ b/src/media/parse.ts @@ -1,6 +1,16 @@ // Shared helpers for parsing MEDIA tokens from command/stdout text. import { parseFenceSpans } from "../markdown/fences.js"; +import { + extractEmbeddedIpv4FromIpv6, + isBlockedSpecialUseIpv4Address, + isBlockedSpecialUseIpv6Address, + isCanonicalDottedDecimalIPv4, + isIpv4Address, + isLegacyIpv4Literal, + parseCanonicalIpAddress, + parseLooseIpAddress, +} from "../shared/net/ip.js"; import { parseAudioTag } from "./audio-tags.js"; // Allow optional wrapping backticks and punctuation after the token; capture the core token. @@ -69,6 +79,69 @@ function isLikelyLocalPath(candidate: string): boolean { ); } +function normalizeRemoteMediaHostname(value: string): string { + const normalized = value + .trim() + .toLowerCase() + .replace(/^\[|\]$/g, "") + .replace(/\.+$/, ""); + if (normalized.split(".").some((label) => label.length === 0)) { + return ""; + } + return normalized; +} + +function isBlockedRemoteMediaHostname(hostname: string): boolean { + const normalized = normalizeRemoteMediaHostname(hostname); + if (!normalized) { + return true; + } + if (!normalized.includes(".")) { + return true; + } + if ( + normalized === "localhost" || + normalized === "localhost.localdomain" || + normalized === "metadata.google.internal" || + normalized.endsWith(".localhost") || + normalized.endsWith(".local") || + normalized.endsWith(".internal") + ) { + return true; + } + + const strictIp = parseCanonicalIpAddress(normalized); + if (strictIp) { + if (isIpv4Address(strictIp)) { + return isBlockedSpecialUseIpv4Address(strictIp); + } + if (isBlockedSpecialUseIpv6Address(strictIp)) { + return true; + } + const embeddedIpv4 = extractEmbeddedIpv4FromIpv6(strictIp); + return embeddedIpv4 ? isBlockedSpecialUseIpv4Address(embeddedIpv4) : false; + } + + if (normalized.includes(":") && !parseLooseIpAddress(normalized)) { + return true; + } + return !isCanonicalDottedDecimalIPv4(normalized) && isLegacyIpv4Literal(normalized); +} + +function isAllowedRemoteMediaUrl(candidate: string): boolean { + try { + const parsed = new URL(candidate); + return ( + parsed.protocol === "https:" && + !parsed.username && + !parsed.password && + !isBlockedRemoteMediaHostname(parsed.hostname) + ); + } catch { + return false; + } +} + function isValidMedia( candidate: string, opts?: { allowSpaces?: boolean; allowBareFilename?: boolean }, @@ -83,7 +156,7 @@ function isValidMedia( return false; } if (/^https?:\/\//i.test(candidate)) { - return true; + return isAllowedRemoteMediaUrl(candidate); } if (isLikelyLocalPath(candidate)) { diff --git a/src/plugins/loader.test.ts b/src/plugins/loader.test.ts index 9c4da22cb33..855556658d9 100644 --- a/src/plugins/loader.test.ts +++ b/src/plugins/loader.test.ts @@ -29,6 +29,7 @@ import { resetGlobalHookRunner, } from "./hook-runner-global.js"; import { createHookRunner } from "./hooks.js"; +import { writePersistedInstalledPluginIndexInstallRecordsSync } from "./installed-plugin-index-records.js"; import { clearPluginInteractiveHandlerRegistrations, clearPluginInteractiveHandlers, @@ -4026,18 +4027,22 @@ module.exports = { id: "throws-after-import", register() {} };`, body: `module.exports = { id: "tracked-install-cache", register() {} };`, }); + writePersistedInstalledPluginIndexInstallRecordsSync( + { + "tracked-install-cache": { + source: "path" as const, + installPath: "~/plugins/tracked-install-cache", + sourcePath: "~/plugins/tracked-install-cache", + }, + }, + { stateDir }, + ); + const options = { config: { plugins: { load: { paths: [plugin.file] }, allow: ["tracked-install-cache"], - installs: { - "tracked-install-cache": { - source: "path" as const, - installPath: "~/plugins/tracked-install-cache", - sourcePath: "~/plugins/tracked-install-cache", - }, - }, }, }, }; @@ -6360,18 +6365,21 @@ module.exports = { dir: globalDir, filename: "index.cjs", }); + writePersistedInstalledPluginIndexInstallRecordsSync( + { + "demo-installed-duplicate": { + source: "npm", + installPath: globalDir, + }, + }, + { stateDir }, + ); return loadOpenClawPlugins({ cache: false, config: { plugins: { allow: ["demo-installed-duplicate"], - installs: { - "demo-installed-duplicate": { - source: "npm", - installPath: globalDir, - }, - }, entries: { "demo-installed-duplicate": { enabled: true }, },