From 24032dcc0ef75e63fc025e7e9502cc46f88894a4 Mon Sep 17 00:00:00 2001 From: scoootscooob <167050519+scoootscooob@users.noreply.github.com> Date: Sun, 22 Mar 2026 15:18:16 -0700 Subject: [PATCH] Reply: fix generated image delivery to Discord (#52489) --- .../discord/src/actions/runtime.messaging.ts | 1 + .../discord/src/actions/runtime.test.ts | 22 ++++++ extensions/discord/src/send.outbound.ts | 3 + .../send.sends-basic-channel-messages.test.ts | 21 ++++++ extensions/discord/src/send.shared.ts | 10 ++- src/agents/tools/image-generate-tool.test.ts | 2 + src/auto-reply/reply/reply-delivery.test.ts | 63 ++++++++++++++++ src/auto-reply/reply/reply-delivery.ts | 7 +- src/image-generation/providers/openai.test.ts | 75 +++++++++++++++++++ src/image-generation/providers/openai.ts | 42 ++++++++++- 10 files changed, 242 insertions(+), 4 deletions(-) create mode 100644 src/auto-reply/reply/reply-delivery.test.ts diff --git a/extensions/discord/src/actions/runtime.messaging.ts b/extensions/discord/src/actions/runtime.messaging.ts index 15a074e0073..757bf90f056 100644 --- a/extensions/discord/src/actions/runtime.messaging.ts +++ b/extensions/discord/src/actions/runtime.messaging.ts @@ -386,6 +386,7 @@ export async function handleDiscordMessagingAction( ...cfgOptions, ...(accountId ? { accountId } : {}), mediaUrl, + filename: filename ?? undefined, mediaLocalRoots: options?.mediaLocalRoots, replyTo, components, diff --git a/extensions/discord/src/actions/runtime.test.ts b/extensions/discord/src/actions/runtime.test.ts index 2eb76c9d426..2dcf8491505 100644 --- a/extensions/discord/src/actions/runtime.test.ts +++ b/extensions/discord/src/actions/runtime.test.ts @@ -395,6 +395,28 @@ describe("handleDiscordMessagingAction", () => { ); }); + it("forwards the optional filename into sendMessageDiscord", async () => { + sendMessageDiscord.mockClear(); + await handleDiscordMessagingAction( + "sendMessage", + { + to: "channel:123", + content: "hello", + mediaUrl: "/tmp/generated-image", + filename: "image.png", + }, + enableAllActions, + ); + expect(sendMessageDiscord).toHaveBeenCalledWith( + "channel:123", + "hello", + expect.objectContaining({ + mediaUrl: "/tmp/generated-image", + filename: "image.png", + }), + ); + }); + it("rejects voice messages that include content", async () => { await expect( handleDiscordMessagingAction( diff --git a/extensions/discord/src/send.outbound.ts b/extensions/discord/src/send.outbound.ts index e0a674d557e..d0790734ed1 100644 --- a/extensions/discord/src/send.outbound.ts +++ b/extensions/discord/src/send.outbound.ts @@ -48,6 +48,7 @@ type DiscordSendOpts = { token?: string; accountId?: string; mediaUrl?: string; + filename?: string; mediaLocalRoots?: readonly string[]; verbose?: boolean; rest?: RequestClient; @@ -214,6 +215,7 @@ export async function sendMessageDiscord( threadId, mediaCaption ?? "", opts.mediaUrl, + opts.filename, opts.mediaLocalRoots, mediaMaxBytes, undefined, @@ -275,6 +277,7 @@ export async function sendMessageDiscord( channelId, textWithMentions, opts.mediaUrl, + opts.filename, opts.mediaLocalRoots, mediaMaxBytes, opts.replyTo, diff --git a/extensions/discord/src/send.sends-basic-channel-messages.test.ts b/extensions/discord/src/send.sends-basic-channel-messages.test.ts index 54c45c6f483..7429805cf52 100644 --- a/extensions/discord/src/send.sends-basic-channel-messages.test.ts +++ b/extensions/discord/src/send.sends-basic-channel-messages.test.ts @@ -272,6 +272,27 @@ describe("sendMessageDiscord", () => { ); }); + it("prefers the caller-provided filename for media attachments", async () => { + const { rest, postMock } = makeDiscordRest(); + postMock.mockResolvedValue({ id: "msg", channel_id: "789" }); + + await sendMessageDiscord("channel:789", "photo", { + rest, + token: "t", + mediaUrl: "file:///tmp/generated-image", + filename: "renderable.png", + }); + + expect(postMock).toHaveBeenCalledWith( + Routes.channelMessages("789"), + expect.objectContaining({ + body: expect.objectContaining({ + files: [expect.objectContaining({ name: "renderable.png" })], + }), + }), + ); + }); + it("uses configured discord mediaMaxMb for uploads", async () => { const { rest, postMock } = makeDiscordRest(); postMock.mockResolvedValue({ id: "msg", channel_id: "789" }); diff --git a/extensions/discord/src/send.shared.ts b/extensions/discord/src/send.shared.ts index 8cdc8ce2805..65370e7d865 100644 --- a/extensions/discord/src/send.shared.ts +++ b/extensions/discord/src/send.shared.ts @@ -12,6 +12,7 @@ import { Routes, type APIChannel, type APIEmbed } from "discord-api-types/v10"; import { loadConfig, type OpenClawConfig } from "openclaw/plugin-sdk/config-runtime"; import type { RetryRunner } from "openclaw/plugin-sdk/infra-runtime"; import { buildOutboundMediaLoadOptions } from "openclaw/plugin-sdk/media-runtime"; +import { extensionForMime } from "openclaw/plugin-sdk/media-runtime"; import { normalizePollDurationHours, normalizePollInput, @@ -416,6 +417,7 @@ async function sendDiscordMedia( channelId: string, text: string, mediaUrl: string, + filename: string | undefined, mediaLocalRoots: readonly string[] | undefined, maxBytes: number | undefined, replyTo: string | undefined, @@ -430,6 +432,12 @@ async function sendDiscordMedia( mediaUrl, buildOutboundMediaLoadOptions({ maxBytes, mediaLocalRoots }), ); + const requestedFileName = filename?.trim(); + const resolvedFileName = + requestedFileName || + media.fileName || + (media.contentType ? `upload${extensionForMime(media.contentType) ?? ""}` : "") || + "upload"; const chunks = text ? buildDiscordTextChunks(text, { maxLinesPerMessage, chunkMode }) : []; const caption = chunks[0] ?? ""; const messageReference = replyTo ? { message_id: replyTo, fail_if_not_exists: false } : undefined; @@ -449,7 +457,7 @@ async function sendDiscordMedia( files: [ { data: fileData, - name: media.fileName ?? "upload", + name: resolvedFileName, }, ], }); diff --git a/src/agents/tools/image-generate-tool.test.ts b/src/agents/tools/image-generate-tool.test.ts index dfd782b597c..5fda73a539d 100644 --- a/src/agents/tools/image-generate-tool.test.ts +++ b/src/agents/tools/image-generate-tool.test.ts @@ -43,6 +43,7 @@ function stubImageGenerationProviders() { generate: { maxCount: 4, supportsSize: true, + supportsAspectRatio: true, }, edit: { enabled: false, @@ -50,6 +51,7 @@ function stubImageGenerationProviders() { }, geometry: { sizes: ["1024x1024", "1024x1536", "1536x1024"], + aspectRatios: ["1:1", "16:9"], }, }, generateImage: vi.fn(async () => { diff --git a/src/auto-reply/reply/reply-delivery.test.ts b/src/auto-reply/reply/reply-delivery.test.ts new file mode 100644 index 00000000000..71ae4650ebe --- /dev/null +++ b/src/auto-reply/reply/reply-delivery.test.ts @@ -0,0 +1,63 @@ +import { describe, expect, it, vi } from "vitest"; +import { createBlockReplyDeliveryHandler } from "./reply-delivery.js"; +import type { TypingSignaler } from "./typing-mode.js"; + +describe("createBlockReplyDeliveryHandler", () => { + it("sends media-bearing block replies even when block streaming is disabled", async () => { + const onBlockReply = vi.fn(async () => {}); + const normalizeStreamingText = vi.fn((payload: { text?: string }) => ({ + text: payload.text, + skip: false, + })); + const typingSignals = { + signalTextDelta: vi.fn(async () => {}), + } as unknown as TypingSignaler; + + const handler = createBlockReplyDeliveryHandler({ + onBlockReply, + normalizeStreamingText, + applyReplyToMode: (payload) => payload, + typingSignals, + blockStreamingEnabled: false, + blockReplyPipeline: null, + directlySentBlockKeys: new Set(), + }); + + await handler({ + text: "here's the vibe", + mediaUrls: ["/tmp/generated.png"], + replyToCurrent: true, + }); + + expect(onBlockReply).toHaveBeenCalledWith({ + text: undefined, + mediaUrl: "/tmp/generated.png", + mediaUrls: ["/tmp/generated.png"], + replyToCurrent: true, + replyToId: undefined, + replyToTag: undefined, + audioAsVoice: false, + }); + expect(typingSignals.signalTextDelta).toHaveBeenCalledWith("here's the vibe"); + }); + + it("keeps text-only block replies buffered when block streaming is disabled", async () => { + const onBlockReply = vi.fn(async () => {}); + + const handler = createBlockReplyDeliveryHandler({ + onBlockReply, + normalizeStreamingText: (payload) => ({ text: payload.text, skip: false }), + applyReplyToMode: (payload) => payload, + typingSignals: { + signalTextDelta: vi.fn(async () => {}), + } as unknown as TypingSignaler, + blockStreamingEnabled: false, + blockReplyPipeline: null, + directlySentBlockKeys: new Set(), + }); + + await handler({ text: "text only" }); + + expect(onBlockReply).not.toHaveBeenCalled(); + }); +}); diff --git a/src/auto-reply/reply/reply-delivery.ts b/src/auto-reply/reply/reply-delivery.ts index ee19d2d0934..9ffc108c86c 100644 --- a/src/auto-reply/reply/reply-delivery.ts +++ b/src/auto-reply/reply/reply-delivery.ts @@ -128,7 +128,12 @@ export function createBlockReplyDeliveryHandler(params: { // Track sent key to avoid duplicate in final payloads. params.directlySentBlockKeys.add(createBlockReplyContentKey(blockPayload)); await params.onBlockReply(blockPayload); + } else if (blockHasMedia) { + // When block streaming is disabled, text-only block replies are accumulated into the + // final response. Media cannot be reconstructed later, so send it immediately and let + // the assistant's final text arrive through the normal final-reply path. + await params.onBlockReply({ ...blockPayload, text: undefined }); } - // When streaming is disabled entirely, blocks are accumulated in final text instead. + // When streaming is disabled entirely, text-only blocks are accumulated in final text. }; } diff --git a/src/image-generation/providers/openai.test.ts b/src/image-generation/providers/openai.test.ts index a128d6c6e04..0f26ad624ec 100644 --- a/src/image-generation/providers/openai.test.ts +++ b/src/image-generation/providers/openai.test.ts @@ -67,6 +67,81 @@ describe("OpenAI image-generation provider", () => { }); }); + it("maps supported aspect ratios onto OpenAI size presets", async () => { + vi.spyOn(modelAuth, "resolveApiKeyForProvider").mockResolvedValue({ + apiKey: "sk-test", + source: "env", + mode: "api-key", + }); + const fetchMock = vi.fn().mockResolvedValue({ + ok: true, + json: async () => ({ + data: [{ b64_json: Buffer.from("png-data").toString("base64") }], + }), + }); + vi.stubGlobal("fetch", fetchMock); + + const provider = buildOpenAIImageGenerationProvider(); + await provider.generateImage({ + provider: "openai", + model: "gpt-image-1.5", + prompt: "draw a portrait", + aspectRatio: "9:16", + cfg: {}, + authStore: { version: 1, profiles: {} }, + }); + + expect(fetchMock).toHaveBeenCalledWith( + "https://api.openai.com/v1/images/generations", + expect.objectContaining({ + body: JSON.stringify({ + model: "gpt-image-1.5", + prompt: "draw a portrait", + n: 1, + size: "1024x1536", + }), + }), + ); + }); + + it("prefers an explicit size over aspect ratio mapping", async () => { + vi.spyOn(modelAuth, "resolveApiKeyForProvider").mockResolvedValue({ + apiKey: "sk-test", + source: "env", + mode: "api-key", + }); + const fetchMock = vi.fn().mockResolvedValue({ + ok: true, + json: async () => ({ + data: [{ b64_json: Buffer.from("png-data").toString("base64") }], + }), + }); + vi.stubGlobal("fetch", fetchMock); + + const provider = buildOpenAIImageGenerationProvider(); + await provider.generateImage({ + provider: "openai", + model: "gpt-image-1.5", + prompt: "draw a landscape", + size: "1024x1024", + aspectRatio: "16:9", + cfg: {}, + authStore: { version: 1, profiles: {} }, + }); + + expect(fetchMock).toHaveBeenCalledWith( + "https://api.openai.com/v1/images/generations", + expect.objectContaining({ + body: JSON.stringify({ + model: "gpt-image-1.5", + prompt: "draw a landscape", + n: 1, + size: "1024x1024", + }), + }), + ); + }); + it("rejects reference-image edits for now", async () => { const provider = buildOpenAIImageGenerationProvider(); diff --git a/src/image-generation/providers/openai.ts b/src/image-generation/providers/openai.ts index e4ceb5caedf..5dcae26b3d9 100644 --- a/src/image-generation/providers/openai.ts +++ b/src/image-generation/providers/openai.ts @@ -6,6 +6,18 @@ const DEFAULT_OPENAI_IMAGE_BASE_URL = "https://api.openai.com/v1"; const DEFAULT_OUTPUT_MIME = "image/png"; const DEFAULT_SIZE = "1024x1024"; const OPENAI_SUPPORTED_SIZES = ["1024x1024", "1024x1536", "1536x1024"] as const; +const OPENAI_SUPPORTED_ASPECT_RATIOS = [ + "1:1", + "2:3", + "3:2", + "3:4", + "4:3", + "4:5", + "5:4", + "9:16", + "16:9", + "21:9", +] as const; type OpenAIImageApiResponse = { data?: Array<{ @@ -19,6 +31,31 @@ function resolveOpenAIBaseUrl(cfg: Parameters[0 return direct || DEFAULT_OPENAI_IMAGE_BASE_URL; } +function resolveOpenAISize(params: { size?: string; aspectRatio?: string }): string { + const explicitSize = params.size?.trim(); + if (explicitSize) { + return explicitSize; + } + + switch (params.aspectRatio?.trim()) { + case "1:1": + return "1024x1024"; + case "2:3": + case "3:4": + case "4:5": + case "9:16": + return "1024x1536"; + case "3:2": + case "4:3": + case "5:4": + case "16:9": + case "21:9": + return "1536x1024"; + default: + return DEFAULT_SIZE; + } +} + export function buildOpenAIImageGenerationProvider(): ImageGenerationProviderPlugin { return { id: "openai", @@ -29,7 +66,7 @@ export function buildOpenAIImageGenerationProvider(): ImageGenerationProviderPlu generate: { maxCount: 4, supportsSize: true, - supportsAspectRatio: false, + supportsAspectRatio: true, supportsResolution: false, }, edit: { @@ -42,6 +79,7 @@ export function buildOpenAIImageGenerationProvider(): ImageGenerationProviderPlu }, geometry: { sizes: [...OPENAI_SUPPORTED_SIZES], + aspectRatios: [...OPENAI_SUPPORTED_ASPECT_RATIOS], }, }, async generateImage(req) { @@ -75,7 +113,7 @@ export function buildOpenAIImageGenerationProvider(): ImageGenerationProviderPlu model: req.model || DEFAULT_OPENAI_IMAGE_MODEL, prompt: req.prompt, n: req.count ?? 1, - size: req.size ?? DEFAULT_SIZE, + size: resolveOpenAISize({ size: req.size, aspectRatio: req.aspectRatio }), }), signal: controller.signal, }).finally(() => {