diff --git a/CHANGELOG.md b/CHANGELOG.md index 7bb4c3d2cf1..7af583a995f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,6 +18,7 @@ Docs: https://docs.openclaw.ai - MiniMax music generation: switch the bundled default model from the unsupported `music-2.5+` id to the current `music-2.6` API model. Fixes #64870 and addresses the music default from #62315. Thanks @noahclanman and @edwardzheng1. - Google media generation: strip a configured trailing `/v1beta` from Google music/video provider base URLs before calling the Google GenAI SDK, preventing doubled `/v1beta/v1beta` paths. Fixes #63240. (#63258) Thanks @Hybirdss. +- Discord: restore direct-message voice-note preflight transcription and classify URL-only Ogg/Opus voice attachments as audio while skipping partial attachments without usable URLs. Fixes #61314 and #64803. - Google Chat: preserve reply text when a typing indicator message is deleted or can no longer be updated, so media captions and first text chunks are resent instead of silently disappearing. (#71498) Thanks @colin-lgtm. - Cron: tolerate malformed legacy job rows in startup, main-session system-event payloads, and human-readable `cron list` output so missing `state`, `payload.text`, or display fields no longer crash the scheduler or CLI. Fixes #66016, #65916, #64137, #57872, #59968, #63813, #52804, and #43163. (#71509) Thanks @vincentkoc. - CLI/models: make `openclaw models scan` fall back to public OpenRouter free-model metadata when no `OPENROUTER_API_KEY` is configured, avoid config secret resolution for explicit `--no-probe` scans, and apply the scan timeout to the OpenRouter catalog request. diff --git a/extensions/discord/src/monitor/message-handler.preflight.test.ts b/extensions/discord/src/monitor/message-handler.preflight.test.ts index 84457031e5d..d200b913a77 100644 --- a/extensions/discord/src/monitor/message-handler.preflight.test.ts +++ b/extensions/discord/src/monitor/message-handler.preflight.test.ts @@ -366,6 +366,47 @@ describe("preflightDiscordMessage", () => { }); }); + it("preflights direct-message voice notes without mention gating", async () => { + transcribeFirstAudioMock.mockResolvedValue("hello openclaw from dm audio"); + + const result = await runDmPreflight({ + channelId: "dm-channel-audio-1", + message: createDiscordMessage({ + id: "m-dm-audio-1", + channelId: "dm-channel-audio-1", + content: "", + attachments: [ + { + id: "att-dm-audio-1", + url: "https://cdn.discordapp.com/attachments/voice.ogg", + content_type: "audio/ogg", + filename: "voice.ogg", + }, + ], + author: { + id: "user-1", + bot: false, + username: "alice", + }, + }), + discordConfig: { + dmPolicy: "open", + } as DiscordConfig, + }); + + expect(transcribeFirstAudioMock).toHaveBeenCalledTimes(1); + expect(transcribeFirstAudioMock).toHaveBeenCalledWith( + expect.objectContaining({ + ctx: expect.objectContaining({ + MediaUrls: ["https://cdn.discordapp.com/attachments/voice.ogg"], + MediaTypes: ["audio/ogg"], + }), + }), + ); + expect(result).not.toBeNull(); + expect(result?.isDirectMessage).toBe(true); + }); + it("falls back to the default discord account for omitted-account dm authorization", async () => { const message = createDiscordMessage({ id: "m-dm-default-account", diff --git a/extensions/discord/src/monitor/message-utils.test.ts b/extensions/discord/src/monitor/message-utils.test.ts index 966d4cfe1b0..6d33b0b385d 100644 --- a/extensions/discord/src/monitor/message-utils.test.ts +++ b/extensions/discord/src/monitor/message-utils.test.ts @@ -517,6 +517,49 @@ describe("resolveMediaList", () => { expectAttachmentImageFallback({ result, attachment }); }); + it("skips attachments without a usable URL", async () => { + const result = await resolveMediaList( + asMessage({ + attachments: [ + { + id: "att-missing-url", + filename: "voice.ogg", + content_type: "audio/ogg", + }, + ], + }), + 512, + ); + + expect(fetchRemoteMedia).not.toHaveBeenCalled(); + expect(saveMediaBuffer).not.toHaveBeenCalled(); + expect(result).toEqual([]); + }); + + it("classifies audio attachments by filename when content type is missing", async () => { + const attachment = { + id: "att-audio-fallback", + url: "https://cdn.discordapp.com/attachments/1/voice.ogg", + filename: "voice.ogg", + }; + fetchRemoteMedia.mockRejectedValueOnce(new Error("blocked by ssrf guard")); + + const result = await resolveMediaList( + asMessage({ + attachments: [attachment], + }), + 512, + ); + + expect(result).toEqual([ + { + path: attachment.url, + contentType: undefined, + placeholder: "", + }, + ]); + }); + it("falls back to URL when saveMediaBuffer fails", async () => { const attachment = { id: "att-save-fail", diff --git a/extensions/discord/src/monitor/message-utils.ts b/extensions/discord/src/monitor/message-utils.ts index 2abfdda30a9..4ac0c8587a8 100644 --- a/extensions/discord/src/monitor/message-utils.ts +++ b/extensions/discord/src/monitor/message-utils.ts @@ -1,5 +1,6 @@ import type { ChannelType, Client, Message } from "@buape/carbon"; import { StickerFormatType, type APIAttachment, type APIStickerItem } from "discord-api-types/v10"; +import { getFileExtension } from "openclaw/plugin-sdk/media-mime"; import { fetchRemoteMedia, type FetchLike } from "openclaw/plugin-sdk/media-runtime"; import { saveMediaBuffer } from "openclaw/plugin-sdk/media-runtime"; import { buildMediaPayload } from "openclaw/plugin-sdk/reply-payload"; @@ -26,6 +27,23 @@ const DISCORD_MEDIA_SSRF_POLICY: SsrFPolicy = { allowRfc2544BenchmarkRange: true, }; +const AUDIO_ATTACHMENT_EXTENSIONS = new Set([ + ".aac", + ".caf", + ".flac", + ".m4a", + ".mp3", + ".oga", + ".ogg", + ".opus", + ".wav", +]); + +function isDiscordAudioAttachmentFileName(fileName?: string | null): boolean { + const ext = getFileExtension(fileName); + return Boolean(ext && AUDIO_ATTACHMENT_EXTENSIONS.has(ext)); +} + function mergeHostnameList(...lists: Array): string[] | undefined { const merged = lists .flatMap((list) => list ?? []) @@ -381,10 +399,17 @@ async function appendResolvedMediaFromAttachments(params: { return; } for (const attachment of attachments) { + const attachmentUrl = normalizeOptionalString(attachment.url); + if (!attachmentUrl) { + logVerbose( + `${params.errorPrefix} ${attachment.id ?? attachment.filename ?? "attachment"}: missing url`, + ); + continue; + } try { const fetched = await fetchDiscordMedia({ - url: attachment.url, - filePathHint: attachment.filename ?? attachment.url, + url: attachmentUrl, + filePathHint: attachment.filename ?? attachmentUrl, maxBytes: params.maxBytes, fetchImpl: params.fetchImpl, ssrfPolicy: params.ssrfPolicy, @@ -404,11 +429,11 @@ async function appendResolvedMediaFromAttachments(params: { placeholder: inferPlaceholder(attachment), }); } catch (err) { - const id = attachment.id ?? attachment.url; + const id = attachment.id ?? attachmentUrl; logVerbose(`${params.errorPrefix} ${id}: ${String(err)}`); // Preserve attachment context even when remote fetch is blocked/fails. params.out.push({ - path: attachment.url, + path: attachmentUrl, contentType: attachment.content_type, placeholder: inferPlaceholder(attachment), }); @@ -553,6 +578,9 @@ function inferPlaceholder(attachment: APIAttachment): string { if (mime.startsWith("audio/")) { return ""; } + if (isDiscordAudioAttachmentFileName(attachment.filename ?? attachment.url)) { + return ""; + } return ""; } diff --git a/extensions/discord/src/monitor/preflight-audio.test.ts b/extensions/discord/src/monitor/preflight-audio.test.ts new file mode 100644 index 00000000000..be42324d893 --- /dev/null +++ b/extensions/discord/src/monitor/preflight-audio.test.ts @@ -0,0 +1,127 @@ +import { beforeEach, describe, expect, it, vi } from "vitest"; + +const transcribeFirstAudioMock = vi.hoisted(() => vi.fn()); + +vi.mock("./preflight-audio.runtime.js", () => ({ + transcribeFirstAudio: transcribeFirstAudioMock, +})); + +import { resolveDiscordPreflightAudioMentionContext } from "./preflight-audio.js"; + +const cfg = {} as import("openclaw/plugin-sdk/config-runtime").OpenClawConfig; + +describe("resolveDiscordPreflightAudioMentionContext", () => { + beforeEach(() => { + transcribeFirstAudioMock.mockReset(); + }); + + it("preflights direct-message audio without requiring a mention", async () => { + transcribeFirstAudioMock.mockResolvedValue("hello from dm"); + + const result = await resolveDiscordPreflightAudioMentionContext({ + message: { + attachments: [ + { + url: "https://cdn.discordapp.com/attachments/voice.ogg", + content_type: "audio/ogg", + filename: "voice.ogg", + }, + ], + }, + isDirectMessage: true, + shouldRequireMention: false, + mentionRegexes: [], + cfg, + }); + + expect(transcribeFirstAudioMock).toHaveBeenCalledWith( + expect.objectContaining({ + ctx: expect.objectContaining({ + MediaUrls: ["https://cdn.discordapp.com/attachments/voice.ogg"], + MediaTypes: ["audio/ogg"], + }), + }), + ); + expect(result).toEqual({ + hasAudioAttachment: true, + hasTypedText: false, + transcript: "hello from dm", + }); + }); + + it("preflights audio by filename when Discord omits content type", async () => { + transcribeFirstAudioMock.mockResolvedValue("filename transcript"); + + await resolveDiscordPreflightAudioMentionContext({ + message: { + attachments: [ + { + url: "https://cdn.discordapp.com/attachments/voice.opus", + filename: "voice.opus", + }, + ], + }, + isDirectMessage: true, + shouldRequireMention: false, + mentionRegexes: [], + cfg, + }); + + expect(transcribeFirstAudioMock).toHaveBeenCalledWith( + expect.objectContaining({ + ctx: expect.objectContaining({ + MediaUrls: ["https://cdn.discordapp.com/attachments/voice.opus"], + MediaTypes: ["audio/opus"], + }), + }), + ); + }); + + it("does not preflight typed direct-message audio", async () => { + const result = await resolveDiscordPreflightAudioMentionContext({ + message: { + content: "typed caption", + attachments: [ + { + url: "https://cdn.discordapp.com/attachments/voice.ogg", + content_type: "audio/ogg", + filename: "voice.ogg", + }, + ], + }, + isDirectMessage: true, + shouldRequireMention: false, + mentionRegexes: [], + cfg, + }); + + expect(transcribeFirstAudioMock).not.toHaveBeenCalled(); + expect(result).toEqual({ + hasAudioAttachment: true, + hasTypedText: true, + }); + }); + + it("ignores URL-less audio attachments", async () => { + const result = await resolveDiscordPreflightAudioMentionContext({ + message: { + attachments: [ + { + content_type: "audio/ogg", + filename: "voice.ogg", + }, + ], + }, + isDirectMessage: true, + shouldRequireMention: false, + mentionRegexes: [], + cfg, + }); + + expect(transcribeFirstAudioMock).not.toHaveBeenCalled(); + expect(result).toEqual({ + hasAudioAttachment: false, + hasTypedText: false, + }); + }); +}); diff --git a/extensions/discord/src/monitor/preflight-audio.ts b/extensions/discord/src/monitor/preflight-audio.ts index 03e883948a1..d1d6acb3624 100644 --- a/extensions/discord/src/monitor/preflight-audio.ts +++ b/extensions/discord/src/monitor/preflight-audio.ts @@ -1,4 +1,5 @@ import type { OpenClawConfig } from "openclaw/plugin-sdk/config-runtime"; +import { getFileExtension } from "openclaw/plugin-sdk/media-mime"; import { logVerbose } from "openclaw/plugin-sdk/runtime-env"; type DiscordPreflightAudioRuntime = typeof import("./preflight-audio.runtime.js"); @@ -12,16 +13,40 @@ function loadDiscordPreflightAudioRuntime(): Promise att.content_type?.startsWith("audio/")); + return attachments.filter( + (att) => typeof att.url === "string" && att.url.length > 0 && inferAudioAttachmentMime(att), + ); } export async function resolveDiscordPreflightAudioMentionContext(params: { @@ -43,12 +68,10 @@ export async function resolveDiscordPreflightAudioMentionContext(params: { const hasAudioAttachment = audioAttachments.length > 0; const hasTypedText = Boolean(params.message.content?.trim()); const needsPreflightTranscription = - !params.isDirectMessage && - params.shouldRequireMention && hasAudioAttachment && // `baseText` includes media placeholders; gate on typed text only. !hasTypedText && - params.mentionRegexes.length > 0; + (params.isDirectMessage || (params.shouldRequireMention && params.mentionRegexes.length > 0)); let transcript: string | undefined; if (needsPreflightTranscription) { @@ -74,7 +97,7 @@ export async function resolveDiscordPreflightAudioMentionContext(params: { ctx: { MediaUrls: audioUrls, MediaTypes: audioAttachments - .map((att) => att.content_type) + .map((att) => inferAudioAttachmentMime(att)) .filter((contentType): contentType is string => Boolean(contentType)), }, cfg: params.cfg,