diff --git a/src/agents/tools/discord-actions-messaging.ts b/src/agents/tools/discord-actions-messaging.ts index 60fcb234953..deec50731c0 100644 --- a/src/agents/tools/discord-actions-messaging.ts +++ b/src/agents/tools/discord-actions-messaging.ts @@ -18,6 +18,7 @@ import { sendMessageDiscord, sendPollDiscord, sendStickerDiscord, + sendVoiceMessageDiscord, unpinMessageDiscord, } from "../../discord/send.js"; import { resolveDiscordChannelId } from "../../discord/targets.js"; @@ -230,11 +231,25 @@ export async function handleDiscordMessagingAction( const to = readStringParam(params, "to", { required: true }); const content = readStringParam(params, "content", { required: true, + allowEmpty: true, }); const mediaUrl = readStringParam(params, "mediaUrl"); const replyTo = readStringParam(params, "replyTo"); + const asVoice = params.asVoice === true; const embeds = Array.isArray(params.embeds) && params.embeds.length > 0 ? params.embeds : undefined; + + // Handle voice message sending + if (asVoice && mediaUrl) { + // Voice messages require a local file path or downloadable URL + // They cannot include text content (Discord limitation) + const result = await sendVoiceMessageDiscord(to, mediaUrl, { + ...(accountId ? { accountId } : {}), + replyTo, + }); + return jsonResult({ ok: true, result, voiceMessage: true }); + } + const result = await sendMessageDiscord(to, content, { ...(accountId ? { accountId } : {}), mediaUrl, diff --git a/src/channels/plugins/actions/discord/handle-action.ts b/src/channels/plugins/actions/discord/handle-action.ts index 1e717967191..dcee3a02c59 100644 --- a/src/channels/plugins/actions/discord/handle-action.ts +++ b/src/channels/plugins/actions/discord/handle-action.ts @@ -41,6 +41,7 @@ export async function handleDiscordMessageAction( const mediaUrl = readStringParam(params, "media", { trim: false }); const replyTo = readStringParam(params, "replyTo"); const embeds = Array.isArray(params.embeds) ? params.embeds : undefined; + const asVoice = params.asVoice === true; return await handleDiscordAction( { action: "sendMessage", @@ -50,6 +51,7 @@ export async function handleDiscordMessageAction( mediaUrl: mediaUrl ?? undefined, replyTo: replyTo ?? undefined, embeds, + asVoice, }, cfg, ); diff --git a/src/discord/send.outbound.ts b/src/discord/send.outbound.ts index c639e551835..83dd23d0def 100644 --- a/src/discord/send.outbound.ts +++ b/src/discord/send.outbound.ts @@ -1,6 +1,7 @@ import type { RequestClient } from "@buape/carbon"; import type { APIChannel } from "discord-api-types/v10"; import { ChannelType, Routes } from "discord-api-types/v10"; +import fs from "node:fs/promises"; import type { RetryConfig } from "../infra/retry.js"; import type { PollInput } from "../polls.js"; import type { DiscordSendResult } from "./send.types.js"; @@ -21,6 +22,11 @@ import { sendDiscordMedia, sendDiscordText, } from "./send.shared.js"; +import { + ensureOggOpus, + getVoiceMessageMetadata, + sendDiscordVoiceMessage, +} from "./voice-message.js"; type DiscordSendOpts = { token?: string; @@ -31,6 +37,7 @@ type DiscordSendOpts = { replyTo?: string; retry?: RetryConfig; embeds?: unknown[]; + silent?: boolean; }; /** Discord thread names are capped at 100 characters. */ @@ -131,6 +138,7 @@ export async function sendMessageDiscord( accountInfo.config.maxLinesPerMessage, undefined, chunkMode, + opts.silent, ); for (const chunk of afterMediaChunks) { await sendDiscordText( @@ -142,6 +150,7 @@ export async function sendMessageDiscord( accountInfo.config.maxLinesPerMessage, undefined, chunkMode, + opts.silent, ); } } else { @@ -155,6 +164,7 @@ export async function sendMessageDiscord( accountInfo.config.maxLinesPerMessage, undefined, chunkMode, + opts.silent, ); } } @@ -191,6 +201,7 @@ export async function sendMessageDiscord( accountInfo.config.maxLinesPerMessage, opts.embeds, chunkMode, + opts.silent, ); } else { result = await sendDiscordText( @@ -202,6 +213,7 @@ export async function sendMessageDiscord( accountInfo.config.maxLinesPerMessage, opts.embeds, chunkMode, + opts.silent, ); } } catch (err) { @@ -277,3 +289,87 @@ export async function sendPollDiscord( channelId: String(res.channel_id ?? channelId), }; } + +type VoiceMessageOpts = { + token?: string; + accountId?: string; + verbose?: boolean; + rest?: RequestClient; + replyTo?: string; + retry?: RetryConfig; + silent?: boolean; +}; + +/** + * Send a voice message to Discord. + * + * Voice messages are a special Discord feature that displays audio with a waveform + * visualization. They require OGG/Opus format and cannot include text content. + * + * @param to - Recipient (user ID for DM or channel ID) + * @param audioPath - Path to local audio file (will be converted to OGG/Opus if needed) + * @param opts - Send options + */ +export async function sendVoiceMessageDiscord( + to: string, + audioPath: string, + opts: VoiceMessageOpts = {}, +): Promise { + const cfg = loadConfig(); + const accountInfo = resolveDiscordAccount({ + cfg, + accountId: opts.accountId, + }); + const { token, rest, request } = createDiscordClient(opts, cfg); + const recipient = await parseAndResolveRecipient(to, opts.accountId); + const { channelId } = await resolveChannelId(rest, recipient, request); + + // Convert to OGG/Opus if needed + const { path: oggPath, cleanup } = await ensureOggOpus(audioPath); + + try { + // Get voice message metadata (duration and waveform) + const metadata = await getVoiceMessageMetadata(oggPath); + + // Read the audio file + const audioBuffer = await fs.readFile(oggPath); + + // Send the voice message + const result = await sendDiscordVoiceMessage( + rest, + channelId, + audioBuffer, + metadata, + opts.replyTo, + request, + opts.silent, + ); + + recordChannelActivity({ + channel: "discord", + accountId: accountInfo.accountId, + direction: "outbound", + }); + + return { + messageId: result.id ? String(result.id) : "unknown", + channelId: String(result.channel_id ?? channelId), + }; + } catch (err) { + throw await buildDiscordSendError(err, { + channelId, + rest, + token, + hasMedia: true, + }); + } finally { + // Clean up temporary OGG file if we created one + if (cleanup) { + try { + await fs.unlink(oggPath); + } catch { + // Ignore cleanup errors + } + } + } +} diff --git a/src/discord/send.ts b/src/discord/send.ts index ef4a8d6467d..adc27c8c17d 100644 --- a/src/discord/send.ts +++ b/src/discord/send.ts @@ -37,7 +37,12 @@ export { searchMessagesDiscord, unpinMessageDiscord, } from "./send.messages.js"; -export { sendMessageDiscord, sendPollDiscord, sendStickerDiscord } from "./send.outbound.js"; +export { + sendMessageDiscord, + sendPollDiscord, + sendStickerDiscord, + sendVoiceMessageDiscord, +} from "./send.outbound.js"; export { fetchChannelPermissionsDiscord, fetchReactionsDiscord, diff --git a/src/discord/voice-message.ts b/src/discord/voice-message.ts new file mode 100644 index 00000000000..48b8ab2e416 --- /dev/null +++ b/src/discord/voice-message.ts @@ -0,0 +1,325 @@ +/** + * Discord Voice Message Support + * + * Implements sending voice messages via Discord's API. + * Voice messages require: + * - OGG/Opus format audio + * - Waveform data (base64 encoded, up to 256 samples, 0-255 values) + * - Duration in seconds + * - Message flag 8192 (IS_VOICE_MESSAGE) + * - No other content (text, embeds, etc.) + */ + +import type { RequestClient } from "@buape/carbon"; +import { execFile } from "node:child_process"; +import fs from "node:fs/promises"; +import os from "node:os"; +import path from "node:path"; +import { promisify } from "node:util"; +import type { RetryRunner } from "../infra/retry-policy.js"; + +const execFileAsync = promisify(execFile); + +const DISCORD_VOICE_MESSAGE_FLAG = 8192; +const WAVEFORM_SAMPLES = 256; + +export type VoiceMessageMetadata = { + durationSecs: number; + waveform: string; // base64 encoded +}; + +/** + * Get audio duration using ffprobe + */ +export async function getAudioDuration(filePath: string): Promise { + try { + const { stdout } = await execFileAsync("ffprobe", [ + "-v", + "error", + "-show_entries", + "format=duration", + "-of", + "csv=p=0", + filePath, + ]); + const duration = parseFloat(stdout.trim()); + if (isNaN(duration)) { + throw new Error("Could not parse duration"); + } + return Math.round(duration * 100) / 100; // Round to 2 decimal places + } catch (err) { + throw new Error(`Failed to get audio duration: ${err instanceof Error ? err.message : err}`); + } +} + +/** + * Generate waveform data from audio file using ffmpeg + * Returns base64 encoded byte array of amplitude samples (0-255) + */ +export async function generateWaveform(filePath: string): Promise { + try { + // Use ffmpeg to extract raw audio samples and compute amplitudes + // We'll get the peak amplitude for each segment of the audio + const { stdout } = await execFileAsync( + "ffmpeg", + [ + "-i", + filePath, + "-af", + `aresample=8000,asetnsamples=n=${WAVEFORM_SAMPLES}:p=0,astats=metadata=1:reset=1`, + "-f", + "null", + "-", + ], + { encoding: "buffer", maxBuffer: 1024 * 1024 }, + ); + + // Fallback: generate a simple waveform by sampling the audio + // This is a simplified approach - extract raw PCM and sample it + const waveformData = await generateWaveformFromPcm(filePath); + return waveformData; + } catch { + // If ffmpeg approach fails, generate a placeholder waveform + return generatePlaceholderWaveform(); + } +} + +/** + * Generate waveform by extracting raw PCM data and sampling amplitudes + */ +async function generateWaveformFromPcm(filePath: string): Promise { + const tempDir = os.tmpdir(); + const tempPcm = path.join(tempDir, `waveform-${Date.now()}.raw`); + + try { + // Convert to raw 16-bit signed PCM, mono, 8kHz + await execFileAsync("ffmpeg", [ + "-y", + "-i", + filePath, + "-f", + "s16le", + "-acodec", + "pcm_s16le", + "-ac", + "1", + "-ar", + "8000", + tempPcm, + ]); + + const pcmData = await fs.readFile(tempPcm); + const samples = new Int16Array(pcmData.buffer, pcmData.byteOffset, pcmData.byteLength / 2); + + // Sample the PCM data to get WAVEFORM_SAMPLES points + const step = Math.max(1, Math.floor(samples.length / WAVEFORM_SAMPLES)); + const waveform: number[] = []; + + for (let i = 0; i < WAVEFORM_SAMPLES && i * step < samples.length; i++) { + // Get average absolute amplitude for this segment + let sum = 0; + let count = 0; + for (let j = 0; j < step && i * step + j < samples.length; j++) { + sum += Math.abs(samples[i * step + j]!); + count++; + } + const avg = count > 0 ? sum / count : 0; + // Normalize to 0-255 (16-bit signed max is 32767) + const normalized = Math.min(255, Math.round((avg / 32767) * 255)); + waveform.push(normalized); + } + + // Pad with zeros if we don't have enough samples + while (waveform.length < WAVEFORM_SAMPLES) { + waveform.push(0); + } + + return Buffer.from(waveform).toString("base64"); + } finally { + // Clean up temp file + try { + await fs.unlink(tempPcm); + } catch { + // Ignore cleanup errors + } + } +} + +/** + * Generate a placeholder waveform (for when audio processing fails) + */ +function generatePlaceholderWaveform(): string { + // Generate a simple sine-wave-like pattern + const waveform: number[] = []; + for (let i = 0; i < WAVEFORM_SAMPLES; i++) { + const value = Math.round(128 + 64 * Math.sin((i / WAVEFORM_SAMPLES) * Math.PI * 8)); + waveform.push(Math.min(255, Math.max(0, value))); + } + return Buffer.from(waveform).toString("base64"); +} + +/** + * Convert audio file to OGG/Opus format if needed + * Returns path to the OGG file (may be same as input if already OGG/Opus) + */ +export async function ensureOggOpus(filePath: string): Promise<{ path: string; cleanup: boolean }> { + const ext = path.extname(filePath).toLowerCase(); + + // Check if already OGG + if (ext === ".ogg") { + // Verify it's Opus codec, not Vorbis (Vorbis won't play on mobile) + try { + const { stdout } = await execFileAsync("ffprobe", [ + "-v", + "error", + "-select_streams", + "a:0", + "-show_entries", + "stream=codec_name", + "-of", + "csv=p=0", + filePath, + ]); + if (stdout.trim().toLowerCase() === "opus") { + return { path: filePath, cleanup: false }; + } + } catch { + // If probe fails, convert anyway + } + } + + // Convert to OGG/Opus + const tempDir = os.tmpdir(); + const outputPath = path.join(tempDir, `voice-${Date.now()}.ogg`); + + await execFileAsync("ffmpeg", [ + "-y", + "-i", + filePath, + "-c:a", + "libopus", + "-b:a", + "64k", + outputPath, + ]); + + return { path: outputPath, cleanup: true }; +} + +/** + * Get voice message metadata (duration and waveform) + */ +export async function getVoiceMessageMetadata(filePath: string): Promise { + const [durationSecs, waveform] = await Promise.all([ + getAudioDuration(filePath), + generateWaveform(filePath), + ]); + + return { durationSecs, waveform }; +} + +type UploadUrlResponse = { + attachments: Array<{ + id: number; + upload_url: string; + upload_filename: string; + }>; +}; + +/** + * Send a voice message to Discord + * + * This follows Discord's voice message protocol: + * 1. Request upload URL from Discord + * 2. Upload the OGG file to the provided URL + * 3. Send the message with flag 8192 and attachment metadata + */ +export async function sendDiscordVoiceMessage( + rest: RequestClient, + channelId: string, + audioBuffer: Buffer, + metadata: VoiceMessageMetadata, + replyTo: string | undefined, + request: RetryRunner, +): Promise<{ id: string; channel_id: string }> { + const filename = "voice-message.ogg"; + const fileSize = audioBuffer.byteLength; + + // Step 1: Request upload URL + const uploadUrlResponse = (await request( + () => + rest.post(`/channels/${channelId}/attachments`, { + body: { + files: [ + { + filename, + file_size: fileSize, + id: "0", + }, + ], + }, + }) as Promise, + "voice-upload-url", + )) as UploadUrlResponse; + + if (!uploadUrlResponse.attachments?.[0]) { + throw new Error("Failed to get upload URL for voice message"); + } + + const { upload_url, upload_filename } = uploadUrlResponse.attachments[0]; + + // Step 2: Upload the file to Discord's CDN + const uploadResponse = await fetch(upload_url, { + method: "PUT", + headers: { + "Content-Type": "audio/ogg", + }, + body: new Uint8Array(audioBuffer), + }); + + if (!uploadResponse.ok) { + throw new Error(`Failed to upload voice message: ${uploadResponse.status}`); + } + + // Step 3: Send the message with voice message flag and metadata + const messagePayload: { + flags: number; + attachments: Array<{ + id: string; + filename: string; + uploaded_filename: string; + duration_secs: number; + waveform: string; + }>; + message_reference?: { message_id: string; fail_if_not_exists: boolean }; + } = { + flags: DISCORD_VOICE_MESSAGE_FLAG, + attachments: [ + { + id: "0", + filename, + uploaded_filename: upload_filename, + duration_secs: metadata.durationSecs, + waveform: metadata.waveform, + }, + ], + }; + + // Note: Voice messages cannot have content, but can have message_reference for replies + if (replyTo) { + messagePayload.message_reference = { + message_id: replyTo, + fail_if_not_exists: false, + }; + } + + const res = (await request( + () => + rest.post(`/channels/${channelId}/messages`, { + body: messagePayload, + }) as Promise<{ id: string; channel_id: string }>, + "voice-message", + )) as { id: string; channel_id: string }; + + return res; +}