/** * Discord Voice Message Support * * Implements sending voice messages via Discord's API. * Voice messages require: * - OGG/Opus format audio * - Waveform data (base64 encoded, up to 256 samples, 0-255 values) * - Duration in seconds * - Message flag 8192 (IS_VOICE_MESSAGE) * - No other content (text, embeds, etc.) */ import crypto from "node:crypto"; import fs from "node:fs/promises"; import path from "node:path"; import { RateLimitError, type RequestClient } from "@buape/carbon"; import type { RetryRunner } from "openclaw/plugin-sdk/infra-runtime"; import { resolvePreferredOpenClawTmpDir } from "openclaw/plugin-sdk/infra-runtime"; import { parseFfprobeCodecAndSampleRate, runFfmpeg, runFfprobe, } from "openclaw/plugin-sdk/media-runtime"; import { MEDIA_FFMPEG_MAX_AUDIO_DURATION_SECS } from "openclaw/plugin-sdk/media-runtime"; import { unlinkIfExists } from "openclaw/plugin-sdk/media-runtime"; const DISCORD_VOICE_MESSAGE_FLAG = 1 << 13; const SUPPRESS_NOTIFICATIONS_FLAG = 1 << 12; const WAVEFORM_SAMPLES = 256; const DISCORD_OPUS_SAMPLE_RATE_HZ = 48_000; export type VoiceMessageMetadata = { durationSecs: number; waveform: string; // base64 encoded }; /** * Get audio duration using ffprobe */ export async function getAudioDuration(filePath: string): Promise { try { const stdout = await runFfprobe([ "-v", "error", "-show_entries", "format=duration", "-of", "csv=p=0", filePath, ]); const duration = parseFloat(stdout.trim()); if (isNaN(duration)) { throw new Error("Could not parse duration"); } return Math.round(duration * 100) / 100; // Round to 2 decimal places } catch (err) { const errMessage = err instanceof Error ? err.message : String(err); throw new Error(`Failed to get audio duration: ${errMessage}`, { cause: err }); } } /** * Generate waveform data from audio file using ffmpeg * Returns base64 encoded byte array of amplitude samples (0-255) */ export async function generateWaveform(filePath: string): Promise { try { // Extract raw PCM and sample amplitude values return await generateWaveformFromPcm(filePath); } catch { // If PCM extraction fails, generate a placeholder waveform return generatePlaceholderWaveform(); } } /** * Generate waveform by extracting raw PCM data and sampling amplitudes */ async function generateWaveformFromPcm(filePath: string): Promise { const tempDir = resolvePreferredOpenClawTmpDir(); const tempPcm = path.join(tempDir, `waveform-${crypto.randomUUID()}.raw`); try { // Convert to raw 16-bit signed PCM, mono, 8kHz await runFfmpeg([ "-y", "-i", filePath, "-vn", "-sn", "-dn", "-t", String(MEDIA_FFMPEG_MAX_AUDIO_DURATION_SECS), "-f", "s16le", "-acodec", "pcm_s16le", "-ac", "1", "-ar", "8000", tempPcm, ]); const pcmData = await fs.readFile(tempPcm); const samples = new Int16Array(pcmData.buffer, pcmData.byteOffset, pcmData.byteLength / 2); // Sample the PCM data to get WAVEFORM_SAMPLES points const step = Math.max(1, Math.floor(samples.length / WAVEFORM_SAMPLES)); const waveform: number[] = []; for (let i = 0; i < WAVEFORM_SAMPLES && i * step < samples.length; i++) { // Get average absolute amplitude for this segment let sum = 0; let count = 0; for (let j = 0; j < step && i * step + j < samples.length; j++) { sum += Math.abs(samples[i * step + j]); count++; } const avg = count > 0 ? sum / count : 0; // Normalize to 0-255 (16-bit signed max is 32767) const normalized = Math.min(255, Math.round((avg / 32767) * 255)); waveform.push(normalized); } // Pad with zeros if we don't have enough samples while (waveform.length < WAVEFORM_SAMPLES) { waveform.push(0); } return Buffer.from(waveform).toString("base64"); } finally { await unlinkIfExists(tempPcm); } } /** * Generate a placeholder waveform (for when audio processing fails) */ function generatePlaceholderWaveform(): string { // Generate a simple sine-wave-like pattern const waveform: number[] = []; for (let i = 0; i < WAVEFORM_SAMPLES; i++) { const value = Math.round(128 + 64 * Math.sin((i / WAVEFORM_SAMPLES) * Math.PI * 8)); waveform.push(Math.min(255, Math.max(0, value))); } return Buffer.from(waveform).toString("base64"); } /** * Convert audio file to OGG/Opus format if needed * Returns path to the OGG file (may be same as input if already OGG/Opus) */ export async function ensureOggOpus(filePath: string): Promise<{ path: string; cleanup: boolean }> { const trimmed = filePath.trim(); // Defense-in-depth: callers should never hand ffmpeg/ffprobe a URL/protocol path. if (/^[a-z][a-z0-9+.-]*:\/\//i.test(trimmed)) { throw new Error( `Voice message conversion requires a local file path; received a URL/protocol source: ${trimmed}`, ); } const ext = path.extname(filePath).toLowerCase(); // Check if already OGG if (ext === ".ogg") { // Fast-path only when the file is Opus at Discord's expected 48kHz. try { const stdout = await runFfprobe([ "-v", "error", "-select_streams", "a:0", "-show_entries", "stream=codec_name,sample_rate", "-of", "csv=p=0", filePath, ]); const { codec, sampleRateHz } = parseFfprobeCodecAndSampleRate(stdout); if (codec === "opus" && sampleRateHz === DISCORD_OPUS_SAMPLE_RATE_HZ) { return { path: filePath, cleanup: false }; } } catch { // If probe fails, convert anyway } } // Convert to OGG/Opus // Always resample to 48kHz to ensure Discord voice messages play at correct speed // (Discord expects 48kHz; lower sample rates like 24kHz from some TTS providers cause 0.5x playback) const tempDir = resolvePreferredOpenClawTmpDir(); const outputPath = path.join(tempDir, `voice-${crypto.randomUUID()}.ogg`); await runFfmpeg([ "-y", "-i", filePath, "-vn", "-sn", "-dn", "-t", String(MEDIA_FFMPEG_MAX_AUDIO_DURATION_SECS), "-ar", String(DISCORD_OPUS_SAMPLE_RATE_HZ), "-c:a", "libopus", "-b:a", "64k", outputPath, ]); return { path: outputPath, cleanup: true }; } /** * Get voice message metadata (duration and waveform) */ export async function getVoiceMessageMetadata(filePath: string): Promise { const [durationSecs, waveform] = await Promise.all([ getAudioDuration(filePath), generateWaveform(filePath), ]); return { durationSecs, waveform }; } type UploadUrlResponse = { attachments: Array<{ id: number; upload_url: string; upload_filename: string; }>; }; /** * Send a voice message to Discord * * This follows Discord's voice message protocol: * 1. Request upload URL from Discord * 2. Upload the OGG file to the provided URL * 3. Send the message with flag 8192 and attachment metadata */ export async function sendDiscordVoiceMessage( rest: RequestClient, channelId: string, audioBuffer: Buffer, metadata: VoiceMessageMetadata, replyTo: string | undefined, request: RetryRunner, silent?: boolean, token?: string, ): Promise<{ id: string; channel_id: string }> { const filename = "voice-message.ogg"; const fileSize = audioBuffer.byteLength; // Step 1: Request upload URL from Discord // Must use fetch() directly instead of rest.post() because @buape/carbon's // RequestClient auto-converts requests to multipart/form-data when the body // contains a "files" key. Discord's /attachments endpoint expects JSON, so // the auto-conversion causes HTTP 400 "Expected Content-Type application/json". const botToken = token; if (!botToken) { throw new Error("Discord bot token is required for voice message upload"); } const uploadUrlResponse = await request(async () => { const url = `${rest.options?.baseUrl ?? "https://discord.com/api"}/channels/${channelId}/attachments`; const res = await fetch(url, { method: "POST", headers: { Authorization: `Bot ${botToken}`, "Content-Type": "application/json", }, body: JSON.stringify({ files: [{ filename, file_size: fileSize, id: "0" }], }), }); if (!res.ok) { if (res.status === 429) { const retryData = (await res.json().catch(() => ({}))) as { message?: string; retry_after?: number; global?: boolean; }; throw new RateLimitError(res, { message: retryData.message ?? "You are being rate limited.", retry_after: retryData.retry_after ?? 1, global: retryData.global ?? false, }); } const errorBody = (await res.json().catch(() => null)) as { code?: number; message?: string; } | null; const err = new Error(`Upload URL request failed: ${res.status} ${errorBody?.message ?? ""}`); if (errorBody?.code !== undefined) { (err as Error & { code: number }).code = errorBody.code; } throw err; } return (await res.json()) as UploadUrlResponse; }, "voice-upload-url"); if (!uploadUrlResponse.attachments?.[0]) { throw new Error("Failed to get upload URL for voice message"); } const { upload_url, upload_filename } = uploadUrlResponse.attachments[0]; // Step 2: Upload the file to Discord's CDN // Note: Not wrapped in retry runner - upload URLs are single-use and CDN behavior differs const uploadResponse = await fetch(upload_url, { method: "PUT", headers: { "Content-Type": "audio/ogg", }, body: new Uint8Array(audioBuffer), }); if (!uploadResponse.ok) { throw new Error(`Failed to upload voice message: ${uploadResponse.status}`); } // Step 3: Send the message with voice message flag and metadata const flags = silent ? DISCORD_VOICE_MESSAGE_FLAG | SUPPRESS_NOTIFICATIONS_FLAG : DISCORD_VOICE_MESSAGE_FLAG; const messagePayload: { flags: number; attachments: Array<{ id: string; filename: string; uploaded_filename: string; duration_secs: number; waveform: string; }>; message_reference?: { message_id: string; fail_if_not_exists: boolean }; } = { flags, attachments: [ { id: "0", filename, uploaded_filename: upload_filename, duration_secs: metadata.durationSecs, waveform: metadata.waveform, }, ], }; // Note: Voice messages cannot have content, but can have message_reference for replies if (replyTo) { messagePayload.message_reference = { message_id: replyTo, fail_if_not_exists: false, }; } const res = (await request( () => rest.post(`/channels/${channelId}/messages`, { body: messagePayload, }) as Promise<{ id: string; channel_id: string }>, "voice-message", )) as { id: string; channel_id: string }; return res; }