mirror of
https://github.com/openclaw/openclaw.git
synced 2026-03-28 02:12:07 +00:00
369 lines
11 KiB
TypeScript
369 lines
11 KiB
TypeScript
/**
|
|
* Discord Voice Message Support
|
|
*
|
|
* Implements sending voice messages via Discord's API.
|
|
* Voice messages require:
|
|
* - OGG/Opus format audio
|
|
* - Waveform data (base64 encoded, up to 256 samples, 0-255 values)
|
|
* - Duration in seconds
|
|
* - Message flag 8192 (IS_VOICE_MESSAGE)
|
|
* - No other content (text, embeds, etc.)
|
|
*/
|
|
|
|
import crypto from "node:crypto";
|
|
import fs from "node:fs/promises";
|
|
import path from "node:path";
|
|
import { RateLimitError, type RequestClient } from "@buape/carbon";
|
|
import type { RetryRunner } from "openclaw/plugin-sdk/infra-runtime";
|
|
import { resolvePreferredOpenClawTmpDir } from "openclaw/plugin-sdk/infra-runtime";
|
|
import {
|
|
parseFfprobeCodecAndSampleRate,
|
|
runFfmpeg,
|
|
runFfprobe,
|
|
} from "openclaw/plugin-sdk/media-runtime";
|
|
import { MEDIA_FFMPEG_MAX_AUDIO_DURATION_SECS } from "openclaw/plugin-sdk/media-runtime";
|
|
import { unlinkIfExists } from "openclaw/plugin-sdk/media-runtime";
|
|
|
|
const DISCORD_VOICE_MESSAGE_FLAG = 1 << 13;
|
|
const SUPPRESS_NOTIFICATIONS_FLAG = 1 << 12;
|
|
const WAVEFORM_SAMPLES = 256;
|
|
const DISCORD_OPUS_SAMPLE_RATE_HZ = 48_000;
|
|
|
|
export type VoiceMessageMetadata = {
|
|
durationSecs: number;
|
|
waveform: string; // base64 encoded
|
|
};
|
|
|
|
/**
|
|
* Get audio duration using ffprobe
|
|
*/
|
|
export async function getAudioDuration(filePath: string): Promise<number> {
|
|
try {
|
|
const stdout = await runFfprobe([
|
|
"-v",
|
|
"error",
|
|
"-show_entries",
|
|
"format=duration",
|
|
"-of",
|
|
"csv=p=0",
|
|
filePath,
|
|
]);
|
|
const duration = parseFloat(stdout.trim());
|
|
if (isNaN(duration)) {
|
|
throw new Error("Could not parse duration");
|
|
}
|
|
return Math.round(duration * 100) / 100; // Round to 2 decimal places
|
|
} catch (err) {
|
|
const errMessage = err instanceof Error ? err.message : String(err);
|
|
throw new Error(`Failed to get audio duration: ${errMessage}`, { cause: err });
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Generate waveform data from audio file using ffmpeg
|
|
* Returns base64 encoded byte array of amplitude samples (0-255)
|
|
*/
|
|
export async function generateWaveform(filePath: string): Promise<string> {
|
|
try {
|
|
// Extract raw PCM and sample amplitude values
|
|
return await generateWaveformFromPcm(filePath);
|
|
} catch {
|
|
// If PCM extraction fails, generate a placeholder waveform
|
|
return generatePlaceholderWaveform();
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Generate waveform by extracting raw PCM data and sampling amplitudes
|
|
*/
|
|
async function generateWaveformFromPcm(filePath: string): Promise<string> {
|
|
const tempDir = resolvePreferredOpenClawTmpDir();
|
|
const tempPcm = path.join(tempDir, `waveform-${crypto.randomUUID()}.raw`);
|
|
|
|
try {
|
|
// Convert to raw 16-bit signed PCM, mono, 8kHz
|
|
await runFfmpeg([
|
|
"-y",
|
|
"-i",
|
|
filePath,
|
|
"-vn",
|
|
"-sn",
|
|
"-dn",
|
|
"-t",
|
|
String(MEDIA_FFMPEG_MAX_AUDIO_DURATION_SECS),
|
|
"-f",
|
|
"s16le",
|
|
"-acodec",
|
|
"pcm_s16le",
|
|
"-ac",
|
|
"1",
|
|
"-ar",
|
|
"8000",
|
|
tempPcm,
|
|
]);
|
|
|
|
const pcmData = await fs.readFile(tempPcm);
|
|
const samples = new Int16Array(pcmData.buffer, pcmData.byteOffset, pcmData.byteLength / 2);
|
|
|
|
// Sample the PCM data to get WAVEFORM_SAMPLES points
|
|
const step = Math.max(1, Math.floor(samples.length / WAVEFORM_SAMPLES));
|
|
const waveform: number[] = [];
|
|
|
|
for (let i = 0; i < WAVEFORM_SAMPLES && i * step < samples.length; i++) {
|
|
// Get average absolute amplitude for this segment
|
|
let sum = 0;
|
|
let count = 0;
|
|
for (let j = 0; j < step && i * step + j < samples.length; j++) {
|
|
sum += Math.abs(samples[i * step + j]);
|
|
count++;
|
|
}
|
|
const avg = count > 0 ? sum / count : 0;
|
|
// Normalize to 0-255 (16-bit signed max is 32767)
|
|
const normalized = Math.min(255, Math.round((avg / 32767) * 255));
|
|
waveform.push(normalized);
|
|
}
|
|
|
|
// Pad with zeros if we don't have enough samples
|
|
while (waveform.length < WAVEFORM_SAMPLES) {
|
|
waveform.push(0);
|
|
}
|
|
|
|
return Buffer.from(waveform).toString("base64");
|
|
} finally {
|
|
await unlinkIfExists(tempPcm);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Generate a placeholder waveform (for when audio processing fails)
|
|
*/
|
|
function generatePlaceholderWaveform(): string {
|
|
// Generate a simple sine-wave-like pattern
|
|
const waveform: number[] = [];
|
|
for (let i = 0; i < WAVEFORM_SAMPLES; i++) {
|
|
const value = Math.round(128 + 64 * Math.sin((i / WAVEFORM_SAMPLES) * Math.PI * 8));
|
|
waveform.push(Math.min(255, Math.max(0, value)));
|
|
}
|
|
return Buffer.from(waveform).toString("base64");
|
|
}
|
|
|
|
/**
|
|
* Convert audio file to OGG/Opus format if needed
|
|
* Returns path to the OGG file (may be same as input if already OGG/Opus)
|
|
*/
|
|
export async function ensureOggOpus(filePath: string): Promise<{ path: string; cleanup: boolean }> {
|
|
const trimmed = filePath.trim();
|
|
// Defense-in-depth: callers should never hand ffmpeg/ffprobe a URL/protocol path.
|
|
if (/^[a-z][a-z0-9+.-]*:\/\//i.test(trimmed)) {
|
|
throw new Error(
|
|
`Voice message conversion requires a local file path; received a URL/protocol source: ${trimmed}`,
|
|
);
|
|
}
|
|
|
|
const ext = path.extname(filePath).toLowerCase();
|
|
|
|
// Check if already OGG
|
|
if (ext === ".ogg") {
|
|
// Fast-path only when the file is Opus at Discord's expected 48kHz.
|
|
try {
|
|
const stdout = await runFfprobe([
|
|
"-v",
|
|
"error",
|
|
"-select_streams",
|
|
"a:0",
|
|
"-show_entries",
|
|
"stream=codec_name,sample_rate",
|
|
"-of",
|
|
"csv=p=0",
|
|
filePath,
|
|
]);
|
|
const { codec, sampleRateHz } = parseFfprobeCodecAndSampleRate(stdout);
|
|
if (codec === "opus" && sampleRateHz === DISCORD_OPUS_SAMPLE_RATE_HZ) {
|
|
return { path: filePath, cleanup: false };
|
|
}
|
|
} catch {
|
|
// If probe fails, convert anyway
|
|
}
|
|
}
|
|
|
|
// Convert to OGG/Opus
|
|
// Always resample to 48kHz to ensure Discord voice messages play at correct speed
|
|
// (Discord expects 48kHz; lower sample rates like 24kHz from some TTS providers cause 0.5x playback)
|
|
const tempDir = resolvePreferredOpenClawTmpDir();
|
|
const outputPath = path.join(tempDir, `voice-${crypto.randomUUID()}.ogg`);
|
|
|
|
await runFfmpeg([
|
|
"-y",
|
|
"-i",
|
|
filePath,
|
|
"-vn",
|
|
"-sn",
|
|
"-dn",
|
|
"-t",
|
|
String(MEDIA_FFMPEG_MAX_AUDIO_DURATION_SECS),
|
|
"-ar",
|
|
String(DISCORD_OPUS_SAMPLE_RATE_HZ),
|
|
"-c:a",
|
|
"libopus",
|
|
"-b:a",
|
|
"64k",
|
|
outputPath,
|
|
]);
|
|
|
|
return { path: outputPath, cleanup: true };
|
|
}
|
|
|
|
/**
|
|
* Get voice message metadata (duration and waveform)
|
|
*/
|
|
export async function getVoiceMessageMetadata(filePath: string): Promise<VoiceMessageMetadata> {
|
|
const [durationSecs, waveform] = await Promise.all([
|
|
getAudioDuration(filePath),
|
|
generateWaveform(filePath),
|
|
]);
|
|
|
|
return { durationSecs, waveform };
|
|
}
|
|
|
|
type UploadUrlResponse = {
|
|
attachments: Array<{
|
|
id: number;
|
|
upload_url: string;
|
|
upload_filename: string;
|
|
}>;
|
|
};
|
|
|
|
/**
|
|
* Send a voice message to Discord
|
|
*
|
|
* This follows Discord's voice message protocol:
|
|
* 1. Request upload URL from Discord
|
|
* 2. Upload the OGG file to the provided URL
|
|
* 3. Send the message with flag 8192 and attachment metadata
|
|
*/
|
|
export async function sendDiscordVoiceMessage(
|
|
rest: RequestClient,
|
|
channelId: string,
|
|
audioBuffer: Buffer,
|
|
metadata: VoiceMessageMetadata,
|
|
replyTo: string | undefined,
|
|
request: RetryRunner,
|
|
silent?: boolean,
|
|
token?: string,
|
|
): Promise<{ id: string; channel_id: string }> {
|
|
const filename = "voice-message.ogg";
|
|
const fileSize = audioBuffer.byteLength;
|
|
|
|
// Step 1: Request upload URL from Discord
|
|
// Must use fetch() directly instead of rest.post() because @buape/carbon's
|
|
// RequestClient auto-converts requests to multipart/form-data when the body
|
|
// contains a "files" key. Discord's /attachments endpoint expects JSON, so
|
|
// the auto-conversion causes HTTP 400 "Expected Content-Type application/json".
|
|
const botToken = token;
|
|
if (!botToken) {
|
|
throw new Error("Discord bot token is required for voice message upload");
|
|
}
|
|
const uploadUrlResponse = await request(async () => {
|
|
const url = `${rest.options?.baseUrl ?? "https://discord.com/api"}/channels/${channelId}/attachments`;
|
|
const res = await fetch(url, {
|
|
method: "POST",
|
|
headers: {
|
|
Authorization: `Bot ${botToken}`,
|
|
"Content-Type": "application/json",
|
|
},
|
|
body: JSON.stringify({
|
|
files: [{ filename, file_size: fileSize, id: "0" }],
|
|
}),
|
|
});
|
|
if (!res.ok) {
|
|
if (res.status === 429) {
|
|
const retryData = (await res.json().catch(() => ({}))) as {
|
|
message?: string;
|
|
retry_after?: number;
|
|
global?: boolean;
|
|
};
|
|
throw new RateLimitError(res, {
|
|
message: retryData.message ?? "You are being rate limited.",
|
|
retry_after: retryData.retry_after ?? 1,
|
|
global: retryData.global ?? false,
|
|
});
|
|
}
|
|
const errorBody = (await res.json().catch(() => null)) as {
|
|
code?: number;
|
|
message?: string;
|
|
} | null;
|
|
const err = new Error(`Upload URL request failed: ${res.status} ${errorBody?.message ?? ""}`);
|
|
if (errorBody?.code !== undefined) {
|
|
(err as Error & { code: number }).code = errorBody.code;
|
|
}
|
|
throw err;
|
|
}
|
|
return (await res.json()) as UploadUrlResponse;
|
|
}, "voice-upload-url");
|
|
|
|
if (!uploadUrlResponse.attachments?.[0]) {
|
|
throw new Error("Failed to get upload URL for voice message");
|
|
}
|
|
|
|
const { upload_url, upload_filename } = uploadUrlResponse.attachments[0];
|
|
|
|
// Step 2: Upload the file to Discord's CDN
|
|
// Note: Not wrapped in retry runner - upload URLs are single-use and CDN behavior differs
|
|
const uploadResponse = await fetch(upload_url, {
|
|
method: "PUT",
|
|
headers: {
|
|
"Content-Type": "audio/ogg",
|
|
},
|
|
body: new Uint8Array(audioBuffer),
|
|
});
|
|
|
|
if (!uploadResponse.ok) {
|
|
throw new Error(`Failed to upload voice message: ${uploadResponse.status}`);
|
|
}
|
|
|
|
// Step 3: Send the message with voice message flag and metadata
|
|
const flags = silent
|
|
? DISCORD_VOICE_MESSAGE_FLAG | SUPPRESS_NOTIFICATIONS_FLAG
|
|
: DISCORD_VOICE_MESSAGE_FLAG;
|
|
const messagePayload: {
|
|
flags: number;
|
|
attachments: Array<{
|
|
id: string;
|
|
filename: string;
|
|
uploaded_filename: string;
|
|
duration_secs: number;
|
|
waveform: string;
|
|
}>;
|
|
message_reference?: { message_id: string; fail_if_not_exists: boolean };
|
|
} = {
|
|
flags,
|
|
attachments: [
|
|
{
|
|
id: "0",
|
|
filename,
|
|
uploaded_filename: upload_filename,
|
|
duration_secs: metadata.durationSecs,
|
|
waveform: metadata.waveform,
|
|
},
|
|
],
|
|
};
|
|
|
|
// Note: Voice messages cannot have content, but can have message_reference for replies
|
|
if (replyTo) {
|
|
messagePayload.message_reference = {
|
|
message_id: replyTo,
|
|
fail_if_not_exists: false,
|
|
};
|
|
}
|
|
|
|
const res = (await request(
|
|
() =>
|
|
rest.post(`/channels/${channelId}/messages`, {
|
|
body: messagePayload,
|
|
}) as Promise<{ id: string; channel_id: string }>,
|
|
"voice-message",
|
|
)) as { id: string; channel_id: string };
|
|
|
|
return res;
|
|
}
|