Files
2026-05-01 16:25:10 +01:00

267 lines
7.1 KiB
TypeScript

import * as crypto from "node:crypto";
import { fetchWithSsrFGuard } from "openclaw/plugin-sdk/ssrf-runtime";
export type VolcengineTtsEncoding = "ogg_opus" | "mp3" | "pcm" | "wav";
type VolcengineTTSParams = {
text: string;
apiKey?: string;
appId?: string;
token?: string;
voice?: string;
cluster?: string;
resourceId?: string;
appKey?: string;
baseUrl?: string;
speedRatio?: number;
volumeRatio?: number;
pitchRatio?: number;
emotion?: string;
encoding?: VolcengineTtsEncoding;
timeoutMs?: number;
};
const DEFAULT_SEED_VOICE = "en_female_anna_mars_bigtts";
const DEFAULT_LEGACY_VOICE = "zh_female_xiaohe_uranus_bigtts";
const DEFAULT_CLUSTER = "volcano_tts";
const DEFAULT_SEED_TTS_RESOURCE_ID = "seed-tts-1.0";
const DEFAULT_SEED_TTS_APP_KEY = "aGjiRDfUWi";
const BYTEPLUS_SEED_TTS_URL =
"https://voice.ap-southeast-1.bytepluses.com/api/v3/tts/unidirectional";
const VOLCENGINE_LEGACY_TTS_URL = "https://openspeech.bytedance.com/api/v1/tts";
type VolcengineTtsResponse = {
code?: number;
message?: string;
data?: string;
};
function parseJsonObject(text: string, providerName: string): Record<string, unknown> {
try {
const parsed = JSON.parse(text) as unknown;
if (!parsed || typeof parsed !== "object" || Array.isArray(parsed)) {
throw new Error("expected JSON object");
}
return parsed as Record<string, unknown>;
} catch (err) {
const detail = err instanceof Error ? err.message : String(err);
throw new Error(`${providerName} TTS: failed to parse response JSON: ${detail}`, {
cause: err,
});
}
}
function toTtsResponse(parsed: Record<string, unknown>): VolcengineTtsResponse {
const header =
parsed.header && typeof parsed.header === "object" && !Array.isArray(parsed.header)
? (parsed.header as Record<string, unknown>)
: undefined;
return {
code:
typeof parsed.code === "number"
? parsed.code
: typeof header?.code === "number"
? header.code
: undefined,
message:
typeof parsed.message === "string"
? parsed.message
: typeof header?.message === "string"
? header.message
: undefined,
data: typeof parsed.data === "string" ? parsed.data : undefined,
};
}
function parseLegacyTtsResponse(text: string): VolcengineTtsResponse {
return toTtsResponse(parseJsonObject(text, "Volcengine"));
}
function parseSeedTtsFrames(text: string): VolcengineTtsResponse[] {
const trimmed = text.trim();
if (!trimmed) {
return [];
}
try {
return [toTtsResponse(parseJsonObject(trimmed, "BytePlus Seed Speech"))];
} catch {
// The HTTP API streams JSON frames; Response.text() preserves line breaks.
}
const frames: VolcengineTtsResponse[] = [];
for (const line of trimmed.split(/\r?\n/)) {
const item = line.trim();
if (!item) {
continue;
}
const json = item.startsWith("data:") ? item.slice("data:".length).trim() : item;
frames.push(toTtsResponse(parseJsonObject(json, "BytePlus Seed Speech")));
}
return frames;
}
function hostnameAllowlist(url: string): string[] {
return [new URL(url).hostname];
}
function seedAudioFormat(encoding: VolcengineTtsEncoding): "ogg_opus" | "mp3" | "pcm" {
return encoding === "wav" ? "pcm" : encoding;
}
async function seedSpeechTTS(params: VolcengineTTSParams & { apiKey: string }): Promise<Buffer> {
const {
text,
apiKey,
voice = DEFAULT_SEED_VOICE,
resourceId = DEFAULT_SEED_TTS_RESOURCE_ID,
appKey = DEFAULT_SEED_TTS_APP_KEY,
baseUrl = BYTEPLUS_SEED_TTS_URL,
speedRatio = 1.0,
emotion,
encoding = "ogg_opus",
timeoutMs = 30_000,
} = params;
const audioFormat = seedAudioFormat(encoding);
const payload = JSON.stringify({
user: { uid: "openclaw" },
req_params: {
text,
speaker: voice,
audio_params: {
format: audioFormat,
sample_rate: 24_000,
},
...(speedRatio !== 1.0 ? { speed_ratio: speedRatio } : {}),
...(emotion ? { emotion } : {}),
},
});
const { response, release } = await fetchWithSsrFGuard({
url: baseUrl,
init: {
method: "POST",
headers: {
"Content-Type": "application/json",
Connection: "keep-alive",
"X-Api-Key": apiKey,
"X-Api-Resource-Id": resourceId,
"X-Api-App-Key": appKey,
},
body: payload,
},
timeoutMs,
policy: { hostnameAllowlist: hostnameAllowlist(baseUrl) },
auditContext: "volcengine.tts",
});
try {
const frames = parseSeedTtsFrames(await response.text());
const chunks: Buffer[] = [];
for (const frame of frames) {
if (frame.code === 0) {
if (frame.data) {
chunks.push(Buffer.from(frame.data, "base64"));
}
continue;
}
if (frame.code === 20000000) {
continue;
}
throw new Error(
`BytePlus Seed Speech TTS error ${frame.code ?? response.status}: ${
frame.message ?? "unknown"
}`,
);
}
if (!response.ok || chunks.length === 0) {
throw new Error(`BytePlus Seed Speech TTS error ${response.status}: no audio data`);
}
return Buffer.concat(chunks);
} finally {
await release();
}
}
async function legacyVolcengineTTS(
params: VolcengineTTSParams & { appId: string; token: string },
): Promise<Buffer> {
const {
text,
appId,
token,
voice = DEFAULT_LEGACY_VOICE,
cluster = DEFAULT_CLUSTER,
baseUrl = VOLCENGINE_LEGACY_TTS_URL,
speedRatio = 1.0,
volumeRatio = 1.0,
pitchRatio = 1.0,
emotion,
encoding = "ogg_opus",
timeoutMs = 30_000,
} = params;
const payload = JSON.stringify({
app: { appid: appId, token, cluster },
user: { uid: "openclaw" },
audio: {
voice_type: voice,
encoding,
speed_ratio: speedRatio,
volume_ratio: volumeRatio,
pitch_ratio: pitchRatio,
...(emotion ? { emotion } : {}),
},
request: {
reqid: crypto.randomUUID(),
text,
text_type: "plain",
operation: "query",
},
});
const { response, release } = await fetchWithSsrFGuard({
url: baseUrl,
init: {
method: "POST",
headers: {
"Content-Type": "application/json",
Authorization: `Bearer;${token}`,
},
body: payload,
},
timeoutMs,
policy: { hostnameAllowlist: hostnameAllowlist(baseUrl) },
auditContext: "volcengine.tts",
});
try {
const body = parseLegacyTtsResponse(await response.text());
if (!response.ok || body.code !== 3000 || !body.data) {
throw new Error(
`Volcengine TTS error ${body.code ?? response.status}: ${body.message ?? "unknown"}`,
);
}
return Buffer.from(body.data, "base64");
} finally {
await release();
}
}
export async function volcengineTTS(params: VolcengineTTSParams): Promise<Buffer> {
if (params.apiKey) {
return seedSpeechTTS({ ...params, apiKey: params.apiKey });
}
if (params.appId && params.token) {
return legacyVolcengineTTS({ ...params, appId: params.appId, token: params.token });
}
throw new Error(
"Volcengine TTS credentials missing. Set a BytePlus Seed Speech API key or legacy AppID/token.",
);
}