mirror of
https://github.com/openclaw/openclaw.git
synced 2026-03-28 10:22:32 +00:00
refactor: clean plugin capability boundaries
This commit is contained in:
@@ -5,33 +5,6 @@
|
||||
"category": "legacy",
|
||||
"entrypoint": "index",
|
||||
"exports": [
|
||||
{
|
||||
"declaration": "export function buildFalImageGenerationProvider(): ImageGenerationProvider;",
|
||||
"exportName": "buildFalImageGenerationProvider",
|
||||
"kind": "function",
|
||||
"source": {
|
||||
"line": 190,
|
||||
"path": "extensions/fal/image-generation-provider.ts"
|
||||
}
|
||||
},
|
||||
{
|
||||
"declaration": "export function buildGoogleImageGenerationProvider(): ImageGenerationProvider;",
|
||||
"exportName": "buildGoogleImageGenerationProvider",
|
||||
"kind": "function",
|
||||
"source": {
|
||||
"line": 98,
|
||||
"path": "extensions/google/image-generation-provider.ts"
|
||||
}
|
||||
},
|
||||
{
|
||||
"declaration": "export function buildOpenAIImageGenerationProvider(): ImageGenerationProvider;",
|
||||
"exportName": "buildOpenAIImageGenerationProvider",
|
||||
"kind": "function",
|
||||
"source": {
|
||||
"line": 22,
|
||||
"path": "extensions/openai/image-generation-provider.ts"
|
||||
}
|
||||
},
|
||||
{
|
||||
"declaration": "export function delegateCompactionToRuntime(params: { sessionId: string; sessionKey?: string | undefined; sessionFile: string; tokenBudget?: number | undefined; force?: boolean | undefined; currentTokenCount?: number | undefined; compactionTarget?: \"budget\" | ... 1 more ... | undefined; customInstructions?: string | undefined; runtimeContext?: ContextEngineRuntimeContext | undefined; }): Promise<...>;",
|
||||
"exportName": "delegateCompactionToRuntime",
|
||||
@@ -923,7 +896,7 @@
|
||||
"exportName": "createMessageToolButtonsSchema",
|
||||
"kind": "function",
|
||||
"source": {
|
||||
"line": 11,
|
||||
"line": 12,
|
||||
"path": "src/plugin-sdk/channel-actions.ts"
|
||||
}
|
||||
},
|
||||
@@ -932,7 +905,7 @@
|
||||
"exportName": "createMessageToolCardSchema",
|
||||
"kind": "function",
|
||||
"source": {
|
||||
"line": 29,
|
||||
"line": 30,
|
||||
"path": "src/plugin-sdk/channel-actions.ts"
|
||||
}
|
||||
},
|
||||
@@ -954,6 +927,15 @@
|
||||
"path": "src/channels/plugins/actions/shared.ts"
|
||||
}
|
||||
},
|
||||
{
|
||||
"declaration": "export function optionalStringEnum<T extends readonly string[]>(values: T, options?: StringEnumOptions<T>): TOptional<TUnsafe<T[number]>>;",
|
||||
"exportName": "optionalStringEnum",
|
||||
"kind": "function",
|
||||
"source": {
|
||||
"line": 31,
|
||||
"path": "src/agents/schema/typebox.ts"
|
||||
}
|
||||
},
|
||||
{
|
||||
"declaration": "export function resolveReactionMessageId(params: { args: Record<string, unknown>; toolContext?: ReactionToolContext | undefined; }): string | number | undefined;",
|
||||
"exportName": "resolveReactionMessageId",
|
||||
@@ -962,6 +944,15 @@
|
||||
"line": 7,
|
||||
"path": "src/channels/plugins/actions/reaction-message-id.ts"
|
||||
}
|
||||
},
|
||||
{
|
||||
"declaration": "export function stringEnum<T extends readonly string[]>(values: T, options?: StringEnumOptions<T>): TUnsafe<T[number]>;",
|
||||
"exportName": "stringEnum",
|
||||
"kind": "function",
|
||||
"source": {
|
||||
"line": 15,
|
||||
"path": "src/agents/schema/typebox.ts"
|
||||
}
|
||||
}
|
||||
],
|
||||
"importSpecifier": "openclaw/plugin-sdk/channel-actions",
|
||||
|
||||
@@ -1,7 +1,4 @@
|
||||
{"category":"legacy","entrypoint":"index","importSpecifier":"openclaw/plugin-sdk","recordType":"module","sourceLine":1,"sourcePath":"src/plugin-sdk/index.ts"}
|
||||
{"declaration":"export function buildFalImageGenerationProvider(): ImageGenerationProvider;","entrypoint":"index","exportName":"buildFalImageGenerationProvider","importSpecifier":"openclaw/plugin-sdk","kind":"function","recordType":"export","sourceLine":190,"sourcePath":"extensions/fal/image-generation-provider.ts"}
|
||||
{"declaration":"export function buildGoogleImageGenerationProvider(): ImageGenerationProvider;","entrypoint":"index","exportName":"buildGoogleImageGenerationProvider","importSpecifier":"openclaw/plugin-sdk","kind":"function","recordType":"export","sourceLine":98,"sourcePath":"extensions/google/image-generation-provider.ts"}
|
||||
{"declaration":"export function buildOpenAIImageGenerationProvider(): ImageGenerationProvider;","entrypoint":"index","exportName":"buildOpenAIImageGenerationProvider","importSpecifier":"openclaw/plugin-sdk","kind":"function","recordType":"export","sourceLine":22,"sourcePath":"extensions/openai/image-generation-provider.ts"}
|
||||
{"declaration":"export function delegateCompactionToRuntime(params: { sessionId: string; sessionKey?: string | undefined; sessionFile: string; tokenBudget?: number | undefined; force?: boolean | undefined; currentTokenCount?: number | undefined; compactionTarget?: \"budget\" | ... 1 more ... | undefined; customInstructions?: string | undefined; runtimeContext?: ContextEngineRuntimeContext | undefined; }): Promise<...>;","entrypoint":"index","exportName":"delegateCompactionToRuntime","importSpecifier":"openclaw/plugin-sdk","kind":"function","recordType":"export","sourceLine":16,"sourcePath":"src/context-engine/delegate.ts"}
|
||||
{"declaration":"export function emptyPluginConfigSchema(): OpenClawPluginConfigSchema;","entrypoint":"index","exportName":"emptyPluginConfigSchema","importSpecifier":"openclaw/plugin-sdk","kind":"function","recordType":"export","sourceLine":13,"sourcePath":"src/plugins/config-schema.ts"}
|
||||
{"declaration":"export function onDiagnosticEvent(listener: (evt: DiagnosticEventPayload) => void): () => void;","entrypoint":"index","exportName":"onDiagnosticEvent","importSpecifier":"openclaw/plugin-sdk","kind":"function","recordType":"export","sourceLine":229,"sourcePath":"src/infra/diagnostic-events.ts"}
|
||||
@@ -100,11 +97,13 @@
|
||||
{"declaration":"export type BasicAllowlistResolutionEntry = BasicAllowlistResolutionEntry;","entrypoint":"allow-from","exportName":"BasicAllowlistResolutionEntry","importSpecifier":"openclaw/plugin-sdk/allow-from","kind":"type","recordType":"export","sourceLine":129,"sourcePath":"src/plugin-sdk/allow-from.ts"}
|
||||
{"declaration":"export type CompiledAllowlist = CompiledAllowlist;","entrypoint":"allow-from","exportName":"CompiledAllowlist","importSpecifier":"openclaw/plugin-sdk/allow-from","kind":"type","recordType":"export","sourceLine":19,"sourcePath":"src/channels/allowlist-match.ts"}
|
||||
{"category":"channel","entrypoint":"channel-actions","importSpecifier":"openclaw/plugin-sdk/channel-actions","recordType":"module","sourceLine":1,"sourcePath":"src/plugin-sdk/channel-actions.ts"}
|
||||
{"declaration":"export function createMessageToolButtonsSchema(): TSchema;","entrypoint":"channel-actions","exportName":"createMessageToolButtonsSchema","importSpecifier":"openclaw/plugin-sdk/channel-actions","kind":"function","recordType":"export","sourceLine":11,"sourcePath":"src/plugin-sdk/channel-actions.ts"}
|
||||
{"declaration":"export function createMessageToolCardSchema(): TSchema;","entrypoint":"channel-actions","exportName":"createMessageToolCardSchema","importSpecifier":"openclaw/plugin-sdk/channel-actions","kind":"function","recordType":"export","sourceLine":29,"sourcePath":"src/plugin-sdk/channel-actions.ts"}
|
||||
{"declaration":"export function createMessageToolButtonsSchema(): TSchema;","entrypoint":"channel-actions","exportName":"createMessageToolButtonsSchema","importSpecifier":"openclaw/plugin-sdk/channel-actions","kind":"function","recordType":"export","sourceLine":12,"sourcePath":"src/plugin-sdk/channel-actions.ts"}
|
||||
{"declaration":"export function createMessageToolCardSchema(): TSchema;","entrypoint":"channel-actions","exportName":"createMessageToolCardSchema","importSpecifier":"openclaw/plugin-sdk/channel-actions","kind":"function","recordType":"export","sourceLine":30,"sourcePath":"src/plugin-sdk/channel-actions.ts"}
|
||||
{"declaration":"export function createUnionActionGate<TAccount, TKey extends string>(accounts: readonly TAccount[], createGate: (account: TAccount) => OptionalDefaultGate<TKey>): OptionalDefaultGate<TKey>;","entrypoint":"channel-actions","exportName":"createUnionActionGate","importSpecifier":"openclaw/plugin-sdk/channel-actions","kind":"function","recordType":"export","sourceLine":13,"sourcePath":"src/channels/plugins/actions/shared.ts"}
|
||||
{"declaration":"export function listTokenSourcedAccounts<TAccount extends TokenSourcedAccount>(accounts: readonly TAccount[]): TAccount[];","entrypoint":"channel-actions","exportName":"listTokenSourcedAccounts","importSpecifier":"openclaw/plugin-sdk/channel-actions","kind":"function","recordType":"export","sourceLine":7,"sourcePath":"src/channels/plugins/actions/shared.ts"}
|
||||
{"declaration":"export function optionalStringEnum<T extends readonly string[]>(values: T, options?: StringEnumOptions<T>): TOptional<TUnsafe<T[number]>>;","entrypoint":"channel-actions","exportName":"optionalStringEnum","importSpecifier":"openclaw/plugin-sdk/channel-actions","kind":"function","recordType":"export","sourceLine":31,"sourcePath":"src/agents/schema/typebox.ts"}
|
||||
{"declaration":"export function resolveReactionMessageId(params: { args: Record<string, unknown>; toolContext?: ReactionToolContext | undefined; }): string | number | undefined;","entrypoint":"channel-actions","exportName":"resolveReactionMessageId","importSpecifier":"openclaw/plugin-sdk/channel-actions","kind":"function","recordType":"export","sourceLine":7,"sourcePath":"src/channels/plugins/actions/reaction-message-id.ts"}
|
||||
{"declaration":"export function stringEnum<T extends readonly string[]>(values: T, options?: StringEnumOptions<T>): TUnsafe<T[number]>;","entrypoint":"channel-actions","exportName":"stringEnum","importSpecifier":"openclaw/plugin-sdk/channel-actions","kind":"function","recordType":"export","sourceLine":15,"sourcePath":"src/agents/schema/typebox.ts"}
|
||||
{"category":"channel","entrypoint":"channel-config-schema","importSpecifier":"openclaw/plugin-sdk/channel-config-schema","recordType":"module","sourceLine":1,"sourcePath":"src/plugin-sdk/channel-config-schema.ts"}
|
||||
{"declaration":"export function buildCatchallMultiAccountChannelSchema<T extends ExtendableZodObject>(accountSchema: T): T;","entrypoint":"channel-config-schema","exportName":"buildCatchallMultiAccountChannelSchema","importSpecifier":"openclaw/plugin-sdk/channel-config-schema","kind":"function","recordType":"export","sourceLine":26,"sourcePath":"src/channels/plugins/config-schema.ts"}
|
||||
{"declaration":"export function buildChannelConfigSchema(schema: ZodType<unknown, unknown, $ZodTypeInternals<unknown, unknown>>): ChannelConfigSchema;","entrypoint":"channel-config-schema","exportName":"buildChannelConfigSchema","importSpecifier":"openclaw/plugin-sdk/channel-config-schema","kind":"function","recordType":"export","sourceLine":35,"sourcePath":"src/channels/plugins/config-schema.ts"}
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
{
|
||||
"id": "anthropic",
|
||||
"providers": ["anthropic"],
|
||||
"mediaUnderstandingProviders": ["anthropic"],
|
||||
"cliBackends": ["claude-cli"],
|
||||
"providerAuthEnvVars": {
|
||||
"anthropic": ["ANTHROPIC_OAUTH_TOKEN", "ANTHROPIC_API_KEY"]
|
||||
|
||||
@@ -7,7 +7,7 @@ import {
|
||||
normalizeBaseUrl,
|
||||
postTranscriptionRequest,
|
||||
requireTranscriptionText,
|
||||
} from "openclaw/plugin-sdk/media-understanding";
|
||||
} from "openclaw/plugin-sdk/provider-http";
|
||||
|
||||
export const DEFAULT_DEEPGRAM_AUDIO_BASE_URL = "https://api.deepgram.com/v1";
|
||||
export const DEFAULT_DEEPGRAM_AUDIO_MODEL = "nova-3";
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
{
|
||||
"id": "deepgram",
|
||||
"mediaUnderstandingProviders": ["deepgram"],
|
||||
"configSchema": {
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
{
|
||||
"id": "elevenlabs",
|
||||
"speechProviders": ["elevenlabs"],
|
||||
"configSchema": {
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
import type { SpeechProviderPlugin } from "openclaw/plugin-sdk/core";
|
||||
import { elevenLabsTTS, type SpeechVoiceOption } from "openclaw/plugin-sdk/speech";
|
||||
import type { SpeechVoiceOption } from "openclaw/plugin-sdk/speech";
|
||||
import { elevenLabsTTS } from "./tts.js";
|
||||
|
||||
const ELEVENLABS_TTS_MODELS = [
|
||||
"eleven_multilingual_v2",
|
||||
|
||||
150
extensions/elevenlabs/tts.ts
Normal file
150
extensions/elevenlabs/tts.ts
Normal file
@@ -0,0 +1,150 @@
|
||||
const DEFAULT_ELEVENLABS_BASE_URL = "https://api.elevenlabs.io";
|
||||
|
||||
function isValidVoiceId(voiceId: string): boolean {
|
||||
return /^[a-zA-Z0-9]{10,40}$/.test(voiceId);
|
||||
}
|
||||
|
||||
function normalizeElevenLabsBaseUrl(baseUrl?: string): string {
|
||||
const trimmed = baseUrl?.trim();
|
||||
if (!trimmed) {
|
||||
return DEFAULT_ELEVENLABS_BASE_URL;
|
||||
}
|
||||
return trimmed.replace(/\/+$/, "");
|
||||
}
|
||||
|
||||
function normalizeLanguageCode(code?: string): string | undefined {
|
||||
const trimmed = code?.trim();
|
||||
if (!trimmed) {
|
||||
return undefined;
|
||||
}
|
||||
const normalized = trimmed.toLowerCase();
|
||||
if (!/^[a-z]{2}$/.test(normalized)) {
|
||||
throw new Error("languageCode must be a 2-letter ISO 639-1 code (e.g. en, de, fr)");
|
||||
}
|
||||
return normalized;
|
||||
}
|
||||
|
||||
function normalizeApplyTextNormalization(mode?: string): "auto" | "on" | "off" | undefined {
|
||||
const trimmed = mode?.trim();
|
||||
if (!trimmed) {
|
||||
return undefined;
|
||||
}
|
||||
const normalized = trimmed.toLowerCase();
|
||||
if (normalized === "auto" || normalized === "on" || normalized === "off") {
|
||||
return normalized;
|
||||
}
|
||||
throw new Error("applyTextNormalization must be one of: auto, on, off");
|
||||
}
|
||||
|
||||
function normalizeSeed(seed?: number): number | undefined {
|
||||
if (seed == null) {
|
||||
return undefined;
|
||||
}
|
||||
const next = Math.floor(seed);
|
||||
if (!Number.isFinite(next) || next < 0 || next > 4_294_967_295) {
|
||||
throw new Error("seed must be between 0 and 4294967295");
|
||||
}
|
||||
return next;
|
||||
}
|
||||
|
||||
function requireInRange(value: number, min: number, max: number, label: string): void {
|
||||
if (!Number.isFinite(value) || value < min || value > max) {
|
||||
throw new Error(`${label} must be between ${min} and ${max}`);
|
||||
}
|
||||
}
|
||||
|
||||
function assertElevenLabsVoiceSettings(settings: {
|
||||
stability: number;
|
||||
similarityBoost: number;
|
||||
style: number;
|
||||
useSpeakerBoost: boolean;
|
||||
speed: number;
|
||||
}) {
|
||||
requireInRange(settings.stability, 0, 1, "stability");
|
||||
requireInRange(settings.similarityBoost, 0, 1, "similarityBoost");
|
||||
requireInRange(settings.style, 0, 1, "style");
|
||||
requireInRange(settings.speed, 0.5, 2, "speed");
|
||||
}
|
||||
|
||||
export async function elevenLabsTTS(params: {
|
||||
text: string;
|
||||
apiKey: string;
|
||||
baseUrl: string;
|
||||
voiceId: string;
|
||||
modelId: string;
|
||||
outputFormat: string;
|
||||
seed?: number;
|
||||
applyTextNormalization?: "auto" | "on" | "off";
|
||||
languageCode?: string;
|
||||
voiceSettings: {
|
||||
stability: number;
|
||||
similarityBoost: number;
|
||||
style: number;
|
||||
useSpeakerBoost: boolean;
|
||||
speed: number;
|
||||
};
|
||||
timeoutMs: number;
|
||||
}): Promise<Buffer> {
|
||||
const {
|
||||
text,
|
||||
apiKey,
|
||||
baseUrl,
|
||||
voiceId,
|
||||
modelId,
|
||||
outputFormat,
|
||||
seed,
|
||||
applyTextNormalization,
|
||||
languageCode,
|
||||
voiceSettings,
|
||||
timeoutMs,
|
||||
} = params;
|
||||
if (!isValidVoiceId(voiceId)) {
|
||||
throw new Error("Invalid voiceId format");
|
||||
}
|
||||
assertElevenLabsVoiceSettings(voiceSettings);
|
||||
const normalizedLanguage = normalizeLanguageCode(languageCode);
|
||||
const normalizedNormalization = normalizeApplyTextNormalization(applyTextNormalization);
|
||||
const normalizedSeed = normalizeSeed(seed);
|
||||
|
||||
const controller = new AbortController();
|
||||
const timeout = setTimeout(() => controller.abort(), timeoutMs);
|
||||
|
||||
try {
|
||||
const url = new URL(`${normalizeElevenLabsBaseUrl(baseUrl)}/v1/text-to-speech/${voiceId}`);
|
||||
if (outputFormat) {
|
||||
url.searchParams.set("output_format", outputFormat);
|
||||
}
|
||||
|
||||
const response = await fetch(url.toString(), {
|
||||
method: "POST",
|
||||
headers: {
|
||||
"xi-api-key": apiKey,
|
||||
"Content-Type": "application/json",
|
||||
Accept: "audio/mpeg",
|
||||
},
|
||||
body: JSON.stringify({
|
||||
text,
|
||||
model_id: modelId,
|
||||
seed: normalizedSeed,
|
||||
apply_text_normalization: normalizedNormalization,
|
||||
language_code: normalizedLanguage,
|
||||
voice_settings: {
|
||||
stability: voiceSettings.stability,
|
||||
similarity_boost: voiceSettings.similarityBoost,
|
||||
style: voiceSettings.style,
|
||||
use_speaker_boost: voiceSettings.useSpeakerBoost,
|
||||
speed: voiceSettings.speed,
|
||||
},
|
||||
}),
|
||||
signal: controller.signal,
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error(`ElevenLabs API error (${response.status})`);
|
||||
}
|
||||
|
||||
return Buffer.from(await response.arrayBuffer());
|
||||
} finally {
|
||||
clearTimeout(timeout);
|
||||
}
|
||||
}
|
||||
@@ -1,6 +1,7 @@
|
||||
{
|
||||
"id": "fal",
|
||||
"providers": ["fal"],
|
||||
"imageGenerationProviders": ["fal"],
|
||||
"providerAuthEnvVars": {
|
||||
"fal": ["FAL_KEY"]
|
||||
},
|
||||
|
||||
@@ -1,9 +1,4 @@
|
||||
import type { ImageGenerationProvider } from "openclaw/plugin-sdk/image-generation";
|
||||
import {
|
||||
assertOkOrThrowHttpError,
|
||||
normalizeBaseUrl,
|
||||
postJsonRequest,
|
||||
} from "openclaw/plugin-sdk/media-understanding";
|
||||
import { resolveApiKeyForProvider } from "openclaw/plugin-sdk/provider-auth";
|
||||
import {
|
||||
DEFAULT_GOOGLE_API_BASE_URL,
|
||||
@@ -11,6 +6,11 @@ import {
|
||||
normalizeGoogleModelId,
|
||||
parseGeminiAuth,
|
||||
} from "openclaw/plugin-sdk/provider-google";
|
||||
import {
|
||||
assertOkOrThrowHttpError,
|
||||
normalizeBaseUrl,
|
||||
postJsonRequest,
|
||||
} from "openclaw/plugin-sdk/provider-http";
|
||||
|
||||
const DEFAULT_GOOGLE_IMAGE_MODEL = "gemini-3.1-flash-image-preview";
|
||||
const DEFAULT_OUTPUT_MIME = "image/png";
|
||||
|
||||
@@ -1,15 +1,17 @@
|
||||
import {
|
||||
assertOkOrThrowHttpError,
|
||||
describeImageWithModel,
|
||||
describeImagesWithModel,
|
||||
normalizeBaseUrl,
|
||||
postJsonRequest,
|
||||
type AudioTranscriptionRequest,
|
||||
type AudioTranscriptionResult,
|
||||
type MediaUnderstandingProvider,
|
||||
type VideoDescriptionRequest,
|
||||
type VideoDescriptionResult,
|
||||
} from "openclaw/plugin-sdk/media-understanding";
|
||||
import {
|
||||
assertOkOrThrowHttpError,
|
||||
normalizeBaseUrl,
|
||||
postJsonRequest,
|
||||
} from "openclaw/plugin-sdk/provider-http";
|
||||
import {
|
||||
DEFAULT_GOOGLE_API_BASE_URL,
|
||||
normalizeGoogleApiBaseUrl,
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
{
|
||||
"id": "google",
|
||||
"providers": ["google", "google-gemini-cli"],
|
||||
"mediaUnderstandingProviders": ["google"],
|
||||
"imageGenerationProviders": ["google"],
|
||||
"cliBackends": ["google-gemini-cli"],
|
||||
"providerAuthEnvVars": {
|
||||
"google": ["GEMINI_API_KEY", "GOOGLE_API_KEY"]
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
{
|
||||
"id": "groq",
|
||||
"mediaUnderstandingProviders": ["groq"],
|
||||
"configSchema": {
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
{
|
||||
"id": "microsoft",
|
||||
"speechProviders": ["microsoft"],
|
||||
"configSchema": {
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
|
||||
@@ -8,7 +8,8 @@ import {
|
||||
import type { SpeechProviderPlugin } from "openclaw/plugin-sdk/core";
|
||||
import { resolvePreferredOpenClawTmpDir } from "openclaw/plugin-sdk/llm-task";
|
||||
import { isVoiceCompatibleAudio } from "openclaw/plugin-sdk/media-runtime";
|
||||
import { edgeTTS, inferEdgeExtension, type SpeechVoiceOption } from "openclaw/plugin-sdk/speech";
|
||||
import type { SpeechVoiceOption } from "openclaw/plugin-sdk/speech";
|
||||
import { edgeTTS, inferEdgeExtension } from "./tts.js";
|
||||
|
||||
const DEFAULT_EDGE_OUTPUT_FORMAT = "audio-24khz-48kbitrate-mono-mp3";
|
||||
|
||||
|
||||
@@ -3,7 +3,7 @@ import { tmpdir } from "node:os";
|
||||
import path from "node:path";
|
||||
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
|
||||
|
||||
let edgeTTS: typeof import("./tts-core.js").edgeTTS;
|
||||
let edgeTTS: typeof import("./tts.js").edgeTTS;
|
||||
|
||||
let mockTtsPromise = vi.fn<(text: string, filePath: string) => Promise<void>>();
|
||||
|
||||
@@ -16,15 +16,13 @@ vi.mock("node-edge-tts", () => ({
|
||||
}));
|
||||
|
||||
const baseEdgeConfig = {
|
||||
enabled: true,
|
||||
voice: "en-US-MichelleNeural",
|
||||
lang: "en-US",
|
||||
outputFormat: "audio-24khz-48kbitrate-mono-mp3",
|
||||
outputFormatConfigured: false,
|
||||
saveSubtitles: false,
|
||||
};
|
||||
|
||||
describe("edgeTTS – empty audio validation", () => {
|
||||
describe("edgeTTS empty audio validation", () => {
|
||||
let tempDir: string | undefined;
|
||||
|
||||
beforeEach(async () => {
|
||||
@@ -36,7 +34,7 @@ describe("edgeTTS – empty audio validation", () => {
|
||||
}
|
||||
},
|
||||
}));
|
||||
({ edgeTTS } = await import("./tts-core.js"));
|
||||
({ edgeTTS } = await import("./tts.js"));
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
55
extensions/microsoft/tts.ts
Normal file
55
extensions/microsoft/tts.ts
Normal file
@@ -0,0 +1,55 @@
|
||||
import { statSync } from "node:fs";
|
||||
import { EdgeTTS } from "node-edge-tts";
|
||||
|
||||
export function inferEdgeExtension(outputFormat: string): string {
|
||||
const normalized = outputFormat.toLowerCase();
|
||||
if (normalized.includes("webm")) {
|
||||
return ".webm";
|
||||
}
|
||||
if (normalized.includes("ogg")) {
|
||||
return ".ogg";
|
||||
}
|
||||
if (normalized.includes("opus")) {
|
||||
return ".opus";
|
||||
}
|
||||
if (normalized.includes("wav") || normalized.includes("riff") || normalized.includes("pcm")) {
|
||||
return ".wav";
|
||||
}
|
||||
return ".mp3";
|
||||
}
|
||||
|
||||
export async function edgeTTS(params: {
|
||||
text: string;
|
||||
outputPath: string;
|
||||
config: {
|
||||
voice: string;
|
||||
lang: string;
|
||||
outputFormat: string;
|
||||
saveSubtitles: boolean;
|
||||
proxy?: string;
|
||||
rate?: string;
|
||||
pitch?: string;
|
||||
volume?: string;
|
||||
timeoutMs?: number;
|
||||
};
|
||||
timeoutMs: number;
|
||||
}): Promise<void> {
|
||||
const { text, outputPath, config, timeoutMs } = params;
|
||||
const tts = new EdgeTTS({
|
||||
voice: config.voice,
|
||||
lang: config.lang,
|
||||
outputFormat: config.outputFormat,
|
||||
saveSubtitles: config.saveSubtitles,
|
||||
proxy: config.proxy,
|
||||
rate: config.rate,
|
||||
pitch: config.pitch,
|
||||
volume: config.volume,
|
||||
timeout: config.timeoutMs ?? timeoutMs,
|
||||
});
|
||||
await tts.ttsPromise(text, outputPath);
|
||||
|
||||
const { size } = statSync(outputPath);
|
||||
if (size === 0) {
|
||||
throw new Error("Edge TTS produced empty audio file");
|
||||
}
|
||||
}
|
||||
@@ -1,6 +1,8 @@
|
||||
{
|
||||
"id": "minimax",
|
||||
"providers": ["minimax", "minimax-portal"],
|
||||
"mediaUnderstandingProviders": ["minimax", "minimax-portal"],
|
||||
"imageGenerationProviders": ["minimax", "minimax-portal"],
|
||||
"providerAuthEnvVars": {
|
||||
"minimax": ["MINIMAX_API_KEY"],
|
||||
"minimax-portal": ["MINIMAX_OAUTH_TOKEN", "MINIMAX_API_KEY"]
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
{
|
||||
"id": "mistral",
|
||||
"providers": ["mistral"],
|
||||
"mediaUnderstandingProviders": ["mistral"],
|
||||
"providerAuthEnvVars": {
|
||||
"mistral": ["MISTRAL_API_KEY"]
|
||||
},
|
||||
|
||||
@@ -4,10 +4,12 @@ import {
|
||||
type MediaUnderstandingProvider,
|
||||
type VideoDescriptionRequest,
|
||||
type VideoDescriptionResult,
|
||||
} from "openclaw/plugin-sdk/media-understanding";
|
||||
import {
|
||||
assertOkOrThrowHttpError,
|
||||
normalizeBaseUrl,
|
||||
postJsonRequest,
|
||||
} from "openclaw/plugin-sdk/media-understanding";
|
||||
} from "openclaw/plugin-sdk/provider-http";
|
||||
|
||||
export const DEFAULT_MOONSHOT_VIDEO_BASE_URL = "https://api.moonshot.ai/v1";
|
||||
const DEFAULT_MOONSHOT_VIDEO_MODEL = "kimi-k2.5";
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
{
|
||||
"id": "moonshot",
|
||||
"providers": ["moonshot"],
|
||||
"mediaUnderstandingProviders": ["moonshot"],
|
||||
"providerAuthEnvVars": {
|
||||
"moonshot": ["MOONSHOT_API_KEY"]
|
||||
},
|
||||
|
||||
@@ -1,6 +1,9 @@
|
||||
{
|
||||
"id": "openai",
|
||||
"providers": ["openai", "openai-codex"],
|
||||
"speechProviders": ["openai"],
|
||||
"mediaUnderstandingProviders": ["openai", "openai-codex"],
|
||||
"imageGenerationProviders": ["openai"],
|
||||
"cliBackends": ["codex-cli"],
|
||||
"providerAuthEnvVars": {
|
||||
"openai": ["OPENAI_API_KEY"]
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
import type { SpeechProviderPlugin } from "openclaw/plugin-sdk/core";
|
||||
import { OPENAI_TTS_MODELS, OPENAI_TTS_VOICES, openaiTTS } from "openclaw/plugin-sdk/speech";
|
||||
import { OPENAI_TTS_MODELS, OPENAI_TTS_VOICES, openaiTTS } from "./tts.js";
|
||||
|
||||
export function buildOpenAISpeechProvider(): SpeechProviderPlugin {
|
||||
return {
|
||||
|
||||
109
extensions/openai/tts.ts
Normal file
109
extensions/openai/tts.ts
Normal file
@@ -0,0 +1,109 @@
|
||||
const DEFAULT_OPENAI_BASE_URL = "https://api.openai.com/v1";
|
||||
|
||||
export const OPENAI_TTS_MODELS = ["gpt-4o-mini-tts", "tts-1", "tts-1-hd"] as const;
|
||||
|
||||
export const OPENAI_TTS_VOICES = [
|
||||
"alloy",
|
||||
"ash",
|
||||
"ballad",
|
||||
"cedar",
|
||||
"coral",
|
||||
"echo",
|
||||
"fable",
|
||||
"juniper",
|
||||
"marin",
|
||||
"onyx",
|
||||
"nova",
|
||||
"sage",
|
||||
"shimmer",
|
||||
"verse",
|
||||
] as const;
|
||||
|
||||
type OpenAiTtsVoice = (typeof OPENAI_TTS_VOICES)[number];
|
||||
|
||||
function normalizeOpenAITtsBaseUrl(baseUrl?: string): string {
|
||||
const trimmed = baseUrl?.trim();
|
||||
if (!trimmed) {
|
||||
return DEFAULT_OPENAI_BASE_URL;
|
||||
}
|
||||
return trimmed.replace(/\/+$/, "");
|
||||
}
|
||||
|
||||
function isCustomOpenAIEndpoint(baseUrl?: string): boolean {
|
||||
if (baseUrl != null) {
|
||||
return normalizeOpenAITtsBaseUrl(baseUrl) !== DEFAULT_OPENAI_BASE_URL;
|
||||
}
|
||||
return normalizeOpenAITtsBaseUrl(process.env.OPENAI_TTS_BASE_URL) !== DEFAULT_OPENAI_BASE_URL;
|
||||
}
|
||||
|
||||
function isValidOpenAIModel(model: string, baseUrl?: string): boolean {
|
||||
if (isCustomOpenAIEndpoint(baseUrl)) {
|
||||
return true;
|
||||
}
|
||||
return OPENAI_TTS_MODELS.includes(model as (typeof OPENAI_TTS_MODELS)[number]);
|
||||
}
|
||||
|
||||
function isValidOpenAIVoice(voice: string, baseUrl?: string): voice is OpenAiTtsVoice {
|
||||
if (isCustomOpenAIEndpoint(baseUrl)) {
|
||||
return true;
|
||||
}
|
||||
return OPENAI_TTS_VOICES.includes(voice as OpenAiTtsVoice);
|
||||
}
|
||||
|
||||
function resolveOpenAITtsInstructions(model: string, instructions?: string): string | undefined {
|
||||
const next = instructions?.trim();
|
||||
return next && model.includes("gpt-4o-mini-tts") ? next : undefined;
|
||||
}
|
||||
|
||||
export async function openaiTTS(params: {
|
||||
text: string;
|
||||
apiKey: string;
|
||||
baseUrl: string;
|
||||
model: string;
|
||||
voice: string;
|
||||
speed?: number;
|
||||
instructions?: string;
|
||||
responseFormat: "mp3" | "opus" | "pcm";
|
||||
timeoutMs: number;
|
||||
}): Promise<Buffer> {
|
||||
const { text, apiKey, baseUrl, model, voice, speed, instructions, responseFormat, timeoutMs } =
|
||||
params;
|
||||
const effectiveInstructions = resolveOpenAITtsInstructions(model, instructions);
|
||||
|
||||
if (!isValidOpenAIModel(model, baseUrl)) {
|
||||
throw new Error(`Invalid model: ${model}`);
|
||||
}
|
||||
if (!isValidOpenAIVoice(voice, baseUrl)) {
|
||||
throw new Error(`Invalid voice: ${voice}`);
|
||||
}
|
||||
|
||||
const controller = new AbortController();
|
||||
const timeout = setTimeout(() => controller.abort(), timeoutMs);
|
||||
|
||||
try {
|
||||
const response = await fetch(`${baseUrl}/audio/speech`, {
|
||||
method: "POST",
|
||||
headers: {
|
||||
Authorization: `Bearer ${apiKey}`,
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
body: JSON.stringify({
|
||||
model,
|
||||
input: text,
|
||||
voice,
|
||||
response_format: responseFormat,
|
||||
...(speed != null && { speed }),
|
||||
...(effectiveInstructions != null && { instructions: effectiveInstructions }),
|
||||
}),
|
||||
signal: controller.signal,
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error(`OpenAI TTS API error (${response.status})`);
|
||||
}
|
||||
|
||||
return Buffer.from(await response.arrayBuffer());
|
||||
} finally {
|
||||
clearTimeout(timeout);
|
||||
}
|
||||
}
|
||||
@@ -1,6 +1,7 @@
|
||||
{
|
||||
"id": "zai",
|
||||
"providers": ["zai"],
|
||||
"mediaUnderstandingProviders": ["zai"],
|
||||
"providerAuthEnvVars": {
|
||||
"zai": ["ZAI_API_KEY", "Z_AI_API_KEY"]
|
||||
},
|
||||
|
||||
@@ -453,6 +453,10 @@
|
||||
"types": "./dist/plugin-sdk/provider-env-vars.d.ts",
|
||||
"default": "./dist/plugin-sdk/provider-env-vars.js"
|
||||
},
|
||||
"./plugin-sdk/provider-http": {
|
||||
"types": "./dist/plugin-sdk/provider-http.d.ts",
|
||||
"default": "./dist/plugin-sdk/provider-http.js"
|
||||
},
|
||||
"./plugin-sdk/provider-google": {
|
||||
"types": "./dist/plugin-sdk/provider-google.d.ts",
|
||||
"default": "./dist/plugin-sdk/provider-google.js"
|
||||
@@ -529,6 +533,10 @@
|
||||
"types": "./dist/plugin-sdk/telegram-core.d.ts",
|
||||
"default": "./dist/plugin-sdk/telegram-core.js"
|
||||
},
|
||||
"./plugin-sdk/telegram-runtime": {
|
||||
"types": "./dist/plugin-sdk/telegram-runtime.d.ts",
|
||||
"default": "./dist/plugin-sdk/telegram-runtime.js"
|
||||
},
|
||||
"./plugin-sdk/thread-ownership": {
|
||||
"types": "./dist/plugin-sdk/thread-ownership.d.ts",
|
||||
"default": "./dist/plugin-sdk/thread-ownership.js"
|
||||
|
||||
@@ -103,6 +103,15 @@ function normalizePluginManifest(raw) {
|
||||
...(normalizeStringList(raw.providers)
|
||||
? { providers: normalizeStringList(raw.providers) }
|
||||
: {}),
|
||||
...(normalizeStringList(raw.speechProviders)
|
||||
? { speechProviders: normalizeStringList(raw.speechProviders) }
|
||||
: {}),
|
||||
...(normalizeStringList(raw.mediaUnderstandingProviders)
|
||||
? { mediaUnderstandingProviders: normalizeStringList(raw.mediaUnderstandingProviders) }
|
||||
: {}),
|
||||
...(normalizeStringList(raw.imageGenerationProviders)
|
||||
? { imageGenerationProviders: normalizeStringList(raw.imageGenerationProviders) }
|
||||
: {}),
|
||||
...(normalizeObject(raw.providerAuthEnvVars)
|
||||
? { providerAuthEnvVars: raw.providerAuthEnvVars }
|
||||
: {}),
|
||||
|
||||
@@ -103,6 +103,7 @@
|
||||
"provider-catalog",
|
||||
"provider-entry",
|
||||
"provider-env-vars",
|
||||
"provider-http",
|
||||
"provider-google",
|
||||
"provider-models",
|
||||
"provider-onboard",
|
||||
@@ -122,6 +123,7 @@
|
||||
"state-paths",
|
||||
"telegram",
|
||||
"telegram-core",
|
||||
"telegram-runtime",
|
||||
"thread-ownership",
|
||||
"tlon",
|
||||
"tool-send",
|
||||
|
||||
@@ -461,7 +461,7 @@ export async function applyMediaUnderstanding(params: {
|
||||
.find((value) => value && value.trim()) ?? undefined;
|
||||
|
||||
const attachments = normalizeMediaAttachments(ctx);
|
||||
const providerRegistry = buildProviderRegistry(params.providers);
|
||||
const providerRegistry = buildProviderRegistry(params.providers, cfg);
|
||||
const cache = createMediaAttachmentCache(attachments, {
|
||||
localPathRoots: resolveMediaAttachmentLocalRoots({ cfg, ctx }),
|
||||
});
|
||||
|
||||
@@ -23,7 +23,7 @@ export async function runAudioTranscription(params: {
|
||||
return { transcript: undefined, attachments };
|
||||
}
|
||||
|
||||
const providerRegistry = buildProviderRegistry(params.providers);
|
||||
const providerRegistry = buildProviderRegistry(params.providers, params.cfg);
|
||||
const cache = createMediaAttachmentCache(
|
||||
attachments,
|
||||
params.localPathRoots ? { localPathRoots: params.localPathRoots } : undefined,
|
||||
|
||||
@@ -11,15 +11,10 @@ describe("media-understanding provider registry", () => {
|
||||
setActivePluginRegistry(createEmptyPluginRegistry());
|
||||
});
|
||||
|
||||
it("keeps core-owned fallback providers registered by default", () => {
|
||||
it("returns no providers by default when no active registry is present", () => {
|
||||
const registry = buildMediaUnderstandingRegistry();
|
||||
const groqProvider = getMediaUnderstandingProvider("groq", registry);
|
||||
const deepgramProvider = getMediaUnderstandingProvider("deepgram", registry);
|
||||
|
||||
expect(groqProvider?.id).toBe("groq");
|
||||
expect(groqProvider?.capabilities).toEqual(["audio"]);
|
||||
expect(deepgramProvider?.id).toBe("deepgram");
|
||||
expect(deepgramProvider?.capabilities).toEqual(["audio"]);
|
||||
expect(getMediaUnderstandingProvider("groq", registry)).toBeUndefined();
|
||||
expect(getMediaUnderstandingProvider("deepgram", registry)).toBeUndefined();
|
||||
});
|
||||
|
||||
it("merges plugin-registered media providers into the active registry", async () => {
|
||||
|
||||
@@ -1,18 +1,9 @@
|
||||
import type { OpenClawConfig } from "../config/config.js";
|
||||
import {
|
||||
deepgramMediaUnderstandingProvider,
|
||||
groqMediaUnderstandingProvider,
|
||||
} from "../plugin-sdk/media-understanding.js";
|
||||
import { loadOpenClawPlugins } from "../plugins/loader.js";
|
||||
import { getActivePluginRegistry } from "../plugins/runtime.js";
|
||||
import { normalizeMediaProviderId } from "./provider-id.js";
|
||||
import type { MediaUnderstandingProvider } from "./types.js";
|
||||
|
||||
const PROVIDERS: MediaUnderstandingProvider[] = [
|
||||
groqMediaUnderstandingProvider,
|
||||
deepgramMediaUnderstandingProvider,
|
||||
];
|
||||
|
||||
function mergeProviderIntoRegistry(
|
||||
registry: Map<string, MediaUnderstandingProvider>,
|
||||
provider: MediaUnderstandingProvider,
|
||||
@@ -36,12 +27,9 @@ export function buildMediaUnderstandingRegistry(
|
||||
cfg?: OpenClawConfig,
|
||||
): Map<string, MediaUnderstandingProvider> {
|
||||
const registry = new Map<string, MediaUnderstandingProvider>();
|
||||
for (const provider of PROVIDERS) {
|
||||
mergeProviderIntoRegistry(registry, provider);
|
||||
}
|
||||
const active = getActivePluginRegistry();
|
||||
const pluginRegistry =
|
||||
(active?.mediaUnderstandingProviders?.length ?? 0) > 0
|
||||
(active?.mediaUnderstandingProviders?.length ?? 0) > 0 || !cfg
|
||||
? active
|
||||
: loadOpenClawPlugins({ config: cfg });
|
||||
for (const entry of pluginRegistry?.mediaUnderstandingProviders ?? []) {
|
||||
|
||||
@@ -494,7 +494,7 @@ export async function resolveAutoImageModel(params: {
|
||||
agentDir?: string;
|
||||
activeModel?: ActiveMediaModel;
|
||||
}): Promise<ActiveMediaModel | null> {
|
||||
const providerRegistry = buildProviderRegistry();
|
||||
const providerRegistry = buildProviderRegistry(undefined, params.cfg);
|
||||
const toActive = (entry: MediaUnderstandingModelConfig | null): ActiveMediaModel | null => {
|
||||
if (!entry || entry.type === "cli") {
|
||||
return null;
|
||||
|
||||
@@ -7,6 +7,7 @@ export { optionalStringEnum, stringEnum } from "../agents/schema/typebox.js";
|
||||
import { Type } from "@sinclair/typebox";
|
||||
import type { TSchema } from "@sinclair/typebox";
|
||||
import { stringEnum } from "../agents/schema/typebox.js";
|
||||
export { optionalStringEnum, stringEnum } from "../agents/schema/typebox.js";
|
||||
|
||||
/** Schema helper for channels that expose button rows on the shared `message` tool. */
|
||||
export function createMessageToolButtonsSchema(): TSchema {
|
||||
|
||||
@@ -8,7 +8,3 @@ export type {
|
||||
ImageGenerationResult,
|
||||
ImageGenerationSourceImage,
|
||||
} from "../image-generation/types.js";
|
||||
|
||||
export { buildFalImageGenerationProvider } from "../../extensions/fal/image-generation-provider.js";
|
||||
export { buildGoogleImageGenerationProvider } from "../../extensions/google/image-generation-provider.js";
|
||||
export { buildOpenAIImageGenerationProvider } from "../../extensions/openai/image-generation-provider.js";
|
||||
|
||||
@@ -89,9 +89,6 @@ describe("plugin-sdk exports", () => {
|
||||
it("keeps the root runtime surface intentionally small", async () => {
|
||||
const runtimeExports = await collectRuntimeExports(path.join(import.meta.dirname, "index.ts"));
|
||||
expect([...runtimeExports].toSorted()).toEqual([
|
||||
"buildFalImageGenerationProvider",
|
||||
"buildGoogleImageGenerationProvider",
|
||||
"buildOpenAIImageGenerationProvider",
|
||||
"delegateCompactionToRuntime",
|
||||
"emptyPluginConfigSchema",
|
||||
"onDiagnosticEvent",
|
||||
|
||||
@@ -18,12 +18,3 @@ export {
|
||||
describeImagesWithModel,
|
||||
} from "../media-understanding/image-runtime.js";
|
||||
export { transcribeOpenAiCompatibleAudio } from "../media-understanding/openai-compatible-audio.js";
|
||||
export {
|
||||
assertOkOrThrowHttpError,
|
||||
normalizeBaseUrl,
|
||||
postJsonRequest,
|
||||
postTranscriptionRequest,
|
||||
requireTranscriptionText,
|
||||
} from "../media-understanding/shared.js";
|
||||
export { deepgramMediaUnderstandingProvider } from "../../extensions/deepgram/media-understanding-provider.js";
|
||||
export { groqMediaUnderstandingProvider } from "../../extensions/groq/media-understanding-provider.js";
|
||||
|
||||
12
src/plugin-sdk/provider-http.ts
Normal file
12
src/plugin-sdk/provider-http.ts
Normal file
@@ -0,0 +1,12 @@
|
||||
// Shared provider-facing HTTP helpers. Keep generic transport utilities here so
|
||||
// capability SDKs do not depend on each other.
|
||||
|
||||
export {
|
||||
assertOkOrThrowHttpError,
|
||||
fetchWithTimeout,
|
||||
fetchWithTimeoutGuarded,
|
||||
normalizeBaseUrl,
|
||||
postJsonRequest,
|
||||
postTranscriptionRequest,
|
||||
requireTranscriptionText,
|
||||
} from "../media-understanding/shared.js";
|
||||
@@ -3,15 +3,4 @@
|
||||
export type { SpeechProviderPlugin } from "../plugins/types.js";
|
||||
export type { SpeechVoiceOption } from "../tts/provider-types.js";
|
||||
|
||||
export {
|
||||
edgeTTS,
|
||||
elevenLabsTTS,
|
||||
inferEdgeExtension,
|
||||
OPENAI_TTS_MODELS,
|
||||
OPENAI_TTS_VOICES,
|
||||
openaiTTS,
|
||||
parseTtsDirectives,
|
||||
} from "../tts/tts-core.js";
|
||||
|
||||
export { resolvePreferredOpenClawTmpDir } from "../infra/tmp-openclaw-dir.js";
|
||||
export { isVoiceCompatibleAudio } from "../media/audio.js";
|
||||
export { parseTtsDirectives } from "../tts/tts-core.js";
|
||||
|
||||
@@ -1,9 +1,4 @@
|
||||
// Public speech-provider builders for bundled or third-party plugins.
|
||||
// Public speech helpers for bundled or third-party plugins.
|
||||
|
||||
export { buildElevenLabsSpeechProvider } from "../../extensions/elevenlabs/speech-provider.js";
|
||||
export { buildMicrosoftSpeechProvider } from "../../extensions/microsoft/speech-provider.js";
|
||||
export { buildOpenAISpeechProvider } from "../../extensions/openai/speech-provider.js";
|
||||
export { edgeTTS, elevenLabsTTS, inferEdgeExtension, openaiTTS } from "../tts/tts-core.js";
|
||||
export { OPENAI_TTS_MODELS, OPENAI_TTS_VOICES } from "../tts/tts-core.js";
|
||||
export { parseTtsDirectives } from "../tts/tts-core.js";
|
||||
export type { SpeechVoiceOption } from "../tts/provider-types.js";
|
||||
|
||||
@@ -544,6 +544,36 @@ describe("plugin-sdk subpath exports", () => {
|
||||
"buildOptionalSecretInputSchema",
|
||||
"normalizeSecretInputString",
|
||||
]);
|
||||
expectSourceMentions("provider-http", [
|
||||
"assertOkOrThrowHttpError",
|
||||
"normalizeBaseUrl",
|
||||
"postJsonRequest",
|
||||
"postTranscriptionRequest",
|
||||
"requireTranscriptionText",
|
||||
]);
|
||||
expectSourceOmits("speech", [
|
||||
"buildElevenLabsSpeechProvider",
|
||||
"buildMicrosoftSpeechProvider",
|
||||
"buildOpenAISpeechProvider",
|
||||
"edgeTTS",
|
||||
"elevenLabsTTS",
|
||||
"inferEdgeExtension",
|
||||
"openaiTTS",
|
||||
"OPENAI_TTS_MODELS",
|
||||
"OPENAI_TTS_VOICES",
|
||||
]);
|
||||
expectSourceOmits("media-understanding", [
|
||||
"deepgramMediaUnderstandingProvider",
|
||||
"groqMediaUnderstandingProvider",
|
||||
"assertOkOrThrowHttpError",
|
||||
"postJsonRequest",
|
||||
"postTranscriptionRequest",
|
||||
]);
|
||||
expectSourceOmits("image-generation", [
|
||||
"buildFalImageGenerationProvider",
|
||||
"buildGoogleImageGenerationProvider",
|
||||
"buildOpenAIImageGenerationProvider",
|
||||
]);
|
||||
expectSourceOmits("config-runtime", [
|
||||
"hasConfiguredSecretInput",
|
||||
"normalizeResolvedSecretInputString",
|
||||
|
||||
@@ -169,6 +169,7 @@ export const GENERATED_BUNDLED_PLUGIN_METADATA = [
|
||||
properties: {},
|
||||
},
|
||||
providers: ["anthropic"],
|
||||
mediaUnderstandingProviders: ["anthropic"],
|
||||
providerAuthEnvVars: {
|
||||
anthropic: ["ANTHROPIC_OAUTH_TOKEN", "ANTHROPIC_API_KEY"],
|
||||
},
|
||||
@@ -488,6 +489,7 @@ export const GENERATED_BUNDLED_PLUGIN_METADATA = [
|
||||
additionalProperties: false,
|
||||
properties: {},
|
||||
},
|
||||
mediaUnderstandingProviders: ["deepgram"],
|
||||
},
|
||||
},
|
||||
{
|
||||
@@ -859,6 +861,7 @@ export const GENERATED_BUNDLED_PLUGIN_METADATA = [
|
||||
additionalProperties: false,
|
||||
properties: {},
|
||||
},
|
||||
speechProviders: ["elevenlabs"],
|
||||
},
|
||||
},
|
||||
{
|
||||
@@ -925,6 +928,7 @@ export const GENERATED_BUNDLED_PLUGIN_METADATA = [
|
||||
properties: {},
|
||||
},
|
||||
providers: ["fal"],
|
||||
imageGenerationProviders: ["fal"],
|
||||
providerAuthEnvVars: {
|
||||
fal: ["FAL_KEY"],
|
||||
},
|
||||
@@ -1114,6 +1118,8 @@ export const GENERATED_BUNDLED_PLUGIN_METADATA = [
|
||||
},
|
||||
},
|
||||
providers: ["google", "google-gemini-cli"],
|
||||
mediaUnderstandingProviders: ["google"],
|
||||
imageGenerationProviders: ["google"],
|
||||
providerAuthEnvVars: {
|
||||
google: ["GEMINI_API_KEY", "GOOGLE_API_KEY"],
|
||||
},
|
||||
@@ -1221,6 +1227,7 @@ export const GENERATED_BUNDLED_PLUGIN_METADATA = [
|
||||
additionalProperties: false,
|
||||
properties: {},
|
||||
},
|
||||
mediaUnderstandingProviders: ["groq"],
|
||||
},
|
||||
},
|
||||
{
|
||||
@@ -1782,6 +1789,7 @@ export const GENERATED_BUNDLED_PLUGIN_METADATA = [
|
||||
additionalProperties: false,
|
||||
properties: {},
|
||||
},
|
||||
speechProviders: ["microsoft"],
|
||||
},
|
||||
},
|
||||
{
|
||||
@@ -1854,6 +1862,8 @@ export const GENERATED_BUNDLED_PLUGIN_METADATA = [
|
||||
properties: {},
|
||||
},
|
||||
providers: ["minimax", "minimax-portal"],
|
||||
mediaUnderstandingProviders: ["minimax", "minimax-portal"],
|
||||
imageGenerationProviders: ["minimax", "minimax-portal"],
|
||||
providerAuthEnvVars: {
|
||||
minimax: ["MINIMAX_API_KEY"],
|
||||
"minimax-portal": ["MINIMAX_OAUTH_TOKEN", "MINIMAX_API_KEY"],
|
||||
@@ -1931,6 +1941,7 @@ export const GENERATED_BUNDLED_PLUGIN_METADATA = [
|
||||
properties: {},
|
||||
},
|
||||
providers: ["mistral"],
|
||||
mediaUnderstandingProviders: ["mistral"],
|
||||
providerAuthEnvVars: {
|
||||
mistral: ["MISTRAL_API_KEY"],
|
||||
},
|
||||
@@ -2072,6 +2083,7 @@ export const GENERATED_BUNDLED_PLUGIN_METADATA = [
|
||||
},
|
||||
},
|
||||
providers: ["moonshot"],
|
||||
mediaUnderstandingProviders: ["moonshot"],
|
||||
providerAuthEnvVars: {
|
||||
moonshot: ["MOONSHOT_API_KEY"],
|
||||
},
|
||||
@@ -2363,6 +2375,9 @@ export const GENERATED_BUNDLED_PLUGIN_METADATA = [
|
||||
properties: {},
|
||||
},
|
||||
providers: ["openai", "openai-codex"],
|
||||
speechProviders: ["openai"],
|
||||
mediaUnderstandingProviders: ["openai", "openai-codex"],
|
||||
imageGenerationProviders: ["openai"],
|
||||
providerAuthEnvVars: {
|
||||
openai: ["OPENAI_API_KEY"],
|
||||
},
|
||||
@@ -4101,6 +4116,7 @@ export const GENERATED_BUNDLED_PLUGIN_METADATA = [
|
||||
properties: {},
|
||||
},
|
||||
providers: ["zai"],
|
||||
mediaUnderstandingProviders: ["zai"],
|
||||
providerAuthEnvVars: {
|
||||
zai: ["ZAI_API_KEY", "Z_AI_API_KEY"],
|
||||
},
|
||||
|
||||
@@ -120,6 +120,53 @@ describe("plugin contract registry", () => {
|
||||
expect(providerContractPluginIds).toEqual(bundledProviderPluginIds);
|
||||
});
|
||||
|
||||
it("covers every bundled speech plugin discovered from manifests", () => {
|
||||
const bundledSpeechPluginIds = loadPluginManifestRegistry({})
|
||||
.plugins.filter(
|
||||
(plugin) => plugin.origin === "bundled" && (plugin.speechProviders?.length ?? 0) > 0,
|
||||
)
|
||||
.map((plugin) => plugin.id)
|
||||
.toSorted((left, right) => left.localeCompare(right));
|
||||
|
||||
expect(
|
||||
[...new Set(speechProviderContractRegistry.map((entry) => entry.pluginId))].toSorted(
|
||||
(left, right) => left.localeCompare(right),
|
||||
),
|
||||
).toEqual(bundledSpeechPluginIds);
|
||||
});
|
||||
|
||||
it("covers every bundled media-understanding plugin discovered from manifests", () => {
|
||||
const bundledMediaPluginIds = loadPluginManifestRegistry({})
|
||||
.plugins.filter(
|
||||
(plugin) =>
|
||||
plugin.origin === "bundled" && (plugin.mediaUnderstandingProviders?.length ?? 0) > 0,
|
||||
)
|
||||
.map((plugin) => plugin.id)
|
||||
.toSorted((left, right) => left.localeCompare(right));
|
||||
|
||||
expect(
|
||||
[
|
||||
...new Set(mediaUnderstandingProviderContractRegistry.map((entry) => entry.pluginId)),
|
||||
].toSorted((left, right) => left.localeCompare(right)),
|
||||
).toEqual(bundledMediaPluginIds);
|
||||
});
|
||||
|
||||
it("covers every bundled image-generation plugin discovered from manifests", () => {
|
||||
const bundledImagePluginIds = loadPluginManifestRegistry({})
|
||||
.plugins.filter(
|
||||
(plugin) =>
|
||||
plugin.origin === "bundled" && (plugin.imageGenerationProviders?.length ?? 0) > 0,
|
||||
)
|
||||
.map((plugin) => plugin.id)
|
||||
.toSorted((left, right) => left.localeCompare(right));
|
||||
|
||||
expect(
|
||||
[...new Set(imageGenerationProviderContractRegistry.map((entry) => entry.pluginId))].toSorted(
|
||||
(left, right) => left.localeCompare(right),
|
||||
),
|
||||
).toEqual(bundledImagePluginIds);
|
||||
});
|
||||
|
||||
it("covers every bundled web search plugin from the shared resolver", () => {
|
||||
const bundledWebSearchPluginIds = resolveBundledWebSearchPluginIds({});
|
||||
|
||||
|
||||
@@ -39,6 +39,7 @@ import xiaomiPlugin from "../../../extensions/xiaomi/index.js";
|
||||
import zaiPlugin from "../../../extensions/zai/index.js";
|
||||
import { bundledWebSearchPluginRegistrations } from "../../bundled-web-search-registry.js";
|
||||
import { createCapturedPluginRegistration } from "../captured-registration.js";
|
||||
import { loadPluginManifestRegistry } from "../manifest-registry.js";
|
||||
import { resolvePluginProviders } from "../provider-auth-choice.runtime.js";
|
||||
import type {
|
||||
ImageGenerationProviderPlugin,
|
||||
@@ -85,21 +86,6 @@ const bundledWebSearchPlugins: Array<RegistrablePlugin & { credentialValue: unkn
|
||||
...plugin,
|
||||
credentialValue,
|
||||
}));
|
||||
const bundledSpeechPlugins: RegistrablePlugin[] = [elevenLabsPlugin, microsoftPlugin, openAIPlugin];
|
||||
|
||||
const bundledMediaUnderstandingPlugins: RegistrablePlugin[] = [
|
||||
anthropicPlugin,
|
||||
deepgramPlugin,
|
||||
googlePlugin,
|
||||
groqPlugin,
|
||||
minimaxPlugin,
|
||||
mistralPlugin,
|
||||
moonshotPlugin,
|
||||
openAIPlugin,
|
||||
zaiPlugin,
|
||||
];
|
||||
|
||||
const bundledImageGenerationPlugins: RegistrablePlugin[] = [falPlugin, googlePlugin, openAIPlugin];
|
||||
|
||||
function captureRegistrations(plugin: RegistrablePlugin) {
|
||||
const captured = createCapturedPluginRegistration();
|
||||
@@ -390,6 +376,43 @@ const bundledProviderPlugins = dedupePlugins([
|
||||
zaiPlugin,
|
||||
]);
|
||||
|
||||
const bundledRegistrablePluginsById = new Map(
|
||||
dedupePlugins([
|
||||
...bundledProviderPlugins,
|
||||
elevenLabsPlugin,
|
||||
microsoftPlugin,
|
||||
deepgramPlugin,
|
||||
groqPlugin,
|
||||
...bundledWebSearchPlugins,
|
||||
]).map((plugin) => [plugin.id, plugin]),
|
||||
);
|
||||
|
||||
function resolveBundledCapabilityPluginIds(
|
||||
capability: "speechProviders" | "mediaUnderstandingProviders" | "imageGenerationProviders",
|
||||
): string[] {
|
||||
return loadPluginManifestRegistry({})
|
||||
.plugins.filter(
|
||||
(plugin) => plugin.origin === "bundled" && (plugin[capability]?.length ?? 0) > 0,
|
||||
)
|
||||
.map((plugin) => plugin.id)
|
||||
.toSorted((left, right) => left.localeCompare(right));
|
||||
}
|
||||
|
||||
function resolveBundledCapabilityPlugins(
|
||||
capability: "speechProviders" | "mediaUnderstandingProviders" | "imageGenerationProviders",
|
||||
): RegistrablePlugin[] {
|
||||
return resolveBundledCapabilityPluginIds(capability).flatMap((pluginId) => {
|
||||
const plugin = bundledRegistrablePluginsById.get(pluginId);
|
||||
return plugin ? [plugin] : [];
|
||||
});
|
||||
}
|
||||
|
||||
const bundledSpeechPlugins = resolveBundledCapabilityPlugins("speechProviders");
|
||||
const bundledMediaUnderstandingPlugins = resolveBundledCapabilityPlugins(
|
||||
"mediaUnderstandingProviders",
|
||||
);
|
||||
const bundledImageGenerationPlugins = resolveBundledCapabilityPlugins("imageGenerationProviders");
|
||||
|
||||
const bundledPluginRegistrationList = dedupePlugins([
|
||||
...bundledSpeechPlugins,
|
||||
...bundledMediaUnderstandingPlugins,
|
||||
|
||||
@@ -45,6 +45,9 @@ export type PluginManifestRecord = {
|
||||
kind?: PluginKind;
|
||||
channels: string[];
|
||||
providers: string[];
|
||||
speechProviders?: string[];
|
||||
mediaUnderstandingProviders?: string[];
|
||||
imageGenerationProviders?: string[];
|
||||
cliBackends: string[];
|
||||
providerAuthEnvVars?: Record<string, string[]>;
|
||||
providerAuthChoices?: PluginManifest["providerAuthChoices"];
|
||||
@@ -171,6 +174,9 @@ function buildRecord(params: {
|
||||
kind: params.manifest.kind,
|
||||
channels: params.manifest.channels ?? [],
|
||||
providers: params.manifest.providers ?? [],
|
||||
speechProviders: params.manifest.speechProviders ?? [],
|
||||
mediaUnderstandingProviders: params.manifest.mediaUnderstandingProviders ?? [],
|
||||
imageGenerationProviders: params.manifest.imageGenerationProviders ?? [],
|
||||
cliBackends: params.manifest.cliBackends ?? [],
|
||||
providerAuthEnvVars: params.manifest.providerAuthEnvVars,
|
||||
providerAuthChoices: params.manifest.providerAuthChoices,
|
||||
@@ -226,6 +232,9 @@ function buildBundleRecord(params: {
|
||||
bundleCapabilities: params.manifest.capabilities,
|
||||
channels: [],
|
||||
providers: [],
|
||||
speechProviders: [],
|
||||
mediaUnderstandingProviders: [],
|
||||
imageGenerationProviders: [],
|
||||
cliBackends: [],
|
||||
skills: params.manifest.skills ?? [],
|
||||
settingsFiles: params.manifest.settingsFiles ?? [],
|
||||
|
||||
@@ -15,6 +15,9 @@ export type PluginManifest = {
|
||||
kind?: PluginKind;
|
||||
channels?: string[];
|
||||
providers?: string[];
|
||||
speechProviders?: string[];
|
||||
mediaUnderstandingProviders?: string[];
|
||||
imageGenerationProviders?: string[];
|
||||
/** Cheap startup activation lookup for plugin-owned CLI inference backends. */
|
||||
cliBackends?: string[];
|
||||
/** Cheap provider-auth env lookup without booting plugin runtime. */
|
||||
@@ -205,6 +208,9 @@ export function loadPluginManifest(
|
||||
const version = typeof raw.version === "string" ? raw.version.trim() : undefined;
|
||||
const channels = normalizeStringList(raw.channels);
|
||||
const providers = normalizeStringList(raw.providers);
|
||||
const speechProviders = normalizeStringList(raw.speechProviders);
|
||||
const mediaUnderstandingProviders = normalizeStringList(raw.mediaUnderstandingProviders);
|
||||
const imageGenerationProviders = normalizeStringList(raw.imageGenerationProviders);
|
||||
const cliBackends = normalizeStringList(raw.cliBackends);
|
||||
const providerAuthEnvVars = normalizeStringListRecord(raw.providerAuthEnvVars);
|
||||
const providerAuthChoices = normalizeProviderAuthChoices(raw.providerAuthChoices);
|
||||
@@ -224,6 +230,9 @@ export function loadPluginManifest(
|
||||
kind,
|
||||
channels,
|
||||
providers,
|
||||
speechProviders,
|
||||
mediaUnderstandingProviders,
|
||||
imageGenerationProviders,
|
||||
cliBackends,
|
||||
providerAuthEnvVars,
|
||||
providerAuthChoices,
|
||||
|
||||
@@ -58,7 +58,7 @@ describe("speech provider registry", () => {
|
||||
|
||||
const providers = listSpeechProviders();
|
||||
|
||||
expect(providers.map((provider) => provider.id)).toEqual(["openai", "elevenlabs", "microsoft"]);
|
||||
expect(providers.map((provider) => provider.id)).toEqual(["openai"]);
|
||||
expect(loadOpenClawPluginsMock).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
@@ -76,22 +76,14 @@ describe("speech provider registry", () => {
|
||||
|
||||
const cfg = {} as OpenClawConfig;
|
||||
|
||||
expect(listSpeechProviders(cfg).map((provider) => provider.id)).toEqual([
|
||||
"openai",
|
||||
"elevenlabs",
|
||||
"microsoft",
|
||||
]);
|
||||
expect(listSpeechProviders(cfg).map((provider) => provider.id)).toEqual(["microsoft"]);
|
||||
expect(getSpeechProvider("edge", cfg)?.id).toBe("microsoft");
|
||||
expect(loadOpenClawPluginsMock).toHaveBeenCalledWith({ config: cfg });
|
||||
});
|
||||
|
||||
it("returns builtin providers when neither plugins nor active registry provide speech support", () => {
|
||||
expect(listSpeechProviders().map((provider) => provider.id)).toEqual([
|
||||
"openai",
|
||||
"elevenlabs",
|
||||
"microsoft",
|
||||
]);
|
||||
expect(getSpeechProvider("openai")?.id).toBe("openai");
|
||||
it("returns no providers when neither plugins nor active registry provide speech support", () => {
|
||||
expect(listSpeechProviders()).toEqual([]);
|
||||
expect(getSpeechProvider("openai")).toBeUndefined();
|
||||
});
|
||||
|
||||
it("normalizes the legacy edge alias to microsoft", () => {
|
||||
|
||||
@@ -1,18 +1,9 @@
|
||||
import { buildElevenLabsSpeechProvider } from "../../extensions/elevenlabs/speech-provider.js";
|
||||
import { buildMicrosoftSpeechProvider } from "../../extensions/microsoft/speech-provider.js";
|
||||
import { buildOpenAISpeechProvider } from "../../extensions/openai/speech-provider.js";
|
||||
import type { OpenClawConfig } from "../config/config.js";
|
||||
import { loadOpenClawPlugins } from "../plugins/loader.js";
|
||||
import { getActivePluginRegistry } from "../plugins/runtime.js";
|
||||
import type { SpeechProviderPlugin } from "../plugins/types.js";
|
||||
import type { SpeechProviderId } from "./provider-types.js";
|
||||
|
||||
const BUILTIN_SPEECH_PROVIDER_BUILDERS = [
|
||||
buildOpenAISpeechProvider,
|
||||
buildElevenLabsSpeechProvider,
|
||||
buildMicrosoftSpeechProvider,
|
||||
] as const satisfies readonly (() => SpeechProviderPlugin)[];
|
||||
|
||||
function trimToUndefined(value: string | undefined): string | undefined {
|
||||
const trimmed = value?.trim().toLowerCase();
|
||||
return trimmed ? trimmed : undefined;
|
||||
@@ -58,9 +49,6 @@ function buildProviderMaps(cfg?: OpenClawConfig): {
|
||||
}
|
||||
};
|
||||
|
||||
for (const buildProvider of BUILTIN_SPEECH_PROVIDER_BUILDERS) {
|
||||
register(buildProvider());
|
||||
}
|
||||
for (const provider of resolveSpeechProviderPluginEntries(cfg)) {
|
||||
register(provider);
|
||||
}
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
import { rmSync, statSync } from "node:fs";
|
||||
import { rmSync } from "node:fs";
|
||||
import { completeSimple, type TextContent } from "@mariozechner/pi-ai";
|
||||
import { EdgeTTS } from "node-edge-tts";
|
||||
import { getApiKeyForModel, requireApiKey } from "../agents/model-auth.js";
|
||||
import {
|
||||
buildModelAliasIndex,
|
||||
@@ -18,7 +17,6 @@ import type {
|
||||
TtsDirectiveParseResult,
|
||||
} from "./tts.js";
|
||||
|
||||
const DEFAULT_ELEVENLABS_BASE_URL = "https://api.elevenlabs.io";
|
||||
export const DEFAULT_OPENAI_BASE_URL = "https://api.openai.com/v1";
|
||||
const TEMP_FILE_CLEANUP_DELAY_MS = 5 * 60 * 1000; // 5 minutes
|
||||
|
||||
@@ -26,14 +24,6 @@ export function isValidVoiceId(voiceId: string): boolean {
|
||||
return /^[a-zA-Z0-9]{10,40}$/.test(voiceId);
|
||||
}
|
||||
|
||||
function normalizeElevenLabsBaseUrl(baseUrl: string): string {
|
||||
const trimmed = baseUrl.trim();
|
||||
if (!trimmed) {
|
||||
return DEFAULT_ELEVENLABS_BASE_URL;
|
||||
}
|
||||
return trimmed.replace(/\/+$/, "");
|
||||
}
|
||||
|
||||
function normalizeOpenAITtsBaseUrl(baseUrl?: string): string {
|
||||
const trimmed = baseUrl?.trim();
|
||||
if (!trimmed) {
|
||||
@@ -53,13 +43,6 @@ function requireInRange(value: number, min: number, max: number, label: string):
|
||||
}
|
||||
}
|
||||
|
||||
function assertElevenLabsVoiceSettings(settings: ResolvedTtsConfig["elevenlabs"]["voiceSettings"]) {
|
||||
requireInRange(settings.stability, 0, 1, "stability");
|
||||
requireInRange(settings.similarityBoost, 0, 1, "similarityBoost");
|
||||
requireInRange(settings.style, 0, 1, "style");
|
||||
requireInRange(settings.speed, 0.5, 2, "speed");
|
||||
}
|
||||
|
||||
function normalizeLanguageCode(code?: string): string | undefined {
|
||||
const trimmed = code?.trim();
|
||||
if (!trimmed) {
|
||||
@@ -538,177 +521,3 @@ export function scheduleCleanup(
|
||||
}, delayMs);
|
||||
timer.unref();
|
||||
}
|
||||
|
||||
export async function elevenLabsTTS(params: {
|
||||
text: string;
|
||||
apiKey: string;
|
||||
baseUrl: string;
|
||||
voiceId: string;
|
||||
modelId: string;
|
||||
outputFormat: string;
|
||||
seed?: number;
|
||||
applyTextNormalization?: "auto" | "on" | "off";
|
||||
languageCode?: string;
|
||||
voiceSettings: ResolvedTtsConfig["elevenlabs"]["voiceSettings"];
|
||||
timeoutMs: number;
|
||||
}): Promise<Buffer> {
|
||||
const {
|
||||
text,
|
||||
apiKey,
|
||||
baseUrl,
|
||||
voiceId,
|
||||
modelId,
|
||||
outputFormat,
|
||||
seed,
|
||||
applyTextNormalization,
|
||||
languageCode,
|
||||
voiceSettings,
|
||||
timeoutMs,
|
||||
} = params;
|
||||
if (!isValidVoiceId(voiceId)) {
|
||||
throw new Error("Invalid voiceId format");
|
||||
}
|
||||
assertElevenLabsVoiceSettings(voiceSettings);
|
||||
const normalizedLanguage = normalizeLanguageCode(languageCode);
|
||||
const normalizedNormalization = normalizeApplyTextNormalization(applyTextNormalization);
|
||||
const normalizedSeed = normalizeSeed(seed);
|
||||
|
||||
const controller = new AbortController();
|
||||
const timeout = setTimeout(() => controller.abort(), timeoutMs);
|
||||
|
||||
try {
|
||||
const url = new URL(`${normalizeElevenLabsBaseUrl(baseUrl)}/v1/text-to-speech/${voiceId}`);
|
||||
if (outputFormat) {
|
||||
url.searchParams.set("output_format", outputFormat);
|
||||
}
|
||||
|
||||
const response = await fetch(url.toString(), {
|
||||
method: "POST",
|
||||
headers: {
|
||||
"xi-api-key": apiKey,
|
||||
"Content-Type": "application/json",
|
||||
Accept: "audio/mpeg",
|
||||
},
|
||||
body: JSON.stringify({
|
||||
text,
|
||||
model_id: modelId,
|
||||
seed: normalizedSeed,
|
||||
apply_text_normalization: normalizedNormalization,
|
||||
language_code: normalizedLanguage,
|
||||
voice_settings: {
|
||||
stability: voiceSettings.stability,
|
||||
similarity_boost: voiceSettings.similarityBoost,
|
||||
style: voiceSettings.style,
|
||||
use_speaker_boost: voiceSettings.useSpeakerBoost,
|
||||
speed: voiceSettings.speed,
|
||||
},
|
||||
}),
|
||||
signal: controller.signal,
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error(`ElevenLabs API error (${response.status})`);
|
||||
}
|
||||
|
||||
return Buffer.from(await response.arrayBuffer());
|
||||
} finally {
|
||||
clearTimeout(timeout);
|
||||
}
|
||||
}
|
||||
|
||||
export async function openaiTTS(params: {
|
||||
text: string;
|
||||
apiKey: string;
|
||||
baseUrl: string;
|
||||
model: string;
|
||||
voice: string;
|
||||
speed?: number;
|
||||
instructions?: string;
|
||||
responseFormat: "mp3" | "opus" | "pcm";
|
||||
timeoutMs: number;
|
||||
}): Promise<Buffer> {
|
||||
const { text, apiKey, baseUrl, model, voice, speed, instructions, responseFormat, timeoutMs } =
|
||||
params;
|
||||
const effectiveInstructions = resolveOpenAITtsInstructions(model, instructions);
|
||||
|
||||
if (!isValidOpenAIModel(model, baseUrl)) {
|
||||
throw new Error(`Invalid model: ${model}`);
|
||||
}
|
||||
if (!isValidOpenAIVoice(voice, baseUrl)) {
|
||||
throw new Error(`Invalid voice: ${voice}`);
|
||||
}
|
||||
|
||||
const controller = new AbortController();
|
||||
const timeout = setTimeout(() => controller.abort(), timeoutMs);
|
||||
|
||||
try {
|
||||
const response = await fetch(`${baseUrl}/audio/speech`, {
|
||||
method: "POST",
|
||||
headers: {
|
||||
Authorization: `Bearer ${apiKey}`,
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
body: JSON.stringify({
|
||||
model,
|
||||
input: text,
|
||||
voice,
|
||||
response_format: responseFormat,
|
||||
...(speed != null && { speed }),
|
||||
...(effectiveInstructions != null && { instructions: effectiveInstructions }),
|
||||
}),
|
||||
signal: controller.signal,
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error(`OpenAI TTS API error (${response.status})`);
|
||||
}
|
||||
|
||||
return Buffer.from(await response.arrayBuffer());
|
||||
} finally {
|
||||
clearTimeout(timeout);
|
||||
}
|
||||
}
|
||||
|
||||
export function inferEdgeExtension(outputFormat: string): string {
|
||||
const normalized = outputFormat.toLowerCase();
|
||||
if (normalized.includes("webm")) {
|
||||
return ".webm";
|
||||
}
|
||||
if (normalized.includes("ogg")) {
|
||||
return ".ogg";
|
||||
}
|
||||
if (normalized.includes("opus")) {
|
||||
return ".opus";
|
||||
}
|
||||
if (normalized.includes("wav") || normalized.includes("riff") || normalized.includes("pcm")) {
|
||||
return ".wav";
|
||||
}
|
||||
return ".mp3";
|
||||
}
|
||||
|
||||
export async function edgeTTS(params: {
|
||||
text: string;
|
||||
outputPath: string;
|
||||
config: ResolvedTtsConfig["edge"];
|
||||
timeoutMs: number;
|
||||
}): Promise<void> {
|
||||
const { text, outputPath, config, timeoutMs } = params;
|
||||
const tts = new EdgeTTS({
|
||||
voice: config.voice,
|
||||
lang: config.lang,
|
||||
outputFormat: config.outputFormat,
|
||||
saveSubtitles: config.saveSubtitles,
|
||||
proxy: config.proxy,
|
||||
rate: config.rate,
|
||||
pitch: config.pitch,
|
||||
volume: config.volume,
|
||||
timeout: config.timeoutMs ?? timeoutMs,
|
||||
});
|
||||
await tts.ttsPromise(text, outputPath);
|
||||
|
||||
const { size } = statSync(outputPath);
|
||||
|
||||
if (size === 0) {
|
||||
throw new Error("Edge TTS produced empty audio file");
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user