From 2a3a24ebdc8cc15e055dc37324369d28b0f057a6 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Tue, 28 Apr 2026 02:44:18 +0100 Subject: [PATCH] refactor: share media provider asset helpers (#73142) * refactor: share openai-compatible speech providers * refactor: tighten openai-compatible speech helper * refactor: share image generation asset helpers * fix: keep image helpers off root plugin sdk runtime --- .../.generated/plugin-sdk-api-baseline.sha256 | 4 +- docs/plugins/sdk-migration.md | 3 +- docs/plugins/sdk-subpaths.md | 4 +- .../deepinfra/image-generation-provider.ts | 84 +--- extensions/deepinfra/speech-provider.ts | 290 +------------ extensions/fal/image-generation-provider.ts | 29 +- .../litellm/image-generation-provider.ts | 27 +- .../openrouter/image-generation-provider.ts | 78 +--- extensions/openrouter/speech-provider.ts | 297 +------------ extensions/xai/image-generation-provider.ts | 34 +- src/image-generation/image-assets.test.ts | 86 ++++ src/image-generation/image-assets.ts | 200 +++++++++ src/plugin-sdk/image-generation.ts | 15 + src/plugin-sdk/index.ts | 2 +- src/plugin-sdk/speech.ts | 7 + .../openai-compatible-speech-provider.test.ts | 155 +++++++ src/tts/openai-compatible-speech-provider.ts | 395 ++++++++++++++++++ 17 files changed, 953 insertions(+), 757 deletions(-) create mode 100644 src/image-generation/image-assets.test.ts create mode 100644 src/image-generation/image-assets.ts create mode 100644 src/tts/openai-compatible-speech-provider.test.ts create mode 100644 src/tts/openai-compatible-speech-provider.ts diff --git a/docs/.generated/plugin-sdk-api-baseline.sha256 b/docs/.generated/plugin-sdk-api-baseline.sha256 index 7a2759ce79e..4aa0fef7c1a 100644 --- a/docs/.generated/plugin-sdk-api-baseline.sha256 +++ b/docs/.generated/plugin-sdk-api-baseline.sha256 @@ -1,2 +1,2 @@ -48cd91661f9fc65e8fb3a091f6deb726d8ccd37f7cec2aa765165f3992e7463f plugin-sdk-api-baseline.json -e8d7069b4d0d7a1a0431d92c845043bb39c3ba106ca0f85cc728a02ece9521bf plugin-sdk-api-baseline.jsonl +8f23f155251c05cab51ee8926e7a359bd64a0ba34e82a80d93d0ed96d07c8a04 plugin-sdk-api-baseline.json +181fea7f35c49032e6894605a06ca1419e5b6ccc1a3d8987d952a1d24a8154bc plugin-sdk-api-baseline.jsonl diff --git a/docs/plugins/sdk-migration.md b/docs/plugins/sdk-migration.md index 32749a857a5..db61c32c6fa 100644 --- a/docs/plugins/sdk-migration.md +++ b/docs/plugins/sdk-migration.md @@ -482,10 +482,11 @@ releases. | `plugin-sdk/media-understanding` | Media-understanding helpers | Media understanding provider types plus provider-facing image/audio helper exports | | `plugin-sdk/text-runtime` | Shared text helpers | Assistant-visible-text stripping, markdown render/chunking/table helpers, redaction helpers, directive-tag helpers, safe-text utilities, and related text/logging helpers | | `plugin-sdk/text-chunking` | Text chunking helpers | Outbound text chunking helper | - | `plugin-sdk/speech` | Speech helpers | Speech provider types plus provider-facing directive, registry, and validation helpers | + | `plugin-sdk/speech` | Speech helpers | Speech provider types plus provider-facing directive, registry, validation helpers, and OpenAI-compatible TTS builder | | `plugin-sdk/speech-core` | Shared speech core | Speech provider types, registry, directives, normalization | | `plugin-sdk/realtime-transcription` | Realtime transcription helpers | Provider types, registry helpers, and shared WebSocket session helper | | `plugin-sdk/realtime-voice` | Realtime voice helpers | Provider types, registry/resolution helpers, and bridge session helpers | + | `plugin-sdk/image-generation` | Image-generation helpers | Image generation provider types plus image asset/data URL helpers | | `plugin-sdk/image-generation-core` | Shared image-generation core | Image-generation types, failover, auth, and registry helpers | | `plugin-sdk/music-generation` | Music-generation helpers | Music-generation provider/request/result types | | `plugin-sdk/music-generation-core` | Shared music-generation core | Music-generation types, failover helpers, provider lookup, and model-ref parsing | diff --git a/docs/plugins/sdk-subpaths.md b/docs/plugins/sdk-subpaths.md index d1f530ddfa6..30234aff80a 100644 --- a/docs/plugins/sdk-subpaths.md +++ b/docs/plugins/sdk-subpaths.md @@ -255,11 +255,11 @@ For the plugin authoring guide, see [Plugin SDK overview](/plugins/sdk-overview) | `plugin-sdk/media-understanding` | Media understanding provider types plus provider-facing image/audio helper exports | | `plugin-sdk/text-runtime` | Shared text/markdown/logging helpers such as assistant-visible-text stripping, markdown render/chunking/table helpers, redaction helpers, directive-tag helpers, and safe-text utilities | | `plugin-sdk/text-chunking` | Outbound text chunking helper | - | `plugin-sdk/speech` | Speech provider types plus provider-facing directive, registry, validation, and speech helper exports | + | `plugin-sdk/speech` | Speech provider types plus provider-facing directive, registry, validation, OpenAI-compatible TTS builder, and speech helper exports | | `plugin-sdk/speech-core` | Shared speech provider types, registry, directive, normalization, and speech helper exports | | `plugin-sdk/realtime-transcription` | Realtime transcription provider types, registry helpers, and shared WebSocket session helper | | `plugin-sdk/realtime-voice` | Realtime voice provider types and registry helpers | - | `plugin-sdk/image-generation` | Image generation provider types | + | `plugin-sdk/image-generation` | Image generation provider types plus image asset/data URL helpers | | `plugin-sdk/image-generation-core` | Shared image-generation types, failover, auth, and registry helpers | | `plugin-sdk/music-generation` | Music generation provider/request/result types | | `plugin-sdk/music-generation-core` | Shared music-generation types, failover helpers, provider lookup, and model-ref parsing | diff --git a/extensions/deepinfra/image-generation-provider.ts b/extensions/deepinfra/image-generation-provider.ts index c5444c71863..7ac4de1c9de 100644 --- a/extensions/deepinfra/image-generation-provider.ts +++ b/extensions/deepinfra/image-generation-provider.ts @@ -1,8 +1,8 @@ import type { OpenClawConfig } from "openclaw/plugin-sdk/config-types"; -import type { - GeneratedImageAsset, - ImageGenerationProvider, - ImageGenerationSourceImage, +import type { ImageGenerationProvider } from "openclaw/plugin-sdk/image-generation"; +import { + imageSourceUploadFileName, + parseOpenAiCompatibleImageResponse, } from "openclaw/plugin-sdk/image-generation"; import { isProviderApiKeyConfigured } from "openclaw/plugin-sdk/provider-auth"; import { resolveApiKeyForProvider } from "openclaw/plugin-sdk/provider-auth-runtime"; @@ -44,75 +44,6 @@ function resolveDeepInfraProviderConfig( return cfg?.models?.providers?.deepinfra; } -function detectImageMimeType(buffer: Buffer): { - mimeType: string; - extension: "jpg" | "png" | "webp"; -} { - if (buffer.length >= 3 && buffer[0] === 0xff && buffer[1] === 0xd8 && buffer[2] === 0xff) { - return { mimeType: "image/jpeg", extension: "jpg" }; - } - if ( - buffer.length >= 8 && - buffer[0] === 0x89 && - buffer[1] === 0x50 && - buffer[2] === 0x4e && - buffer[3] === 0x47 - ) { - return { mimeType: "image/png", extension: "png" }; - } - if ( - buffer.length >= 12 && - buffer.toString("ascii", 0, 4) === "RIFF" && - buffer.toString("ascii", 8, 12) === "WEBP" - ) { - return { mimeType: "image/webp", extension: "webp" }; - } - return { mimeType: "image/jpeg", extension: "jpg" }; -} - -function imageToUploadName(image: ImageGenerationSourceImage, index: number): string { - const fileName = normalizeOptionalString(image.fileName); - if (fileName) { - return fileName; - } - const mimeType = normalizeOptionalString(image.mimeType) ?? "image/png"; - const ext = - mimeType === "image/jpeg" || mimeType === "image/jpg" - ? "jpg" - : mimeType === "image/webp" - ? "webp" - : "png"; - return `image-${index + 1}.${ext}`; -} - -function imageToAsset( - entry: NonNullable[number], - index: number, -): GeneratedImageAsset | null { - const b64 = normalizeOptionalString(entry.b64_json); - if (!b64) { - return null; - } - const buffer = Buffer.from(b64, "base64"); - const detected = detectImageMimeType(buffer); - const image: GeneratedImageAsset = { - buffer, - mimeType: detected.mimeType, - fileName: `image-${index + 1}.${detected.extension}`, - }; - const revisedPrompt = normalizeOptionalString(entry.revised_prompt); - if (revisedPrompt) { - image.revisedPrompt = revisedPrompt; - } - return image; -} - -function parseImageResponse(payload: DeepInfraImageApiResponse): GeneratedImageAsset[] { - return (payload.data ?? []) - .map(imageToAsset) - .filter((entry): entry is GeneratedImageAsset => entry !== null); -} - export function buildDeepInfraImageGenerationProvider(): ImageGenerationProvider { return { id: "deepinfra", @@ -198,7 +129,7 @@ export function buildDeepInfraImageGenerationProvider(): ImageGenerationProvider form.append( "image", new Blob([new Uint8Array(image.buffer)], { type: mimeType }), - imageToUploadName(image, 0), + imageSourceUploadFileName({ image, index: 0 }), ); const multipartHeaders = new Headers(headers); multipartHeaders.delete("Content-Type"); @@ -237,7 +168,10 @@ export function buildDeepInfraImageGenerationProvider(): ImageGenerationProvider response, isEdit ? "DeepInfra image edit failed" : "DeepInfra image generation failed", ); - const images = parseImageResponse((await response.json()) as DeepInfraImageApiResponse); + const images = parseOpenAiCompatibleImageResponse( + (await response.json()) as DeepInfraImageApiResponse, + { defaultMimeType: "image/jpeg", sniffMimeType: true }, + ); if (images.length === 0) { throw new Error("DeepInfra image response did not include generated image data"); } diff --git a/extensions/deepinfra/speech-provider.ts b/extensions/deepinfra/speech-provider.ts index 24aba7cafb0..43a48abd9b3 100644 --- a/extensions/deepinfra/speech-provider.ts +++ b/extensions/deepinfra/speech-provider.ts @@ -1,295 +1,41 @@ import { - assertOkOrThrowHttpError, - postJsonRequest, - resolveProviderHttpRequestConfig, -} from "openclaw/plugin-sdk/provider-http"; -import { normalizeResolvedSecretInputString } from "openclaw/plugin-sdk/secret-input"; -import { - asFiniteNumber, asObject, - trimToUndefined, - type SpeechDirectiveTokenParseContext, - type SpeechProviderConfig, - type SpeechProviderOverrides, + createOpenAiCompatibleSpeechProvider, type SpeechProviderPlugin, } from "openclaw/plugin-sdk/speech"; -import { normalizeOptionalLowercaseString } from "openclaw/plugin-sdk/text-runtime"; import { DEEPINFRA_BASE_URL, DEEPINFRA_TTS_MODELS, DEFAULT_DEEPINFRA_TTS_MODEL, DEFAULT_DEEPINFRA_TTS_VOICE, - normalizeDeepInfraBaseUrl, normalizeDeepInfraModelRef, } from "./media-models.js"; const DEEPINFRA_TTS_RESPONSE_FORMATS = ["mp3", "opus", "flac", "wav", "pcm"] as const; -type DeepInfraTtsResponseFormat = (typeof DEEPINFRA_TTS_RESPONSE_FORMATS)[number]; - -type DeepInfraTtsProviderConfig = { - apiKey?: string; - baseUrl?: string; - model: string; - voice: string; - speed?: number; - responseFormat?: DeepInfraTtsResponseFormat; +type DeepInfraTtsExtraConfig = { extraBody?: Record; }; -type DeepInfraTtsProviderOverrides = { - model?: string; - voice?: string; - speed?: number; -}; - -function normalizeDeepInfraTtsResponseFormat( - value: unknown, -): DeepInfraTtsResponseFormat | undefined { - const next = normalizeOptionalLowercaseString(value); - if (!next) { - return undefined; - } - if (DEEPINFRA_TTS_RESPONSE_FORMATS.some((format) => format === next)) { - return next as DeepInfraTtsResponseFormat; - } - throw new Error(`Invalid DeepInfra speech responseFormat: ${next}`); -} - -function resolveDeepInfraProviderConfigRecord( - rawConfig: Record, -): Record | undefined { - const providers = asObject(rawConfig.providers); - return asObject(providers?.deepinfra) ?? asObject(rawConfig.deepinfra); -} - -function normalizeDeepInfraTtsProviderConfig( - rawConfig: Record, -): DeepInfraTtsProviderConfig { - const raw = resolveDeepInfraProviderConfigRecord(rawConfig); - return { - apiKey: normalizeResolvedSecretInputString({ - value: raw?.apiKey, - path: "messages.tts.providers.deepinfra.apiKey", - }), - baseUrl: - trimToUndefined(raw?.baseUrl) == null ? undefined : normalizeDeepInfraBaseUrl(raw?.baseUrl), - model: normalizeDeepInfraModelRef( - trimToUndefined(raw?.model ?? raw?.modelId), - DEFAULT_DEEPINFRA_TTS_MODEL, - ), - voice: trimToUndefined(raw?.voice ?? raw?.voiceId) ?? DEFAULT_DEEPINFRA_TTS_VOICE, - speed: asFiniteNumber(raw?.speed), - responseFormat: normalizeDeepInfraTtsResponseFormat(raw?.responseFormat), - extraBody: asObject(raw?.extraBody), - }; -} - -function readDeepInfraTtsProviderConfig(config: SpeechProviderConfig): DeepInfraTtsProviderConfig { - const normalized = normalizeDeepInfraTtsProviderConfig({}); - return { - apiKey: trimToUndefined(config.apiKey) ?? normalized.apiKey, - baseUrl: - trimToUndefined(config.baseUrl) == null - ? normalized.baseUrl - : normalizeDeepInfraBaseUrl(config.baseUrl), - model: normalizeDeepInfraModelRef( - trimToUndefined(config.model ?? config.modelId), - normalized.model, - ), - voice: trimToUndefined(config.voice ?? config.voiceId) ?? normalized.voice, - speed: asFiniteNumber(config.speed) ?? normalized.speed, - responseFormat: - normalizeDeepInfraTtsResponseFormat(config.responseFormat) ?? normalized.responseFormat, - extraBody: asObject(config.extraBody) ?? normalized.extraBody, - }; -} - -function readDeepInfraTtsOverrides( - overrides: SpeechProviderOverrides | undefined, -): DeepInfraTtsProviderOverrides { - if (!overrides) { - return {}; - } - return { - model: trimToUndefined(overrides.model ?? overrides.modelId), - voice: trimToUndefined(overrides.voice ?? overrides.voiceId), - speed: asFiniteNumber(overrides.speed), - }; -} - -function resolveDeepInfraTtsApiKey(params: { - cfg?: { models?: { providers?: { deepinfra?: { apiKey?: unknown } } } }; - providerConfig: DeepInfraTtsProviderConfig; -}): string | undefined { - return ( - params.providerConfig.apiKey ?? - normalizeResolvedSecretInputString({ - value: params.cfg?.models?.providers?.deepinfra?.apiKey, - path: "models.providers.deepinfra.apiKey", - }) ?? - trimToUndefined(process.env.DEEPINFRA_API_KEY) - ); -} - -function resolveDeepInfraTtsBaseUrl(params: { - cfg?: { models?: { providers?: { deepinfra?: { baseUrl?: unknown } } } }; - providerConfig: DeepInfraTtsProviderConfig; -}): string { - return normalizeDeepInfraBaseUrl( - params.providerConfig.baseUrl ?? - trimToUndefined(params.cfg?.models?.providers?.deepinfra?.baseUrl) ?? - DEEPINFRA_BASE_URL, - ); -} - -function responseFormatToFileExtension( - format: DeepInfraTtsResponseFormat, -): ".mp3" | ".opus" | ".flac" | ".wav" | ".pcm" { - return `.${format}`; -} - -function parseDirectiveToken(ctx: SpeechDirectiveTokenParseContext): { - handled: boolean; - overrides?: SpeechProviderOverrides; -} { - switch (ctx.key) { - case "voice": - case "voice_id": - case "voiceid": - case "deepinfra_voice": - case "deepinfravoice": - if (!ctx.policy.allowVoice) { - return { handled: true }; - } - return { handled: true, overrides: { voice: ctx.value } }; - case "model": - case "model_id": - case "modelid": - case "deepinfra_model": - case "deepinframodel": - if (!ctx.policy.allowModelId) { - return { handled: true }; - } - return { handled: true, overrides: { model: ctx.value } }; - default: - return { handled: false }; - } -} - export function buildDeepInfraSpeechProvider(): SpeechProviderPlugin { - return { + return createOpenAiCompatibleSpeechProvider({ id: "deepinfra", label: "DeepInfra", autoSelectOrder: 45, - models: [...DEEPINFRA_TTS_MODELS], + models: DEEPINFRA_TTS_MODELS, voices: [DEFAULT_DEEPINFRA_TTS_VOICE], - resolveConfig: ({ rawConfig }) => normalizeDeepInfraTtsProviderConfig(rawConfig), - parseDirectiveToken, - resolveTalkConfig: ({ baseTtsConfig, talkProviderConfig }) => { - const base = normalizeDeepInfraTtsProviderConfig(baseTtsConfig); - const responseFormat = normalizeDeepInfraTtsResponseFormat(talkProviderConfig.responseFormat); - return { - ...base, - ...(talkProviderConfig.apiKey === undefined - ? {} - : { - apiKey: normalizeResolvedSecretInputString({ - value: talkProviderConfig.apiKey, - path: "talk.providers.deepinfra.apiKey", - }), - }), - ...(trimToUndefined(talkProviderConfig.baseUrl) == null - ? {} - : { baseUrl: normalizeDeepInfraBaseUrl(talkProviderConfig.baseUrl) }), - ...(trimToUndefined(talkProviderConfig.modelId) == null - ? {} - : { - model: normalizeDeepInfraModelRef( - trimToUndefined(talkProviderConfig.modelId), - DEFAULT_DEEPINFRA_TTS_MODEL, - ), - }), - ...(trimToUndefined(talkProviderConfig.voiceId) == null - ? {} - : { voice: trimToUndefined(talkProviderConfig.voiceId) }), - ...(asFiniteNumber(talkProviderConfig.speed) == null - ? {} - : { speed: asFiniteNumber(talkProviderConfig.speed) }), - ...(responseFormat == null ? {} : { responseFormat }), - }; - }, - resolveTalkOverrides: ({ params }) => ({ - ...(trimToUndefined(params.voiceId ?? params.voice) == null - ? {} - : { voice: trimToUndefined(params.voiceId ?? params.voice) }), - ...(trimToUndefined(params.modelId ?? params.model) == null - ? {} - : { model: trimToUndefined(params.modelId ?? params.model) }), - ...(asFiniteNumber(params.speed) == null ? {} : { speed: asFiniteNumber(params.speed) }), - }), - listVoices: async () => [ - { id: DEFAULT_DEEPINFRA_TTS_VOICE, name: DEFAULT_DEEPINFRA_TTS_VOICE }, - ], - isConfigured: ({ cfg, providerConfig }) => { - const config = readDeepInfraTtsProviderConfig(providerConfig); - return Boolean(resolveDeepInfraTtsApiKey({ cfg, providerConfig: config })); - }, - synthesize: async (req) => { - const config = readDeepInfraTtsProviderConfig(req.providerConfig); - const overrides = readDeepInfraTtsOverrides(req.providerOverrides); - const apiKey = resolveDeepInfraTtsApiKey({ cfg: req.cfg, providerConfig: config }); - if (!apiKey) { - throw new Error("DeepInfra API key missing"); - } - - const baseUrl = resolveDeepInfraTtsBaseUrl({ cfg: req.cfg, providerConfig: config }); - const responseFormat = config.responseFormat ?? "mp3"; - const speed = overrides.speed ?? config.speed; - const { allowPrivateNetwork, headers, dispatcherPolicy } = resolveProviderHttpRequestConfig({ - baseUrl, - defaultBaseUrl: DEEPINFRA_BASE_URL, - allowPrivateNetwork: false, - defaultHeaders: { - Authorization: `Bearer ${apiKey}`, - "Content-Type": "application/json", - }, - provider: "deepinfra", - capability: "audio", - transport: "http", - }); - - const { response, release } = await postJsonRequest({ - url: `${baseUrl}/audio/speech`, - headers, - body: { - model: normalizeDeepInfraModelRef( - overrides.model ?? config.model, - DEFAULT_DEEPINFRA_TTS_MODEL, - ), - input: req.text, - voice: overrides.voice ?? config.voice, - response_format: responseFormat, - ...(speed == null ? {} : { speed }), - ...(config.extraBody == null ? {} : { extra_body: config.extraBody }), - }, - timeoutMs: req.timeoutMs, - fetchFn: fetch, - allowPrivateNetwork, - dispatcherPolicy, - }); - - try { - await assertOkOrThrowHttpError(response, "DeepInfra TTS API error"); - return { - audioBuffer: Buffer.from(await response.arrayBuffer()), - outputFormat: responseFormat, - fileExtension: responseFormatToFileExtension(responseFormat), - voiceCompatible: responseFormat === "mp3" || responseFormat === "opus", - }; - } finally { - await release(); - } - }, - }; + defaultModel: DEFAULT_DEEPINFRA_TTS_MODEL, + defaultVoice: DEFAULT_DEEPINFRA_TTS_VOICE, + defaultBaseUrl: DEEPINFRA_BASE_URL, + envKey: "DEEPINFRA_API_KEY", + responseFormats: DEEPINFRA_TTS_RESPONSE_FORMATS, + defaultResponseFormat: "mp3", + voiceCompatibleResponseFormats: ["mp3", "opus"], + baseUrlPolicy: { kind: "trim-trailing-slash" }, + normalizeModel: normalizeDeepInfraModelRef, + apiErrorLabel: "DeepInfra TTS API error", + missingApiKeyError: "DeepInfra API key missing", + readExtraConfig: (raw) => ({ extraBody: asObject(raw?.extraBody) }), + extraJsonBodyFields: [{ configKey: "extraBody", requestKey: "extra_body" }], + }); } diff --git a/extensions/fal/image-generation-provider.ts b/extensions/fal/image-generation-provider.ts index 1d8732a95d5..2f318580257 100644 --- a/extensions/fal/image-generation-provider.ts +++ b/extensions/fal/image-generation-provider.ts @@ -2,6 +2,10 @@ import type { GeneratedImageAsset, ImageGenerationProvider, } from "openclaw/plugin-sdk/image-generation"; +import { + imageFileExtensionForMimeType, + toImageDataUrl, +} from "openclaw/plugin-sdk/image-generation"; import { isProviderApiKeyConfigured } from "openclaw/plugin-sdk/provider-auth"; import { resolveApiKeyForProvider } from "openclaw/plugin-sdk/provider-auth-runtime"; import { @@ -16,10 +20,7 @@ import { type SsrFPolicy, ssrfPolicyFromDangerouslyAllowPrivateNetwork, } from "openclaw/plugin-sdk/ssrf-runtime"; -import { - normalizeLowercaseStringOrEmpty, - normalizeOptionalLowercaseString, -} from "openclaw/plugin-sdk/text-runtime"; +import { normalizeLowercaseStringOrEmpty } from "openclaw/plugin-sdk/text-runtime"; const DEFAULT_FAL_BASE_URL = "https://fal.run"; const DEFAULT_FAL_IMAGE_MODEL = "fal-ai/flux/dev"; @@ -214,22 +215,6 @@ function resolveFalImageSize(params: { return undefined; } -function toDataUri(buffer: Buffer, mimeType: string): string { - return `data:${mimeType};base64,${buffer.toString("base64")}`; -} - -function fileExtensionForMimeType(mimeType: string | undefined): string { - const normalized = normalizeOptionalLowercaseString(mimeType); - if (!normalized) { - return "png"; - } - if (normalized.includes("jpeg")) { - return "jpg"; - } - const slashIndex = normalized.indexOf("/"); - return slashIndex >= 0 ? normalized.slice(slashIndex + 1) || "png" : "png"; -} - async function fetchImageBuffer( url: string, networkPolicy?: FalNetworkPolicy, @@ -348,7 +333,7 @@ export function buildFalImageGenerationProvider(): ImageGenerationProvider { if (!input) { throw new Error("fal image edit request missing reference image"); } - requestBody.image_url = toDataUri(input.buffer, input.mimeType); + requestBody.image_url = toImageDataUrl(input); } const { response, release } = await falFetchGuard({ url: `${baseUrl}/${model}`, @@ -378,7 +363,7 @@ export function buildFalImageGenerationProvider(): ImageGenerationProvider { images.push({ buffer: downloaded.buffer, mimeType: downloaded.mimeType, - fileName: `image-${imageIndex}.${fileExtensionForMimeType( + fileName: `image-${imageIndex}.${imageFileExtensionForMimeType( downloaded.mimeType || entry.content_type, )}`, }); diff --git a/extensions/litellm/image-generation-provider.ts b/extensions/litellm/image-generation-provider.ts index 5843edbcd46..c12685799b8 100644 --- a/extensions/litellm/image-generation-provider.ts +++ b/extensions/litellm/image-generation-provider.ts @@ -1,5 +1,9 @@ import type { OpenClawConfig } from "openclaw/plugin-sdk/config-types"; import type { ImageGenerationProvider } from "openclaw/plugin-sdk/image-generation"; +import { + parseOpenAiCompatibleImageResponse, + toImageDataUrl, +} from "openclaw/plugin-sdk/image-generation"; import { isProviderApiKeyConfigured } from "openclaw/plugin-sdk/provider-auth"; import { resolveApiKeyForProvider } from "openclaw/plugin-sdk/provider-auth-runtime"; import { @@ -11,7 +15,6 @@ import { import { normalizeOptionalString } from "openclaw/plugin-sdk/text-runtime"; import { LITELLM_BASE_URL } from "./onboard.js"; -const DEFAULT_OUTPUT_MIME = "image/png"; const DEFAULT_SIZE = "1024x1024"; const DEFAULT_LITELLM_IMAGE_MODEL = "gpt-image-2"; const LITELLM_SUPPORTED_SIZES = [ @@ -82,10 +85,6 @@ function shouldAutoAllowPrivateLitellmEndpoint(baseUrl: string): boolean { } } -function toDataUrl(buffer: Buffer, mimeType: string): string { - return `data:${mimeType};base64,${buffer.toString("base64")}`; -} - type LitellmImageApiResponse = { data?: Array<{ b64_json?: string; @@ -167,7 +166,7 @@ export function buildLitellmImageGenerationProvider(): ImageGenerationProvider { n: count, size, images: inputImages.map((image) => ({ - image_url: toDataUrl(image.buffer, image.mimeType?.trim() || DEFAULT_OUTPUT_MIME), + image_url: toImageDataUrl(image), })), } : { @@ -192,21 +191,7 @@ export function buildLitellmImageGenerationProvider(): ImageGenerationProvider { ); const data = (await response.json()) as LitellmImageApiResponse; - const images = (data.data ?? []) - .map((entry, index) => { - if (!entry.b64_json) { - return null; - } - return Object.assign( - { - buffer: Buffer.from(entry.b64_json, `base64`), - mimeType: DEFAULT_OUTPUT_MIME, - fileName: `image-${index + 1}.png`, - }, - entry.revised_prompt ? { revisedPrompt: entry.revised_prompt } : {}, - ); - }) - .filter((entry): entry is NonNullable => entry !== null); + const images = parseOpenAiCompatibleImageResponse(data); return { images, diff --git a/extensions/openrouter/image-generation-provider.ts b/extensions/openrouter/image-generation-provider.ts index 1edee5e7ca5..75fedb8ed11 100644 --- a/extensions/openrouter/image-generation-provider.ts +++ b/extensions/openrouter/image-generation-provider.ts @@ -3,6 +3,11 @@ import type { ImageGenerationProvider, ImageGenerationRequest, } from "openclaw/plugin-sdk/image-generation"; +import { + generatedImageAssetFromBase64, + generatedImageAssetFromDataUrl, + toImageDataUrl, +} from "openclaw/plugin-sdk/image-generation"; import { isProviderApiKeyConfigured } from "openclaw/plugin-sdk/provider-auth"; import { resolveApiKeyForProvider } from "openclaw/plugin-sdk/provider-auth-runtime"; import { @@ -14,7 +19,6 @@ import { normalizeOptionalString } from "openclaw/plugin-sdk/text-runtime"; import { OPENROUTER_BASE_URL } from "./provider-catalog.js"; const DEFAULT_MODEL = "google/gemini-3.1-flash-image-preview"; -const DEFAULT_OUTPUT_MIME = "image/png"; const DEFAULT_TIMEOUT_MS = 90_000; const MAX_IMAGE_RESULTS = 4; const SUPPORTED_MODELS = [ @@ -49,56 +53,12 @@ type OpenRouterChatCompletionResponse = { }>; }; -function parseDataUrl(dataUrl: string): { mimeType: string; data: string } | undefined { - const match = dataUrl.match(/^data:([^;]+);base64,(.+)$/s); - if (!match) { - return undefined; - } - const [, mimeType, data] = match; - if (!mimeType || !data) { - return undefined; - } - return { mimeType, data }; -} - -function fileExtensionForMimeType(mimeType: string): string { - if (mimeType.includes("jpeg") || mimeType.includes("jpg")) { - return "jpg"; - } - if (mimeType.includes("webp")) { - return "webp"; - } - if (mimeType.includes("gif")) { - return "gif"; - } - return mimeType.split("/")[1] ?? "png"; -} - -function toGeneratedImage(params: { - base64: string; - index: number; - mimeType?: string; -}): GeneratedImageAsset { - const mimeType = params.mimeType ?? DEFAULT_OUTPUT_MIME; - return { - buffer: Buffer.from(params.base64, "base64"), - mimeType, - fileName: `image-${params.index + 1}.${fileExtensionForMimeType(mimeType)}`, - }; -} - function pushDataUrlImage(images: GeneratedImageAsset[], dataUrl: string): void { - const parsed = parseDataUrl(dataUrl); - if (!parsed) { + const image = generatedImageAssetFromDataUrl({ dataUrl, index: images.length }); + if (!image) { return; } - images.push( - toGeneratedImage({ - base64: parsed.data, - index: images.length, - mimeType: parsed.mimeType, - }), - ); + images.push(image); } function extractImagesFromPart(images: GeneratedImageAsset[], part: unknown): void { @@ -117,7 +77,10 @@ function extractImagesFromPart(images: GeneratedImageAsset[], part: unknown): vo const rawBase64 = typeof value.b64_json === "string" ? value.b64_json : undefined; if (rawBase64) { - images.push(toGeneratedImage({ base64: rawBase64, index: images.length })); + const image = generatedImageAssetFromBase64({ base64: rawBase64, index: images.length }); + if (image) { + images.push(image); + } return; } @@ -129,8 +92,15 @@ function extractImagesFromPart(images: GeneratedImageAsset[], part: unknown): vo const mimeType = (typeof inlineData?.mimeType === "string" ? inlineData.mimeType : undefined) ?? (typeof inlineData?.mime_type === "string" ? inlineData.mime_type : undefined) ?? - DEFAULT_OUTPUT_MIME; - images.push(toGeneratedImage({ base64: data, index: images.length, mimeType })); + "image/png"; + const image = generatedImageAssetFromBase64({ + base64: data, + index: images.length, + mimeType, + }); + if (image) { + images.push(image); + } } export function extractOpenRouterImagesFromResponse( @@ -165,10 +135,6 @@ export function extractOpenRouterImagesFromResponse( return images; } -function toDataUrl(image: { buffer: Buffer; mimeType: string }): string { - return `data:${image.mimeType};base64,${image.buffer.toString("base64")}`; -} - function resolveImageCount(count: number | undefined): number { if (typeof count !== "number" || !Number.isFinite(count)) { return 1; @@ -193,7 +159,7 @@ function buildMessageContent( { type: "text", text: req.prompt }, ...inputImages.map((image) => ({ type: "image_url" as const, - image_url: { url: toDataUrl(image) }, + image_url: { url: toImageDataUrl(image) }, })), ]; } diff --git a/extensions/openrouter/speech-provider.ts b/extensions/openrouter/speech-provider.ts index e60102ea96f..2b2faee61e6 100644 --- a/extensions/openrouter/speech-provider.ts +++ b/extensions/openrouter/speech-provider.ts @@ -1,20 +1,9 @@ import { - assertOkOrThrowHttpError, - postJsonRequest, - resolveProviderHttpRequestConfig, -} from "openclaw/plugin-sdk/provider-http"; -import { normalizeResolvedSecretInputString } from "openclaw/plugin-sdk/secret-input"; -import { - asFiniteNumber, asObject, - trimToUndefined, - type SpeechDirectiveTokenParseContext, - type SpeechProviderConfig, - type SpeechProviderOverrides, + createOpenAiCompatibleSpeechProvider, type SpeechProviderPlugin, } from "openclaw/plugin-sdk/speech"; -import { normalizeOptionalLowercaseString } from "openclaw/plugin-sdk/text-runtime"; -import { normalizeOpenRouterBaseUrl, OPENROUTER_BASE_URL } from "./provider-catalog.js"; +import { OPENROUTER_BASE_URL } from "./provider-catalog.js"; const DEFAULT_OPENROUTER_TTS_MODEL = "hexgrad/kokoro-82m"; const DEFAULT_OPENROUTER_TTS_VOICE = "af_alloy"; @@ -26,278 +15,32 @@ const OPENROUTER_TTS_MODELS = [ ] as const; const OPENROUTER_TTS_RESPONSE_FORMATS = ["mp3", "pcm"] as const; -type OpenRouterTtsResponseFormat = (typeof OPENROUTER_TTS_RESPONSE_FORMATS)[number]; - -type OpenRouterTtsProviderConfig = { - apiKey?: string; - baseUrl?: string; - model: string; - voice: string; - speed?: number; - responseFormat?: OpenRouterTtsResponseFormat; +type OpenRouterTtsExtraConfig = { provider?: Record; }; -type OpenRouterTtsProviderOverrides = { - model?: string; - voice?: string; - speed?: number; -}; - -function normalizeOpenRouterTtsResponseFormat( - value: unknown, -): OpenRouterTtsResponseFormat | undefined { - const next = normalizeOptionalLowercaseString(value); - if (!next) { - return undefined; - } - if (OPENROUTER_TTS_RESPONSE_FORMATS.some((format) => format === next)) { - return next as OpenRouterTtsResponseFormat; - } - throw new Error(`Invalid OpenRouter speech responseFormat: ${next}`); -} - -function normalizeOpenRouterTtsBaseUrl(value: unknown): string { - return ( - normalizeOpenRouterBaseUrl(trimToUndefined(value) ?? OPENROUTER_BASE_URL) ?? OPENROUTER_BASE_URL - ); -} - -function resolveOpenRouterProviderConfigRecord( - rawConfig: Record, -): Record | undefined { - const providers = asObject(rawConfig.providers); - return asObject(providers?.openrouter) ?? asObject(rawConfig.openrouter); -} - -function normalizeOpenRouterTtsProviderConfig( - rawConfig: Record, -): OpenRouterTtsProviderConfig { - const raw = resolveOpenRouterProviderConfigRecord(rawConfig); - return { - apiKey: normalizeResolvedSecretInputString({ - value: raw?.apiKey, - path: "messages.tts.providers.openrouter.apiKey", - }), - baseUrl: - trimToUndefined(raw?.baseUrl) == null - ? undefined - : normalizeOpenRouterTtsBaseUrl(raw?.baseUrl), - model: trimToUndefined(raw?.model ?? raw?.modelId) ?? DEFAULT_OPENROUTER_TTS_MODEL, - voice: trimToUndefined(raw?.voice ?? raw?.voiceId) ?? DEFAULT_OPENROUTER_TTS_VOICE, - speed: asFiniteNumber(raw?.speed), - responseFormat: normalizeOpenRouterTtsResponseFormat(raw?.responseFormat), - provider: asObject(raw?.provider), - }; -} - -function readOpenRouterTtsProviderConfig( - config: SpeechProviderConfig, -): OpenRouterTtsProviderConfig { - const normalized = normalizeOpenRouterTtsProviderConfig({}); - return { - apiKey: trimToUndefined(config.apiKey) ?? normalized.apiKey, - baseUrl: - trimToUndefined(config.baseUrl) == null - ? normalized.baseUrl - : normalizeOpenRouterTtsBaseUrl(config.baseUrl), - model: trimToUndefined(config.model ?? config.modelId) ?? normalized.model, - voice: trimToUndefined(config.voice ?? config.voiceId) ?? normalized.voice, - speed: asFiniteNumber(config.speed) ?? normalized.speed, - responseFormat: - normalizeOpenRouterTtsResponseFormat(config.responseFormat) ?? normalized.responseFormat, - provider: asObject(config.provider) ?? normalized.provider, - }; -} - -function readOpenRouterTtsOverrides( - overrides: SpeechProviderOverrides | undefined, -): OpenRouterTtsProviderOverrides { - if (!overrides) { - return {}; - } - return { - model: trimToUndefined(overrides.model ?? overrides.modelId), - voice: trimToUndefined(overrides.voice ?? overrides.voiceId), - speed: asFiniteNumber(overrides.speed), - }; -} - -function resolveOpenRouterTtsApiKey(params: { - cfg?: { models?: { providers?: { openrouter?: { apiKey?: unknown } } } }; - providerConfig: OpenRouterTtsProviderConfig; -}): string | undefined { - return ( - params.providerConfig.apiKey ?? - normalizeResolvedSecretInputString({ - value: params.cfg?.models?.providers?.openrouter?.apiKey, - path: "models.providers.openrouter.apiKey", - }) ?? - trimToUndefined(process.env.OPENROUTER_API_KEY) - ); -} - -function resolveOpenRouterTtsBaseUrl(params: { - cfg?: { models?: { providers?: { openrouter?: { baseUrl?: unknown } } } }; - providerConfig: OpenRouterTtsProviderConfig; -}): string { - return normalizeOpenRouterTtsBaseUrl( - params.providerConfig.baseUrl ?? - trimToUndefined(params.cfg?.models?.providers?.openrouter?.baseUrl) ?? - OPENROUTER_BASE_URL, - ); -} - -function resolveOpenRouterTtsResponseFormat( - configuredFormat?: OpenRouterTtsResponseFormat, -): OpenRouterTtsResponseFormat { - if (configuredFormat) { - return configuredFormat; - } - return "mp3"; -} - -function responseFormatToFileExtension(format: OpenRouterTtsResponseFormat): ".mp3" | ".pcm" { - return format === "pcm" ? ".pcm" : ".mp3"; -} - -function parseDirectiveToken(ctx: SpeechDirectiveTokenParseContext): { - handled: boolean; - overrides?: SpeechProviderOverrides; -} { - switch (ctx.key) { - case "voice": - case "voice_id": - case "voiceid": - case "openrouter_voice": - case "openroutervoice": - if (!ctx.policy.allowVoice) { - return { handled: true }; - } - return { handled: true, overrides: { voice: ctx.value } }; - case "model": - case "model_id": - case "modelid": - case "openrouter_model": - case "openroutermodel": - if (!ctx.policy.allowModelId) { - return { handled: true }; - } - return { handled: true, overrides: { model: ctx.value } }; - default: - return { handled: false }; - } -} - export function buildOpenRouterSpeechProvider(): SpeechProviderPlugin { - return { + return createOpenAiCompatibleSpeechProvider({ id: "openrouter", label: "OpenRouter", autoSelectOrder: 35, models: OPENROUTER_TTS_MODELS, voices: [DEFAULT_OPENROUTER_TTS_VOICE], - resolveConfig: ({ rawConfig }) => normalizeOpenRouterTtsProviderConfig(rawConfig), - parseDirectiveToken, - resolveTalkConfig: ({ baseTtsConfig, talkProviderConfig }) => { - const base = normalizeOpenRouterTtsProviderConfig(baseTtsConfig); - const responseFormat = normalizeOpenRouterTtsResponseFormat( - talkProviderConfig.responseFormat, - ); - return { - ...base, - ...(talkProviderConfig.apiKey === undefined - ? {} - : { - apiKey: normalizeResolvedSecretInputString({ - value: talkProviderConfig.apiKey, - path: "talk.providers.openrouter.apiKey", - }), - }), - ...(trimToUndefined(talkProviderConfig.baseUrl) == null - ? {} - : { baseUrl: normalizeOpenRouterTtsBaseUrl(talkProviderConfig.baseUrl) }), - ...(trimToUndefined(talkProviderConfig.modelId) == null - ? {} - : { model: trimToUndefined(talkProviderConfig.modelId) }), - ...(trimToUndefined(talkProviderConfig.voiceId) == null - ? {} - : { voice: trimToUndefined(talkProviderConfig.voiceId) }), - ...(asFiniteNumber(talkProviderConfig.speed) == null - ? {} - : { speed: asFiniteNumber(talkProviderConfig.speed) }), - ...(responseFormat == null ? {} : { responseFormat }), - }; + defaultModel: DEFAULT_OPENROUTER_TTS_MODEL, + defaultVoice: DEFAULT_OPENROUTER_TTS_VOICE, + defaultBaseUrl: OPENROUTER_BASE_URL, + envKey: "OPENROUTER_API_KEY", + responseFormats: OPENROUTER_TTS_RESPONSE_FORMATS, + defaultResponseFormat: "mp3", + voiceCompatibleResponseFormats: ["mp3"], + baseUrlPolicy: { kind: "canonical", aliases: ["https://openrouter.ai/v1"] }, + extraHeaders: { + "HTTP-Referer": "https://openclaw.ai", + "X-OpenRouter-Title": "OpenClaw", }, - resolveTalkOverrides: ({ params }) => ({ - ...(trimToUndefined(params.voiceId ?? params.voice) == null - ? {} - : { voice: trimToUndefined(params.voiceId ?? params.voice) }), - ...(trimToUndefined(params.modelId ?? params.model) == null - ? {} - : { model: trimToUndefined(params.modelId ?? params.model) }), - ...(asFiniteNumber(params.speed) == null ? {} : { speed: asFiniteNumber(params.speed) }), - }), - listVoices: async () => [ - { id: DEFAULT_OPENROUTER_TTS_VOICE, name: DEFAULT_OPENROUTER_TTS_VOICE }, - ], - isConfigured: ({ cfg, providerConfig }) => { - const config = readOpenRouterTtsProviderConfig(providerConfig); - return Boolean(resolveOpenRouterTtsApiKey({ cfg, providerConfig: config })); - }, - synthesize: async (req) => { - const config = readOpenRouterTtsProviderConfig(req.providerConfig); - const overrides = readOpenRouterTtsOverrides(req.providerOverrides); - const apiKey = resolveOpenRouterTtsApiKey({ cfg: req.cfg, providerConfig: config }); - if (!apiKey) { - throw new Error("OpenRouter API key missing"); - } - - const baseUrl = resolveOpenRouterTtsBaseUrl({ cfg: req.cfg, providerConfig: config }); - const responseFormat = resolveOpenRouterTtsResponseFormat(config.responseFormat); - const speed = overrides.speed ?? config.speed; - const { allowPrivateNetwork, headers, dispatcherPolicy } = resolveProviderHttpRequestConfig({ - baseUrl, - defaultBaseUrl: OPENROUTER_BASE_URL, - allowPrivateNetwork: false, - defaultHeaders: { - Authorization: `Bearer ${apiKey}`, - "Content-Type": "application/json", - "HTTP-Referer": "https://openclaw.ai", - "X-OpenRouter-Title": "OpenClaw", - }, - provider: "openrouter", - capability: "audio", - transport: "http", - }); - - const { response, release } = await postJsonRequest({ - url: `${baseUrl}/audio/speech`, - headers, - body: { - model: overrides.model ?? config.model, - input: req.text, - voice: overrides.voice ?? config.voice, - response_format: responseFormat, - ...(speed == null ? {} : { speed }), - ...(config.provider == null ? {} : { provider: config.provider }), - }, - timeoutMs: req.timeoutMs, - fetchFn: fetch, - allowPrivateNetwork, - dispatcherPolicy, - }); - - try { - await assertOkOrThrowHttpError(response, "OpenRouter TTS API error"); - return { - audioBuffer: Buffer.from(await response.arrayBuffer()), - outputFormat: responseFormat, - fileExtension: responseFormatToFileExtension(responseFormat), - voiceCompatible: responseFormat === "mp3", - }; - } finally { - await release(); - } - }, - }; + apiErrorLabel: "OpenRouter TTS API error", + missingApiKeyError: "OpenRouter API key missing", + readExtraConfig: (raw) => ({ provider: asObject(raw?.provider) }), + extraJsonBodyFields: [{ configKey: "provider" }], + }); } diff --git a/extensions/xai/image-generation-provider.ts b/extensions/xai/image-generation-provider.ts index a44f2914d9e..96969c4e40c 100644 --- a/extensions/xai/image-generation-provider.ts +++ b/extensions/xai/image-generation-provider.ts @@ -1,9 +1,12 @@ import type { - GeneratedImageAsset, ImageGenerationProvider, ImageGenerationRequest, ImageGenerationResult, } from "openclaw/plugin-sdk/image-generation"; +import { + parseOpenAiCompatibleImageResponse, + toImageDataUrl, +} from "openclaw/plugin-sdk/image-generation"; import { isProviderApiKeyConfigured } from "openclaw/plugin-sdk/provider-auth"; import { resolveApiKeyForProvider } from "openclaw/plugin-sdk/provider-auth-runtime"; import { @@ -19,7 +22,6 @@ import { } from "openclaw/plugin-sdk/text-runtime"; import { XAI_BASE_URL, XAI_DEFAULT_IMAGE_MODEL, XAI_IMAGE_MODELS } from "./model-definitions.js"; -const DEFAULT_OUTPUT_MIME = "image/png"; const DEFAULT_TIMEOUT_MS = 60_000; const XAI_SUPPORTED_ASPECT_RATIOS = ["1:1", "16:9", "9:16", "4:3", "3:4", "2:3", "3:2"] as const; @@ -32,10 +34,6 @@ type XaiImageApiResponse = { }>; }; -function toDataUrl(buffer: Buffer, mimeType: string): string { - return `data:${mimeType};base64,${buffer.toString("base64")}`; -} - function resolveImageForEdit( input: { url?: string; buffer?: Buffer; mimeType?: string } | undefined, ): string { @@ -49,8 +47,7 @@ function resolveImageForEdit( if (!input.buffer) { throw new Error("xAI image edit input is missing both URL and buffer data."); } - const mime = normalizeOptionalString(input.mimeType) ?? "image/png"; - return toDataUrl(input.buffer, mime); + return toImageDataUrl({ buffer: input.buffer, mimeType: input.mimeType }); } function isEdit(req: ImageGenerationRequest): boolean { @@ -187,26 +184,7 @@ export function buildXaiImageGenerationProvider(): ImageGenerationProvider { ); const payload = (await response.json()) as XaiImageApiResponse; - const images: GeneratedImageAsset[] = (payload.data ?? []).flatMap((item, idx) => { - if (!item) { - return []; - } - const b64 = normalizeOptionalString(item.b64_json); - if (!b64) { - return []; - } - const mimeType = normalizeOptionalString(item.mime_type) ?? DEFAULT_OUTPUT_MIME; - return [ - { - buffer: Buffer.from(b64, "base64"), - mimeType, - fileName: `image-${idx + 1}.${mimeType.split("/")[1] || "png"}`, - ...(item.revised_prompt - ? { revisedPrompt: normalizeOptionalString(item.revised_prompt) } - : {}), - }, - ]; - }); + const images = parseOpenAiCompatibleImageResponse(payload); return { images, diff --git a/src/image-generation/image-assets.test.ts b/src/image-generation/image-assets.test.ts new file mode 100644 index 00000000000..b503cad8d1e --- /dev/null +++ b/src/image-generation/image-assets.test.ts @@ -0,0 +1,86 @@ +import { describe, expect, it } from "vitest"; +import { + generatedImageAssetFromDataUrl, + imageFileExtensionForMimeType, + imageSourceUploadFileName, + parseImageDataUrl, + parseOpenAiCompatibleImageResponse, + sniffImageMimeType, + toImageDataUrl, +} from "./image-assets.js"; + +describe("image asset helpers", () => { + it("converts buffers to image data URLs and parses them back", () => { + const buffer = Buffer.from("png-bytes"); + const dataUrl = toImageDataUrl({ buffer, mimeType: "image/png" }); + + expect(dataUrl).toBe(`data:image/png;base64,${buffer.toString("base64")}`); + expect(parseImageDataUrl(dataUrl)).toEqual({ + mimeType: "image/png", + base64: buffer.toString("base64"), + }); + expect(generatedImageAssetFromDataUrl({ dataUrl, index: 1 })).toMatchObject({ + buffer, + mimeType: "image/png", + fileName: "image-2.png", + }); + }); + + it("normalizes image file extensions", () => { + expect(imageFileExtensionForMimeType("image/jpeg")).toBe("jpg"); + expect(imageFileExtensionForMimeType("image/webp")).toBe("webp"); + expect(imageFileExtensionForMimeType("image/svg+xml")).toBe("svg"); + expect(imageFileExtensionForMimeType(undefined, "jpg")).toBe("jpg"); + }); + + it("sniffs common generated image types", () => { + expect(sniffImageMimeType(Buffer.from([0xff, 0xd8, 0xff]))).toEqual({ + mimeType: "image/jpeg", + extension: "jpg", + }); + expect(sniffImageMimeType(Buffer.from([0x89, 0x50, 0x4e, 0x47, 0, 0, 0, 0]))).toEqual({ + mimeType: "image/png", + extension: "png", + }); + }); + + it("parses OpenAI-compatible base64 image responses", () => { + const jpegBytes = Buffer.from([0xff, 0xd8, 0xff, 0xdb]); + const images = parseOpenAiCompatibleImageResponse( + { + data: [ + { + b64_json: jpegBytes.toString("base64"), + revised_prompt: "revised", + }, + { b64_json: "" }, + ], + }, + { defaultMimeType: "image/png", sniffMimeType: true }, + ); + + expect(images).toEqual([ + { + buffer: jpegBytes, + mimeType: "image/jpeg", + fileName: "image-1.jpg", + revisedPrompt: "revised", + }, + ]); + }); + + it("resolves source upload filenames from explicit names or MIME types", () => { + expect( + imageSourceUploadFileName({ + image: { buffer: Buffer.from("x"), mimeType: "image/webp" }, + index: 2, + }), + ).toBe("image-3.webp"); + expect( + imageSourceUploadFileName({ + image: { buffer: Buffer.from("x"), mimeType: "image/png", fileName: "source.png" }, + index: 0, + }), + ).toBe("source.png"); + }); +}); diff --git a/src/image-generation/image-assets.ts b/src/image-generation/image-assets.ts new file mode 100644 index 00000000000..71735c88851 --- /dev/null +++ b/src/image-generation/image-assets.ts @@ -0,0 +1,200 @@ +import { + normalizeOptionalLowercaseString, + normalizeOptionalString, +} from "../shared/string-coerce.js"; +import type { GeneratedImageAsset, ImageGenerationSourceImage } from "./types.js"; + +const DEFAULT_IMAGE_MIME_TYPE = "image/png"; +const DEFAULT_IMAGE_FILE_PREFIX = "image"; + +export type ImageMimeTypeDetection = { + mimeType: string; + extension: string; +}; + +export type OpenAiCompatibleImageResponseEntry = { + b64_json?: unknown; + mime_type?: unknown; + revised_prompt?: unknown; +}; + +export type OpenAiCompatibleImageResponsePayload = { + data?: OpenAiCompatibleImageResponseEntry[]; +}; + +export function imageFileExtensionForMimeType( + mimeType: string | undefined, + fallback = "png", +): string { + const normalized = normalizeOptionalLowercaseString(mimeType)?.split(";")[0]?.trim(); + if (!normalized) { + return fallback; + } + if (normalized.includes("jpeg") || normalized.includes("jpg")) { + return "jpg"; + } + if (normalized.includes("svg")) { + return "svg"; + } + const slashIndex = normalized.indexOf("/"); + return slashIndex >= 0 ? normalized.slice(slashIndex + 1) || fallback : fallback; +} + +export function sniffImageMimeType( + buffer: Buffer, + fallbackMimeType = DEFAULT_IMAGE_MIME_TYPE, +): ImageMimeTypeDetection { + if (buffer.length >= 3 && buffer[0] === 0xff && buffer[1] === 0xd8 && buffer[2] === 0xff) { + return { mimeType: "image/jpeg", extension: "jpg" }; + } + if ( + buffer.length >= 8 && + buffer[0] === 0x89 && + buffer[1] === 0x50 && + buffer[2] === 0x4e && + buffer[3] === 0x47 + ) { + return { mimeType: "image/png", extension: "png" }; + } + if ( + buffer.length >= 12 && + buffer.toString("ascii", 0, 4) === "RIFF" && + buffer.toString("ascii", 8, 12) === "WEBP" + ) { + return { mimeType: "image/webp", extension: "webp" }; + } + return { + mimeType: fallbackMimeType, + extension: imageFileExtensionForMimeType(fallbackMimeType), + }; +} + +export function toImageDataUrl(params: { + buffer: Buffer; + mimeType?: string; + defaultMimeType?: string; +}): string { + const mimeType = + normalizeOptionalString(params.mimeType) ?? + normalizeOptionalString(params.defaultMimeType) ?? + DEFAULT_IMAGE_MIME_TYPE; + return `data:${mimeType};base64,${params.buffer.toString("base64")}`; +} + +export function parseImageDataUrl( + dataUrl: string, +): { mimeType: string; base64: string } | undefined { + const match = dataUrl.match(/^data:(image\/[^;,]+)(?:;[^,]*)?;base64,(.+)$/is); + if (!match) { + return undefined; + } + const mimeType = normalizeOptionalString(match[1]); + const base64 = normalizeOptionalString(match[2]); + if (!mimeType || !base64) { + return undefined; + } + return { mimeType, base64 }; +} + +export function generatedImageAssetFromBase64(params: { + base64: string | undefined; + index: number; + mimeType?: string; + revisedPrompt?: string; + defaultMimeType?: string; + fileNamePrefix?: string; + sniffMimeType?: boolean; +}): GeneratedImageAsset | undefined { + const base64 = normalizeOptionalString(params.base64); + if (!base64) { + return undefined; + } + const buffer = Buffer.from(base64, "base64"); + const explicitMimeType = normalizeOptionalString(params.mimeType); + const defaultMimeType = + normalizeOptionalString(params.defaultMimeType) ?? DEFAULT_IMAGE_MIME_TYPE; + const detected = + params.sniffMimeType && !explicitMimeType + ? sniffImageMimeType(buffer, defaultMimeType) + : undefined; + const mimeType = explicitMimeType ?? detected?.mimeType ?? defaultMimeType; + const prefix = normalizeOptionalString(params.fileNamePrefix) ?? DEFAULT_IMAGE_FILE_PREFIX; + const image: GeneratedImageAsset = { + buffer, + mimeType, + fileName: `${prefix}-${params.index + 1}.${detected?.extension ?? imageFileExtensionForMimeType(mimeType)}`, + }; + const revisedPrompt = normalizeOptionalString(params.revisedPrompt); + if (revisedPrompt) { + image.revisedPrompt = revisedPrompt; + } + return image; +} + +export function generatedImageAssetFromDataUrl(params: { + dataUrl: string; + index: number; + fileNamePrefix?: string; +}): GeneratedImageAsset | undefined { + const parsed = parseImageDataUrl(params.dataUrl); + if (!parsed) { + return undefined; + } + return generatedImageAssetFromBase64({ + base64: parsed.base64, + index: params.index, + mimeType: parsed.mimeType, + fileNamePrefix: params.fileNamePrefix, + }); +} + +export function generatedImageAssetFromOpenAiCompatibleEntry( + entry: OpenAiCompatibleImageResponseEntry, + index: number, + options: { + defaultMimeType?: string; + fileNamePrefix?: string; + sniffMimeType?: boolean; + } = {}, +): GeneratedImageAsset | undefined { + return generatedImageAssetFromBase64({ + base64: normalizeOptionalString(entry.b64_json), + index, + mimeType: normalizeOptionalString(entry.mime_type), + revisedPrompt: normalizeOptionalString(entry.revised_prompt), + defaultMimeType: options.defaultMimeType, + fileNamePrefix: options.fileNamePrefix, + sniffMimeType: options.sniffMimeType, + }); +} + +export function parseOpenAiCompatibleImageResponse( + payload: OpenAiCompatibleImageResponsePayload, + options: { + defaultMimeType?: string; + fileNamePrefix?: string; + sniffMimeType?: boolean; + } = {}, +): GeneratedImageAsset[] { + return (payload.data ?? []) + .map((entry, index) => generatedImageAssetFromOpenAiCompatibleEntry(entry, index, options)) + .filter((entry): entry is GeneratedImageAsset => entry !== undefined); +} + +export function imageSourceUploadFileName(params: { + image: ImageGenerationSourceImage; + index: number; + defaultMimeType?: string; + fileNamePrefix?: string; +}): string { + const fileName = normalizeOptionalString(params.image.fileName); + if (fileName) { + return fileName; + } + const mimeType = + normalizeOptionalString(params.image.mimeType) ?? + normalizeOptionalString(params.defaultMimeType) ?? + DEFAULT_IMAGE_MIME_TYPE; + const prefix = normalizeOptionalString(params.fileNamePrefix) ?? DEFAULT_IMAGE_FILE_PREFIX; + return `${prefix}-${params.index + 1}.${imageFileExtensionForMimeType(mimeType)}`; +} diff --git a/src/plugin-sdk/image-generation.ts b/src/plugin-sdk/image-generation.ts index d12fc67f11b..0c1391e63d2 100644 --- a/src/plugin-sdk/image-generation.ts +++ b/src/plugin-sdk/image-generation.ts @@ -1,5 +1,20 @@ // Public image-generation helpers and types for provider plugins. +export { + generatedImageAssetFromBase64, + generatedImageAssetFromDataUrl, + generatedImageAssetFromOpenAiCompatibleEntry, + imageFileExtensionForMimeType, + imageSourceUploadFileName, + parseImageDataUrl, + parseOpenAiCompatibleImageResponse, + sniffImageMimeType, + toImageDataUrl, + type ImageMimeTypeDetection, + type OpenAiCompatibleImageResponseEntry, + type OpenAiCompatibleImageResponsePayload, +} from "../image-generation/image-assets.js"; + export type { GeneratedImageAsset, ImageGenerationBackground, diff --git a/src/plugin-sdk/index.ts b/src/plugin-sdk/index.ts index 4077cf6fcd8..1bbb3ebdd1c 100644 --- a/src/plugin-sdk/index.ts +++ b/src/plugin-sdk/index.ts @@ -88,7 +88,7 @@ export type { MemoryPluginPublicArtifactsProvider, } from "../plugins/memory-state.js"; export type { CliBackendConfig } from "../config/types.js"; -export * from "./image-generation.js"; +export type * from "./image-generation.js"; export * from "./music-generation.js"; export type { SecretInput, SecretRef } from "../config/types.secrets.js"; export type { RuntimeEnv } from "../runtime.js"; diff --git a/src/plugin-sdk/speech.ts b/src/plugin-sdk/speech.ts index 7b7774e50e5..72660896334 100644 --- a/src/plugin-sdk/speech.ts +++ b/src/plugin-sdk/speech.ts @@ -55,3 +55,10 @@ export { requireInRange, scheduleCleanup, } from "../tts/tts-provider-helpers.js"; +export { + createOpenAiCompatibleSpeechProvider, + type OpenAiCompatibleSpeechProviderBaseUrlPolicy, + type OpenAiCompatibleSpeechProviderConfig, + type OpenAiCompatibleSpeechProviderExtraJsonBodyField, + type OpenAiCompatibleSpeechProviderOptions, +} from "../tts/openai-compatible-speech-provider.js"; diff --git a/src/tts/openai-compatible-speech-provider.test.ts b/src/tts/openai-compatible-speech-provider.test.ts new file mode 100644 index 00000000000..ad3f6f2f25a --- /dev/null +++ b/src/tts/openai-compatible-speech-provider.test.ts @@ -0,0 +1,155 @@ +import { afterEach, describe, expect, it, vi } from "vitest"; +import { createOpenAiCompatibleSpeechProvider } from "./openai-compatible-speech-provider.js"; + +const { assertOkOrThrowHttpErrorMock, postJsonRequestMock, resolveProviderHttpRequestConfigMock } = + vi.hoisted(() => ({ + assertOkOrThrowHttpErrorMock: vi.fn(async () => {}), + postJsonRequestMock: vi.fn(), + resolveProviderHttpRequestConfigMock: vi.fn((params: Record) => ({ + baseUrl: params.baseUrl ?? params.defaultBaseUrl ?? "https://example.test/v1", + allowPrivateNetwork: false, + headers: new Headers(params.defaultHeaders as HeadersInit | undefined), + dispatcherPolicy: undefined, + })), + })); + +vi.mock("openclaw/plugin-sdk/provider-http", () => ({ + assertOkOrThrowHttpError: assertOkOrThrowHttpErrorMock, + postJsonRequest: postJsonRequestMock, + resolveProviderHttpRequestConfig: resolveProviderHttpRequestConfigMock, +})); + +describe("createOpenAiCompatibleSpeechProvider", () => { + afterEach(() => { + assertOkOrThrowHttpErrorMock.mockClear(); + postJsonRequestMock.mockReset(); + resolveProviderHttpRequestConfigMock.mockClear(); + vi.unstubAllEnvs(); + }); + + it("normalizes config with built-in base URL policies", () => { + const provider = createOpenAiCompatibleSpeechProvider({ + id: "demo", + label: "Demo", + autoSelectOrder: 40, + models: ["demo-tts"], + voices: ["alloy"], + defaultModel: "demo-tts", + defaultVoice: "alloy", + defaultBaseUrl: "https://example.test/api/v1", + envKey: "DEMO_API_KEY", + responseFormats: ["mp3", "pcm"], + defaultResponseFormat: "mp3", + voiceCompatibleResponseFormats: ["mp3"], + baseUrlPolicy: { + kind: "canonical", + aliases: ["https://example.test/v1"], + }, + }); + + expect( + provider.resolveConfig?.({ + cfg: {} as never, + timeoutMs: 30_000, + rawConfig: { + providers: { + demo: { + apiKey: "sk-demo", + baseUrl: "https://example.test/v1/", + modelId: "custom-tts", + voiceId: "nova", + speed: 1.25, + responseFormat: " PCM ", + }, + }, + }, + }), + ).toEqual({ + apiKey: "sk-demo", + baseUrl: "https://example.test/api/v1", + model: "custom-tts", + voice: "nova", + speed: 1.25, + responseFormat: "pcm", + }); + }); + + it("maps configured extra JSON body fields into synthesis requests", async () => { + const release = vi.fn(async () => {}); + postJsonRequestMock.mockResolvedValue({ + response: new Response(new Uint8Array([4, 5, 6]), { status: 200 }), + release, + }); + vi.stubEnv("DEMO_API_KEY", "sk-env"); + + const provider = createOpenAiCompatibleSpeechProvider<{ + routing?: Record; + }>({ + id: "demo", + label: "Demo", + autoSelectOrder: 40, + models: ["demo-tts"], + voices: ["alloy"], + defaultModel: "demo-tts", + defaultVoice: "alloy", + defaultBaseUrl: "https://example.test/v1", + envKey: "DEMO_API_KEY", + responseFormats: ["mp3", "opus"], + defaultResponseFormat: "mp3", + voiceCompatibleResponseFormats: ["opus"], + baseUrlPolicy: { kind: "trim-trailing-slash" }, + readExtraConfig: (raw) => + typeof raw?.routing === "object" && raw.routing !== null && !Array.isArray(raw.routing) + ? { routing: raw.routing as Record } + : {}, + extraJsonBodyFields: [{ configKey: "routing", requestKey: "provider" }], + }); + + const result = await provider.synthesize({ + text: "hello", + cfg: {} as never, + providerConfig: { + baseUrl: "https://example.test/v1/", + responseFormat: "opus", + routing: { order: ["openai"] }, + }, + providerOverrides: { + modelId: "override-tts", + voiceId: "verse", + speed: 1.1, + }, + target: "voice-note", + timeoutMs: 1234, + }); + + expect(resolveProviderHttpRequestConfigMock).toHaveBeenCalledWith( + expect.objectContaining({ + baseUrl: "https://example.test/v1", + defaultBaseUrl: "https://example.test/v1", + provider: "demo", + capability: "audio", + }), + ); + expect(postJsonRequestMock).toHaveBeenCalledWith( + expect.objectContaining({ + url: "https://example.test/v1/audio/speech", + timeoutMs: 1234, + body: { + model: "override-tts", + input: "hello", + voice: "verse", + response_format: "opus", + speed: 1.1, + provider: { order: ["openai"] }, + }, + }), + ); + expect(result).toMatchObject({ + audioBuffer: Buffer.from([4, 5, 6]), + outputFormat: "opus", + fileExtension: ".opus", + voiceCompatible: true, + }); + expect(release).toHaveBeenCalledOnce(); + }); +}); diff --git a/src/tts/openai-compatible-speech-provider.ts b/src/tts/openai-compatible-speech-provider.ts new file mode 100644 index 00000000000..af2d5a707f0 --- /dev/null +++ b/src/tts/openai-compatible-speech-provider.ts @@ -0,0 +1,395 @@ +import { + assertOkOrThrowHttpError, + postJsonRequest, + resolveProviderHttpRequestConfig, +} from "openclaw/plugin-sdk/provider-http"; +import { normalizeResolvedSecretInputString } from "openclaw/plugin-sdk/secret-input"; +import { asFiniteNumber, asObject, trimToUndefined } from "../agents/provider-http-errors.js"; +import type { SpeechProviderPlugin } from "../plugins/types.js"; +import { normalizeOptionalLowercaseString } from "../shared/string-coerce.js"; +import type { + SpeechDirectiveTokenParseContext, + SpeechProviderConfig, + SpeechProviderOverrides, +} from "./provider-types.js"; + +type OpenAiCompatibleSpeechProviderBaseConfig = { + apiKey?: string; + baseUrl?: string; + model: string; + voice: string; + speed?: number; + responseFormat?: string; +}; + +export type OpenAiCompatibleSpeechProviderConfig< + ExtraConfig extends Record = Record, +> = OpenAiCompatibleSpeechProviderBaseConfig & ExtraConfig; + +export type OpenAiCompatibleSpeechProviderBaseUrlPolicy = + | { kind: "trim-trailing-slash" } + | { kind: "canonical"; aliases?: readonly string[]; allowCustom?: boolean }; + +export type OpenAiCompatibleSpeechProviderExtraJsonBodyField< + ExtraConfig extends Record, +> = { + configKey: Extract; + requestKey?: string; +}; + +export type OpenAiCompatibleSpeechProviderOptions< + ExtraConfig extends Record = Record, +> = { + id: string; + label: string; + autoSelectOrder: number; + models: readonly string[]; + voices: readonly string[]; + defaultModel: string; + defaultVoice: string; + defaultBaseUrl: string; + envKey: string; + responseFormats: readonly string[]; + defaultResponseFormat: string; + voiceCompatibleResponseFormats: readonly string[]; + baseUrlPolicy?: OpenAiCompatibleSpeechProviderBaseUrlPolicy; + normalizeModel?: (value: string | undefined, fallback: string) => string; + configKey?: string; + extraHeaders?: Record; + readExtraConfig?: (raw: Record | undefined) => ExtraConfig; + extraJsonBodyFields?: readonly OpenAiCompatibleSpeechProviderExtraJsonBodyField[]; + apiErrorLabel?: string; + missingApiKeyError?: string; +}; + +type ModelProviderConfig = { + apiKey?: unknown; + baseUrl?: unknown; +}; + +function normalizeResponseFormat(params: { + providerLabel: string; + responseFormats: readonly string[]; + value: unknown; +}): string | undefined { + const next = normalizeOptionalLowercaseString(params.value); + if (!next) { + return undefined; + } + if (params.responseFormats.includes(next)) { + return next; + } + throw new Error(`Invalid ${params.providerLabel} speech responseFormat: ${next}`); +} + +function responseFormatToFileExtension(format: string): `.${string}` { + return `.${format}`; +} + +function trimTrailingBaseUrl(value: unknown, fallback: string): string { + return (trimToUndefined(value) ?? fallback).replace(/\/+$/u, ""); +} + +function normalizeBaseUrl(params: { + value: unknown; + fallback: string; + policy?: OpenAiCompatibleSpeechProviderBaseUrlPolicy; +}): string { + const normalized = trimTrailingBaseUrl(params.value, params.fallback); + if (params.policy?.kind !== "canonical") { + return normalized; + } + const canonical = trimTrailingBaseUrl(params.fallback, params.fallback); + const aliases = new Set( + [canonical, ...(params.policy.aliases ?? [])].map((entry) => + trimTrailingBaseUrl(entry, canonical), + ), + ); + return aliases.has(normalized) || !params.policy.allowCustom ? canonical : normalized; +} + +function resolveProviderConfigRecord( + rawConfig: Record, + providerConfigKey: string, +): Record | undefined { + const providers = asObject(rawConfig.providers); + return asObject(providers?.[providerConfigKey]) ?? asObject(rawConfig[providerConfigKey]); +} + +function readModelProviderConfig( + cfg: unknown, + providerConfigKey: string, +): ModelProviderConfig | undefined { + const root = asObject(cfg); + const models = asObject(root?.models); + const providers = asObject(models?.providers); + return asObject(providers?.[providerConfigKey]); +} + +function readSpeechOverrides(overrides: SpeechProviderOverrides | undefined): { + model?: string; + voice?: string; + speed?: number; +} { + if (!overrides) { + return {}; + } + return { + model: trimToUndefined(overrides.model ?? overrides.modelId), + voice: trimToUndefined(overrides.voice ?? overrides.voiceId), + speed: asFiniteNumber(overrides.speed), + }; +} + +function parseDirectiveToken( + ctx: SpeechDirectiveTokenParseContext, + providerConfigKey: string, +): { handled: boolean; overrides?: SpeechProviderOverrides } { + const compactProviderKey = providerConfigKey.replace(/[^a-z0-9]+/giu, "").toLowerCase(); + switch (ctx.key) { + case "voice": + case "voice_id": + case "voiceid": + case `${providerConfigKey}_voice`: + case `${compactProviderKey}voice`: + if (!ctx.policy.allowVoice) { + return { handled: true }; + } + return { handled: true, overrides: { voice: ctx.value } }; + case "model": + case "model_id": + case "modelid": + case `${providerConfigKey}_model`: + case `${compactProviderKey}model`: + if (!ctx.policy.allowModelId) { + return { handled: true }; + } + return { handled: true, overrides: { model: ctx.value } }; + default: + return { handled: false }; + } +} + +function buildExtraJsonBodyFields>( + config: OpenAiCompatibleSpeechProviderConfig, + fields: readonly OpenAiCompatibleSpeechProviderExtraJsonBodyField[] | undefined, +): Record { + const body: Record = {}; + for (const field of fields ?? []) { + const value = config[field.configKey]; + if (value != null) { + body[field.requestKey ?? field.configKey] = value; + } + } + return body; +} + +export function createOpenAiCompatibleSpeechProvider< + ExtraConfig extends Record = Record, +>(options: OpenAiCompatibleSpeechProviderOptions): SpeechProviderPlugin { + const providerConfigKey = options.configKey ?? options.id; + const normalizeModel = + options.normalizeModel ?? ((value, fallback) => trimToUndefined(value) ?? fallback); + const readExtraConfig = options.readExtraConfig ?? (() => ({}) as ExtraConfig); + + function normalizeConfig( + rawConfig: Record, + ): OpenAiCompatibleSpeechProviderConfig { + const raw = resolveProviderConfigRecord(rawConfig, providerConfigKey); + return { + apiKey: normalizeResolvedSecretInputString({ + value: raw?.apiKey, + path: `messages.tts.providers.${providerConfigKey}.apiKey`, + }), + baseUrl: + trimToUndefined(raw?.baseUrl) == null + ? undefined + : normalizeBaseUrl({ + value: raw?.baseUrl, + fallback: options.defaultBaseUrl, + policy: options.baseUrlPolicy, + }), + model: normalizeModel(trimToUndefined(raw?.model ?? raw?.modelId), options.defaultModel), + voice: trimToUndefined(raw?.voice ?? raw?.voiceId) ?? options.defaultVoice, + speed: asFiniteNumber(raw?.speed), + responseFormat: normalizeResponseFormat({ + providerLabel: options.label, + responseFormats: options.responseFormats, + value: raw?.responseFormat, + }), + ...readExtraConfig(raw), + }; + } + + function readProviderConfig( + config: SpeechProviderConfig, + ): OpenAiCompatibleSpeechProviderConfig { + const normalized = normalizeConfig({}); + return { + apiKey: trimToUndefined(config.apiKey) ?? normalized.apiKey, + baseUrl: + trimToUndefined(config.baseUrl) == null + ? normalized.baseUrl + : normalizeBaseUrl({ + value: config.baseUrl, + fallback: options.defaultBaseUrl, + policy: options.baseUrlPolicy, + }), + model: normalizeModel(trimToUndefined(config.model ?? config.modelId), normalized.model), + voice: trimToUndefined(config.voice ?? config.voiceId) ?? normalized.voice, + speed: asFiniteNumber(config.speed) ?? normalized.speed, + responseFormat: + normalizeResponseFormat({ + providerLabel: options.label, + responseFormats: options.responseFormats, + value: config.responseFormat, + }) ?? normalized.responseFormat, + ...readExtraConfig(config), + }; + } + + function resolveApiKey(params: { + cfg?: unknown; + providerConfig: OpenAiCompatibleSpeechProviderConfig; + }): string | undefined { + return ( + params.providerConfig.apiKey ?? + normalizeResolvedSecretInputString({ + value: readModelProviderConfig(params.cfg, providerConfigKey)?.apiKey, + path: `models.providers.${providerConfigKey}.apiKey`, + }) ?? + trimToUndefined(process.env[options.envKey]) + ); + } + + function resolveBaseUrl(params: { + cfg?: unknown; + providerConfig: OpenAiCompatibleSpeechProviderConfig; + }): string { + return normalizeBaseUrl({ + value: + params.providerConfig.baseUrl ?? + trimToUndefined(readModelProviderConfig(params.cfg, providerConfigKey)?.baseUrl), + fallback: options.defaultBaseUrl, + policy: options.baseUrlPolicy, + }); + } + + return { + id: options.id, + label: options.label, + autoSelectOrder: options.autoSelectOrder, + models: [...options.models], + voices: [...options.voices], + resolveConfig: ({ rawConfig }) => normalizeConfig(rawConfig), + parseDirectiveToken: (ctx) => parseDirectiveToken(ctx, providerConfigKey), + resolveTalkConfig: ({ baseTtsConfig, talkProviderConfig }) => { + const base = normalizeConfig(baseTtsConfig); + const responseFormat = normalizeResponseFormat({ + providerLabel: options.label, + responseFormats: options.responseFormats, + value: talkProviderConfig.responseFormat, + }); + const next: OpenAiCompatibleSpeechProviderConfig = { ...base }; + if (talkProviderConfig.apiKey !== undefined) { + next.apiKey = normalizeResolvedSecretInputString({ + value: talkProviderConfig.apiKey, + path: `talk.providers.${providerConfigKey}.apiKey`, + }); + } + const baseUrl = trimToUndefined(talkProviderConfig.baseUrl); + if (baseUrl !== undefined) { + next.baseUrl = normalizeBaseUrl({ + value: baseUrl, + fallback: options.defaultBaseUrl, + policy: options.baseUrlPolicy, + }); + } + const modelId = trimToUndefined(talkProviderConfig.modelId); + if (modelId !== undefined) { + next.model = normalizeModel(modelId, options.defaultModel); + } + const voiceId = trimToUndefined(talkProviderConfig.voiceId); + if (voiceId !== undefined) { + next.voice = voiceId; + } + const speed = asFiniteNumber(talkProviderConfig.speed); + if (speed !== undefined) { + next.speed = speed; + } + if (responseFormat !== undefined) { + next.responseFormat = responseFormat; + } + return next; + }, + resolveTalkOverrides: ({ params }) => ({ + ...(trimToUndefined(params.voiceId ?? params.voice) == null + ? {} + : { voice: trimToUndefined(params.voiceId ?? params.voice) }), + ...(trimToUndefined(params.modelId ?? params.model) == null + ? {} + : { model: trimToUndefined(params.modelId ?? params.model) }), + ...(asFiniteNumber(params.speed) == null ? {} : { speed: asFiniteNumber(params.speed) }), + }), + listVoices: async () => options.voices.map((voice) => ({ id: voice, name: voice })), + isConfigured: ({ cfg, providerConfig }) => + Boolean(resolveApiKey({ cfg, providerConfig: readProviderConfig(providerConfig) })), + synthesize: async (req) => { + const config = readProviderConfig(req.providerConfig); + const overrides = readSpeechOverrides(req.providerOverrides); + const apiKey = resolveApiKey({ cfg: req.cfg, providerConfig: config }); + if (!apiKey) { + throw new Error(options.missingApiKeyError ?? `${options.label} API key missing`); + } + + const baseUrl = resolveBaseUrl({ cfg: req.cfg, providerConfig: config }); + const responseFormat = config.responseFormat ?? options.defaultResponseFormat; + const speed = overrides.speed ?? config.speed; + const { allowPrivateNetwork, headers, dispatcherPolicy } = resolveProviderHttpRequestConfig({ + baseUrl, + defaultBaseUrl: options.defaultBaseUrl, + allowPrivateNetwork: false, + defaultHeaders: { + Authorization: `Bearer ${apiKey}`, + "Content-Type": "application/json", + ...options.extraHeaders, + }, + provider: options.id, + capability: "audio", + transport: "http", + }); + + const { response, release } = await postJsonRequest({ + url: `${baseUrl}/audio/speech`, + headers, + body: { + model: normalizeModel(overrides.model ?? config.model, options.defaultModel), + input: req.text, + voice: overrides.voice ?? config.voice, + response_format: responseFormat, + ...(speed == null ? {} : { speed }), + ...buildExtraJsonBodyFields(config, options.extraJsonBodyFields), + }, + timeoutMs: req.timeoutMs, + fetchFn: fetch, + allowPrivateNetwork, + dispatcherPolicy, + }); + + try { + await assertOkOrThrowHttpError( + response, + options.apiErrorLabel ?? `${options.label} TTS API error`, + ); + return { + audioBuffer: Buffer.from(await response.arrayBuffer()), + outputFormat: responseFormat, + fileExtension: responseFormatToFileExtension(responseFormat), + voiceCompatible: options.voiceCompatibleResponseFormats.includes(responseFormat), + }; + } finally { + await release(); + } + }, + }; +}