refactor: share media provider asset helpers (#73142)

* refactor: share openai-compatible speech providers

* refactor: tighten openai-compatible speech helper

* refactor: share image generation asset helpers

* fix: keep image helpers off root plugin sdk runtime
This commit is contained in:
Peter Steinberger
2026-04-28 02:44:18 +01:00
committed by GitHub
parent 4949f23219
commit 2a3a24ebdc
17 changed files with 953 additions and 757 deletions

View File

@@ -1,2 +1,2 @@
48cd91661f9fc65e8fb3a091f6deb726d8ccd37f7cec2aa765165f3992e7463f plugin-sdk-api-baseline.json
e8d7069b4d0d7a1a0431d92c845043bb39c3ba106ca0f85cc728a02ece9521bf plugin-sdk-api-baseline.jsonl
8f23f155251c05cab51ee8926e7a359bd64a0ba34e82a80d93d0ed96d07c8a04 plugin-sdk-api-baseline.json
181fea7f35c49032e6894605a06ca1419e5b6ccc1a3d8987d952a1d24a8154bc plugin-sdk-api-baseline.jsonl

View File

@@ -482,10 +482,11 @@ releases.
| `plugin-sdk/media-understanding` | Media-understanding helpers | Media understanding provider types plus provider-facing image/audio helper exports |
| `plugin-sdk/text-runtime` | Shared text helpers | Assistant-visible-text stripping, markdown render/chunking/table helpers, redaction helpers, directive-tag helpers, safe-text utilities, and related text/logging helpers |
| `plugin-sdk/text-chunking` | Text chunking helpers | Outbound text chunking helper |
| `plugin-sdk/speech` | Speech helpers | Speech provider types plus provider-facing directive, registry, and validation helpers |
| `plugin-sdk/speech` | Speech helpers | Speech provider types plus provider-facing directive, registry, validation helpers, and OpenAI-compatible TTS builder |
| `plugin-sdk/speech-core` | Shared speech core | Speech provider types, registry, directives, normalization |
| `plugin-sdk/realtime-transcription` | Realtime transcription helpers | Provider types, registry helpers, and shared WebSocket session helper |
| `plugin-sdk/realtime-voice` | Realtime voice helpers | Provider types, registry/resolution helpers, and bridge session helpers |
| `plugin-sdk/image-generation` | Image-generation helpers | Image generation provider types plus image asset/data URL helpers |
| `plugin-sdk/image-generation-core` | Shared image-generation core | Image-generation types, failover, auth, and registry helpers |
| `plugin-sdk/music-generation` | Music-generation helpers | Music-generation provider/request/result types |
| `plugin-sdk/music-generation-core` | Shared music-generation core | Music-generation types, failover helpers, provider lookup, and model-ref parsing |

View File

@@ -255,11 +255,11 @@ For the plugin authoring guide, see [Plugin SDK overview](/plugins/sdk-overview)
| `plugin-sdk/media-understanding` | Media understanding provider types plus provider-facing image/audio helper exports |
| `plugin-sdk/text-runtime` | Shared text/markdown/logging helpers such as assistant-visible-text stripping, markdown render/chunking/table helpers, redaction helpers, directive-tag helpers, and safe-text utilities |
| `plugin-sdk/text-chunking` | Outbound text chunking helper |
| `plugin-sdk/speech` | Speech provider types plus provider-facing directive, registry, validation, and speech helper exports |
| `plugin-sdk/speech` | Speech provider types plus provider-facing directive, registry, validation, OpenAI-compatible TTS builder, and speech helper exports |
| `plugin-sdk/speech-core` | Shared speech provider types, registry, directive, normalization, and speech helper exports |
| `plugin-sdk/realtime-transcription` | Realtime transcription provider types, registry helpers, and shared WebSocket session helper |
| `plugin-sdk/realtime-voice` | Realtime voice provider types and registry helpers |
| `plugin-sdk/image-generation` | Image generation provider types |
| `plugin-sdk/image-generation` | Image generation provider types plus image asset/data URL helpers |
| `plugin-sdk/image-generation-core` | Shared image-generation types, failover, auth, and registry helpers |
| `plugin-sdk/music-generation` | Music generation provider/request/result types |
| `plugin-sdk/music-generation-core` | Shared music-generation types, failover helpers, provider lookup, and model-ref parsing |

View File

@@ -1,8 +1,8 @@
import type { OpenClawConfig } from "openclaw/plugin-sdk/config-types";
import type {
GeneratedImageAsset,
ImageGenerationProvider,
ImageGenerationSourceImage,
import type { ImageGenerationProvider } from "openclaw/plugin-sdk/image-generation";
import {
imageSourceUploadFileName,
parseOpenAiCompatibleImageResponse,
} from "openclaw/plugin-sdk/image-generation";
import { isProviderApiKeyConfigured } from "openclaw/plugin-sdk/provider-auth";
import { resolveApiKeyForProvider } from "openclaw/plugin-sdk/provider-auth-runtime";
@@ -44,75 +44,6 @@ function resolveDeepInfraProviderConfig(
return cfg?.models?.providers?.deepinfra;
}
function detectImageMimeType(buffer: Buffer): {
mimeType: string;
extension: "jpg" | "png" | "webp";
} {
if (buffer.length >= 3 && buffer[0] === 0xff && buffer[1] === 0xd8 && buffer[2] === 0xff) {
return { mimeType: "image/jpeg", extension: "jpg" };
}
if (
buffer.length >= 8 &&
buffer[0] === 0x89 &&
buffer[1] === 0x50 &&
buffer[2] === 0x4e &&
buffer[3] === 0x47
) {
return { mimeType: "image/png", extension: "png" };
}
if (
buffer.length >= 12 &&
buffer.toString("ascii", 0, 4) === "RIFF" &&
buffer.toString("ascii", 8, 12) === "WEBP"
) {
return { mimeType: "image/webp", extension: "webp" };
}
return { mimeType: "image/jpeg", extension: "jpg" };
}
function imageToUploadName(image: ImageGenerationSourceImage, index: number): string {
const fileName = normalizeOptionalString(image.fileName);
if (fileName) {
return fileName;
}
const mimeType = normalizeOptionalString(image.mimeType) ?? "image/png";
const ext =
mimeType === "image/jpeg" || mimeType === "image/jpg"
? "jpg"
: mimeType === "image/webp"
? "webp"
: "png";
return `image-${index + 1}.${ext}`;
}
function imageToAsset(
entry: NonNullable<DeepInfraImageApiResponse["data"]>[number],
index: number,
): GeneratedImageAsset | null {
const b64 = normalizeOptionalString(entry.b64_json);
if (!b64) {
return null;
}
const buffer = Buffer.from(b64, "base64");
const detected = detectImageMimeType(buffer);
const image: GeneratedImageAsset = {
buffer,
mimeType: detected.mimeType,
fileName: `image-${index + 1}.${detected.extension}`,
};
const revisedPrompt = normalizeOptionalString(entry.revised_prompt);
if (revisedPrompt) {
image.revisedPrompt = revisedPrompt;
}
return image;
}
function parseImageResponse(payload: DeepInfraImageApiResponse): GeneratedImageAsset[] {
return (payload.data ?? [])
.map(imageToAsset)
.filter((entry): entry is GeneratedImageAsset => entry !== null);
}
export function buildDeepInfraImageGenerationProvider(): ImageGenerationProvider {
return {
id: "deepinfra",
@@ -198,7 +129,7 @@ export function buildDeepInfraImageGenerationProvider(): ImageGenerationProvider
form.append(
"image",
new Blob([new Uint8Array(image.buffer)], { type: mimeType }),
imageToUploadName(image, 0),
imageSourceUploadFileName({ image, index: 0 }),
);
const multipartHeaders = new Headers(headers);
multipartHeaders.delete("Content-Type");
@@ -237,7 +168,10 @@ export function buildDeepInfraImageGenerationProvider(): ImageGenerationProvider
response,
isEdit ? "DeepInfra image edit failed" : "DeepInfra image generation failed",
);
const images = parseImageResponse((await response.json()) as DeepInfraImageApiResponse);
const images = parseOpenAiCompatibleImageResponse(
(await response.json()) as DeepInfraImageApiResponse,
{ defaultMimeType: "image/jpeg", sniffMimeType: true },
);
if (images.length === 0) {
throw new Error("DeepInfra image response did not include generated image data");
}

View File

@@ -1,295 +1,41 @@
import {
assertOkOrThrowHttpError,
postJsonRequest,
resolveProviderHttpRequestConfig,
} from "openclaw/plugin-sdk/provider-http";
import { normalizeResolvedSecretInputString } from "openclaw/plugin-sdk/secret-input";
import {
asFiniteNumber,
asObject,
trimToUndefined,
type SpeechDirectiveTokenParseContext,
type SpeechProviderConfig,
type SpeechProviderOverrides,
createOpenAiCompatibleSpeechProvider,
type SpeechProviderPlugin,
} from "openclaw/plugin-sdk/speech";
import { normalizeOptionalLowercaseString } from "openclaw/plugin-sdk/text-runtime";
import {
DEEPINFRA_BASE_URL,
DEEPINFRA_TTS_MODELS,
DEFAULT_DEEPINFRA_TTS_MODEL,
DEFAULT_DEEPINFRA_TTS_VOICE,
normalizeDeepInfraBaseUrl,
normalizeDeepInfraModelRef,
} from "./media-models.js";
const DEEPINFRA_TTS_RESPONSE_FORMATS = ["mp3", "opus", "flac", "wav", "pcm"] as const;
type DeepInfraTtsResponseFormat = (typeof DEEPINFRA_TTS_RESPONSE_FORMATS)[number];
type DeepInfraTtsProviderConfig = {
apiKey?: string;
baseUrl?: string;
model: string;
voice: string;
speed?: number;
responseFormat?: DeepInfraTtsResponseFormat;
type DeepInfraTtsExtraConfig = {
extraBody?: Record<string, unknown>;
};
type DeepInfraTtsProviderOverrides = {
model?: string;
voice?: string;
speed?: number;
};
function normalizeDeepInfraTtsResponseFormat(
value: unknown,
): DeepInfraTtsResponseFormat | undefined {
const next = normalizeOptionalLowercaseString(value);
if (!next) {
return undefined;
}
if (DEEPINFRA_TTS_RESPONSE_FORMATS.some((format) => format === next)) {
return next as DeepInfraTtsResponseFormat;
}
throw new Error(`Invalid DeepInfra speech responseFormat: ${next}`);
}
function resolveDeepInfraProviderConfigRecord(
rawConfig: Record<string, unknown>,
): Record<string, unknown> | undefined {
const providers = asObject(rawConfig.providers);
return asObject(providers?.deepinfra) ?? asObject(rawConfig.deepinfra);
}
function normalizeDeepInfraTtsProviderConfig(
rawConfig: Record<string, unknown>,
): DeepInfraTtsProviderConfig {
const raw = resolveDeepInfraProviderConfigRecord(rawConfig);
return {
apiKey: normalizeResolvedSecretInputString({
value: raw?.apiKey,
path: "messages.tts.providers.deepinfra.apiKey",
}),
baseUrl:
trimToUndefined(raw?.baseUrl) == null ? undefined : normalizeDeepInfraBaseUrl(raw?.baseUrl),
model: normalizeDeepInfraModelRef(
trimToUndefined(raw?.model ?? raw?.modelId),
DEFAULT_DEEPINFRA_TTS_MODEL,
),
voice: trimToUndefined(raw?.voice ?? raw?.voiceId) ?? DEFAULT_DEEPINFRA_TTS_VOICE,
speed: asFiniteNumber(raw?.speed),
responseFormat: normalizeDeepInfraTtsResponseFormat(raw?.responseFormat),
extraBody: asObject(raw?.extraBody),
};
}
function readDeepInfraTtsProviderConfig(config: SpeechProviderConfig): DeepInfraTtsProviderConfig {
const normalized = normalizeDeepInfraTtsProviderConfig({});
return {
apiKey: trimToUndefined(config.apiKey) ?? normalized.apiKey,
baseUrl:
trimToUndefined(config.baseUrl) == null
? normalized.baseUrl
: normalizeDeepInfraBaseUrl(config.baseUrl),
model: normalizeDeepInfraModelRef(
trimToUndefined(config.model ?? config.modelId),
normalized.model,
),
voice: trimToUndefined(config.voice ?? config.voiceId) ?? normalized.voice,
speed: asFiniteNumber(config.speed) ?? normalized.speed,
responseFormat:
normalizeDeepInfraTtsResponseFormat(config.responseFormat) ?? normalized.responseFormat,
extraBody: asObject(config.extraBody) ?? normalized.extraBody,
};
}
function readDeepInfraTtsOverrides(
overrides: SpeechProviderOverrides | undefined,
): DeepInfraTtsProviderOverrides {
if (!overrides) {
return {};
}
return {
model: trimToUndefined(overrides.model ?? overrides.modelId),
voice: trimToUndefined(overrides.voice ?? overrides.voiceId),
speed: asFiniteNumber(overrides.speed),
};
}
function resolveDeepInfraTtsApiKey(params: {
cfg?: { models?: { providers?: { deepinfra?: { apiKey?: unknown } } } };
providerConfig: DeepInfraTtsProviderConfig;
}): string | undefined {
return (
params.providerConfig.apiKey ??
normalizeResolvedSecretInputString({
value: params.cfg?.models?.providers?.deepinfra?.apiKey,
path: "models.providers.deepinfra.apiKey",
}) ??
trimToUndefined(process.env.DEEPINFRA_API_KEY)
);
}
function resolveDeepInfraTtsBaseUrl(params: {
cfg?: { models?: { providers?: { deepinfra?: { baseUrl?: unknown } } } };
providerConfig: DeepInfraTtsProviderConfig;
}): string {
return normalizeDeepInfraBaseUrl(
params.providerConfig.baseUrl ??
trimToUndefined(params.cfg?.models?.providers?.deepinfra?.baseUrl) ??
DEEPINFRA_BASE_URL,
);
}
function responseFormatToFileExtension(
format: DeepInfraTtsResponseFormat,
): ".mp3" | ".opus" | ".flac" | ".wav" | ".pcm" {
return `.${format}`;
}
function parseDirectiveToken(ctx: SpeechDirectiveTokenParseContext): {
handled: boolean;
overrides?: SpeechProviderOverrides;
} {
switch (ctx.key) {
case "voice":
case "voice_id":
case "voiceid":
case "deepinfra_voice":
case "deepinfravoice":
if (!ctx.policy.allowVoice) {
return { handled: true };
}
return { handled: true, overrides: { voice: ctx.value } };
case "model":
case "model_id":
case "modelid":
case "deepinfra_model":
case "deepinframodel":
if (!ctx.policy.allowModelId) {
return { handled: true };
}
return { handled: true, overrides: { model: ctx.value } };
default:
return { handled: false };
}
}
export function buildDeepInfraSpeechProvider(): SpeechProviderPlugin {
return {
return createOpenAiCompatibleSpeechProvider<DeepInfraTtsExtraConfig>({
id: "deepinfra",
label: "DeepInfra",
autoSelectOrder: 45,
models: [...DEEPINFRA_TTS_MODELS],
models: DEEPINFRA_TTS_MODELS,
voices: [DEFAULT_DEEPINFRA_TTS_VOICE],
resolveConfig: ({ rawConfig }) => normalizeDeepInfraTtsProviderConfig(rawConfig),
parseDirectiveToken,
resolveTalkConfig: ({ baseTtsConfig, talkProviderConfig }) => {
const base = normalizeDeepInfraTtsProviderConfig(baseTtsConfig);
const responseFormat = normalizeDeepInfraTtsResponseFormat(talkProviderConfig.responseFormat);
return {
...base,
...(talkProviderConfig.apiKey === undefined
? {}
: {
apiKey: normalizeResolvedSecretInputString({
value: talkProviderConfig.apiKey,
path: "talk.providers.deepinfra.apiKey",
}),
}),
...(trimToUndefined(talkProviderConfig.baseUrl) == null
? {}
: { baseUrl: normalizeDeepInfraBaseUrl(talkProviderConfig.baseUrl) }),
...(trimToUndefined(talkProviderConfig.modelId) == null
? {}
: {
model: normalizeDeepInfraModelRef(
trimToUndefined(talkProviderConfig.modelId),
DEFAULT_DEEPINFRA_TTS_MODEL,
),
}),
...(trimToUndefined(talkProviderConfig.voiceId) == null
? {}
: { voice: trimToUndefined(talkProviderConfig.voiceId) }),
...(asFiniteNumber(talkProviderConfig.speed) == null
? {}
: { speed: asFiniteNumber(talkProviderConfig.speed) }),
...(responseFormat == null ? {} : { responseFormat }),
};
},
resolveTalkOverrides: ({ params }) => ({
...(trimToUndefined(params.voiceId ?? params.voice) == null
? {}
: { voice: trimToUndefined(params.voiceId ?? params.voice) }),
...(trimToUndefined(params.modelId ?? params.model) == null
? {}
: { model: trimToUndefined(params.modelId ?? params.model) }),
...(asFiniteNumber(params.speed) == null ? {} : { speed: asFiniteNumber(params.speed) }),
}),
listVoices: async () => [
{ id: DEFAULT_DEEPINFRA_TTS_VOICE, name: DEFAULT_DEEPINFRA_TTS_VOICE },
],
isConfigured: ({ cfg, providerConfig }) => {
const config = readDeepInfraTtsProviderConfig(providerConfig);
return Boolean(resolveDeepInfraTtsApiKey({ cfg, providerConfig: config }));
},
synthesize: async (req) => {
const config = readDeepInfraTtsProviderConfig(req.providerConfig);
const overrides = readDeepInfraTtsOverrides(req.providerOverrides);
const apiKey = resolveDeepInfraTtsApiKey({ cfg: req.cfg, providerConfig: config });
if (!apiKey) {
throw new Error("DeepInfra API key missing");
}
const baseUrl = resolveDeepInfraTtsBaseUrl({ cfg: req.cfg, providerConfig: config });
const responseFormat = config.responseFormat ?? "mp3";
const speed = overrides.speed ?? config.speed;
const { allowPrivateNetwork, headers, dispatcherPolicy } = resolveProviderHttpRequestConfig({
baseUrl,
defaultBaseUrl: DEEPINFRA_BASE_URL,
allowPrivateNetwork: false,
defaultHeaders: {
Authorization: `Bearer ${apiKey}`,
"Content-Type": "application/json",
},
provider: "deepinfra",
capability: "audio",
transport: "http",
});
const { response, release } = await postJsonRequest({
url: `${baseUrl}/audio/speech`,
headers,
body: {
model: normalizeDeepInfraModelRef(
overrides.model ?? config.model,
DEFAULT_DEEPINFRA_TTS_MODEL,
),
input: req.text,
voice: overrides.voice ?? config.voice,
response_format: responseFormat,
...(speed == null ? {} : { speed }),
...(config.extraBody == null ? {} : { extra_body: config.extraBody }),
},
timeoutMs: req.timeoutMs,
fetchFn: fetch,
allowPrivateNetwork,
dispatcherPolicy,
});
try {
await assertOkOrThrowHttpError(response, "DeepInfra TTS API error");
return {
audioBuffer: Buffer.from(await response.arrayBuffer()),
outputFormat: responseFormat,
fileExtension: responseFormatToFileExtension(responseFormat),
voiceCompatible: responseFormat === "mp3" || responseFormat === "opus",
};
} finally {
await release();
}
},
};
defaultModel: DEFAULT_DEEPINFRA_TTS_MODEL,
defaultVoice: DEFAULT_DEEPINFRA_TTS_VOICE,
defaultBaseUrl: DEEPINFRA_BASE_URL,
envKey: "DEEPINFRA_API_KEY",
responseFormats: DEEPINFRA_TTS_RESPONSE_FORMATS,
defaultResponseFormat: "mp3",
voiceCompatibleResponseFormats: ["mp3", "opus"],
baseUrlPolicy: { kind: "trim-trailing-slash" },
normalizeModel: normalizeDeepInfraModelRef,
apiErrorLabel: "DeepInfra TTS API error",
missingApiKeyError: "DeepInfra API key missing",
readExtraConfig: (raw) => ({ extraBody: asObject(raw?.extraBody) }),
extraJsonBodyFields: [{ configKey: "extraBody", requestKey: "extra_body" }],
});
}

View File

@@ -2,6 +2,10 @@ import type {
GeneratedImageAsset,
ImageGenerationProvider,
} from "openclaw/plugin-sdk/image-generation";
import {
imageFileExtensionForMimeType,
toImageDataUrl,
} from "openclaw/plugin-sdk/image-generation";
import { isProviderApiKeyConfigured } from "openclaw/plugin-sdk/provider-auth";
import { resolveApiKeyForProvider } from "openclaw/plugin-sdk/provider-auth-runtime";
import {
@@ -16,10 +20,7 @@ import {
type SsrFPolicy,
ssrfPolicyFromDangerouslyAllowPrivateNetwork,
} from "openclaw/plugin-sdk/ssrf-runtime";
import {
normalizeLowercaseStringOrEmpty,
normalizeOptionalLowercaseString,
} from "openclaw/plugin-sdk/text-runtime";
import { normalizeLowercaseStringOrEmpty } from "openclaw/plugin-sdk/text-runtime";
const DEFAULT_FAL_BASE_URL = "https://fal.run";
const DEFAULT_FAL_IMAGE_MODEL = "fal-ai/flux/dev";
@@ -214,22 +215,6 @@ function resolveFalImageSize(params: {
return undefined;
}
function toDataUri(buffer: Buffer, mimeType: string): string {
return `data:${mimeType};base64,${buffer.toString("base64")}`;
}
function fileExtensionForMimeType(mimeType: string | undefined): string {
const normalized = normalizeOptionalLowercaseString(mimeType);
if (!normalized) {
return "png";
}
if (normalized.includes("jpeg")) {
return "jpg";
}
const slashIndex = normalized.indexOf("/");
return slashIndex >= 0 ? normalized.slice(slashIndex + 1) || "png" : "png";
}
async function fetchImageBuffer(
url: string,
networkPolicy?: FalNetworkPolicy,
@@ -348,7 +333,7 @@ export function buildFalImageGenerationProvider(): ImageGenerationProvider {
if (!input) {
throw new Error("fal image edit request missing reference image");
}
requestBody.image_url = toDataUri(input.buffer, input.mimeType);
requestBody.image_url = toImageDataUrl(input);
}
const { response, release } = await falFetchGuard({
url: `${baseUrl}/${model}`,
@@ -378,7 +363,7 @@ export function buildFalImageGenerationProvider(): ImageGenerationProvider {
images.push({
buffer: downloaded.buffer,
mimeType: downloaded.mimeType,
fileName: `image-${imageIndex}.${fileExtensionForMimeType(
fileName: `image-${imageIndex}.${imageFileExtensionForMimeType(
downloaded.mimeType || entry.content_type,
)}`,
});

View File

@@ -1,5 +1,9 @@
import type { OpenClawConfig } from "openclaw/plugin-sdk/config-types";
import type { ImageGenerationProvider } from "openclaw/plugin-sdk/image-generation";
import {
parseOpenAiCompatibleImageResponse,
toImageDataUrl,
} from "openclaw/plugin-sdk/image-generation";
import { isProviderApiKeyConfigured } from "openclaw/plugin-sdk/provider-auth";
import { resolveApiKeyForProvider } from "openclaw/plugin-sdk/provider-auth-runtime";
import {
@@ -11,7 +15,6 @@ import {
import { normalizeOptionalString } from "openclaw/plugin-sdk/text-runtime";
import { LITELLM_BASE_URL } from "./onboard.js";
const DEFAULT_OUTPUT_MIME = "image/png";
const DEFAULT_SIZE = "1024x1024";
const DEFAULT_LITELLM_IMAGE_MODEL = "gpt-image-2";
const LITELLM_SUPPORTED_SIZES = [
@@ -82,10 +85,6 @@ function shouldAutoAllowPrivateLitellmEndpoint(baseUrl: string): boolean {
}
}
function toDataUrl(buffer: Buffer, mimeType: string): string {
return `data:${mimeType};base64,${buffer.toString("base64")}`;
}
type LitellmImageApiResponse = {
data?: Array<{
b64_json?: string;
@@ -167,7 +166,7 @@ export function buildLitellmImageGenerationProvider(): ImageGenerationProvider {
n: count,
size,
images: inputImages.map((image) => ({
image_url: toDataUrl(image.buffer, image.mimeType?.trim() || DEFAULT_OUTPUT_MIME),
image_url: toImageDataUrl(image),
})),
}
: {
@@ -192,21 +191,7 @@ export function buildLitellmImageGenerationProvider(): ImageGenerationProvider {
);
const data = (await response.json()) as LitellmImageApiResponse;
const images = (data.data ?? [])
.map((entry, index) => {
if (!entry.b64_json) {
return null;
}
return Object.assign(
{
buffer: Buffer.from(entry.b64_json, `base64`),
mimeType: DEFAULT_OUTPUT_MIME,
fileName: `image-${index + 1}.png`,
},
entry.revised_prompt ? { revisedPrompt: entry.revised_prompt } : {},
);
})
.filter((entry): entry is NonNullable<typeof entry> => entry !== null);
const images = parseOpenAiCompatibleImageResponse(data);
return {
images,

View File

@@ -3,6 +3,11 @@ import type {
ImageGenerationProvider,
ImageGenerationRequest,
} from "openclaw/plugin-sdk/image-generation";
import {
generatedImageAssetFromBase64,
generatedImageAssetFromDataUrl,
toImageDataUrl,
} from "openclaw/plugin-sdk/image-generation";
import { isProviderApiKeyConfigured } from "openclaw/plugin-sdk/provider-auth";
import { resolveApiKeyForProvider } from "openclaw/plugin-sdk/provider-auth-runtime";
import {
@@ -14,7 +19,6 @@ import { normalizeOptionalString } from "openclaw/plugin-sdk/text-runtime";
import { OPENROUTER_BASE_URL } from "./provider-catalog.js";
const DEFAULT_MODEL = "google/gemini-3.1-flash-image-preview";
const DEFAULT_OUTPUT_MIME = "image/png";
const DEFAULT_TIMEOUT_MS = 90_000;
const MAX_IMAGE_RESULTS = 4;
const SUPPORTED_MODELS = [
@@ -49,56 +53,12 @@ type OpenRouterChatCompletionResponse = {
}>;
};
function parseDataUrl(dataUrl: string): { mimeType: string; data: string } | undefined {
const match = dataUrl.match(/^data:([^;]+);base64,(.+)$/s);
if (!match) {
return undefined;
}
const [, mimeType, data] = match;
if (!mimeType || !data) {
return undefined;
}
return { mimeType, data };
}
function fileExtensionForMimeType(mimeType: string): string {
if (mimeType.includes("jpeg") || mimeType.includes("jpg")) {
return "jpg";
}
if (mimeType.includes("webp")) {
return "webp";
}
if (mimeType.includes("gif")) {
return "gif";
}
return mimeType.split("/")[1] ?? "png";
}
function toGeneratedImage(params: {
base64: string;
index: number;
mimeType?: string;
}): GeneratedImageAsset {
const mimeType = params.mimeType ?? DEFAULT_OUTPUT_MIME;
return {
buffer: Buffer.from(params.base64, "base64"),
mimeType,
fileName: `image-${params.index + 1}.${fileExtensionForMimeType(mimeType)}`,
};
}
function pushDataUrlImage(images: GeneratedImageAsset[], dataUrl: string): void {
const parsed = parseDataUrl(dataUrl);
if (!parsed) {
const image = generatedImageAssetFromDataUrl({ dataUrl, index: images.length });
if (!image) {
return;
}
images.push(
toGeneratedImage({
base64: parsed.data,
index: images.length,
mimeType: parsed.mimeType,
}),
);
images.push(image);
}
function extractImagesFromPart(images: GeneratedImageAsset[], part: unknown): void {
@@ -117,7 +77,10 @@ function extractImagesFromPart(images: GeneratedImageAsset[], part: unknown): vo
const rawBase64 = typeof value.b64_json === "string" ? value.b64_json : undefined;
if (rawBase64) {
images.push(toGeneratedImage({ base64: rawBase64, index: images.length }));
const image = generatedImageAssetFromBase64({ base64: rawBase64, index: images.length });
if (image) {
images.push(image);
}
return;
}
@@ -129,8 +92,15 @@ function extractImagesFromPart(images: GeneratedImageAsset[], part: unknown): vo
const mimeType =
(typeof inlineData?.mimeType === "string" ? inlineData.mimeType : undefined) ??
(typeof inlineData?.mime_type === "string" ? inlineData.mime_type : undefined) ??
DEFAULT_OUTPUT_MIME;
images.push(toGeneratedImage({ base64: data, index: images.length, mimeType }));
"image/png";
const image = generatedImageAssetFromBase64({
base64: data,
index: images.length,
mimeType,
});
if (image) {
images.push(image);
}
}
export function extractOpenRouterImagesFromResponse(
@@ -165,10 +135,6 @@ export function extractOpenRouterImagesFromResponse(
return images;
}
function toDataUrl(image: { buffer: Buffer; mimeType: string }): string {
return `data:${image.mimeType};base64,${image.buffer.toString("base64")}`;
}
function resolveImageCount(count: number | undefined): number {
if (typeof count !== "number" || !Number.isFinite(count)) {
return 1;
@@ -193,7 +159,7 @@ function buildMessageContent(
{ type: "text", text: req.prompt },
...inputImages.map((image) => ({
type: "image_url" as const,
image_url: { url: toDataUrl(image) },
image_url: { url: toImageDataUrl(image) },
})),
];
}

View File

@@ -1,20 +1,9 @@
import {
assertOkOrThrowHttpError,
postJsonRequest,
resolveProviderHttpRequestConfig,
} from "openclaw/plugin-sdk/provider-http";
import { normalizeResolvedSecretInputString } from "openclaw/plugin-sdk/secret-input";
import {
asFiniteNumber,
asObject,
trimToUndefined,
type SpeechDirectiveTokenParseContext,
type SpeechProviderConfig,
type SpeechProviderOverrides,
createOpenAiCompatibleSpeechProvider,
type SpeechProviderPlugin,
} from "openclaw/plugin-sdk/speech";
import { normalizeOptionalLowercaseString } from "openclaw/plugin-sdk/text-runtime";
import { normalizeOpenRouterBaseUrl, OPENROUTER_BASE_URL } from "./provider-catalog.js";
import { OPENROUTER_BASE_URL } from "./provider-catalog.js";
const DEFAULT_OPENROUTER_TTS_MODEL = "hexgrad/kokoro-82m";
const DEFAULT_OPENROUTER_TTS_VOICE = "af_alloy";
@@ -26,278 +15,32 @@ const OPENROUTER_TTS_MODELS = [
] as const;
const OPENROUTER_TTS_RESPONSE_FORMATS = ["mp3", "pcm"] as const;
type OpenRouterTtsResponseFormat = (typeof OPENROUTER_TTS_RESPONSE_FORMATS)[number];
type OpenRouterTtsProviderConfig = {
apiKey?: string;
baseUrl?: string;
model: string;
voice: string;
speed?: number;
responseFormat?: OpenRouterTtsResponseFormat;
type OpenRouterTtsExtraConfig = {
provider?: Record<string, unknown>;
};
type OpenRouterTtsProviderOverrides = {
model?: string;
voice?: string;
speed?: number;
};
function normalizeOpenRouterTtsResponseFormat(
value: unknown,
): OpenRouterTtsResponseFormat | undefined {
const next = normalizeOptionalLowercaseString(value);
if (!next) {
return undefined;
}
if (OPENROUTER_TTS_RESPONSE_FORMATS.some((format) => format === next)) {
return next as OpenRouterTtsResponseFormat;
}
throw new Error(`Invalid OpenRouter speech responseFormat: ${next}`);
}
function normalizeOpenRouterTtsBaseUrl(value: unknown): string {
return (
normalizeOpenRouterBaseUrl(trimToUndefined(value) ?? OPENROUTER_BASE_URL) ?? OPENROUTER_BASE_URL
);
}
function resolveOpenRouterProviderConfigRecord(
rawConfig: Record<string, unknown>,
): Record<string, unknown> | undefined {
const providers = asObject(rawConfig.providers);
return asObject(providers?.openrouter) ?? asObject(rawConfig.openrouter);
}
function normalizeOpenRouterTtsProviderConfig(
rawConfig: Record<string, unknown>,
): OpenRouterTtsProviderConfig {
const raw = resolveOpenRouterProviderConfigRecord(rawConfig);
return {
apiKey: normalizeResolvedSecretInputString({
value: raw?.apiKey,
path: "messages.tts.providers.openrouter.apiKey",
}),
baseUrl:
trimToUndefined(raw?.baseUrl) == null
? undefined
: normalizeOpenRouterTtsBaseUrl(raw?.baseUrl),
model: trimToUndefined(raw?.model ?? raw?.modelId) ?? DEFAULT_OPENROUTER_TTS_MODEL,
voice: trimToUndefined(raw?.voice ?? raw?.voiceId) ?? DEFAULT_OPENROUTER_TTS_VOICE,
speed: asFiniteNumber(raw?.speed),
responseFormat: normalizeOpenRouterTtsResponseFormat(raw?.responseFormat),
provider: asObject(raw?.provider),
};
}
function readOpenRouterTtsProviderConfig(
config: SpeechProviderConfig,
): OpenRouterTtsProviderConfig {
const normalized = normalizeOpenRouterTtsProviderConfig({});
return {
apiKey: trimToUndefined(config.apiKey) ?? normalized.apiKey,
baseUrl:
trimToUndefined(config.baseUrl) == null
? normalized.baseUrl
: normalizeOpenRouterTtsBaseUrl(config.baseUrl),
model: trimToUndefined(config.model ?? config.modelId) ?? normalized.model,
voice: trimToUndefined(config.voice ?? config.voiceId) ?? normalized.voice,
speed: asFiniteNumber(config.speed) ?? normalized.speed,
responseFormat:
normalizeOpenRouterTtsResponseFormat(config.responseFormat) ?? normalized.responseFormat,
provider: asObject(config.provider) ?? normalized.provider,
};
}
function readOpenRouterTtsOverrides(
overrides: SpeechProviderOverrides | undefined,
): OpenRouterTtsProviderOverrides {
if (!overrides) {
return {};
}
return {
model: trimToUndefined(overrides.model ?? overrides.modelId),
voice: trimToUndefined(overrides.voice ?? overrides.voiceId),
speed: asFiniteNumber(overrides.speed),
};
}
function resolveOpenRouterTtsApiKey(params: {
cfg?: { models?: { providers?: { openrouter?: { apiKey?: unknown } } } };
providerConfig: OpenRouterTtsProviderConfig;
}): string | undefined {
return (
params.providerConfig.apiKey ??
normalizeResolvedSecretInputString({
value: params.cfg?.models?.providers?.openrouter?.apiKey,
path: "models.providers.openrouter.apiKey",
}) ??
trimToUndefined(process.env.OPENROUTER_API_KEY)
);
}
function resolveOpenRouterTtsBaseUrl(params: {
cfg?: { models?: { providers?: { openrouter?: { baseUrl?: unknown } } } };
providerConfig: OpenRouterTtsProviderConfig;
}): string {
return normalizeOpenRouterTtsBaseUrl(
params.providerConfig.baseUrl ??
trimToUndefined(params.cfg?.models?.providers?.openrouter?.baseUrl) ??
OPENROUTER_BASE_URL,
);
}
function resolveOpenRouterTtsResponseFormat(
configuredFormat?: OpenRouterTtsResponseFormat,
): OpenRouterTtsResponseFormat {
if (configuredFormat) {
return configuredFormat;
}
return "mp3";
}
function responseFormatToFileExtension(format: OpenRouterTtsResponseFormat): ".mp3" | ".pcm" {
return format === "pcm" ? ".pcm" : ".mp3";
}
function parseDirectiveToken(ctx: SpeechDirectiveTokenParseContext): {
handled: boolean;
overrides?: SpeechProviderOverrides;
} {
switch (ctx.key) {
case "voice":
case "voice_id":
case "voiceid":
case "openrouter_voice":
case "openroutervoice":
if (!ctx.policy.allowVoice) {
return { handled: true };
}
return { handled: true, overrides: { voice: ctx.value } };
case "model":
case "model_id":
case "modelid":
case "openrouter_model":
case "openroutermodel":
if (!ctx.policy.allowModelId) {
return { handled: true };
}
return { handled: true, overrides: { model: ctx.value } };
default:
return { handled: false };
}
}
export function buildOpenRouterSpeechProvider(): SpeechProviderPlugin {
return {
return createOpenAiCompatibleSpeechProvider<OpenRouterTtsExtraConfig>({
id: "openrouter",
label: "OpenRouter",
autoSelectOrder: 35,
models: OPENROUTER_TTS_MODELS,
voices: [DEFAULT_OPENROUTER_TTS_VOICE],
resolveConfig: ({ rawConfig }) => normalizeOpenRouterTtsProviderConfig(rawConfig),
parseDirectiveToken,
resolveTalkConfig: ({ baseTtsConfig, talkProviderConfig }) => {
const base = normalizeOpenRouterTtsProviderConfig(baseTtsConfig);
const responseFormat = normalizeOpenRouterTtsResponseFormat(
talkProviderConfig.responseFormat,
);
return {
...base,
...(talkProviderConfig.apiKey === undefined
? {}
: {
apiKey: normalizeResolvedSecretInputString({
value: talkProviderConfig.apiKey,
path: "talk.providers.openrouter.apiKey",
}),
}),
...(trimToUndefined(talkProviderConfig.baseUrl) == null
? {}
: { baseUrl: normalizeOpenRouterTtsBaseUrl(talkProviderConfig.baseUrl) }),
...(trimToUndefined(talkProviderConfig.modelId) == null
? {}
: { model: trimToUndefined(talkProviderConfig.modelId) }),
...(trimToUndefined(talkProviderConfig.voiceId) == null
? {}
: { voice: trimToUndefined(talkProviderConfig.voiceId) }),
...(asFiniteNumber(talkProviderConfig.speed) == null
? {}
: { speed: asFiniteNumber(talkProviderConfig.speed) }),
...(responseFormat == null ? {} : { responseFormat }),
};
defaultModel: DEFAULT_OPENROUTER_TTS_MODEL,
defaultVoice: DEFAULT_OPENROUTER_TTS_VOICE,
defaultBaseUrl: OPENROUTER_BASE_URL,
envKey: "OPENROUTER_API_KEY",
responseFormats: OPENROUTER_TTS_RESPONSE_FORMATS,
defaultResponseFormat: "mp3",
voiceCompatibleResponseFormats: ["mp3"],
baseUrlPolicy: { kind: "canonical", aliases: ["https://openrouter.ai/v1"] },
extraHeaders: {
"HTTP-Referer": "https://openclaw.ai",
"X-OpenRouter-Title": "OpenClaw",
},
resolveTalkOverrides: ({ params }) => ({
...(trimToUndefined(params.voiceId ?? params.voice) == null
? {}
: { voice: trimToUndefined(params.voiceId ?? params.voice) }),
...(trimToUndefined(params.modelId ?? params.model) == null
? {}
: { model: trimToUndefined(params.modelId ?? params.model) }),
...(asFiniteNumber(params.speed) == null ? {} : { speed: asFiniteNumber(params.speed) }),
}),
listVoices: async () => [
{ id: DEFAULT_OPENROUTER_TTS_VOICE, name: DEFAULT_OPENROUTER_TTS_VOICE },
],
isConfigured: ({ cfg, providerConfig }) => {
const config = readOpenRouterTtsProviderConfig(providerConfig);
return Boolean(resolveOpenRouterTtsApiKey({ cfg, providerConfig: config }));
},
synthesize: async (req) => {
const config = readOpenRouterTtsProviderConfig(req.providerConfig);
const overrides = readOpenRouterTtsOverrides(req.providerOverrides);
const apiKey = resolveOpenRouterTtsApiKey({ cfg: req.cfg, providerConfig: config });
if (!apiKey) {
throw new Error("OpenRouter API key missing");
}
const baseUrl = resolveOpenRouterTtsBaseUrl({ cfg: req.cfg, providerConfig: config });
const responseFormat = resolveOpenRouterTtsResponseFormat(config.responseFormat);
const speed = overrides.speed ?? config.speed;
const { allowPrivateNetwork, headers, dispatcherPolicy } = resolveProviderHttpRequestConfig({
baseUrl,
defaultBaseUrl: OPENROUTER_BASE_URL,
allowPrivateNetwork: false,
defaultHeaders: {
Authorization: `Bearer ${apiKey}`,
"Content-Type": "application/json",
"HTTP-Referer": "https://openclaw.ai",
"X-OpenRouter-Title": "OpenClaw",
},
provider: "openrouter",
capability: "audio",
transport: "http",
});
const { response, release } = await postJsonRequest({
url: `${baseUrl}/audio/speech`,
headers,
body: {
model: overrides.model ?? config.model,
input: req.text,
voice: overrides.voice ?? config.voice,
response_format: responseFormat,
...(speed == null ? {} : { speed }),
...(config.provider == null ? {} : { provider: config.provider }),
},
timeoutMs: req.timeoutMs,
fetchFn: fetch,
allowPrivateNetwork,
dispatcherPolicy,
});
try {
await assertOkOrThrowHttpError(response, "OpenRouter TTS API error");
return {
audioBuffer: Buffer.from(await response.arrayBuffer()),
outputFormat: responseFormat,
fileExtension: responseFormatToFileExtension(responseFormat),
voiceCompatible: responseFormat === "mp3",
};
} finally {
await release();
}
},
};
apiErrorLabel: "OpenRouter TTS API error",
missingApiKeyError: "OpenRouter API key missing",
readExtraConfig: (raw) => ({ provider: asObject(raw?.provider) }),
extraJsonBodyFields: [{ configKey: "provider" }],
});
}

View File

@@ -1,9 +1,12 @@
import type {
GeneratedImageAsset,
ImageGenerationProvider,
ImageGenerationRequest,
ImageGenerationResult,
} from "openclaw/plugin-sdk/image-generation";
import {
parseOpenAiCompatibleImageResponse,
toImageDataUrl,
} from "openclaw/plugin-sdk/image-generation";
import { isProviderApiKeyConfigured } from "openclaw/plugin-sdk/provider-auth";
import { resolveApiKeyForProvider } from "openclaw/plugin-sdk/provider-auth-runtime";
import {
@@ -19,7 +22,6 @@ import {
} from "openclaw/plugin-sdk/text-runtime";
import { XAI_BASE_URL, XAI_DEFAULT_IMAGE_MODEL, XAI_IMAGE_MODELS } from "./model-definitions.js";
const DEFAULT_OUTPUT_MIME = "image/png";
const DEFAULT_TIMEOUT_MS = 60_000;
const XAI_SUPPORTED_ASPECT_RATIOS = ["1:1", "16:9", "9:16", "4:3", "3:4", "2:3", "3:2"] as const;
@@ -32,10 +34,6 @@ type XaiImageApiResponse = {
}>;
};
function toDataUrl(buffer: Buffer, mimeType: string): string {
return `data:${mimeType};base64,${buffer.toString("base64")}`;
}
function resolveImageForEdit(
input: { url?: string; buffer?: Buffer; mimeType?: string } | undefined,
): string {
@@ -49,8 +47,7 @@ function resolveImageForEdit(
if (!input.buffer) {
throw new Error("xAI image edit input is missing both URL and buffer data.");
}
const mime = normalizeOptionalString(input.mimeType) ?? "image/png";
return toDataUrl(input.buffer, mime);
return toImageDataUrl({ buffer: input.buffer, mimeType: input.mimeType });
}
function isEdit(req: ImageGenerationRequest): boolean {
@@ -187,26 +184,7 @@ export function buildXaiImageGenerationProvider(): ImageGenerationProvider {
);
const payload = (await response.json()) as XaiImageApiResponse;
const images: GeneratedImageAsset[] = (payload.data ?? []).flatMap((item, idx) => {
if (!item) {
return [];
}
const b64 = normalizeOptionalString(item.b64_json);
if (!b64) {
return [];
}
const mimeType = normalizeOptionalString(item.mime_type) ?? DEFAULT_OUTPUT_MIME;
return [
{
buffer: Buffer.from(b64, "base64"),
mimeType,
fileName: `image-${idx + 1}.${mimeType.split("/")[1] || "png"}`,
...(item.revised_prompt
? { revisedPrompt: normalizeOptionalString(item.revised_prompt) }
: {}),
},
];
});
const images = parseOpenAiCompatibleImageResponse(payload);
return {
images,

View File

@@ -0,0 +1,86 @@
import { describe, expect, it } from "vitest";
import {
generatedImageAssetFromDataUrl,
imageFileExtensionForMimeType,
imageSourceUploadFileName,
parseImageDataUrl,
parseOpenAiCompatibleImageResponse,
sniffImageMimeType,
toImageDataUrl,
} from "./image-assets.js";
describe("image asset helpers", () => {
it("converts buffers to image data URLs and parses them back", () => {
const buffer = Buffer.from("png-bytes");
const dataUrl = toImageDataUrl({ buffer, mimeType: "image/png" });
expect(dataUrl).toBe(`data:image/png;base64,${buffer.toString("base64")}`);
expect(parseImageDataUrl(dataUrl)).toEqual({
mimeType: "image/png",
base64: buffer.toString("base64"),
});
expect(generatedImageAssetFromDataUrl({ dataUrl, index: 1 })).toMatchObject({
buffer,
mimeType: "image/png",
fileName: "image-2.png",
});
});
it("normalizes image file extensions", () => {
expect(imageFileExtensionForMimeType("image/jpeg")).toBe("jpg");
expect(imageFileExtensionForMimeType("image/webp")).toBe("webp");
expect(imageFileExtensionForMimeType("image/svg+xml")).toBe("svg");
expect(imageFileExtensionForMimeType(undefined, "jpg")).toBe("jpg");
});
it("sniffs common generated image types", () => {
expect(sniffImageMimeType(Buffer.from([0xff, 0xd8, 0xff]))).toEqual({
mimeType: "image/jpeg",
extension: "jpg",
});
expect(sniffImageMimeType(Buffer.from([0x89, 0x50, 0x4e, 0x47, 0, 0, 0, 0]))).toEqual({
mimeType: "image/png",
extension: "png",
});
});
it("parses OpenAI-compatible base64 image responses", () => {
const jpegBytes = Buffer.from([0xff, 0xd8, 0xff, 0xdb]);
const images = parseOpenAiCompatibleImageResponse(
{
data: [
{
b64_json: jpegBytes.toString("base64"),
revised_prompt: "revised",
},
{ b64_json: "" },
],
},
{ defaultMimeType: "image/png", sniffMimeType: true },
);
expect(images).toEqual([
{
buffer: jpegBytes,
mimeType: "image/jpeg",
fileName: "image-1.jpg",
revisedPrompt: "revised",
},
]);
});
it("resolves source upload filenames from explicit names or MIME types", () => {
expect(
imageSourceUploadFileName({
image: { buffer: Buffer.from("x"), mimeType: "image/webp" },
index: 2,
}),
).toBe("image-3.webp");
expect(
imageSourceUploadFileName({
image: { buffer: Buffer.from("x"), mimeType: "image/png", fileName: "source.png" },
index: 0,
}),
).toBe("source.png");
});
});

View File

@@ -0,0 +1,200 @@
import {
normalizeOptionalLowercaseString,
normalizeOptionalString,
} from "../shared/string-coerce.js";
import type { GeneratedImageAsset, ImageGenerationSourceImage } from "./types.js";
const DEFAULT_IMAGE_MIME_TYPE = "image/png";
const DEFAULT_IMAGE_FILE_PREFIX = "image";
export type ImageMimeTypeDetection = {
mimeType: string;
extension: string;
};
export type OpenAiCompatibleImageResponseEntry = {
b64_json?: unknown;
mime_type?: unknown;
revised_prompt?: unknown;
};
export type OpenAiCompatibleImageResponsePayload = {
data?: OpenAiCompatibleImageResponseEntry[];
};
export function imageFileExtensionForMimeType(
mimeType: string | undefined,
fallback = "png",
): string {
const normalized = normalizeOptionalLowercaseString(mimeType)?.split(";")[0]?.trim();
if (!normalized) {
return fallback;
}
if (normalized.includes("jpeg") || normalized.includes("jpg")) {
return "jpg";
}
if (normalized.includes("svg")) {
return "svg";
}
const slashIndex = normalized.indexOf("/");
return slashIndex >= 0 ? normalized.slice(slashIndex + 1) || fallback : fallback;
}
export function sniffImageMimeType(
buffer: Buffer,
fallbackMimeType = DEFAULT_IMAGE_MIME_TYPE,
): ImageMimeTypeDetection {
if (buffer.length >= 3 && buffer[0] === 0xff && buffer[1] === 0xd8 && buffer[2] === 0xff) {
return { mimeType: "image/jpeg", extension: "jpg" };
}
if (
buffer.length >= 8 &&
buffer[0] === 0x89 &&
buffer[1] === 0x50 &&
buffer[2] === 0x4e &&
buffer[3] === 0x47
) {
return { mimeType: "image/png", extension: "png" };
}
if (
buffer.length >= 12 &&
buffer.toString("ascii", 0, 4) === "RIFF" &&
buffer.toString("ascii", 8, 12) === "WEBP"
) {
return { mimeType: "image/webp", extension: "webp" };
}
return {
mimeType: fallbackMimeType,
extension: imageFileExtensionForMimeType(fallbackMimeType),
};
}
export function toImageDataUrl(params: {
buffer: Buffer;
mimeType?: string;
defaultMimeType?: string;
}): string {
const mimeType =
normalizeOptionalString(params.mimeType) ??
normalizeOptionalString(params.defaultMimeType) ??
DEFAULT_IMAGE_MIME_TYPE;
return `data:${mimeType};base64,${params.buffer.toString("base64")}`;
}
export function parseImageDataUrl(
dataUrl: string,
): { mimeType: string; base64: string } | undefined {
const match = dataUrl.match(/^data:(image\/[^;,]+)(?:;[^,]*)?;base64,(.+)$/is);
if (!match) {
return undefined;
}
const mimeType = normalizeOptionalString(match[1]);
const base64 = normalizeOptionalString(match[2]);
if (!mimeType || !base64) {
return undefined;
}
return { mimeType, base64 };
}
export function generatedImageAssetFromBase64(params: {
base64: string | undefined;
index: number;
mimeType?: string;
revisedPrompt?: string;
defaultMimeType?: string;
fileNamePrefix?: string;
sniffMimeType?: boolean;
}): GeneratedImageAsset | undefined {
const base64 = normalizeOptionalString(params.base64);
if (!base64) {
return undefined;
}
const buffer = Buffer.from(base64, "base64");
const explicitMimeType = normalizeOptionalString(params.mimeType);
const defaultMimeType =
normalizeOptionalString(params.defaultMimeType) ?? DEFAULT_IMAGE_MIME_TYPE;
const detected =
params.sniffMimeType && !explicitMimeType
? sniffImageMimeType(buffer, defaultMimeType)
: undefined;
const mimeType = explicitMimeType ?? detected?.mimeType ?? defaultMimeType;
const prefix = normalizeOptionalString(params.fileNamePrefix) ?? DEFAULT_IMAGE_FILE_PREFIX;
const image: GeneratedImageAsset = {
buffer,
mimeType,
fileName: `${prefix}-${params.index + 1}.${detected?.extension ?? imageFileExtensionForMimeType(mimeType)}`,
};
const revisedPrompt = normalizeOptionalString(params.revisedPrompt);
if (revisedPrompt) {
image.revisedPrompt = revisedPrompt;
}
return image;
}
export function generatedImageAssetFromDataUrl(params: {
dataUrl: string;
index: number;
fileNamePrefix?: string;
}): GeneratedImageAsset | undefined {
const parsed = parseImageDataUrl(params.dataUrl);
if (!parsed) {
return undefined;
}
return generatedImageAssetFromBase64({
base64: parsed.base64,
index: params.index,
mimeType: parsed.mimeType,
fileNamePrefix: params.fileNamePrefix,
});
}
export function generatedImageAssetFromOpenAiCompatibleEntry(
entry: OpenAiCompatibleImageResponseEntry,
index: number,
options: {
defaultMimeType?: string;
fileNamePrefix?: string;
sniffMimeType?: boolean;
} = {},
): GeneratedImageAsset | undefined {
return generatedImageAssetFromBase64({
base64: normalizeOptionalString(entry.b64_json),
index,
mimeType: normalizeOptionalString(entry.mime_type),
revisedPrompt: normalizeOptionalString(entry.revised_prompt),
defaultMimeType: options.defaultMimeType,
fileNamePrefix: options.fileNamePrefix,
sniffMimeType: options.sniffMimeType,
});
}
export function parseOpenAiCompatibleImageResponse(
payload: OpenAiCompatibleImageResponsePayload,
options: {
defaultMimeType?: string;
fileNamePrefix?: string;
sniffMimeType?: boolean;
} = {},
): GeneratedImageAsset[] {
return (payload.data ?? [])
.map((entry, index) => generatedImageAssetFromOpenAiCompatibleEntry(entry, index, options))
.filter((entry): entry is GeneratedImageAsset => entry !== undefined);
}
export function imageSourceUploadFileName(params: {
image: ImageGenerationSourceImage;
index: number;
defaultMimeType?: string;
fileNamePrefix?: string;
}): string {
const fileName = normalizeOptionalString(params.image.fileName);
if (fileName) {
return fileName;
}
const mimeType =
normalizeOptionalString(params.image.mimeType) ??
normalizeOptionalString(params.defaultMimeType) ??
DEFAULT_IMAGE_MIME_TYPE;
const prefix = normalizeOptionalString(params.fileNamePrefix) ?? DEFAULT_IMAGE_FILE_PREFIX;
return `${prefix}-${params.index + 1}.${imageFileExtensionForMimeType(mimeType)}`;
}

View File

@@ -1,5 +1,20 @@
// Public image-generation helpers and types for provider plugins.
export {
generatedImageAssetFromBase64,
generatedImageAssetFromDataUrl,
generatedImageAssetFromOpenAiCompatibleEntry,
imageFileExtensionForMimeType,
imageSourceUploadFileName,
parseImageDataUrl,
parseOpenAiCompatibleImageResponse,
sniffImageMimeType,
toImageDataUrl,
type ImageMimeTypeDetection,
type OpenAiCompatibleImageResponseEntry,
type OpenAiCompatibleImageResponsePayload,
} from "../image-generation/image-assets.js";
export type {
GeneratedImageAsset,
ImageGenerationBackground,

View File

@@ -88,7 +88,7 @@ export type {
MemoryPluginPublicArtifactsProvider,
} from "../plugins/memory-state.js";
export type { CliBackendConfig } from "../config/types.js";
export * from "./image-generation.js";
export type * from "./image-generation.js";
export * from "./music-generation.js";
export type { SecretInput, SecretRef } from "../config/types.secrets.js";
export type { RuntimeEnv } from "../runtime.js";

View File

@@ -55,3 +55,10 @@ export {
requireInRange,
scheduleCleanup,
} from "../tts/tts-provider-helpers.js";
export {
createOpenAiCompatibleSpeechProvider,
type OpenAiCompatibleSpeechProviderBaseUrlPolicy,
type OpenAiCompatibleSpeechProviderConfig,
type OpenAiCompatibleSpeechProviderExtraJsonBodyField,
type OpenAiCompatibleSpeechProviderOptions,
} from "../tts/openai-compatible-speech-provider.js";

View File

@@ -0,0 +1,155 @@
import { afterEach, describe, expect, it, vi } from "vitest";
import { createOpenAiCompatibleSpeechProvider } from "./openai-compatible-speech-provider.js";
const { assertOkOrThrowHttpErrorMock, postJsonRequestMock, resolveProviderHttpRequestConfigMock } =
vi.hoisted(() => ({
assertOkOrThrowHttpErrorMock: vi.fn(async () => {}),
postJsonRequestMock: vi.fn(),
resolveProviderHttpRequestConfigMock: vi.fn((params: Record<string, unknown>) => ({
baseUrl: params.baseUrl ?? params.defaultBaseUrl ?? "https://example.test/v1",
allowPrivateNetwork: false,
headers: new Headers(params.defaultHeaders as HeadersInit | undefined),
dispatcherPolicy: undefined,
})),
}));
vi.mock("openclaw/plugin-sdk/provider-http", () => ({
assertOkOrThrowHttpError: assertOkOrThrowHttpErrorMock,
postJsonRequest: postJsonRequestMock,
resolveProviderHttpRequestConfig: resolveProviderHttpRequestConfigMock,
}));
describe("createOpenAiCompatibleSpeechProvider", () => {
afterEach(() => {
assertOkOrThrowHttpErrorMock.mockClear();
postJsonRequestMock.mockReset();
resolveProviderHttpRequestConfigMock.mockClear();
vi.unstubAllEnvs();
});
it("normalizes config with built-in base URL policies", () => {
const provider = createOpenAiCompatibleSpeechProvider({
id: "demo",
label: "Demo",
autoSelectOrder: 40,
models: ["demo-tts"],
voices: ["alloy"],
defaultModel: "demo-tts",
defaultVoice: "alloy",
defaultBaseUrl: "https://example.test/api/v1",
envKey: "DEMO_API_KEY",
responseFormats: ["mp3", "pcm"],
defaultResponseFormat: "mp3",
voiceCompatibleResponseFormats: ["mp3"],
baseUrlPolicy: {
kind: "canonical",
aliases: ["https://example.test/v1"],
},
});
expect(
provider.resolveConfig?.({
cfg: {} as never,
timeoutMs: 30_000,
rawConfig: {
providers: {
demo: {
apiKey: "sk-demo",
baseUrl: "https://example.test/v1/",
modelId: "custom-tts",
voiceId: "nova",
speed: 1.25,
responseFormat: " PCM ",
},
},
},
}),
).toEqual({
apiKey: "sk-demo",
baseUrl: "https://example.test/api/v1",
model: "custom-tts",
voice: "nova",
speed: 1.25,
responseFormat: "pcm",
});
});
it("maps configured extra JSON body fields into synthesis requests", async () => {
const release = vi.fn(async () => {});
postJsonRequestMock.mockResolvedValue({
response: new Response(new Uint8Array([4, 5, 6]), { status: 200 }),
release,
});
vi.stubEnv("DEMO_API_KEY", "sk-env");
const provider = createOpenAiCompatibleSpeechProvider<{
routing?: Record<string, unknown>;
}>({
id: "demo",
label: "Demo",
autoSelectOrder: 40,
models: ["demo-tts"],
voices: ["alloy"],
defaultModel: "demo-tts",
defaultVoice: "alloy",
defaultBaseUrl: "https://example.test/v1",
envKey: "DEMO_API_KEY",
responseFormats: ["mp3", "opus"],
defaultResponseFormat: "mp3",
voiceCompatibleResponseFormats: ["opus"],
baseUrlPolicy: { kind: "trim-trailing-slash" },
readExtraConfig: (raw) =>
typeof raw?.routing === "object" && raw.routing !== null && !Array.isArray(raw.routing)
? { routing: raw.routing as Record<string, unknown> }
: {},
extraJsonBodyFields: [{ configKey: "routing", requestKey: "provider" }],
});
const result = await provider.synthesize({
text: "hello",
cfg: {} as never,
providerConfig: {
baseUrl: "https://example.test/v1/",
responseFormat: "opus",
routing: { order: ["openai"] },
},
providerOverrides: {
modelId: "override-tts",
voiceId: "verse",
speed: 1.1,
},
target: "voice-note",
timeoutMs: 1234,
});
expect(resolveProviderHttpRequestConfigMock).toHaveBeenCalledWith(
expect.objectContaining({
baseUrl: "https://example.test/v1",
defaultBaseUrl: "https://example.test/v1",
provider: "demo",
capability: "audio",
}),
);
expect(postJsonRequestMock).toHaveBeenCalledWith(
expect.objectContaining({
url: "https://example.test/v1/audio/speech",
timeoutMs: 1234,
body: {
model: "override-tts",
input: "hello",
voice: "verse",
response_format: "opus",
speed: 1.1,
provider: { order: ["openai"] },
},
}),
);
expect(result).toMatchObject({
audioBuffer: Buffer.from([4, 5, 6]),
outputFormat: "opus",
fileExtension: ".opus",
voiceCompatible: true,
});
expect(release).toHaveBeenCalledOnce();
});
});

View File

@@ -0,0 +1,395 @@
import {
assertOkOrThrowHttpError,
postJsonRequest,
resolveProviderHttpRequestConfig,
} from "openclaw/plugin-sdk/provider-http";
import { normalizeResolvedSecretInputString } from "openclaw/plugin-sdk/secret-input";
import { asFiniteNumber, asObject, trimToUndefined } from "../agents/provider-http-errors.js";
import type { SpeechProviderPlugin } from "../plugins/types.js";
import { normalizeOptionalLowercaseString } from "../shared/string-coerce.js";
import type {
SpeechDirectiveTokenParseContext,
SpeechProviderConfig,
SpeechProviderOverrides,
} from "./provider-types.js";
type OpenAiCompatibleSpeechProviderBaseConfig = {
apiKey?: string;
baseUrl?: string;
model: string;
voice: string;
speed?: number;
responseFormat?: string;
};
export type OpenAiCompatibleSpeechProviderConfig<
ExtraConfig extends Record<string, unknown> = Record<string, never>,
> = OpenAiCompatibleSpeechProviderBaseConfig & ExtraConfig;
export type OpenAiCompatibleSpeechProviderBaseUrlPolicy =
| { kind: "trim-trailing-slash" }
| { kind: "canonical"; aliases?: readonly string[]; allowCustom?: boolean };
export type OpenAiCompatibleSpeechProviderExtraJsonBodyField<
ExtraConfig extends Record<string, unknown>,
> = {
configKey: Extract<keyof ExtraConfig, string>;
requestKey?: string;
};
export type OpenAiCompatibleSpeechProviderOptions<
ExtraConfig extends Record<string, unknown> = Record<string, never>,
> = {
id: string;
label: string;
autoSelectOrder: number;
models: readonly string[];
voices: readonly string[];
defaultModel: string;
defaultVoice: string;
defaultBaseUrl: string;
envKey: string;
responseFormats: readonly string[];
defaultResponseFormat: string;
voiceCompatibleResponseFormats: readonly string[];
baseUrlPolicy?: OpenAiCompatibleSpeechProviderBaseUrlPolicy;
normalizeModel?: (value: string | undefined, fallback: string) => string;
configKey?: string;
extraHeaders?: Record<string, string>;
readExtraConfig?: (raw: Record<string, unknown> | undefined) => ExtraConfig;
extraJsonBodyFields?: readonly OpenAiCompatibleSpeechProviderExtraJsonBodyField<ExtraConfig>[];
apiErrorLabel?: string;
missingApiKeyError?: string;
};
type ModelProviderConfig = {
apiKey?: unknown;
baseUrl?: unknown;
};
function normalizeResponseFormat(params: {
providerLabel: string;
responseFormats: readonly string[];
value: unknown;
}): string | undefined {
const next = normalizeOptionalLowercaseString(params.value);
if (!next) {
return undefined;
}
if (params.responseFormats.includes(next)) {
return next;
}
throw new Error(`Invalid ${params.providerLabel} speech responseFormat: ${next}`);
}
function responseFormatToFileExtension(format: string): `.${string}` {
return `.${format}`;
}
function trimTrailingBaseUrl(value: unknown, fallback: string): string {
return (trimToUndefined(value) ?? fallback).replace(/\/+$/u, "");
}
function normalizeBaseUrl(params: {
value: unknown;
fallback: string;
policy?: OpenAiCompatibleSpeechProviderBaseUrlPolicy;
}): string {
const normalized = trimTrailingBaseUrl(params.value, params.fallback);
if (params.policy?.kind !== "canonical") {
return normalized;
}
const canonical = trimTrailingBaseUrl(params.fallback, params.fallback);
const aliases = new Set(
[canonical, ...(params.policy.aliases ?? [])].map((entry) =>
trimTrailingBaseUrl(entry, canonical),
),
);
return aliases.has(normalized) || !params.policy.allowCustom ? canonical : normalized;
}
function resolveProviderConfigRecord(
rawConfig: Record<string, unknown>,
providerConfigKey: string,
): Record<string, unknown> | undefined {
const providers = asObject(rawConfig.providers);
return asObject(providers?.[providerConfigKey]) ?? asObject(rawConfig[providerConfigKey]);
}
function readModelProviderConfig(
cfg: unknown,
providerConfigKey: string,
): ModelProviderConfig | undefined {
const root = asObject(cfg);
const models = asObject(root?.models);
const providers = asObject(models?.providers);
return asObject(providers?.[providerConfigKey]);
}
function readSpeechOverrides(overrides: SpeechProviderOverrides | undefined): {
model?: string;
voice?: string;
speed?: number;
} {
if (!overrides) {
return {};
}
return {
model: trimToUndefined(overrides.model ?? overrides.modelId),
voice: trimToUndefined(overrides.voice ?? overrides.voiceId),
speed: asFiniteNumber(overrides.speed),
};
}
function parseDirectiveToken(
ctx: SpeechDirectiveTokenParseContext,
providerConfigKey: string,
): { handled: boolean; overrides?: SpeechProviderOverrides } {
const compactProviderKey = providerConfigKey.replace(/[^a-z0-9]+/giu, "").toLowerCase();
switch (ctx.key) {
case "voice":
case "voice_id":
case "voiceid":
case `${providerConfigKey}_voice`:
case `${compactProviderKey}voice`:
if (!ctx.policy.allowVoice) {
return { handled: true };
}
return { handled: true, overrides: { voice: ctx.value } };
case "model":
case "model_id":
case "modelid":
case `${providerConfigKey}_model`:
case `${compactProviderKey}model`:
if (!ctx.policy.allowModelId) {
return { handled: true };
}
return { handled: true, overrides: { model: ctx.value } };
default:
return { handled: false };
}
}
function buildExtraJsonBodyFields<ExtraConfig extends Record<string, unknown>>(
config: OpenAiCompatibleSpeechProviderConfig<ExtraConfig>,
fields: readonly OpenAiCompatibleSpeechProviderExtraJsonBodyField<ExtraConfig>[] | undefined,
): Record<string, unknown> {
const body: Record<string, unknown> = {};
for (const field of fields ?? []) {
const value = config[field.configKey];
if (value != null) {
body[field.requestKey ?? field.configKey] = value;
}
}
return body;
}
export function createOpenAiCompatibleSpeechProvider<
ExtraConfig extends Record<string, unknown> = Record<string, never>,
>(options: OpenAiCompatibleSpeechProviderOptions<ExtraConfig>): SpeechProviderPlugin {
const providerConfigKey = options.configKey ?? options.id;
const normalizeModel =
options.normalizeModel ?? ((value, fallback) => trimToUndefined(value) ?? fallback);
const readExtraConfig = options.readExtraConfig ?? (() => ({}) as ExtraConfig);
function normalizeConfig(
rawConfig: Record<string, unknown>,
): OpenAiCompatibleSpeechProviderConfig<ExtraConfig> {
const raw = resolveProviderConfigRecord(rawConfig, providerConfigKey);
return {
apiKey: normalizeResolvedSecretInputString({
value: raw?.apiKey,
path: `messages.tts.providers.${providerConfigKey}.apiKey`,
}),
baseUrl:
trimToUndefined(raw?.baseUrl) == null
? undefined
: normalizeBaseUrl({
value: raw?.baseUrl,
fallback: options.defaultBaseUrl,
policy: options.baseUrlPolicy,
}),
model: normalizeModel(trimToUndefined(raw?.model ?? raw?.modelId), options.defaultModel),
voice: trimToUndefined(raw?.voice ?? raw?.voiceId) ?? options.defaultVoice,
speed: asFiniteNumber(raw?.speed),
responseFormat: normalizeResponseFormat({
providerLabel: options.label,
responseFormats: options.responseFormats,
value: raw?.responseFormat,
}),
...readExtraConfig(raw),
};
}
function readProviderConfig(
config: SpeechProviderConfig,
): OpenAiCompatibleSpeechProviderConfig<ExtraConfig> {
const normalized = normalizeConfig({});
return {
apiKey: trimToUndefined(config.apiKey) ?? normalized.apiKey,
baseUrl:
trimToUndefined(config.baseUrl) == null
? normalized.baseUrl
: normalizeBaseUrl({
value: config.baseUrl,
fallback: options.defaultBaseUrl,
policy: options.baseUrlPolicy,
}),
model: normalizeModel(trimToUndefined(config.model ?? config.modelId), normalized.model),
voice: trimToUndefined(config.voice ?? config.voiceId) ?? normalized.voice,
speed: asFiniteNumber(config.speed) ?? normalized.speed,
responseFormat:
normalizeResponseFormat({
providerLabel: options.label,
responseFormats: options.responseFormats,
value: config.responseFormat,
}) ?? normalized.responseFormat,
...readExtraConfig(config),
};
}
function resolveApiKey(params: {
cfg?: unknown;
providerConfig: OpenAiCompatibleSpeechProviderConfig<ExtraConfig>;
}): string | undefined {
return (
params.providerConfig.apiKey ??
normalizeResolvedSecretInputString({
value: readModelProviderConfig(params.cfg, providerConfigKey)?.apiKey,
path: `models.providers.${providerConfigKey}.apiKey`,
}) ??
trimToUndefined(process.env[options.envKey])
);
}
function resolveBaseUrl(params: {
cfg?: unknown;
providerConfig: OpenAiCompatibleSpeechProviderConfig<ExtraConfig>;
}): string {
return normalizeBaseUrl({
value:
params.providerConfig.baseUrl ??
trimToUndefined(readModelProviderConfig(params.cfg, providerConfigKey)?.baseUrl),
fallback: options.defaultBaseUrl,
policy: options.baseUrlPolicy,
});
}
return {
id: options.id,
label: options.label,
autoSelectOrder: options.autoSelectOrder,
models: [...options.models],
voices: [...options.voices],
resolveConfig: ({ rawConfig }) => normalizeConfig(rawConfig),
parseDirectiveToken: (ctx) => parseDirectiveToken(ctx, providerConfigKey),
resolveTalkConfig: ({ baseTtsConfig, talkProviderConfig }) => {
const base = normalizeConfig(baseTtsConfig);
const responseFormat = normalizeResponseFormat({
providerLabel: options.label,
responseFormats: options.responseFormats,
value: talkProviderConfig.responseFormat,
});
const next: OpenAiCompatibleSpeechProviderConfig<ExtraConfig> = { ...base };
if (talkProviderConfig.apiKey !== undefined) {
next.apiKey = normalizeResolvedSecretInputString({
value: talkProviderConfig.apiKey,
path: `talk.providers.${providerConfigKey}.apiKey`,
});
}
const baseUrl = trimToUndefined(talkProviderConfig.baseUrl);
if (baseUrl !== undefined) {
next.baseUrl = normalizeBaseUrl({
value: baseUrl,
fallback: options.defaultBaseUrl,
policy: options.baseUrlPolicy,
});
}
const modelId = trimToUndefined(talkProviderConfig.modelId);
if (modelId !== undefined) {
next.model = normalizeModel(modelId, options.defaultModel);
}
const voiceId = trimToUndefined(talkProviderConfig.voiceId);
if (voiceId !== undefined) {
next.voice = voiceId;
}
const speed = asFiniteNumber(talkProviderConfig.speed);
if (speed !== undefined) {
next.speed = speed;
}
if (responseFormat !== undefined) {
next.responseFormat = responseFormat;
}
return next;
},
resolveTalkOverrides: ({ params }) => ({
...(trimToUndefined(params.voiceId ?? params.voice) == null
? {}
: { voice: trimToUndefined(params.voiceId ?? params.voice) }),
...(trimToUndefined(params.modelId ?? params.model) == null
? {}
: { model: trimToUndefined(params.modelId ?? params.model) }),
...(asFiniteNumber(params.speed) == null ? {} : { speed: asFiniteNumber(params.speed) }),
}),
listVoices: async () => options.voices.map((voice) => ({ id: voice, name: voice })),
isConfigured: ({ cfg, providerConfig }) =>
Boolean(resolveApiKey({ cfg, providerConfig: readProviderConfig(providerConfig) })),
synthesize: async (req) => {
const config = readProviderConfig(req.providerConfig);
const overrides = readSpeechOverrides(req.providerOverrides);
const apiKey = resolveApiKey({ cfg: req.cfg, providerConfig: config });
if (!apiKey) {
throw new Error(options.missingApiKeyError ?? `${options.label} API key missing`);
}
const baseUrl = resolveBaseUrl({ cfg: req.cfg, providerConfig: config });
const responseFormat = config.responseFormat ?? options.defaultResponseFormat;
const speed = overrides.speed ?? config.speed;
const { allowPrivateNetwork, headers, dispatcherPolicy } = resolveProviderHttpRequestConfig({
baseUrl,
defaultBaseUrl: options.defaultBaseUrl,
allowPrivateNetwork: false,
defaultHeaders: {
Authorization: `Bearer ${apiKey}`,
"Content-Type": "application/json",
...options.extraHeaders,
},
provider: options.id,
capability: "audio",
transport: "http",
});
const { response, release } = await postJsonRequest({
url: `${baseUrl}/audio/speech`,
headers,
body: {
model: normalizeModel(overrides.model ?? config.model, options.defaultModel),
input: req.text,
voice: overrides.voice ?? config.voice,
response_format: responseFormat,
...(speed == null ? {} : { speed }),
...buildExtraJsonBodyFields(config, options.extraJsonBodyFields),
},
timeoutMs: req.timeoutMs,
fetchFn: fetch,
allowPrivateNetwork,
dispatcherPolicy,
});
try {
await assertOkOrThrowHttpError(
response,
options.apiErrorLabel ?? `${options.label} TTS API error`,
);
return {
audioBuffer: Buffer.from(await response.arrayBuffer()),
outputFormat: responseFormat,
fileExtension: responseFormatToFileExtension(responseFormat),
voiceCompatible: options.voiceCompatibleResponseFormats.includes(responseFormat),
};
} finally {
await release();
}
},
};
}