mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 05:30:42 +00:00
refactor: share media provider asset helpers (#73142)
* refactor: share openai-compatible speech providers * refactor: tighten openai-compatible speech helper * refactor: share image generation asset helpers * fix: keep image helpers off root plugin sdk runtime
This commit is contained in:
committed by
GitHub
parent
4949f23219
commit
2a3a24ebdc
@@ -1,2 +1,2 @@
|
||||
48cd91661f9fc65e8fb3a091f6deb726d8ccd37f7cec2aa765165f3992e7463f plugin-sdk-api-baseline.json
|
||||
e8d7069b4d0d7a1a0431d92c845043bb39c3ba106ca0f85cc728a02ece9521bf plugin-sdk-api-baseline.jsonl
|
||||
8f23f155251c05cab51ee8926e7a359bd64a0ba34e82a80d93d0ed96d07c8a04 plugin-sdk-api-baseline.json
|
||||
181fea7f35c49032e6894605a06ca1419e5b6ccc1a3d8987d952a1d24a8154bc plugin-sdk-api-baseline.jsonl
|
||||
|
||||
@@ -482,10 +482,11 @@ releases.
|
||||
| `plugin-sdk/media-understanding` | Media-understanding helpers | Media understanding provider types plus provider-facing image/audio helper exports |
|
||||
| `plugin-sdk/text-runtime` | Shared text helpers | Assistant-visible-text stripping, markdown render/chunking/table helpers, redaction helpers, directive-tag helpers, safe-text utilities, and related text/logging helpers |
|
||||
| `plugin-sdk/text-chunking` | Text chunking helpers | Outbound text chunking helper |
|
||||
| `plugin-sdk/speech` | Speech helpers | Speech provider types plus provider-facing directive, registry, and validation helpers |
|
||||
| `plugin-sdk/speech` | Speech helpers | Speech provider types plus provider-facing directive, registry, validation helpers, and OpenAI-compatible TTS builder |
|
||||
| `plugin-sdk/speech-core` | Shared speech core | Speech provider types, registry, directives, normalization |
|
||||
| `plugin-sdk/realtime-transcription` | Realtime transcription helpers | Provider types, registry helpers, and shared WebSocket session helper |
|
||||
| `plugin-sdk/realtime-voice` | Realtime voice helpers | Provider types, registry/resolution helpers, and bridge session helpers |
|
||||
| `plugin-sdk/image-generation` | Image-generation helpers | Image generation provider types plus image asset/data URL helpers |
|
||||
| `plugin-sdk/image-generation-core` | Shared image-generation core | Image-generation types, failover, auth, and registry helpers |
|
||||
| `plugin-sdk/music-generation` | Music-generation helpers | Music-generation provider/request/result types |
|
||||
| `plugin-sdk/music-generation-core` | Shared music-generation core | Music-generation types, failover helpers, provider lookup, and model-ref parsing |
|
||||
|
||||
@@ -255,11 +255,11 @@ For the plugin authoring guide, see [Plugin SDK overview](/plugins/sdk-overview)
|
||||
| `plugin-sdk/media-understanding` | Media understanding provider types plus provider-facing image/audio helper exports |
|
||||
| `plugin-sdk/text-runtime` | Shared text/markdown/logging helpers such as assistant-visible-text stripping, markdown render/chunking/table helpers, redaction helpers, directive-tag helpers, and safe-text utilities |
|
||||
| `plugin-sdk/text-chunking` | Outbound text chunking helper |
|
||||
| `plugin-sdk/speech` | Speech provider types plus provider-facing directive, registry, validation, and speech helper exports |
|
||||
| `plugin-sdk/speech` | Speech provider types plus provider-facing directive, registry, validation, OpenAI-compatible TTS builder, and speech helper exports |
|
||||
| `plugin-sdk/speech-core` | Shared speech provider types, registry, directive, normalization, and speech helper exports |
|
||||
| `plugin-sdk/realtime-transcription` | Realtime transcription provider types, registry helpers, and shared WebSocket session helper |
|
||||
| `plugin-sdk/realtime-voice` | Realtime voice provider types and registry helpers |
|
||||
| `plugin-sdk/image-generation` | Image generation provider types |
|
||||
| `plugin-sdk/image-generation` | Image generation provider types plus image asset/data URL helpers |
|
||||
| `plugin-sdk/image-generation-core` | Shared image-generation types, failover, auth, and registry helpers |
|
||||
| `plugin-sdk/music-generation` | Music generation provider/request/result types |
|
||||
| `plugin-sdk/music-generation-core` | Shared music-generation types, failover helpers, provider lookup, and model-ref parsing |
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
import type { OpenClawConfig } from "openclaw/plugin-sdk/config-types";
|
||||
import type {
|
||||
GeneratedImageAsset,
|
||||
ImageGenerationProvider,
|
||||
ImageGenerationSourceImage,
|
||||
import type { ImageGenerationProvider } from "openclaw/plugin-sdk/image-generation";
|
||||
import {
|
||||
imageSourceUploadFileName,
|
||||
parseOpenAiCompatibleImageResponse,
|
||||
} from "openclaw/plugin-sdk/image-generation";
|
||||
import { isProviderApiKeyConfigured } from "openclaw/plugin-sdk/provider-auth";
|
||||
import { resolveApiKeyForProvider } from "openclaw/plugin-sdk/provider-auth-runtime";
|
||||
@@ -44,75 +44,6 @@ function resolveDeepInfraProviderConfig(
|
||||
return cfg?.models?.providers?.deepinfra;
|
||||
}
|
||||
|
||||
function detectImageMimeType(buffer: Buffer): {
|
||||
mimeType: string;
|
||||
extension: "jpg" | "png" | "webp";
|
||||
} {
|
||||
if (buffer.length >= 3 && buffer[0] === 0xff && buffer[1] === 0xd8 && buffer[2] === 0xff) {
|
||||
return { mimeType: "image/jpeg", extension: "jpg" };
|
||||
}
|
||||
if (
|
||||
buffer.length >= 8 &&
|
||||
buffer[0] === 0x89 &&
|
||||
buffer[1] === 0x50 &&
|
||||
buffer[2] === 0x4e &&
|
||||
buffer[3] === 0x47
|
||||
) {
|
||||
return { mimeType: "image/png", extension: "png" };
|
||||
}
|
||||
if (
|
||||
buffer.length >= 12 &&
|
||||
buffer.toString("ascii", 0, 4) === "RIFF" &&
|
||||
buffer.toString("ascii", 8, 12) === "WEBP"
|
||||
) {
|
||||
return { mimeType: "image/webp", extension: "webp" };
|
||||
}
|
||||
return { mimeType: "image/jpeg", extension: "jpg" };
|
||||
}
|
||||
|
||||
function imageToUploadName(image: ImageGenerationSourceImage, index: number): string {
|
||||
const fileName = normalizeOptionalString(image.fileName);
|
||||
if (fileName) {
|
||||
return fileName;
|
||||
}
|
||||
const mimeType = normalizeOptionalString(image.mimeType) ?? "image/png";
|
||||
const ext =
|
||||
mimeType === "image/jpeg" || mimeType === "image/jpg"
|
||||
? "jpg"
|
||||
: mimeType === "image/webp"
|
||||
? "webp"
|
||||
: "png";
|
||||
return `image-${index + 1}.${ext}`;
|
||||
}
|
||||
|
||||
function imageToAsset(
|
||||
entry: NonNullable<DeepInfraImageApiResponse["data"]>[number],
|
||||
index: number,
|
||||
): GeneratedImageAsset | null {
|
||||
const b64 = normalizeOptionalString(entry.b64_json);
|
||||
if (!b64) {
|
||||
return null;
|
||||
}
|
||||
const buffer = Buffer.from(b64, "base64");
|
||||
const detected = detectImageMimeType(buffer);
|
||||
const image: GeneratedImageAsset = {
|
||||
buffer,
|
||||
mimeType: detected.mimeType,
|
||||
fileName: `image-${index + 1}.${detected.extension}`,
|
||||
};
|
||||
const revisedPrompt = normalizeOptionalString(entry.revised_prompt);
|
||||
if (revisedPrompt) {
|
||||
image.revisedPrompt = revisedPrompt;
|
||||
}
|
||||
return image;
|
||||
}
|
||||
|
||||
function parseImageResponse(payload: DeepInfraImageApiResponse): GeneratedImageAsset[] {
|
||||
return (payload.data ?? [])
|
||||
.map(imageToAsset)
|
||||
.filter((entry): entry is GeneratedImageAsset => entry !== null);
|
||||
}
|
||||
|
||||
export function buildDeepInfraImageGenerationProvider(): ImageGenerationProvider {
|
||||
return {
|
||||
id: "deepinfra",
|
||||
@@ -198,7 +129,7 @@ export function buildDeepInfraImageGenerationProvider(): ImageGenerationProvider
|
||||
form.append(
|
||||
"image",
|
||||
new Blob([new Uint8Array(image.buffer)], { type: mimeType }),
|
||||
imageToUploadName(image, 0),
|
||||
imageSourceUploadFileName({ image, index: 0 }),
|
||||
);
|
||||
const multipartHeaders = new Headers(headers);
|
||||
multipartHeaders.delete("Content-Type");
|
||||
@@ -237,7 +168,10 @@ export function buildDeepInfraImageGenerationProvider(): ImageGenerationProvider
|
||||
response,
|
||||
isEdit ? "DeepInfra image edit failed" : "DeepInfra image generation failed",
|
||||
);
|
||||
const images = parseImageResponse((await response.json()) as DeepInfraImageApiResponse);
|
||||
const images = parseOpenAiCompatibleImageResponse(
|
||||
(await response.json()) as DeepInfraImageApiResponse,
|
||||
{ defaultMimeType: "image/jpeg", sniffMimeType: true },
|
||||
);
|
||||
if (images.length === 0) {
|
||||
throw new Error("DeepInfra image response did not include generated image data");
|
||||
}
|
||||
|
||||
@@ -1,295 +1,41 @@
|
||||
import {
|
||||
assertOkOrThrowHttpError,
|
||||
postJsonRequest,
|
||||
resolveProviderHttpRequestConfig,
|
||||
} from "openclaw/plugin-sdk/provider-http";
|
||||
import { normalizeResolvedSecretInputString } from "openclaw/plugin-sdk/secret-input";
|
||||
import {
|
||||
asFiniteNumber,
|
||||
asObject,
|
||||
trimToUndefined,
|
||||
type SpeechDirectiveTokenParseContext,
|
||||
type SpeechProviderConfig,
|
||||
type SpeechProviderOverrides,
|
||||
createOpenAiCompatibleSpeechProvider,
|
||||
type SpeechProviderPlugin,
|
||||
} from "openclaw/plugin-sdk/speech";
|
||||
import { normalizeOptionalLowercaseString } from "openclaw/plugin-sdk/text-runtime";
|
||||
import {
|
||||
DEEPINFRA_BASE_URL,
|
||||
DEEPINFRA_TTS_MODELS,
|
||||
DEFAULT_DEEPINFRA_TTS_MODEL,
|
||||
DEFAULT_DEEPINFRA_TTS_VOICE,
|
||||
normalizeDeepInfraBaseUrl,
|
||||
normalizeDeepInfraModelRef,
|
||||
} from "./media-models.js";
|
||||
|
||||
const DEEPINFRA_TTS_RESPONSE_FORMATS = ["mp3", "opus", "flac", "wav", "pcm"] as const;
|
||||
|
||||
type DeepInfraTtsResponseFormat = (typeof DEEPINFRA_TTS_RESPONSE_FORMATS)[number];
|
||||
|
||||
type DeepInfraTtsProviderConfig = {
|
||||
apiKey?: string;
|
||||
baseUrl?: string;
|
||||
model: string;
|
||||
voice: string;
|
||||
speed?: number;
|
||||
responseFormat?: DeepInfraTtsResponseFormat;
|
||||
type DeepInfraTtsExtraConfig = {
|
||||
extraBody?: Record<string, unknown>;
|
||||
};
|
||||
|
||||
type DeepInfraTtsProviderOverrides = {
|
||||
model?: string;
|
||||
voice?: string;
|
||||
speed?: number;
|
||||
};
|
||||
|
||||
function normalizeDeepInfraTtsResponseFormat(
|
||||
value: unknown,
|
||||
): DeepInfraTtsResponseFormat | undefined {
|
||||
const next = normalizeOptionalLowercaseString(value);
|
||||
if (!next) {
|
||||
return undefined;
|
||||
}
|
||||
if (DEEPINFRA_TTS_RESPONSE_FORMATS.some((format) => format === next)) {
|
||||
return next as DeepInfraTtsResponseFormat;
|
||||
}
|
||||
throw new Error(`Invalid DeepInfra speech responseFormat: ${next}`);
|
||||
}
|
||||
|
||||
function resolveDeepInfraProviderConfigRecord(
|
||||
rawConfig: Record<string, unknown>,
|
||||
): Record<string, unknown> | undefined {
|
||||
const providers = asObject(rawConfig.providers);
|
||||
return asObject(providers?.deepinfra) ?? asObject(rawConfig.deepinfra);
|
||||
}
|
||||
|
||||
function normalizeDeepInfraTtsProviderConfig(
|
||||
rawConfig: Record<string, unknown>,
|
||||
): DeepInfraTtsProviderConfig {
|
||||
const raw = resolveDeepInfraProviderConfigRecord(rawConfig);
|
||||
return {
|
||||
apiKey: normalizeResolvedSecretInputString({
|
||||
value: raw?.apiKey,
|
||||
path: "messages.tts.providers.deepinfra.apiKey",
|
||||
}),
|
||||
baseUrl:
|
||||
trimToUndefined(raw?.baseUrl) == null ? undefined : normalizeDeepInfraBaseUrl(raw?.baseUrl),
|
||||
model: normalizeDeepInfraModelRef(
|
||||
trimToUndefined(raw?.model ?? raw?.modelId),
|
||||
DEFAULT_DEEPINFRA_TTS_MODEL,
|
||||
),
|
||||
voice: trimToUndefined(raw?.voice ?? raw?.voiceId) ?? DEFAULT_DEEPINFRA_TTS_VOICE,
|
||||
speed: asFiniteNumber(raw?.speed),
|
||||
responseFormat: normalizeDeepInfraTtsResponseFormat(raw?.responseFormat),
|
||||
extraBody: asObject(raw?.extraBody),
|
||||
};
|
||||
}
|
||||
|
||||
function readDeepInfraTtsProviderConfig(config: SpeechProviderConfig): DeepInfraTtsProviderConfig {
|
||||
const normalized = normalizeDeepInfraTtsProviderConfig({});
|
||||
return {
|
||||
apiKey: trimToUndefined(config.apiKey) ?? normalized.apiKey,
|
||||
baseUrl:
|
||||
trimToUndefined(config.baseUrl) == null
|
||||
? normalized.baseUrl
|
||||
: normalizeDeepInfraBaseUrl(config.baseUrl),
|
||||
model: normalizeDeepInfraModelRef(
|
||||
trimToUndefined(config.model ?? config.modelId),
|
||||
normalized.model,
|
||||
),
|
||||
voice: trimToUndefined(config.voice ?? config.voiceId) ?? normalized.voice,
|
||||
speed: asFiniteNumber(config.speed) ?? normalized.speed,
|
||||
responseFormat:
|
||||
normalizeDeepInfraTtsResponseFormat(config.responseFormat) ?? normalized.responseFormat,
|
||||
extraBody: asObject(config.extraBody) ?? normalized.extraBody,
|
||||
};
|
||||
}
|
||||
|
||||
function readDeepInfraTtsOverrides(
|
||||
overrides: SpeechProviderOverrides | undefined,
|
||||
): DeepInfraTtsProviderOverrides {
|
||||
if (!overrides) {
|
||||
return {};
|
||||
}
|
||||
return {
|
||||
model: trimToUndefined(overrides.model ?? overrides.modelId),
|
||||
voice: trimToUndefined(overrides.voice ?? overrides.voiceId),
|
||||
speed: asFiniteNumber(overrides.speed),
|
||||
};
|
||||
}
|
||||
|
||||
function resolveDeepInfraTtsApiKey(params: {
|
||||
cfg?: { models?: { providers?: { deepinfra?: { apiKey?: unknown } } } };
|
||||
providerConfig: DeepInfraTtsProviderConfig;
|
||||
}): string | undefined {
|
||||
return (
|
||||
params.providerConfig.apiKey ??
|
||||
normalizeResolvedSecretInputString({
|
||||
value: params.cfg?.models?.providers?.deepinfra?.apiKey,
|
||||
path: "models.providers.deepinfra.apiKey",
|
||||
}) ??
|
||||
trimToUndefined(process.env.DEEPINFRA_API_KEY)
|
||||
);
|
||||
}
|
||||
|
||||
function resolveDeepInfraTtsBaseUrl(params: {
|
||||
cfg?: { models?: { providers?: { deepinfra?: { baseUrl?: unknown } } } };
|
||||
providerConfig: DeepInfraTtsProviderConfig;
|
||||
}): string {
|
||||
return normalizeDeepInfraBaseUrl(
|
||||
params.providerConfig.baseUrl ??
|
||||
trimToUndefined(params.cfg?.models?.providers?.deepinfra?.baseUrl) ??
|
||||
DEEPINFRA_BASE_URL,
|
||||
);
|
||||
}
|
||||
|
||||
function responseFormatToFileExtension(
|
||||
format: DeepInfraTtsResponseFormat,
|
||||
): ".mp3" | ".opus" | ".flac" | ".wav" | ".pcm" {
|
||||
return `.${format}`;
|
||||
}
|
||||
|
||||
function parseDirectiveToken(ctx: SpeechDirectiveTokenParseContext): {
|
||||
handled: boolean;
|
||||
overrides?: SpeechProviderOverrides;
|
||||
} {
|
||||
switch (ctx.key) {
|
||||
case "voice":
|
||||
case "voice_id":
|
||||
case "voiceid":
|
||||
case "deepinfra_voice":
|
||||
case "deepinfravoice":
|
||||
if (!ctx.policy.allowVoice) {
|
||||
return { handled: true };
|
||||
}
|
||||
return { handled: true, overrides: { voice: ctx.value } };
|
||||
case "model":
|
||||
case "model_id":
|
||||
case "modelid":
|
||||
case "deepinfra_model":
|
||||
case "deepinframodel":
|
||||
if (!ctx.policy.allowModelId) {
|
||||
return { handled: true };
|
||||
}
|
||||
return { handled: true, overrides: { model: ctx.value } };
|
||||
default:
|
||||
return { handled: false };
|
||||
}
|
||||
}
|
||||
|
||||
export function buildDeepInfraSpeechProvider(): SpeechProviderPlugin {
|
||||
return {
|
||||
return createOpenAiCompatibleSpeechProvider<DeepInfraTtsExtraConfig>({
|
||||
id: "deepinfra",
|
||||
label: "DeepInfra",
|
||||
autoSelectOrder: 45,
|
||||
models: [...DEEPINFRA_TTS_MODELS],
|
||||
models: DEEPINFRA_TTS_MODELS,
|
||||
voices: [DEFAULT_DEEPINFRA_TTS_VOICE],
|
||||
resolveConfig: ({ rawConfig }) => normalizeDeepInfraTtsProviderConfig(rawConfig),
|
||||
parseDirectiveToken,
|
||||
resolveTalkConfig: ({ baseTtsConfig, talkProviderConfig }) => {
|
||||
const base = normalizeDeepInfraTtsProviderConfig(baseTtsConfig);
|
||||
const responseFormat = normalizeDeepInfraTtsResponseFormat(talkProviderConfig.responseFormat);
|
||||
return {
|
||||
...base,
|
||||
...(talkProviderConfig.apiKey === undefined
|
||||
? {}
|
||||
: {
|
||||
apiKey: normalizeResolvedSecretInputString({
|
||||
value: talkProviderConfig.apiKey,
|
||||
path: "talk.providers.deepinfra.apiKey",
|
||||
}),
|
||||
}),
|
||||
...(trimToUndefined(talkProviderConfig.baseUrl) == null
|
||||
? {}
|
||||
: { baseUrl: normalizeDeepInfraBaseUrl(talkProviderConfig.baseUrl) }),
|
||||
...(trimToUndefined(talkProviderConfig.modelId) == null
|
||||
? {}
|
||||
: {
|
||||
model: normalizeDeepInfraModelRef(
|
||||
trimToUndefined(talkProviderConfig.modelId),
|
||||
DEFAULT_DEEPINFRA_TTS_MODEL,
|
||||
),
|
||||
}),
|
||||
...(trimToUndefined(talkProviderConfig.voiceId) == null
|
||||
? {}
|
||||
: { voice: trimToUndefined(talkProviderConfig.voiceId) }),
|
||||
...(asFiniteNumber(talkProviderConfig.speed) == null
|
||||
? {}
|
||||
: { speed: asFiniteNumber(talkProviderConfig.speed) }),
|
||||
...(responseFormat == null ? {} : { responseFormat }),
|
||||
};
|
||||
},
|
||||
resolveTalkOverrides: ({ params }) => ({
|
||||
...(trimToUndefined(params.voiceId ?? params.voice) == null
|
||||
? {}
|
||||
: { voice: trimToUndefined(params.voiceId ?? params.voice) }),
|
||||
...(trimToUndefined(params.modelId ?? params.model) == null
|
||||
? {}
|
||||
: { model: trimToUndefined(params.modelId ?? params.model) }),
|
||||
...(asFiniteNumber(params.speed) == null ? {} : { speed: asFiniteNumber(params.speed) }),
|
||||
}),
|
||||
listVoices: async () => [
|
||||
{ id: DEFAULT_DEEPINFRA_TTS_VOICE, name: DEFAULT_DEEPINFRA_TTS_VOICE },
|
||||
],
|
||||
isConfigured: ({ cfg, providerConfig }) => {
|
||||
const config = readDeepInfraTtsProviderConfig(providerConfig);
|
||||
return Boolean(resolveDeepInfraTtsApiKey({ cfg, providerConfig: config }));
|
||||
},
|
||||
synthesize: async (req) => {
|
||||
const config = readDeepInfraTtsProviderConfig(req.providerConfig);
|
||||
const overrides = readDeepInfraTtsOverrides(req.providerOverrides);
|
||||
const apiKey = resolveDeepInfraTtsApiKey({ cfg: req.cfg, providerConfig: config });
|
||||
if (!apiKey) {
|
||||
throw new Error("DeepInfra API key missing");
|
||||
}
|
||||
|
||||
const baseUrl = resolveDeepInfraTtsBaseUrl({ cfg: req.cfg, providerConfig: config });
|
||||
const responseFormat = config.responseFormat ?? "mp3";
|
||||
const speed = overrides.speed ?? config.speed;
|
||||
const { allowPrivateNetwork, headers, dispatcherPolicy } = resolveProviderHttpRequestConfig({
|
||||
baseUrl,
|
||||
defaultBaseUrl: DEEPINFRA_BASE_URL,
|
||||
allowPrivateNetwork: false,
|
||||
defaultHeaders: {
|
||||
Authorization: `Bearer ${apiKey}`,
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
provider: "deepinfra",
|
||||
capability: "audio",
|
||||
transport: "http",
|
||||
});
|
||||
|
||||
const { response, release } = await postJsonRequest({
|
||||
url: `${baseUrl}/audio/speech`,
|
||||
headers,
|
||||
body: {
|
||||
model: normalizeDeepInfraModelRef(
|
||||
overrides.model ?? config.model,
|
||||
DEFAULT_DEEPINFRA_TTS_MODEL,
|
||||
),
|
||||
input: req.text,
|
||||
voice: overrides.voice ?? config.voice,
|
||||
response_format: responseFormat,
|
||||
...(speed == null ? {} : { speed }),
|
||||
...(config.extraBody == null ? {} : { extra_body: config.extraBody }),
|
||||
},
|
||||
timeoutMs: req.timeoutMs,
|
||||
fetchFn: fetch,
|
||||
allowPrivateNetwork,
|
||||
dispatcherPolicy,
|
||||
});
|
||||
|
||||
try {
|
||||
await assertOkOrThrowHttpError(response, "DeepInfra TTS API error");
|
||||
return {
|
||||
audioBuffer: Buffer.from(await response.arrayBuffer()),
|
||||
outputFormat: responseFormat,
|
||||
fileExtension: responseFormatToFileExtension(responseFormat),
|
||||
voiceCompatible: responseFormat === "mp3" || responseFormat === "opus",
|
||||
};
|
||||
} finally {
|
||||
await release();
|
||||
}
|
||||
},
|
||||
};
|
||||
defaultModel: DEFAULT_DEEPINFRA_TTS_MODEL,
|
||||
defaultVoice: DEFAULT_DEEPINFRA_TTS_VOICE,
|
||||
defaultBaseUrl: DEEPINFRA_BASE_URL,
|
||||
envKey: "DEEPINFRA_API_KEY",
|
||||
responseFormats: DEEPINFRA_TTS_RESPONSE_FORMATS,
|
||||
defaultResponseFormat: "mp3",
|
||||
voiceCompatibleResponseFormats: ["mp3", "opus"],
|
||||
baseUrlPolicy: { kind: "trim-trailing-slash" },
|
||||
normalizeModel: normalizeDeepInfraModelRef,
|
||||
apiErrorLabel: "DeepInfra TTS API error",
|
||||
missingApiKeyError: "DeepInfra API key missing",
|
||||
readExtraConfig: (raw) => ({ extraBody: asObject(raw?.extraBody) }),
|
||||
extraJsonBodyFields: [{ configKey: "extraBody", requestKey: "extra_body" }],
|
||||
});
|
||||
}
|
||||
|
||||
@@ -2,6 +2,10 @@ import type {
|
||||
GeneratedImageAsset,
|
||||
ImageGenerationProvider,
|
||||
} from "openclaw/plugin-sdk/image-generation";
|
||||
import {
|
||||
imageFileExtensionForMimeType,
|
||||
toImageDataUrl,
|
||||
} from "openclaw/plugin-sdk/image-generation";
|
||||
import { isProviderApiKeyConfigured } from "openclaw/plugin-sdk/provider-auth";
|
||||
import { resolveApiKeyForProvider } from "openclaw/plugin-sdk/provider-auth-runtime";
|
||||
import {
|
||||
@@ -16,10 +20,7 @@ import {
|
||||
type SsrFPolicy,
|
||||
ssrfPolicyFromDangerouslyAllowPrivateNetwork,
|
||||
} from "openclaw/plugin-sdk/ssrf-runtime";
|
||||
import {
|
||||
normalizeLowercaseStringOrEmpty,
|
||||
normalizeOptionalLowercaseString,
|
||||
} from "openclaw/plugin-sdk/text-runtime";
|
||||
import { normalizeLowercaseStringOrEmpty } from "openclaw/plugin-sdk/text-runtime";
|
||||
|
||||
const DEFAULT_FAL_BASE_URL = "https://fal.run";
|
||||
const DEFAULT_FAL_IMAGE_MODEL = "fal-ai/flux/dev";
|
||||
@@ -214,22 +215,6 @@ function resolveFalImageSize(params: {
|
||||
return undefined;
|
||||
}
|
||||
|
||||
function toDataUri(buffer: Buffer, mimeType: string): string {
|
||||
return `data:${mimeType};base64,${buffer.toString("base64")}`;
|
||||
}
|
||||
|
||||
function fileExtensionForMimeType(mimeType: string | undefined): string {
|
||||
const normalized = normalizeOptionalLowercaseString(mimeType);
|
||||
if (!normalized) {
|
||||
return "png";
|
||||
}
|
||||
if (normalized.includes("jpeg")) {
|
||||
return "jpg";
|
||||
}
|
||||
const slashIndex = normalized.indexOf("/");
|
||||
return slashIndex >= 0 ? normalized.slice(slashIndex + 1) || "png" : "png";
|
||||
}
|
||||
|
||||
async function fetchImageBuffer(
|
||||
url: string,
|
||||
networkPolicy?: FalNetworkPolicy,
|
||||
@@ -348,7 +333,7 @@ export function buildFalImageGenerationProvider(): ImageGenerationProvider {
|
||||
if (!input) {
|
||||
throw new Error("fal image edit request missing reference image");
|
||||
}
|
||||
requestBody.image_url = toDataUri(input.buffer, input.mimeType);
|
||||
requestBody.image_url = toImageDataUrl(input);
|
||||
}
|
||||
const { response, release } = await falFetchGuard({
|
||||
url: `${baseUrl}/${model}`,
|
||||
@@ -378,7 +363,7 @@ export function buildFalImageGenerationProvider(): ImageGenerationProvider {
|
||||
images.push({
|
||||
buffer: downloaded.buffer,
|
||||
mimeType: downloaded.mimeType,
|
||||
fileName: `image-${imageIndex}.${fileExtensionForMimeType(
|
||||
fileName: `image-${imageIndex}.${imageFileExtensionForMimeType(
|
||||
downloaded.mimeType || entry.content_type,
|
||||
)}`,
|
||||
});
|
||||
|
||||
@@ -1,5 +1,9 @@
|
||||
import type { OpenClawConfig } from "openclaw/plugin-sdk/config-types";
|
||||
import type { ImageGenerationProvider } from "openclaw/plugin-sdk/image-generation";
|
||||
import {
|
||||
parseOpenAiCompatibleImageResponse,
|
||||
toImageDataUrl,
|
||||
} from "openclaw/plugin-sdk/image-generation";
|
||||
import { isProviderApiKeyConfigured } from "openclaw/plugin-sdk/provider-auth";
|
||||
import { resolveApiKeyForProvider } from "openclaw/plugin-sdk/provider-auth-runtime";
|
||||
import {
|
||||
@@ -11,7 +15,6 @@ import {
|
||||
import { normalizeOptionalString } from "openclaw/plugin-sdk/text-runtime";
|
||||
import { LITELLM_BASE_URL } from "./onboard.js";
|
||||
|
||||
const DEFAULT_OUTPUT_MIME = "image/png";
|
||||
const DEFAULT_SIZE = "1024x1024";
|
||||
const DEFAULT_LITELLM_IMAGE_MODEL = "gpt-image-2";
|
||||
const LITELLM_SUPPORTED_SIZES = [
|
||||
@@ -82,10 +85,6 @@ function shouldAutoAllowPrivateLitellmEndpoint(baseUrl: string): boolean {
|
||||
}
|
||||
}
|
||||
|
||||
function toDataUrl(buffer: Buffer, mimeType: string): string {
|
||||
return `data:${mimeType};base64,${buffer.toString("base64")}`;
|
||||
}
|
||||
|
||||
type LitellmImageApiResponse = {
|
||||
data?: Array<{
|
||||
b64_json?: string;
|
||||
@@ -167,7 +166,7 @@ export function buildLitellmImageGenerationProvider(): ImageGenerationProvider {
|
||||
n: count,
|
||||
size,
|
||||
images: inputImages.map((image) => ({
|
||||
image_url: toDataUrl(image.buffer, image.mimeType?.trim() || DEFAULT_OUTPUT_MIME),
|
||||
image_url: toImageDataUrl(image),
|
||||
})),
|
||||
}
|
||||
: {
|
||||
@@ -192,21 +191,7 @@ export function buildLitellmImageGenerationProvider(): ImageGenerationProvider {
|
||||
);
|
||||
|
||||
const data = (await response.json()) as LitellmImageApiResponse;
|
||||
const images = (data.data ?? [])
|
||||
.map((entry, index) => {
|
||||
if (!entry.b64_json) {
|
||||
return null;
|
||||
}
|
||||
return Object.assign(
|
||||
{
|
||||
buffer: Buffer.from(entry.b64_json, `base64`),
|
||||
mimeType: DEFAULT_OUTPUT_MIME,
|
||||
fileName: `image-${index + 1}.png`,
|
||||
},
|
||||
entry.revised_prompt ? { revisedPrompt: entry.revised_prompt } : {},
|
||||
);
|
||||
})
|
||||
.filter((entry): entry is NonNullable<typeof entry> => entry !== null);
|
||||
const images = parseOpenAiCompatibleImageResponse(data);
|
||||
|
||||
return {
|
||||
images,
|
||||
|
||||
@@ -3,6 +3,11 @@ import type {
|
||||
ImageGenerationProvider,
|
||||
ImageGenerationRequest,
|
||||
} from "openclaw/plugin-sdk/image-generation";
|
||||
import {
|
||||
generatedImageAssetFromBase64,
|
||||
generatedImageAssetFromDataUrl,
|
||||
toImageDataUrl,
|
||||
} from "openclaw/plugin-sdk/image-generation";
|
||||
import { isProviderApiKeyConfigured } from "openclaw/plugin-sdk/provider-auth";
|
||||
import { resolveApiKeyForProvider } from "openclaw/plugin-sdk/provider-auth-runtime";
|
||||
import {
|
||||
@@ -14,7 +19,6 @@ import { normalizeOptionalString } from "openclaw/plugin-sdk/text-runtime";
|
||||
import { OPENROUTER_BASE_URL } from "./provider-catalog.js";
|
||||
|
||||
const DEFAULT_MODEL = "google/gemini-3.1-flash-image-preview";
|
||||
const DEFAULT_OUTPUT_MIME = "image/png";
|
||||
const DEFAULT_TIMEOUT_MS = 90_000;
|
||||
const MAX_IMAGE_RESULTS = 4;
|
||||
const SUPPORTED_MODELS = [
|
||||
@@ -49,56 +53,12 @@ type OpenRouterChatCompletionResponse = {
|
||||
}>;
|
||||
};
|
||||
|
||||
function parseDataUrl(dataUrl: string): { mimeType: string; data: string } | undefined {
|
||||
const match = dataUrl.match(/^data:([^;]+);base64,(.+)$/s);
|
||||
if (!match) {
|
||||
return undefined;
|
||||
}
|
||||
const [, mimeType, data] = match;
|
||||
if (!mimeType || !data) {
|
||||
return undefined;
|
||||
}
|
||||
return { mimeType, data };
|
||||
}
|
||||
|
||||
function fileExtensionForMimeType(mimeType: string): string {
|
||||
if (mimeType.includes("jpeg") || mimeType.includes("jpg")) {
|
||||
return "jpg";
|
||||
}
|
||||
if (mimeType.includes("webp")) {
|
||||
return "webp";
|
||||
}
|
||||
if (mimeType.includes("gif")) {
|
||||
return "gif";
|
||||
}
|
||||
return mimeType.split("/")[1] ?? "png";
|
||||
}
|
||||
|
||||
function toGeneratedImage(params: {
|
||||
base64: string;
|
||||
index: number;
|
||||
mimeType?: string;
|
||||
}): GeneratedImageAsset {
|
||||
const mimeType = params.mimeType ?? DEFAULT_OUTPUT_MIME;
|
||||
return {
|
||||
buffer: Buffer.from(params.base64, "base64"),
|
||||
mimeType,
|
||||
fileName: `image-${params.index + 1}.${fileExtensionForMimeType(mimeType)}`,
|
||||
};
|
||||
}
|
||||
|
||||
function pushDataUrlImage(images: GeneratedImageAsset[], dataUrl: string): void {
|
||||
const parsed = parseDataUrl(dataUrl);
|
||||
if (!parsed) {
|
||||
const image = generatedImageAssetFromDataUrl({ dataUrl, index: images.length });
|
||||
if (!image) {
|
||||
return;
|
||||
}
|
||||
images.push(
|
||||
toGeneratedImage({
|
||||
base64: parsed.data,
|
||||
index: images.length,
|
||||
mimeType: parsed.mimeType,
|
||||
}),
|
||||
);
|
||||
images.push(image);
|
||||
}
|
||||
|
||||
function extractImagesFromPart(images: GeneratedImageAsset[], part: unknown): void {
|
||||
@@ -117,7 +77,10 @@ function extractImagesFromPart(images: GeneratedImageAsset[], part: unknown): vo
|
||||
|
||||
const rawBase64 = typeof value.b64_json === "string" ? value.b64_json : undefined;
|
||||
if (rawBase64) {
|
||||
images.push(toGeneratedImage({ base64: rawBase64, index: images.length }));
|
||||
const image = generatedImageAssetFromBase64({ base64: rawBase64, index: images.length });
|
||||
if (image) {
|
||||
images.push(image);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -129,8 +92,15 @@ function extractImagesFromPart(images: GeneratedImageAsset[], part: unknown): vo
|
||||
const mimeType =
|
||||
(typeof inlineData?.mimeType === "string" ? inlineData.mimeType : undefined) ??
|
||||
(typeof inlineData?.mime_type === "string" ? inlineData.mime_type : undefined) ??
|
||||
DEFAULT_OUTPUT_MIME;
|
||||
images.push(toGeneratedImage({ base64: data, index: images.length, mimeType }));
|
||||
"image/png";
|
||||
const image = generatedImageAssetFromBase64({
|
||||
base64: data,
|
||||
index: images.length,
|
||||
mimeType,
|
||||
});
|
||||
if (image) {
|
||||
images.push(image);
|
||||
}
|
||||
}
|
||||
|
||||
export function extractOpenRouterImagesFromResponse(
|
||||
@@ -165,10 +135,6 @@ export function extractOpenRouterImagesFromResponse(
|
||||
return images;
|
||||
}
|
||||
|
||||
function toDataUrl(image: { buffer: Buffer; mimeType: string }): string {
|
||||
return `data:${image.mimeType};base64,${image.buffer.toString("base64")}`;
|
||||
}
|
||||
|
||||
function resolveImageCount(count: number | undefined): number {
|
||||
if (typeof count !== "number" || !Number.isFinite(count)) {
|
||||
return 1;
|
||||
@@ -193,7 +159,7 @@ function buildMessageContent(
|
||||
{ type: "text", text: req.prompt },
|
||||
...inputImages.map((image) => ({
|
||||
type: "image_url" as const,
|
||||
image_url: { url: toDataUrl(image) },
|
||||
image_url: { url: toImageDataUrl(image) },
|
||||
})),
|
||||
];
|
||||
}
|
||||
|
||||
@@ -1,20 +1,9 @@
|
||||
import {
|
||||
assertOkOrThrowHttpError,
|
||||
postJsonRequest,
|
||||
resolveProviderHttpRequestConfig,
|
||||
} from "openclaw/plugin-sdk/provider-http";
|
||||
import { normalizeResolvedSecretInputString } from "openclaw/plugin-sdk/secret-input";
|
||||
import {
|
||||
asFiniteNumber,
|
||||
asObject,
|
||||
trimToUndefined,
|
||||
type SpeechDirectiveTokenParseContext,
|
||||
type SpeechProviderConfig,
|
||||
type SpeechProviderOverrides,
|
||||
createOpenAiCompatibleSpeechProvider,
|
||||
type SpeechProviderPlugin,
|
||||
} from "openclaw/plugin-sdk/speech";
|
||||
import { normalizeOptionalLowercaseString } from "openclaw/plugin-sdk/text-runtime";
|
||||
import { normalizeOpenRouterBaseUrl, OPENROUTER_BASE_URL } from "./provider-catalog.js";
|
||||
import { OPENROUTER_BASE_URL } from "./provider-catalog.js";
|
||||
|
||||
const DEFAULT_OPENROUTER_TTS_MODEL = "hexgrad/kokoro-82m";
|
||||
const DEFAULT_OPENROUTER_TTS_VOICE = "af_alloy";
|
||||
@@ -26,278 +15,32 @@ const OPENROUTER_TTS_MODELS = [
|
||||
] as const;
|
||||
const OPENROUTER_TTS_RESPONSE_FORMATS = ["mp3", "pcm"] as const;
|
||||
|
||||
type OpenRouterTtsResponseFormat = (typeof OPENROUTER_TTS_RESPONSE_FORMATS)[number];
|
||||
|
||||
type OpenRouterTtsProviderConfig = {
|
||||
apiKey?: string;
|
||||
baseUrl?: string;
|
||||
model: string;
|
||||
voice: string;
|
||||
speed?: number;
|
||||
responseFormat?: OpenRouterTtsResponseFormat;
|
||||
type OpenRouterTtsExtraConfig = {
|
||||
provider?: Record<string, unknown>;
|
||||
};
|
||||
|
||||
type OpenRouterTtsProviderOverrides = {
|
||||
model?: string;
|
||||
voice?: string;
|
||||
speed?: number;
|
||||
};
|
||||
|
||||
function normalizeOpenRouterTtsResponseFormat(
|
||||
value: unknown,
|
||||
): OpenRouterTtsResponseFormat | undefined {
|
||||
const next = normalizeOptionalLowercaseString(value);
|
||||
if (!next) {
|
||||
return undefined;
|
||||
}
|
||||
if (OPENROUTER_TTS_RESPONSE_FORMATS.some((format) => format === next)) {
|
||||
return next as OpenRouterTtsResponseFormat;
|
||||
}
|
||||
throw new Error(`Invalid OpenRouter speech responseFormat: ${next}`);
|
||||
}
|
||||
|
||||
function normalizeOpenRouterTtsBaseUrl(value: unknown): string {
|
||||
return (
|
||||
normalizeOpenRouterBaseUrl(trimToUndefined(value) ?? OPENROUTER_BASE_URL) ?? OPENROUTER_BASE_URL
|
||||
);
|
||||
}
|
||||
|
||||
function resolveOpenRouterProviderConfigRecord(
|
||||
rawConfig: Record<string, unknown>,
|
||||
): Record<string, unknown> | undefined {
|
||||
const providers = asObject(rawConfig.providers);
|
||||
return asObject(providers?.openrouter) ?? asObject(rawConfig.openrouter);
|
||||
}
|
||||
|
||||
function normalizeOpenRouterTtsProviderConfig(
|
||||
rawConfig: Record<string, unknown>,
|
||||
): OpenRouterTtsProviderConfig {
|
||||
const raw = resolveOpenRouterProviderConfigRecord(rawConfig);
|
||||
return {
|
||||
apiKey: normalizeResolvedSecretInputString({
|
||||
value: raw?.apiKey,
|
||||
path: "messages.tts.providers.openrouter.apiKey",
|
||||
}),
|
||||
baseUrl:
|
||||
trimToUndefined(raw?.baseUrl) == null
|
||||
? undefined
|
||||
: normalizeOpenRouterTtsBaseUrl(raw?.baseUrl),
|
||||
model: trimToUndefined(raw?.model ?? raw?.modelId) ?? DEFAULT_OPENROUTER_TTS_MODEL,
|
||||
voice: trimToUndefined(raw?.voice ?? raw?.voiceId) ?? DEFAULT_OPENROUTER_TTS_VOICE,
|
||||
speed: asFiniteNumber(raw?.speed),
|
||||
responseFormat: normalizeOpenRouterTtsResponseFormat(raw?.responseFormat),
|
||||
provider: asObject(raw?.provider),
|
||||
};
|
||||
}
|
||||
|
||||
function readOpenRouterTtsProviderConfig(
|
||||
config: SpeechProviderConfig,
|
||||
): OpenRouterTtsProviderConfig {
|
||||
const normalized = normalizeOpenRouterTtsProviderConfig({});
|
||||
return {
|
||||
apiKey: trimToUndefined(config.apiKey) ?? normalized.apiKey,
|
||||
baseUrl:
|
||||
trimToUndefined(config.baseUrl) == null
|
||||
? normalized.baseUrl
|
||||
: normalizeOpenRouterTtsBaseUrl(config.baseUrl),
|
||||
model: trimToUndefined(config.model ?? config.modelId) ?? normalized.model,
|
||||
voice: trimToUndefined(config.voice ?? config.voiceId) ?? normalized.voice,
|
||||
speed: asFiniteNumber(config.speed) ?? normalized.speed,
|
||||
responseFormat:
|
||||
normalizeOpenRouterTtsResponseFormat(config.responseFormat) ?? normalized.responseFormat,
|
||||
provider: asObject(config.provider) ?? normalized.provider,
|
||||
};
|
||||
}
|
||||
|
||||
function readOpenRouterTtsOverrides(
|
||||
overrides: SpeechProviderOverrides | undefined,
|
||||
): OpenRouterTtsProviderOverrides {
|
||||
if (!overrides) {
|
||||
return {};
|
||||
}
|
||||
return {
|
||||
model: trimToUndefined(overrides.model ?? overrides.modelId),
|
||||
voice: trimToUndefined(overrides.voice ?? overrides.voiceId),
|
||||
speed: asFiniteNumber(overrides.speed),
|
||||
};
|
||||
}
|
||||
|
||||
function resolveOpenRouterTtsApiKey(params: {
|
||||
cfg?: { models?: { providers?: { openrouter?: { apiKey?: unknown } } } };
|
||||
providerConfig: OpenRouterTtsProviderConfig;
|
||||
}): string | undefined {
|
||||
return (
|
||||
params.providerConfig.apiKey ??
|
||||
normalizeResolvedSecretInputString({
|
||||
value: params.cfg?.models?.providers?.openrouter?.apiKey,
|
||||
path: "models.providers.openrouter.apiKey",
|
||||
}) ??
|
||||
trimToUndefined(process.env.OPENROUTER_API_KEY)
|
||||
);
|
||||
}
|
||||
|
||||
function resolveOpenRouterTtsBaseUrl(params: {
|
||||
cfg?: { models?: { providers?: { openrouter?: { baseUrl?: unknown } } } };
|
||||
providerConfig: OpenRouterTtsProviderConfig;
|
||||
}): string {
|
||||
return normalizeOpenRouterTtsBaseUrl(
|
||||
params.providerConfig.baseUrl ??
|
||||
trimToUndefined(params.cfg?.models?.providers?.openrouter?.baseUrl) ??
|
||||
OPENROUTER_BASE_URL,
|
||||
);
|
||||
}
|
||||
|
||||
function resolveOpenRouterTtsResponseFormat(
|
||||
configuredFormat?: OpenRouterTtsResponseFormat,
|
||||
): OpenRouterTtsResponseFormat {
|
||||
if (configuredFormat) {
|
||||
return configuredFormat;
|
||||
}
|
||||
return "mp3";
|
||||
}
|
||||
|
||||
function responseFormatToFileExtension(format: OpenRouterTtsResponseFormat): ".mp3" | ".pcm" {
|
||||
return format === "pcm" ? ".pcm" : ".mp3";
|
||||
}
|
||||
|
||||
function parseDirectiveToken(ctx: SpeechDirectiveTokenParseContext): {
|
||||
handled: boolean;
|
||||
overrides?: SpeechProviderOverrides;
|
||||
} {
|
||||
switch (ctx.key) {
|
||||
case "voice":
|
||||
case "voice_id":
|
||||
case "voiceid":
|
||||
case "openrouter_voice":
|
||||
case "openroutervoice":
|
||||
if (!ctx.policy.allowVoice) {
|
||||
return { handled: true };
|
||||
}
|
||||
return { handled: true, overrides: { voice: ctx.value } };
|
||||
case "model":
|
||||
case "model_id":
|
||||
case "modelid":
|
||||
case "openrouter_model":
|
||||
case "openroutermodel":
|
||||
if (!ctx.policy.allowModelId) {
|
||||
return { handled: true };
|
||||
}
|
||||
return { handled: true, overrides: { model: ctx.value } };
|
||||
default:
|
||||
return { handled: false };
|
||||
}
|
||||
}
|
||||
|
||||
export function buildOpenRouterSpeechProvider(): SpeechProviderPlugin {
|
||||
return {
|
||||
return createOpenAiCompatibleSpeechProvider<OpenRouterTtsExtraConfig>({
|
||||
id: "openrouter",
|
||||
label: "OpenRouter",
|
||||
autoSelectOrder: 35,
|
||||
models: OPENROUTER_TTS_MODELS,
|
||||
voices: [DEFAULT_OPENROUTER_TTS_VOICE],
|
||||
resolveConfig: ({ rawConfig }) => normalizeOpenRouterTtsProviderConfig(rawConfig),
|
||||
parseDirectiveToken,
|
||||
resolveTalkConfig: ({ baseTtsConfig, talkProviderConfig }) => {
|
||||
const base = normalizeOpenRouterTtsProviderConfig(baseTtsConfig);
|
||||
const responseFormat = normalizeOpenRouterTtsResponseFormat(
|
||||
talkProviderConfig.responseFormat,
|
||||
);
|
||||
return {
|
||||
...base,
|
||||
...(talkProviderConfig.apiKey === undefined
|
||||
? {}
|
||||
: {
|
||||
apiKey: normalizeResolvedSecretInputString({
|
||||
value: talkProviderConfig.apiKey,
|
||||
path: "talk.providers.openrouter.apiKey",
|
||||
}),
|
||||
}),
|
||||
...(trimToUndefined(talkProviderConfig.baseUrl) == null
|
||||
? {}
|
||||
: { baseUrl: normalizeOpenRouterTtsBaseUrl(talkProviderConfig.baseUrl) }),
|
||||
...(trimToUndefined(talkProviderConfig.modelId) == null
|
||||
? {}
|
||||
: { model: trimToUndefined(talkProviderConfig.modelId) }),
|
||||
...(trimToUndefined(talkProviderConfig.voiceId) == null
|
||||
? {}
|
||||
: { voice: trimToUndefined(talkProviderConfig.voiceId) }),
|
||||
...(asFiniteNumber(talkProviderConfig.speed) == null
|
||||
? {}
|
||||
: { speed: asFiniteNumber(talkProviderConfig.speed) }),
|
||||
...(responseFormat == null ? {} : { responseFormat }),
|
||||
};
|
||||
defaultModel: DEFAULT_OPENROUTER_TTS_MODEL,
|
||||
defaultVoice: DEFAULT_OPENROUTER_TTS_VOICE,
|
||||
defaultBaseUrl: OPENROUTER_BASE_URL,
|
||||
envKey: "OPENROUTER_API_KEY",
|
||||
responseFormats: OPENROUTER_TTS_RESPONSE_FORMATS,
|
||||
defaultResponseFormat: "mp3",
|
||||
voiceCompatibleResponseFormats: ["mp3"],
|
||||
baseUrlPolicy: { kind: "canonical", aliases: ["https://openrouter.ai/v1"] },
|
||||
extraHeaders: {
|
||||
"HTTP-Referer": "https://openclaw.ai",
|
||||
"X-OpenRouter-Title": "OpenClaw",
|
||||
},
|
||||
resolveTalkOverrides: ({ params }) => ({
|
||||
...(trimToUndefined(params.voiceId ?? params.voice) == null
|
||||
? {}
|
||||
: { voice: trimToUndefined(params.voiceId ?? params.voice) }),
|
||||
...(trimToUndefined(params.modelId ?? params.model) == null
|
||||
? {}
|
||||
: { model: trimToUndefined(params.modelId ?? params.model) }),
|
||||
...(asFiniteNumber(params.speed) == null ? {} : { speed: asFiniteNumber(params.speed) }),
|
||||
}),
|
||||
listVoices: async () => [
|
||||
{ id: DEFAULT_OPENROUTER_TTS_VOICE, name: DEFAULT_OPENROUTER_TTS_VOICE },
|
||||
],
|
||||
isConfigured: ({ cfg, providerConfig }) => {
|
||||
const config = readOpenRouterTtsProviderConfig(providerConfig);
|
||||
return Boolean(resolveOpenRouterTtsApiKey({ cfg, providerConfig: config }));
|
||||
},
|
||||
synthesize: async (req) => {
|
||||
const config = readOpenRouterTtsProviderConfig(req.providerConfig);
|
||||
const overrides = readOpenRouterTtsOverrides(req.providerOverrides);
|
||||
const apiKey = resolveOpenRouterTtsApiKey({ cfg: req.cfg, providerConfig: config });
|
||||
if (!apiKey) {
|
||||
throw new Error("OpenRouter API key missing");
|
||||
}
|
||||
|
||||
const baseUrl = resolveOpenRouterTtsBaseUrl({ cfg: req.cfg, providerConfig: config });
|
||||
const responseFormat = resolveOpenRouterTtsResponseFormat(config.responseFormat);
|
||||
const speed = overrides.speed ?? config.speed;
|
||||
const { allowPrivateNetwork, headers, dispatcherPolicy } = resolveProviderHttpRequestConfig({
|
||||
baseUrl,
|
||||
defaultBaseUrl: OPENROUTER_BASE_URL,
|
||||
allowPrivateNetwork: false,
|
||||
defaultHeaders: {
|
||||
Authorization: `Bearer ${apiKey}`,
|
||||
"Content-Type": "application/json",
|
||||
"HTTP-Referer": "https://openclaw.ai",
|
||||
"X-OpenRouter-Title": "OpenClaw",
|
||||
},
|
||||
provider: "openrouter",
|
||||
capability: "audio",
|
||||
transport: "http",
|
||||
});
|
||||
|
||||
const { response, release } = await postJsonRequest({
|
||||
url: `${baseUrl}/audio/speech`,
|
||||
headers,
|
||||
body: {
|
||||
model: overrides.model ?? config.model,
|
||||
input: req.text,
|
||||
voice: overrides.voice ?? config.voice,
|
||||
response_format: responseFormat,
|
||||
...(speed == null ? {} : { speed }),
|
||||
...(config.provider == null ? {} : { provider: config.provider }),
|
||||
},
|
||||
timeoutMs: req.timeoutMs,
|
||||
fetchFn: fetch,
|
||||
allowPrivateNetwork,
|
||||
dispatcherPolicy,
|
||||
});
|
||||
|
||||
try {
|
||||
await assertOkOrThrowHttpError(response, "OpenRouter TTS API error");
|
||||
return {
|
||||
audioBuffer: Buffer.from(await response.arrayBuffer()),
|
||||
outputFormat: responseFormat,
|
||||
fileExtension: responseFormatToFileExtension(responseFormat),
|
||||
voiceCompatible: responseFormat === "mp3",
|
||||
};
|
||||
} finally {
|
||||
await release();
|
||||
}
|
||||
},
|
||||
};
|
||||
apiErrorLabel: "OpenRouter TTS API error",
|
||||
missingApiKeyError: "OpenRouter API key missing",
|
||||
readExtraConfig: (raw) => ({ provider: asObject(raw?.provider) }),
|
||||
extraJsonBodyFields: [{ configKey: "provider" }],
|
||||
});
|
||||
}
|
||||
|
||||
@@ -1,9 +1,12 @@
|
||||
import type {
|
||||
GeneratedImageAsset,
|
||||
ImageGenerationProvider,
|
||||
ImageGenerationRequest,
|
||||
ImageGenerationResult,
|
||||
} from "openclaw/plugin-sdk/image-generation";
|
||||
import {
|
||||
parseOpenAiCompatibleImageResponse,
|
||||
toImageDataUrl,
|
||||
} from "openclaw/plugin-sdk/image-generation";
|
||||
import { isProviderApiKeyConfigured } from "openclaw/plugin-sdk/provider-auth";
|
||||
import { resolveApiKeyForProvider } from "openclaw/plugin-sdk/provider-auth-runtime";
|
||||
import {
|
||||
@@ -19,7 +22,6 @@ import {
|
||||
} from "openclaw/plugin-sdk/text-runtime";
|
||||
import { XAI_BASE_URL, XAI_DEFAULT_IMAGE_MODEL, XAI_IMAGE_MODELS } from "./model-definitions.js";
|
||||
|
||||
const DEFAULT_OUTPUT_MIME = "image/png";
|
||||
const DEFAULT_TIMEOUT_MS = 60_000;
|
||||
|
||||
const XAI_SUPPORTED_ASPECT_RATIOS = ["1:1", "16:9", "9:16", "4:3", "3:4", "2:3", "3:2"] as const;
|
||||
@@ -32,10 +34,6 @@ type XaiImageApiResponse = {
|
||||
}>;
|
||||
};
|
||||
|
||||
function toDataUrl(buffer: Buffer, mimeType: string): string {
|
||||
return `data:${mimeType};base64,${buffer.toString("base64")}`;
|
||||
}
|
||||
|
||||
function resolveImageForEdit(
|
||||
input: { url?: string; buffer?: Buffer; mimeType?: string } | undefined,
|
||||
): string {
|
||||
@@ -49,8 +47,7 @@ function resolveImageForEdit(
|
||||
if (!input.buffer) {
|
||||
throw new Error("xAI image edit input is missing both URL and buffer data.");
|
||||
}
|
||||
const mime = normalizeOptionalString(input.mimeType) ?? "image/png";
|
||||
return toDataUrl(input.buffer, mime);
|
||||
return toImageDataUrl({ buffer: input.buffer, mimeType: input.mimeType });
|
||||
}
|
||||
|
||||
function isEdit(req: ImageGenerationRequest): boolean {
|
||||
@@ -187,26 +184,7 @@ export function buildXaiImageGenerationProvider(): ImageGenerationProvider {
|
||||
);
|
||||
|
||||
const payload = (await response.json()) as XaiImageApiResponse;
|
||||
const images: GeneratedImageAsset[] = (payload.data ?? []).flatMap((item, idx) => {
|
||||
if (!item) {
|
||||
return [];
|
||||
}
|
||||
const b64 = normalizeOptionalString(item.b64_json);
|
||||
if (!b64) {
|
||||
return [];
|
||||
}
|
||||
const mimeType = normalizeOptionalString(item.mime_type) ?? DEFAULT_OUTPUT_MIME;
|
||||
return [
|
||||
{
|
||||
buffer: Buffer.from(b64, "base64"),
|
||||
mimeType,
|
||||
fileName: `image-${idx + 1}.${mimeType.split("/")[1] || "png"}`,
|
||||
...(item.revised_prompt
|
||||
? { revisedPrompt: normalizeOptionalString(item.revised_prompt) }
|
||||
: {}),
|
||||
},
|
||||
];
|
||||
});
|
||||
const images = parseOpenAiCompatibleImageResponse(payload);
|
||||
|
||||
return {
|
||||
images,
|
||||
|
||||
86
src/image-generation/image-assets.test.ts
Normal file
86
src/image-generation/image-assets.test.ts
Normal file
@@ -0,0 +1,86 @@
|
||||
import { describe, expect, it } from "vitest";
|
||||
import {
|
||||
generatedImageAssetFromDataUrl,
|
||||
imageFileExtensionForMimeType,
|
||||
imageSourceUploadFileName,
|
||||
parseImageDataUrl,
|
||||
parseOpenAiCompatibleImageResponse,
|
||||
sniffImageMimeType,
|
||||
toImageDataUrl,
|
||||
} from "./image-assets.js";
|
||||
|
||||
describe("image asset helpers", () => {
|
||||
it("converts buffers to image data URLs and parses them back", () => {
|
||||
const buffer = Buffer.from("png-bytes");
|
||||
const dataUrl = toImageDataUrl({ buffer, mimeType: "image/png" });
|
||||
|
||||
expect(dataUrl).toBe(`data:image/png;base64,${buffer.toString("base64")}`);
|
||||
expect(parseImageDataUrl(dataUrl)).toEqual({
|
||||
mimeType: "image/png",
|
||||
base64: buffer.toString("base64"),
|
||||
});
|
||||
expect(generatedImageAssetFromDataUrl({ dataUrl, index: 1 })).toMatchObject({
|
||||
buffer,
|
||||
mimeType: "image/png",
|
||||
fileName: "image-2.png",
|
||||
});
|
||||
});
|
||||
|
||||
it("normalizes image file extensions", () => {
|
||||
expect(imageFileExtensionForMimeType("image/jpeg")).toBe("jpg");
|
||||
expect(imageFileExtensionForMimeType("image/webp")).toBe("webp");
|
||||
expect(imageFileExtensionForMimeType("image/svg+xml")).toBe("svg");
|
||||
expect(imageFileExtensionForMimeType(undefined, "jpg")).toBe("jpg");
|
||||
});
|
||||
|
||||
it("sniffs common generated image types", () => {
|
||||
expect(sniffImageMimeType(Buffer.from([0xff, 0xd8, 0xff]))).toEqual({
|
||||
mimeType: "image/jpeg",
|
||||
extension: "jpg",
|
||||
});
|
||||
expect(sniffImageMimeType(Buffer.from([0x89, 0x50, 0x4e, 0x47, 0, 0, 0, 0]))).toEqual({
|
||||
mimeType: "image/png",
|
||||
extension: "png",
|
||||
});
|
||||
});
|
||||
|
||||
it("parses OpenAI-compatible base64 image responses", () => {
|
||||
const jpegBytes = Buffer.from([0xff, 0xd8, 0xff, 0xdb]);
|
||||
const images = parseOpenAiCompatibleImageResponse(
|
||||
{
|
||||
data: [
|
||||
{
|
||||
b64_json: jpegBytes.toString("base64"),
|
||||
revised_prompt: "revised",
|
||||
},
|
||||
{ b64_json: "" },
|
||||
],
|
||||
},
|
||||
{ defaultMimeType: "image/png", sniffMimeType: true },
|
||||
);
|
||||
|
||||
expect(images).toEqual([
|
||||
{
|
||||
buffer: jpegBytes,
|
||||
mimeType: "image/jpeg",
|
||||
fileName: "image-1.jpg",
|
||||
revisedPrompt: "revised",
|
||||
},
|
||||
]);
|
||||
});
|
||||
|
||||
it("resolves source upload filenames from explicit names or MIME types", () => {
|
||||
expect(
|
||||
imageSourceUploadFileName({
|
||||
image: { buffer: Buffer.from("x"), mimeType: "image/webp" },
|
||||
index: 2,
|
||||
}),
|
||||
).toBe("image-3.webp");
|
||||
expect(
|
||||
imageSourceUploadFileName({
|
||||
image: { buffer: Buffer.from("x"), mimeType: "image/png", fileName: "source.png" },
|
||||
index: 0,
|
||||
}),
|
||||
).toBe("source.png");
|
||||
});
|
||||
});
|
||||
200
src/image-generation/image-assets.ts
Normal file
200
src/image-generation/image-assets.ts
Normal file
@@ -0,0 +1,200 @@
|
||||
import {
|
||||
normalizeOptionalLowercaseString,
|
||||
normalizeOptionalString,
|
||||
} from "../shared/string-coerce.js";
|
||||
import type { GeneratedImageAsset, ImageGenerationSourceImage } from "./types.js";
|
||||
|
||||
const DEFAULT_IMAGE_MIME_TYPE = "image/png";
|
||||
const DEFAULT_IMAGE_FILE_PREFIX = "image";
|
||||
|
||||
export type ImageMimeTypeDetection = {
|
||||
mimeType: string;
|
||||
extension: string;
|
||||
};
|
||||
|
||||
export type OpenAiCompatibleImageResponseEntry = {
|
||||
b64_json?: unknown;
|
||||
mime_type?: unknown;
|
||||
revised_prompt?: unknown;
|
||||
};
|
||||
|
||||
export type OpenAiCompatibleImageResponsePayload = {
|
||||
data?: OpenAiCompatibleImageResponseEntry[];
|
||||
};
|
||||
|
||||
export function imageFileExtensionForMimeType(
|
||||
mimeType: string | undefined,
|
||||
fallback = "png",
|
||||
): string {
|
||||
const normalized = normalizeOptionalLowercaseString(mimeType)?.split(";")[0]?.trim();
|
||||
if (!normalized) {
|
||||
return fallback;
|
||||
}
|
||||
if (normalized.includes("jpeg") || normalized.includes("jpg")) {
|
||||
return "jpg";
|
||||
}
|
||||
if (normalized.includes("svg")) {
|
||||
return "svg";
|
||||
}
|
||||
const slashIndex = normalized.indexOf("/");
|
||||
return slashIndex >= 0 ? normalized.slice(slashIndex + 1) || fallback : fallback;
|
||||
}
|
||||
|
||||
export function sniffImageMimeType(
|
||||
buffer: Buffer,
|
||||
fallbackMimeType = DEFAULT_IMAGE_MIME_TYPE,
|
||||
): ImageMimeTypeDetection {
|
||||
if (buffer.length >= 3 && buffer[0] === 0xff && buffer[1] === 0xd8 && buffer[2] === 0xff) {
|
||||
return { mimeType: "image/jpeg", extension: "jpg" };
|
||||
}
|
||||
if (
|
||||
buffer.length >= 8 &&
|
||||
buffer[0] === 0x89 &&
|
||||
buffer[1] === 0x50 &&
|
||||
buffer[2] === 0x4e &&
|
||||
buffer[3] === 0x47
|
||||
) {
|
||||
return { mimeType: "image/png", extension: "png" };
|
||||
}
|
||||
if (
|
||||
buffer.length >= 12 &&
|
||||
buffer.toString("ascii", 0, 4) === "RIFF" &&
|
||||
buffer.toString("ascii", 8, 12) === "WEBP"
|
||||
) {
|
||||
return { mimeType: "image/webp", extension: "webp" };
|
||||
}
|
||||
return {
|
||||
mimeType: fallbackMimeType,
|
||||
extension: imageFileExtensionForMimeType(fallbackMimeType),
|
||||
};
|
||||
}
|
||||
|
||||
export function toImageDataUrl(params: {
|
||||
buffer: Buffer;
|
||||
mimeType?: string;
|
||||
defaultMimeType?: string;
|
||||
}): string {
|
||||
const mimeType =
|
||||
normalizeOptionalString(params.mimeType) ??
|
||||
normalizeOptionalString(params.defaultMimeType) ??
|
||||
DEFAULT_IMAGE_MIME_TYPE;
|
||||
return `data:${mimeType};base64,${params.buffer.toString("base64")}`;
|
||||
}
|
||||
|
||||
export function parseImageDataUrl(
|
||||
dataUrl: string,
|
||||
): { mimeType: string; base64: string } | undefined {
|
||||
const match = dataUrl.match(/^data:(image\/[^;,]+)(?:;[^,]*)?;base64,(.+)$/is);
|
||||
if (!match) {
|
||||
return undefined;
|
||||
}
|
||||
const mimeType = normalizeOptionalString(match[1]);
|
||||
const base64 = normalizeOptionalString(match[2]);
|
||||
if (!mimeType || !base64) {
|
||||
return undefined;
|
||||
}
|
||||
return { mimeType, base64 };
|
||||
}
|
||||
|
||||
export function generatedImageAssetFromBase64(params: {
|
||||
base64: string | undefined;
|
||||
index: number;
|
||||
mimeType?: string;
|
||||
revisedPrompt?: string;
|
||||
defaultMimeType?: string;
|
||||
fileNamePrefix?: string;
|
||||
sniffMimeType?: boolean;
|
||||
}): GeneratedImageAsset | undefined {
|
||||
const base64 = normalizeOptionalString(params.base64);
|
||||
if (!base64) {
|
||||
return undefined;
|
||||
}
|
||||
const buffer = Buffer.from(base64, "base64");
|
||||
const explicitMimeType = normalizeOptionalString(params.mimeType);
|
||||
const defaultMimeType =
|
||||
normalizeOptionalString(params.defaultMimeType) ?? DEFAULT_IMAGE_MIME_TYPE;
|
||||
const detected =
|
||||
params.sniffMimeType && !explicitMimeType
|
||||
? sniffImageMimeType(buffer, defaultMimeType)
|
||||
: undefined;
|
||||
const mimeType = explicitMimeType ?? detected?.mimeType ?? defaultMimeType;
|
||||
const prefix = normalizeOptionalString(params.fileNamePrefix) ?? DEFAULT_IMAGE_FILE_PREFIX;
|
||||
const image: GeneratedImageAsset = {
|
||||
buffer,
|
||||
mimeType,
|
||||
fileName: `${prefix}-${params.index + 1}.${detected?.extension ?? imageFileExtensionForMimeType(mimeType)}`,
|
||||
};
|
||||
const revisedPrompt = normalizeOptionalString(params.revisedPrompt);
|
||||
if (revisedPrompt) {
|
||||
image.revisedPrompt = revisedPrompt;
|
||||
}
|
||||
return image;
|
||||
}
|
||||
|
||||
export function generatedImageAssetFromDataUrl(params: {
|
||||
dataUrl: string;
|
||||
index: number;
|
||||
fileNamePrefix?: string;
|
||||
}): GeneratedImageAsset | undefined {
|
||||
const parsed = parseImageDataUrl(params.dataUrl);
|
||||
if (!parsed) {
|
||||
return undefined;
|
||||
}
|
||||
return generatedImageAssetFromBase64({
|
||||
base64: parsed.base64,
|
||||
index: params.index,
|
||||
mimeType: parsed.mimeType,
|
||||
fileNamePrefix: params.fileNamePrefix,
|
||||
});
|
||||
}
|
||||
|
||||
export function generatedImageAssetFromOpenAiCompatibleEntry(
|
||||
entry: OpenAiCompatibleImageResponseEntry,
|
||||
index: number,
|
||||
options: {
|
||||
defaultMimeType?: string;
|
||||
fileNamePrefix?: string;
|
||||
sniffMimeType?: boolean;
|
||||
} = {},
|
||||
): GeneratedImageAsset | undefined {
|
||||
return generatedImageAssetFromBase64({
|
||||
base64: normalizeOptionalString(entry.b64_json),
|
||||
index,
|
||||
mimeType: normalizeOptionalString(entry.mime_type),
|
||||
revisedPrompt: normalizeOptionalString(entry.revised_prompt),
|
||||
defaultMimeType: options.defaultMimeType,
|
||||
fileNamePrefix: options.fileNamePrefix,
|
||||
sniffMimeType: options.sniffMimeType,
|
||||
});
|
||||
}
|
||||
|
||||
export function parseOpenAiCompatibleImageResponse(
|
||||
payload: OpenAiCompatibleImageResponsePayload,
|
||||
options: {
|
||||
defaultMimeType?: string;
|
||||
fileNamePrefix?: string;
|
||||
sniffMimeType?: boolean;
|
||||
} = {},
|
||||
): GeneratedImageAsset[] {
|
||||
return (payload.data ?? [])
|
||||
.map((entry, index) => generatedImageAssetFromOpenAiCompatibleEntry(entry, index, options))
|
||||
.filter((entry): entry is GeneratedImageAsset => entry !== undefined);
|
||||
}
|
||||
|
||||
export function imageSourceUploadFileName(params: {
|
||||
image: ImageGenerationSourceImage;
|
||||
index: number;
|
||||
defaultMimeType?: string;
|
||||
fileNamePrefix?: string;
|
||||
}): string {
|
||||
const fileName = normalizeOptionalString(params.image.fileName);
|
||||
if (fileName) {
|
||||
return fileName;
|
||||
}
|
||||
const mimeType =
|
||||
normalizeOptionalString(params.image.mimeType) ??
|
||||
normalizeOptionalString(params.defaultMimeType) ??
|
||||
DEFAULT_IMAGE_MIME_TYPE;
|
||||
const prefix = normalizeOptionalString(params.fileNamePrefix) ?? DEFAULT_IMAGE_FILE_PREFIX;
|
||||
return `${prefix}-${params.index + 1}.${imageFileExtensionForMimeType(mimeType)}`;
|
||||
}
|
||||
@@ -1,5 +1,20 @@
|
||||
// Public image-generation helpers and types for provider plugins.
|
||||
|
||||
export {
|
||||
generatedImageAssetFromBase64,
|
||||
generatedImageAssetFromDataUrl,
|
||||
generatedImageAssetFromOpenAiCompatibleEntry,
|
||||
imageFileExtensionForMimeType,
|
||||
imageSourceUploadFileName,
|
||||
parseImageDataUrl,
|
||||
parseOpenAiCompatibleImageResponse,
|
||||
sniffImageMimeType,
|
||||
toImageDataUrl,
|
||||
type ImageMimeTypeDetection,
|
||||
type OpenAiCompatibleImageResponseEntry,
|
||||
type OpenAiCompatibleImageResponsePayload,
|
||||
} from "../image-generation/image-assets.js";
|
||||
|
||||
export type {
|
||||
GeneratedImageAsset,
|
||||
ImageGenerationBackground,
|
||||
|
||||
@@ -88,7 +88,7 @@ export type {
|
||||
MemoryPluginPublicArtifactsProvider,
|
||||
} from "../plugins/memory-state.js";
|
||||
export type { CliBackendConfig } from "../config/types.js";
|
||||
export * from "./image-generation.js";
|
||||
export type * from "./image-generation.js";
|
||||
export * from "./music-generation.js";
|
||||
export type { SecretInput, SecretRef } from "../config/types.secrets.js";
|
||||
export type { RuntimeEnv } from "../runtime.js";
|
||||
|
||||
@@ -55,3 +55,10 @@ export {
|
||||
requireInRange,
|
||||
scheduleCleanup,
|
||||
} from "../tts/tts-provider-helpers.js";
|
||||
export {
|
||||
createOpenAiCompatibleSpeechProvider,
|
||||
type OpenAiCompatibleSpeechProviderBaseUrlPolicy,
|
||||
type OpenAiCompatibleSpeechProviderConfig,
|
||||
type OpenAiCompatibleSpeechProviderExtraJsonBodyField,
|
||||
type OpenAiCompatibleSpeechProviderOptions,
|
||||
} from "../tts/openai-compatible-speech-provider.js";
|
||||
|
||||
155
src/tts/openai-compatible-speech-provider.test.ts
Normal file
155
src/tts/openai-compatible-speech-provider.test.ts
Normal file
@@ -0,0 +1,155 @@
|
||||
import { afterEach, describe, expect, it, vi } from "vitest";
|
||||
import { createOpenAiCompatibleSpeechProvider } from "./openai-compatible-speech-provider.js";
|
||||
|
||||
const { assertOkOrThrowHttpErrorMock, postJsonRequestMock, resolveProviderHttpRequestConfigMock } =
|
||||
vi.hoisted(() => ({
|
||||
assertOkOrThrowHttpErrorMock: vi.fn(async () => {}),
|
||||
postJsonRequestMock: vi.fn(),
|
||||
resolveProviderHttpRequestConfigMock: vi.fn((params: Record<string, unknown>) => ({
|
||||
baseUrl: params.baseUrl ?? params.defaultBaseUrl ?? "https://example.test/v1",
|
||||
allowPrivateNetwork: false,
|
||||
headers: new Headers(params.defaultHeaders as HeadersInit | undefined),
|
||||
dispatcherPolicy: undefined,
|
||||
})),
|
||||
}));
|
||||
|
||||
vi.mock("openclaw/plugin-sdk/provider-http", () => ({
|
||||
assertOkOrThrowHttpError: assertOkOrThrowHttpErrorMock,
|
||||
postJsonRequest: postJsonRequestMock,
|
||||
resolveProviderHttpRequestConfig: resolveProviderHttpRequestConfigMock,
|
||||
}));
|
||||
|
||||
describe("createOpenAiCompatibleSpeechProvider", () => {
|
||||
afterEach(() => {
|
||||
assertOkOrThrowHttpErrorMock.mockClear();
|
||||
postJsonRequestMock.mockReset();
|
||||
resolveProviderHttpRequestConfigMock.mockClear();
|
||||
vi.unstubAllEnvs();
|
||||
});
|
||||
|
||||
it("normalizes config with built-in base URL policies", () => {
|
||||
const provider = createOpenAiCompatibleSpeechProvider({
|
||||
id: "demo",
|
||||
label: "Demo",
|
||||
autoSelectOrder: 40,
|
||||
models: ["demo-tts"],
|
||||
voices: ["alloy"],
|
||||
defaultModel: "demo-tts",
|
||||
defaultVoice: "alloy",
|
||||
defaultBaseUrl: "https://example.test/api/v1",
|
||||
envKey: "DEMO_API_KEY",
|
||||
responseFormats: ["mp3", "pcm"],
|
||||
defaultResponseFormat: "mp3",
|
||||
voiceCompatibleResponseFormats: ["mp3"],
|
||||
baseUrlPolicy: {
|
||||
kind: "canonical",
|
||||
aliases: ["https://example.test/v1"],
|
||||
},
|
||||
});
|
||||
|
||||
expect(
|
||||
provider.resolveConfig?.({
|
||||
cfg: {} as never,
|
||||
timeoutMs: 30_000,
|
||||
rawConfig: {
|
||||
providers: {
|
||||
demo: {
|
||||
apiKey: "sk-demo",
|
||||
baseUrl: "https://example.test/v1/",
|
||||
modelId: "custom-tts",
|
||||
voiceId: "nova",
|
||||
speed: 1.25,
|
||||
responseFormat: " PCM ",
|
||||
},
|
||||
},
|
||||
},
|
||||
}),
|
||||
).toEqual({
|
||||
apiKey: "sk-demo",
|
||||
baseUrl: "https://example.test/api/v1",
|
||||
model: "custom-tts",
|
||||
voice: "nova",
|
||||
speed: 1.25,
|
||||
responseFormat: "pcm",
|
||||
});
|
||||
});
|
||||
|
||||
it("maps configured extra JSON body fields into synthesis requests", async () => {
|
||||
const release = vi.fn(async () => {});
|
||||
postJsonRequestMock.mockResolvedValue({
|
||||
response: new Response(new Uint8Array([4, 5, 6]), { status: 200 }),
|
||||
release,
|
||||
});
|
||||
vi.stubEnv("DEMO_API_KEY", "sk-env");
|
||||
|
||||
const provider = createOpenAiCompatibleSpeechProvider<{
|
||||
routing?: Record<string, unknown>;
|
||||
}>({
|
||||
id: "demo",
|
||||
label: "Demo",
|
||||
autoSelectOrder: 40,
|
||||
models: ["demo-tts"],
|
||||
voices: ["alloy"],
|
||||
defaultModel: "demo-tts",
|
||||
defaultVoice: "alloy",
|
||||
defaultBaseUrl: "https://example.test/v1",
|
||||
envKey: "DEMO_API_KEY",
|
||||
responseFormats: ["mp3", "opus"],
|
||||
defaultResponseFormat: "mp3",
|
||||
voiceCompatibleResponseFormats: ["opus"],
|
||||
baseUrlPolicy: { kind: "trim-trailing-slash" },
|
||||
readExtraConfig: (raw) =>
|
||||
typeof raw?.routing === "object" && raw.routing !== null && !Array.isArray(raw.routing)
|
||||
? { routing: raw.routing as Record<string, unknown> }
|
||||
: {},
|
||||
extraJsonBodyFields: [{ configKey: "routing", requestKey: "provider" }],
|
||||
});
|
||||
|
||||
const result = await provider.synthesize({
|
||||
text: "hello",
|
||||
cfg: {} as never,
|
||||
providerConfig: {
|
||||
baseUrl: "https://example.test/v1/",
|
||||
responseFormat: "opus",
|
||||
routing: { order: ["openai"] },
|
||||
},
|
||||
providerOverrides: {
|
||||
modelId: "override-tts",
|
||||
voiceId: "verse",
|
||||
speed: 1.1,
|
||||
},
|
||||
target: "voice-note",
|
||||
timeoutMs: 1234,
|
||||
});
|
||||
|
||||
expect(resolveProviderHttpRequestConfigMock).toHaveBeenCalledWith(
|
||||
expect.objectContaining({
|
||||
baseUrl: "https://example.test/v1",
|
||||
defaultBaseUrl: "https://example.test/v1",
|
||||
provider: "demo",
|
||||
capability: "audio",
|
||||
}),
|
||||
);
|
||||
expect(postJsonRequestMock).toHaveBeenCalledWith(
|
||||
expect.objectContaining({
|
||||
url: "https://example.test/v1/audio/speech",
|
||||
timeoutMs: 1234,
|
||||
body: {
|
||||
model: "override-tts",
|
||||
input: "hello",
|
||||
voice: "verse",
|
||||
response_format: "opus",
|
||||
speed: 1.1,
|
||||
provider: { order: ["openai"] },
|
||||
},
|
||||
}),
|
||||
);
|
||||
expect(result).toMatchObject({
|
||||
audioBuffer: Buffer.from([4, 5, 6]),
|
||||
outputFormat: "opus",
|
||||
fileExtension: ".opus",
|
||||
voiceCompatible: true,
|
||||
});
|
||||
expect(release).toHaveBeenCalledOnce();
|
||||
});
|
||||
});
|
||||
395
src/tts/openai-compatible-speech-provider.ts
Normal file
395
src/tts/openai-compatible-speech-provider.ts
Normal file
@@ -0,0 +1,395 @@
|
||||
import {
|
||||
assertOkOrThrowHttpError,
|
||||
postJsonRequest,
|
||||
resolveProviderHttpRequestConfig,
|
||||
} from "openclaw/plugin-sdk/provider-http";
|
||||
import { normalizeResolvedSecretInputString } from "openclaw/plugin-sdk/secret-input";
|
||||
import { asFiniteNumber, asObject, trimToUndefined } from "../agents/provider-http-errors.js";
|
||||
import type { SpeechProviderPlugin } from "../plugins/types.js";
|
||||
import { normalizeOptionalLowercaseString } from "../shared/string-coerce.js";
|
||||
import type {
|
||||
SpeechDirectiveTokenParseContext,
|
||||
SpeechProviderConfig,
|
||||
SpeechProviderOverrides,
|
||||
} from "./provider-types.js";
|
||||
|
||||
type OpenAiCompatibleSpeechProviderBaseConfig = {
|
||||
apiKey?: string;
|
||||
baseUrl?: string;
|
||||
model: string;
|
||||
voice: string;
|
||||
speed?: number;
|
||||
responseFormat?: string;
|
||||
};
|
||||
|
||||
export type OpenAiCompatibleSpeechProviderConfig<
|
||||
ExtraConfig extends Record<string, unknown> = Record<string, never>,
|
||||
> = OpenAiCompatibleSpeechProviderBaseConfig & ExtraConfig;
|
||||
|
||||
export type OpenAiCompatibleSpeechProviderBaseUrlPolicy =
|
||||
| { kind: "trim-trailing-slash" }
|
||||
| { kind: "canonical"; aliases?: readonly string[]; allowCustom?: boolean };
|
||||
|
||||
export type OpenAiCompatibleSpeechProviderExtraJsonBodyField<
|
||||
ExtraConfig extends Record<string, unknown>,
|
||||
> = {
|
||||
configKey: Extract<keyof ExtraConfig, string>;
|
||||
requestKey?: string;
|
||||
};
|
||||
|
||||
export type OpenAiCompatibleSpeechProviderOptions<
|
||||
ExtraConfig extends Record<string, unknown> = Record<string, never>,
|
||||
> = {
|
||||
id: string;
|
||||
label: string;
|
||||
autoSelectOrder: number;
|
||||
models: readonly string[];
|
||||
voices: readonly string[];
|
||||
defaultModel: string;
|
||||
defaultVoice: string;
|
||||
defaultBaseUrl: string;
|
||||
envKey: string;
|
||||
responseFormats: readonly string[];
|
||||
defaultResponseFormat: string;
|
||||
voiceCompatibleResponseFormats: readonly string[];
|
||||
baseUrlPolicy?: OpenAiCompatibleSpeechProviderBaseUrlPolicy;
|
||||
normalizeModel?: (value: string | undefined, fallback: string) => string;
|
||||
configKey?: string;
|
||||
extraHeaders?: Record<string, string>;
|
||||
readExtraConfig?: (raw: Record<string, unknown> | undefined) => ExtraConfig;
|
||||
extraJsonBodyFields?: readonly OpenAiCompatibleSpeechProviderExtraJsonBodyField<ExtraConfig>[];
|
||||
apiErrorLabel?: string;
|
||||
missingApiKeyError?: string;
|
||||
};
|
||||
|
||||
type ModelProviderConfig = {
|
||||
apiKey?: unknown;
|
||||
baseUrl?: unknown;
|
||||
};
|
||||
|
||||
function normalizeResponseFormat(params: {
|
||||
providerLabel: string;
|
||||
responseFormats: readonly string[];
|
||||
value: unknown;
|
||||
}): string | undefined {
|
||||
const next = normalizeOptionalLowercaseString(params.value);
|
||||
if (!next) {
|
||||
return undefined;
|
||||
}
|
||||
if (params.responseFormats.includes(next)) {
|
||||
return next;
|
||||
}
|
||||
throw new Error(`Invalid ${params.providerLabel} speech responseFormat: ${next}`);
|
||||
}
|
||||
|
||||
function responseFormatToFileExtension(format: string): `.${string}` {
|
||||
return `.${format}`;
|
||||
}
|
||||
|
||||
function trimTrailingBaseUrl(value: unknown, fallback: string): string {
|
||||
return (trimToUndefined(value) ?? fallback).replace(/\/+$/u, "");
|
||||
}
|
||||
|
||||
function normalizeBaseUrl(params: {
|
||||
value: unknown;
|
||||
fallback: string;
|
||||
policy?: OpenAiCompatibleSpeechProviderBaseUrlPolicy;
|
||||
}): string {
|
||||
const normalized = trimTrailingBaseUrl(params.value, params.fallback);
|
||||
if (params.policy?.kind !== "canonical") {
|
||||
return normalized;
|
||||
}
|
||||
const canonical = trimTrailingBaseUrl(params.fallback, params.fallback);
|
||||
const aliases = new Set(
|
||||
[canonical, ...(params.policy.aliases ?? [])].map((entry) =>
|
||||
trimTrailingBaseUrl(entry, canonical),
|
||||
),
|
||||
);
|
||||
return aliases.has(normalized) || !params.policy.allowCustom ? canonical : normalized;
|
||||
}
|
||||
|
||||
function resolveProviderConfigRecord(
|
||||
rawConfig: Record<string, unknown>,
|
||||
providerConfigKey: string,
|
||||
): Record<string, unknown> | undefined {
|
||||
const providers = asObject(rawConfig.providers);
|
||||
return asObject(providers?.[providerConfigKey]) ?? asObject(rawConfig[providerConfigKey]);
|
||||
}
|
||||
|
||||
function readModelProviderConfig(
|
||||
cfg: unknown,
|
||||
providerConfigKey: string,
|
||||
): ModelProviderConfig | undefined {
|
||||
const root = asObject(cfg);
|
||||
const models = asObject(root?.models);
|
||||
const providers = asObject(models?.providers);
|
||||
return asObject(providers?.[providerConfigKey]);
|
||||
}
|
||||
|
||||
function readSpeechOverrides(overrides: SpeechProviderOverrides | undefined): {
|
||||
model?: string;
|
||||
voice?: string;
|
||||
speed?: number;
|
||||
} {
|
||||
if (!overrides) {
|
||||
return {};
|
||||
}
|
||||
return {
|
||||
model: trimToUndefined(overrides.model ?? overrides.modelId),
|
||||
voice: trimToUndefined(overrides.voice ?? overrides.voiceId),
|
||||
speed: asFiniteNumber(overrides.speed),
|
||||
};
|
||||
}
|
||||
|
||||
function parseDirectiveToken(
|
||||
ctx: SpeechDirectiveTokenParseContext,
|
||||
providerConfigKey: string,
|
||||
): { handled: boolean; overrides?: SpeechProviderOverrides } {
|
||||
const compactProviderKey = providerConfigKey.replace(/[^a-z0-9]+/giu, "").toLowerCase();
|
||||
switch (ctx.key) {
|
||||
case "voice":
|
||||
case "voice_id":
|
||||
case "voiceid":
|
||||
case `${providerConfigKey}_voice`:
|
||||
case `${compactProviderKey}voice`:
|
||||
if (!ctx.policy.allowVoice) {
|
||||
return { handled: true };
|
||||
}
|
||||
return { handled: true, overrides: { voice: ctx.value } };
|
||||
case "model":
|
||||
case "model_id":
|
||||
case "modelid":
|
||||
case `${providerConfigKey}_model`:
|
||||
case `${compactProviderKey}model`:
|
||||
if (!ctx.policy.allowModelId) {
|
||||
return { handled: true };
|
||||
}
|
||||
return { handled: true, overrides: { model: ctx.value } };
|
||||
default:
|
||||
return { handled: false };
|
||||
}
|
||||
}
|
||||
|
||||
function buildExtraJsonBodyFields<ExtraConfig extends Record<string, unknown>>(
|
||||
config: OpenAiCompatibleSpeechProviderConfig<ExtraConfig>,
|
||||
fields: readonly OpenAiCompatibleSpeechProviderExtraJsonBodyField<ExtraConfig>[] | undefined,
|
||||
): Record<string, unknown> {
|
||||
const body: Record<string, unknown> = {};
|
||||
for (const field of fields ?? []) {
|
||||
const value = config[field.configKey];
|
||||
if (value != null) {
|
||||
body[field.requestKey ?? field.configKey] = value;
|
||||
}
|
||||
}
|
||||
return body;
|
||||
}
|
||||
|
||||
export function createOpenAiCompatibleSpeechProvider<
|
||||
ExtraConfig extends Record<string, unknown> = Record<string, never>,
|
||||
>(options: OpenAiCompatibleSpeechProviderOptions<ExtraConfig>): SpeechProviderPlugin {
|
||||
const providerConfigKey = options.configKey ?? options.id;
|
||||
const normalizeModel =
|
||||
options.normalizeModel ?? ((value, fallback) => trimToUndefined(value) ?? fallback);
|
||||
const readExtraConfig = options.readExtraConfig ?? (() => ({}) as ExtraConfig);
|
||||
|
||||
function normalizeConfig(
|
||||
rawConfig: Record<string, unknown>,
|
||||
): OpenAiCompatibleSpeechProviderConfig<ExtraConfig> {
|
||||
const raw = resolveProviderConfigRecord(rawConfig, providerConfigKey);
|
||||
return {
|
||||
apiKey: normalizeResolvedSecretInputString({
|
||||
value: raw?.apiKey,
|
||||
path: `messages.tts.providers.${providerConfigKey}.apiKey`,
|
||||
}),
|
||||
baseUrl:
|
||||
trimToUndefined(raw?.baseUrl) == null
|
||||
? undefined
|
||||
: normalizeBaseUrl({
|
||||
value: raw?.baseUrl,
|
||||
fallback: options.defaultBaseUrl,
|
||||
policy: options.baseUrlPolicy,
|
||||
}),
|
||||
model: normalizeModel(trimToUndefined(raw?.model ?? raw?.modelId), options.defaultModel),
|
||||
voice: trimToUndefined(raw?.voice ?? raw?.voiceId) ?? options.defaultVoice,
|
||||
speed: asFiniteNumber(raw?.speed),
|
||||
responseFormat: normalizeResponseFormat({
|
||||
providerLabel: options.label,
|
||||
responseFormats: options.responseFormats,
|
||||
value: raw?.responseFormat,
|
||||
}),
|
||||
...readExtraConfig(raw),
|
||||
};
|
||||
}
|
||||
|
||||
function readProviderConfig(
|
||||
config: SpeechProviderConfig,
|
||||
): OpenAiCompatibleSpeechProviderConfig<ExtraConfig> {
|
||||
const normalized = normalizeConfig({});
|
||||
return {
|
||||
apiKey: trimToUndefined(config.apiKey) ?? normalized.apiKey,
|
||||
baseUrl:
|
||||
trimToUndefined(config.baseUrl) == null
|
||||
? normalized.baseUrl
|
||||
: normalizeBaseUrl({
|
||||
value: config.baseUrl,
|
||||
fallback: options.defaultBaseUrl,
|
||||
policy: options.baseUrlPolicy,
|
||||
}),
|
||||
model: normalizeModel(trimToUndefined(config.model ?? config.modelId), normalized.model),
|
||||
voice: trimToUndefined(config.voice ?? config.voiceId) ?? normalized.voice,
|
||||
speed: asFiniteNumber(config.speed) ?? normalized.speed,
|
||||
responseFormat:
|
||||
normalizeResponseFormat({
|
||||
providerLabel: options.label,
|
||||
responseFormats: options.responseFormats,
|
||||
value: config.responseFormat,
|
||||
}) ?? normalized.responseFormat,
|
||||
...readExtraConfig(config),
|
||||
};
|
||||
}
|
||||
|
||||
function resolveApiKey(params: {
|
||||
cfg?: unknown;
|
||||
providerConfig: OpenAiCompatibleSpeechProviderConfig<ExtraConfig>;
|
||||
}): string | undefined {
|
||||
return (
|
||||
params.providerConfig.apiKey ??
|
||||
normalizeResolvedSecretInputString({
|
||||
value: readModelProviderConfig(params.cfg, providerConfigKey)?.apiKey,
|
||||
path: `models.providers.${providerConfigKey}.apiKey`,
|
||||
}) ??
|
||||
trimToUndefined(process.env[options.envKey])
|
||||
);
|
||||
}
|
||||
|
||||
function resolveBaseUrl(params: {
|
||||
cfg?: unknown;
|
||||
providerConfig: OpenAiCompatibleSpeechProviderConfig<ExtraConfig>;
|
||||
}): string {
|
||||
return normalizeBaseUrl({
|
||||
value:
|
||||
params.providerConfig.baseUrl ??
|
||||
trimToUndefined(readModelProviderConfig(params.cfg, providerConfigKey)?.baseUrl),
|
||||
fallback: options.defaultBaseUrl,
|
||||
policy: options.baseUrlPolicy,
|
||||
});
|
||||
}
|
||||
|
||||
return {
|
||||
id: options.id,
|
||||
label: options.label,
|
||||
autoSelectOrder: options.autoSelectOrder,
|
||||
models: [...options.models],
|
||||
voices: [...options.voices],
|
||||
resolveConfig: ({ rawConfig }) => normalizeConfig(rawConfig),
|
||||
parseDirectiveToken: (ctx) => parseDirectiveToken(ctx, providerConfigKey),
|
||||
resolveTalkConfig: ({ baseTtsConfig, talkProviderConfig }) => {
|
||||
const base = normalizeConfig(baseTtsConfig);
|
||||
const responseFormat = normalizeResponseFormat({
|
||||
providerLabel: options.label,
|
||||
responseFormats: options.responseFormats,
|
||||
value: talkProviderConfig.responseFormat,
|
||||
});
|
||||
const next: OpenAiCompatibleSpeechProviderConfig<ExtraConfig> = { ...base };
|
||||
if (talkProviderConfig.apiKey !== undefined) {
|
||||
next.apiKey = normalizeResolvedSecretInputString({
|
||||
value: talkProviderConfig.apiKey,
|
||||
path: `talk.providers.${providerConfigKey}.apiKey`,
|
||||
});
|
||||
}
|
||||
const baseUrl = trimToUndefined(talkProviderConfig.baseUrl);
|
||||
if (baseUrl !== undefined) {
|
||||
next.baseUrl = normalizeBaseUrl({
|
||||
value: baseUrl,
|
||||
fallback: options.defaultBaseUrl,
|
||||
policy: options.baseUrlPolicy,
|
||||
});
|
||||
}
|
||||
const modelId = trimToUndefined(talkProviderConfig.modelId);
|
||||
if (modelId !== undefined) {
|
||||
next.model = normalizeModel(modelId, options.defaultModel);
|
||||
}
|
||||
const voiceId = trimToUndefined(talkProviderConfig.voiceId);
|
||||
if (voiceId !== undefined) {
|
||||
next.voice = voiceId;
|
||||
}
|
||||
const speed = asFiniteNumber(talkProviderConfig.speed);
|
||||
if (speed !== undefined) {
|
||||
next.speed = speed;
|
||||
}
|
||||
if (responseFormat !== undefined) {
|
||||
next.responseFormat = responseFormat;
|
||||
}
|
||||
return next;
|
||||
},
|
||||
resolveTalkOverrides: ({ params }) => ({
|
||||
...(trimToUndefined(params.voiceId ?? params.voice) == null
|
||||
? {}
|
||||
: { voice: trimToUndefined(params.voiceId ?? params.voice) }),
|
||||
...(trimToUndefined(params.modelId ?? params.model) == null
|
||||
? {}
|
||||
: { model: trimToUndefined(params.modelId ?? params.model) }),
|
||||
...(asFiniteNumber(params.speed) == null ? {} : { speed: asFiniteNumber(params.speed) }),
|
||||
}),
|
||||
listVoices: async () => options.voices.map((voice) => ({ id: voice, name: voice })),
|
||||
isConfigured: ({ cfg, providerConfig }) =>
|
||||
Boolean(resolveApiKey({ cfg, providerConfig: readProviderConfig(providerConfig) })),
|
||||
synthesize: async (req) => {
|
||||
const config = readProviderConfig(req.providerConfig);
|
||||
const overrides = readSpeechOverrides(req.providerOverrides);
|
||||
const apiKey = resolveApiKey({ cfg: req.cfg, providerConfig: config });
|
||||
if (!apiKey) {
|
||||
throw new Error(options.missingApiKeyError ?? `${options.label} API key missing`);
|
||||
}
|
||||
|
||||
const baseUrl = resolveBaseUrl({ cfg: req.cfg, providerConfig: config });
|
||||
const responseFormat = config.responseFormat ?? options.defaultResponseFormat;
|
||||
const speed = overrides.speed ?? config.speed;
|
||||
const { allowPrivateNetwork, headers, dispatcherPolicy } = resolveProviderHttpRequestConfig({
|
||||
baseUrl,
|
||||
defaultBaseUrl: options.defaultBaseUrl,
|
||||
allowPrivateNetwork: false,
|
||||
defaultHeaders: {
|
||||
Authorization: `Bearer ${apiKey}`,
|
||||
"Content-Type": "application/json",
|
||||
...options.extraHeaders,
|
||||
},
|
||||
provider: options.id,
|
||||
capability: "audio",
|
||||
transport: "http",
|
||||
});
|
||||
|
||||
const { response, release } = await postJsonRequest({
|
||||
url: `${baseUrl}/audio/speech`,
|
||||
headers,
|
||||
body: {
|
||||
model: normalizeModel(overrides.model ?? config.model, options.defaultModel),
|
||||
input: req.text,
|
||||
voice: overrides.voice ?? config.voice,
|
||||
response_format: responseFormat,
|
||||
...(speed == null ? {} : { speed }),
|
||||
...buildExtraJsonBodyFields(config, options.extraJsonBodyFields),
|
||||
},
|
||||
timeoutMs: req.timeoutMs,
|
||||
fetchFn: fetch,
|
||||
allowPrivateNetwork,
|
||||
dispatcherPolicy,
|
||||
});
|
||||
|
||||
try {
|
||||
await assertOkOrThrowHttpError(
|
||||
response,
|
||||
options.apiErrorLabel ?? `${options.label} TTS API error`,
|
||||
);
|
||||
return {
|
||||
audioBuffer: Buffer.from(await response.arrayBuffer()),
|
||||
outputFormat: responseFormat,
|
||||
fileExtension: responseFormatToFileExtension(responseFormat),
|
||||
voiceCompatible: options.voiceCompatibleResponseFormats.includes(responseFormat),
|
||||
};
|
||||
} finally {
|
||||
await release();
|
||||
}
|
||||
},
|
||||
};
|
||||
}
|
||||
Reference in New Issue
Block a user