Files
openclaw/src/gateway/server-methods/talk.ts
2026-03-26 22:48:57 +00:00

316 lines
9.3 KiB
TypeScript

import { readConfigFileSnapshot } from "../../config/config.js";
import { redactConfigObject } from "../../config/redact-snapshot.js";
import { buildTalkConfigResponse, resolveActiveTalkProviderConfig } from "../../config/talk.js";
import type { TalkProviderConfig } from "../../config/types.gateway.js";
import type { OpenClawConfig, TtsConfig, TtsProviderConfigMap } from "../../config/types.js";
import { canonicalizeSpeechProviderId, getSpeechProvider } from "../../tts/provider-registry.js";
import { synthesizeSpeech, type TtsDirectiveOverrides } from "../../tts/tts.js";
import {
ErrorCodes,
errorShape,
formatValidationErrors,
validateTalkConfigParams,
validateTalkModeParams,
validateTalkSpeakParams,
} from "../protocol/index.js";
import { formatForLog } from "../ws-log.js";
import type { GatewayRequestHandlers } from "./types.js";
const ADMIN_SCOPE = "operator.admin";
const TALK_SECRETS_SCOPE = "operator.talk.secrets";
function canReadTalkSecrets(client: { connect?: { scopes?: string[] } } | null): boolean {
const scopes = Array.isArray(client?.connect?.scopes) ? client.connect.scopes : [];
return scopes.includes(ADMIN_SCOPE) || scopes.includes(TALK_SECRETS_SCOPE);
}
function trimString(value: unknown): string | undefined {
if (typeof value !== "string") {
return undefined;
}
const trimmed = value.trim();
return trimmed.length > 0 ? trimmed : undefined;
}
function asRecord(value: unknown): Record<string, unknown> | undefined {
return typeof value === "object" && value !== null && !Array.isArray(value)
? (value as Record<string, unknown>)
: undefined;
}
function normalizeAliasKey(value: string): string {
return value.trim().toLowerCase();
}
function resolveTalkVoiceId(
providerConfig: TalkProviderConfig,
requested: string | undefined,
): string | undefined {
if (!requested) {
return undefined;
}
const aliases = providerConfig.voiceAliases;
if (!aliases) {
return requested;
}
const normalizedRequested = normalizeAliasKey(requested);
for (const [alias, voiceId] of Object.entries(aliases)) {
if (normalizeAliasKey(alias) === normalizedRequested) {
return voiceId;
}
}
return requested;
}
function buildTalkTtsConfig(
config: OpenClawConfig,
):
| { cfg: OpenClawConfig; provider: string; providerConfig: TalkProviderConfig }
| { error: string } {
const resolved = resolveActiveTalkProviderConfig(config.talk);
const provider = canonicalizeSpeechProviderId(resolved?.provider, config);
if (!resolved || !provider) {
return { error: "talk.speak unavailable: talk provider not configured" };
}
const speechProvider = getSpeechProvider(provider, config);
if (!speechProvider) {
return {
error: `talk.speak unavailable: speech provider "${provider}" does not support Talk mode`,
};
}
const baseTts = config.messages?.tts ?? {};
const providerConfig = resolved.config;
const resolvedProviderConfig =
speechProvider.resolveTalkConfig?.({
cfg: config,
baseTtsConfig: baseTts as Record<string, unknown>,
talkProviderConfig: providerConfig,
timeoutMs: baseTts.timeoutMs ?? 30_000,
}) ?? providerConfig;
const talkTts: TtsConfig = {
...baseTts,
auto: "always",
provider,
providers: {
...((asRecord(baseTts.providers) ?? {}) as TtsProviderConfigMap),
[provider]: resolvedProviderConfig,
},
};
return {
provider,
providerConfig,
cfg: {
...config,
messages: {
...config.messages,
tts: talkTts,
},
},
};
}
function buildTalkSpeakOverrides(
provider: string,
providerConfig: TalkProviderConfig,
config: OpenClawConfig,
params: Record<string, unknown>,
): TtsDirectiveOverrides {
const speechProvider = getSpeechProvider(provider, config);
if (!speechProvider?.resolveTalkOverrides) {
return { provider };
}
const providerOverrides = speechProvider.resolveTalkOverrides({
talkProviderConfig: providerConfig,
params: {
...params,
...(resolveTalkVoiceId(providerConfig, trimString(params.voiceId)) == null
? {}
: { voiceId: resolveTalkVoiceId(providerConfig, trimString(params.voiceId)) }),
},
});
if (!providerOverrides || Object.keys(providerOverrides).length === 0) {
return { provider };
}
return {
provider,
providerOverrides: {
[provider]: providerOverrides,
},
};
}
function inferMimeType(
outputFormat: string | undefined,
fileExtension: string | undefined,
): string | undefined {
const normalizedOutput = outputFormat?.trim().toLowerCase();
const normalizedExtension = fileExtension?.trim().toLowerCase();
if (
normalizedOutput === "mp3" ||
normalizedOutput?.startsWith("mp3_") ||
normalizedOutput?.endsWith("-mp3") ||
normalizedExtension === ".mp3"
) {
return "audio/mpeg";
}
if (
normalizedOutput === "opus" ||
normalizedOutput?.startsWith("opus_") ||
normalizedExtension === ".opus" ||
normalizedExtension === ".ogg"
) {
return "audio/ogg";
}
if (normalizedOutput?.endsWith("-wav") || normalizedExtension === ".wav") {
return "audio/wav";
}
if (normalizedOutput?.endsWith("-webm") || normalizedExtension === ".webm") {
return "audio/webm";
}
return undefined;
}
export const talkHandlers: GatewayRequestHandlers = {
"talk.config": async ({ params, respond, client }) => {
if (!validateTalkConfigParams(params)) {
respond(
false,
undefined,
errorShape(
ErrorCodes.INVALID_REQUEST,
`invalid talk.config params: ${formatValidationErrors(validateTalkConfigParams.errors)}`,
),
);
return;
}
const includeSecrets = Boolean((params as { includeSecrets?: boolean }).includeSecrets);
if (includeSecrets && !canReadTalkSecrets(client)) {
respond(
false,
undefined,
errorShape(ErrorCodes.INVALID_REQUEST, `missing scope: ${TALK_SECRETS_SCOPE}`),
);
return;
}
const snapshot = await readConfigFileSnapshot();
const configPayload: Record<string, unknown> = {};
const talkSource = includeSecrets
? snapshot.config.talk
: redactConfigObject(snapshot.config.talk);
const talk = buildTalkConfigResponse(talkSource);
if (talk) {
configPayload.talk = talk;
}
const sessionMainKey = snapshot.config.session?.mainKey;
if (typeof sessionMainKey === "string") {
configPayload.session = { mainKey: sessionMainKey };
}
const seamColor = snapshot.config.ui?.seamColor;
if (typeof seamColor === "string") {
configPayload.ui = { seamColor };
}
respond(true, { config: configPayload }, undefined);
},
"talk.speak": async ({ params, respond }) => {
if (!validateTalkSpeakParams(params)) {
respond(
false,
undefined,
errorShape(
ErrorCodes.INVALID_REQUEST,
`invalid talk.speak params: ${formatValidationErrors(validateTalkSpeakParams.errors)}`,
),
);
return;
}
const text = trimString((params as { text?: unknown }).text);
if (!text) {
respond(false, undefined, errorShape(ErrorCodes.INVALID_REQUEST, "talk.speak requires text"));
return;
}
try {
const snapshot = await readConfigFileSnapshot();
const setup = buildTalkTtsConfig(snapshot.config);
if ("error" in setup) {
respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, setup.error));
return;
}
const overrides = buildTalkSpeakOverrides(
setup.provider,
setup.providerConfig,
snapshot.config,
params,
);
const result = await synthesizeSpeech({
text,
cfg: setup.cfg,
overrides,
disableFallback: true,
});
if (!result.success || !result.audioBuffer) {
respond(
false,
undefined,
errorShape(ErrorCodes.UNAVAILABLE, result.error ?? "talk synthesis failed"),
);
return;
}
respond(
true,
{
audioBase64: result.audioBuffer.toString("base64"),
provider: result.provider ?? setup.provider,
outputFormat: result.outputFormat,
voiceCompatible: result.voiceCompatible,
mimeType: inferMimeType(result.outputFormat, result.fileExtension),
fileExtension: result.fileExtension,
},
undefined,
);
} catch (err) {
respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, formatForLog(err)));
}
},
"talk.mode": ({ params, respond, context, client, isWebchatConnect }) => {
if (client && isWebchatConnect(client.connect) && !context.hasConnectedMobileNode()) {
respond(
false,
undefined,
errorShape(ErrorCodes.UNAVAILABLE, "talk disabled: no connected iOS/Android nodes"),
);
return;
}
if (!validateTalkModeParams(params)) {
respond(
false,
undefined,
errorShape(
ErrorCodes.INVALID_REQUEST,
`invalid talk.mode params: ${formatValidationErrors(validateTalkModeParams.errors)}`,
),
);
return;
}
const payload = {
enabled: (params as { enabled: boolean }).enabled,
phase: (params as { phase?: string }).phase ?? null,
ts: Date.now(),
};
context.broadcast("talk.mode", payload, { dropIfSlow: true });
respond(true, payload, undefined);
},
};