feat(gateway): add talk speak rpc

2026-06-28 10:13:36 +00:00 · 2026-03-20 10:27:05 +05:30
parent 84ee6fbb76
commit 4ac355babb
8 changed files with 447 additions and 2 deletions
--- a/src/gateway/method-scopes.ts
+++ b/src/gateway/method-scopes.ts
@@ -98,6 +98,7 @@ const METHOD_SCOPE_GROUPS: Record<OperatorScope, readonly string[]> = {
    "agent.wait",
    "wake",
    "talk.mode",
+    "talk.speak",
    "tts.enable",
    "tts.disable",
    "tts.convert",
--- a/src/gateway/protocol/index.ts
+++ b/src/gateway/protocol/index.ts
@@ -48,6 +48,10 @@ import {
  TalkConfigParamsSchema,
  type TalkConfigResult,
  TalkConfigResultSchema,
+  type TalkSpeakParams,
+  TalkSpeakParamsSchema,
+  type TalkSpeakResult,
+  TalkSpeakResultSchema,
  type ChannelsStatusParams,
  ChannelsStatusParamsSchema,
  type ChannelsStatusResult,
@@ -375,6 +379,8 @@ export const validateWizardStatusParams = ajv.compile<WizardStatusParams>(Wizard
 export const validateTalkModeParams = ajv.compile<TalkModeParams>(TalkModeParamsSchema);
 export const validateTalkConfigParams = ajv.compile<TalkConfigParams>(TalkConfigParamsSchema);
 export const validateTalkConfigResult = ajv.compile<TalkConfigResult>(TalkConfigResultSchema);
+export const validateTalkSpeakParams = ajv.compile<TalkSpeakParams>(TalkSpeakParamsSchema);
+export const validateTalkSpeakResult = ajv.compile<TalkSpeakResult>(TalkSpeakResultSchema);
 export const validateChannelsStatusParams = ajv.compile<ChannelsStatusParams>(
  ChannelsStatusParamsSchema,
 );
@@ -540,6 +546,8 @@ export {
  WizardStatusResultSchema,
  TalkConfigParamsSchema,
  TalkConfigResultSchema,
+  TalkSpeakParamsSchema,
+  TalkSpeakResultSchema,
  ChannelsStatusParamsSchema,
  ChannelsStatusResultSchema,
  ChannelsLogoutParamsSchema,
@@ -629,6 +637,8 @@ export type {
  WizardStatusResult,
  TalkConfigParams,
  TalkConfigResult,
+  TalkSpeakParams,
+  TalkSpeakResult,
  TalkModeParams,
  ChannelsStatusParams,
  ChannelsStatusResult,
--- a/src/gateway/protocol/schema/channels.ts
+++ b/src/gateway/protocol/schema/channels.ts
@@ -16,6 +16,23 @@ export const TalkConfigParamsSchema = Type.Object(
  { additionalProperties: false },
 );

+export const TalkSpeakParamsSchema = Type.Object(
+  {
+    text: NonEmptyString,
+    voiceId: Type.Optional(Type.String()),
+    modelId: Type.Optional(Type.String()),
+    speed: Type.Optional(Type.Number()),
+    stability: Type.Optional(Type.Number()),
+    similarity: Type.Optional(Type.Number()),
+    style: Type.Optional(Type.Number()),
+    speakerBoost: Type.Optional(Type.Boolean()),
+    seed: Type.Optional(Type.Integer({ minimum: 0 })),
+    normalize: Type.Optional(Type.String()),
+    language: Type.Optional(Type.String()),
+  },
+  { additionalProperties: false },
+);
+
 const talkProviderFieldSchemas = {
  voiceId: Type.Optional(Type.String()),
  voiceAliases: Type.Optional(Type.Record(Type.String(), Type.String())),
@@ -85,6 +102,18 @@ export const TalkConfigResultSchema = Type.Object(
  { additionalProperties: false },
 );

+export const TalkSpeakResultSchema = Type.Object(
+  {
+    audioBase64: NonEmptyString,
+    provider: NonEmptyString,
+    outputFormat: Type.Optional(Type.String()),
+    voiceCompatible: Type.Optional(Type.Boolean()),
+    mimeType: Type.Optional(Type.String()),
+    fileExtension: Type.Optional(Type.String()),
+  },
+  { additionalProperties: false },
+);
+
 export const ChannelsStatusParamsSchema = Type.Object(
  {
    probe: Type.Optional(Type.Boolean()),
--- a/src/gateway/protocol/schema/protocol-schemas.ts
+++ b/src/gateway/protocol/schema/protocol-schemas.ts
@@ -44,6 +44,8 @@ import {
  ChannelsLogoutParamsSchema,
  TalkConfigParamsSchema,
  TalkConfigResultSchema,
+  TalkSpeakParamsSchema,
+  TalkSpeakResultSchema,
  ChannelsStatusParamsSchema,
  ChannelsStatusResultSchema,
  TalkModeParamsSchema,
@@ -238,6 +240,8 @@ export const ProtocolSchemas = {
  TalkModeParams: TalkModeParamsSchema,
  TalkConfigParams: TalkConfigParamsSchema,
  TalkConfigResult: TalkConfigResultSchema,
+  TalkSpeakParams: TalkSpeakParamsSchema,
+  TalkSpeakResult: TalkSpeakResultSchema,
  ChannelsStatusParams: ChannelsStatusParamsSchema,
  ChannelsStatusResult: ChannelsStatusResultSchema,
  ChannelsLogoutParams: ChannelsLogoutParamsSchema,
--- a/src/gateway/protocol/schema/types.ts
+++ b/src/gateway/protocol/schema/types.ts
@@ -70,6 +70,8 @@ export type WizardStatusResult = SchemaType<"WizardStatusResult">;
 export type TalkModeParams = SchemaType<"TalkModeParams">;
 export type TalkConfigParams = SchemaType<"TalkConfigParams">;
 export type TalkConfigResult = SchemaType<"TalkConfigResult">;
+export type TalkSpeakParams = SchemaType<"TalkSpeakParams">;
+export type TalkSpeakResult = SchemaType<"TalkSpeakResult">;
 export type ChannelsStatusParams = SchemaType<"ChannelsStatusParams">;
 export type ChannelsStatusResult = SchemaType<"ChannelsStatusResult">;
 export type ChannelsLogoutParams = SchemaType<"ChannelsLogoutParams">;
--- a/src/gateway/server-methods-list.ts
+++ b/src/gateway/server-methods-list.ts
@@ -34,6 +34,7 @@ const BASE_METHODS = [
  "wizard.cancel",
  "wizard.status",
  "talk.config",
+  "talk.speak",
  "talk.mode",
  "models.list",
  "tools.catalog",
--- a/src/gateway/server-methods/talk.ts
+++ b/src/gateway/server-methods/talk.ts
@@ -1,23 +1,297 @@
 import { readConfigFileSnapshot } from "../../config/config.js";
 import { redactConfigObject } from "../../config/redact-snapshot.js";
-import { buildTalkConfigResponse } from "../../config/talk.js";
+import { buildTalkConfigResponse, resolveActiveTalkProviderConfig } from "../../config/talk.js";
+import type { TalkProviderConfig } from "../../config/types.gateway.js";
+import type { OpenClawConfig, TtsConfig } from "../../config/types.js";
+import { normalizeSpeechProviderId } from "../../tts/provider-registry.js";
+import { synthesizeSpeech, type TtsDirectiveOverrides } from "../../tts/tts.js";
 import {
  ErrorCodes,
  errorShape,
  formatValidationErrors,
  validateTalkConfigParams,
  validateTalkModeParams,
+  validateTalkSpeakParams,
 } from "../protocol/index.js";
+import { formatForLog } from "../ws-log.js";
 import type { GatewayRequestHandlers } from "./types.js";

 const ADMIN_SCOPE = "operator.admin";
 const TALK_SECRETS_SCOPE = "operator.talk.secrets";
+type ElevenLabsVoiceSettings = NonNullable<NonNullable<TtsConfig["elevenlabs"]>["voiceSettings"]>;

 function canReadTalkSecrets(client: { connect?: { scopes?: string[] } } | null): boolean {
  const scopes = Array.isArray(client?.connect?.scopes) ? client.connect.scopes : [];
  return scopes.includes(ADMIN_SCOPE) || scopes.includes(TALK_SECRETS_SCOPE);
 }

+function trimString(value: unknown): string | undefined {
+  if (typeof value !== "string") {
+    return undefined;
+  }
+  const trimmed = value.trim();
+  return trimmed.length > 0 ? trimmed : undefined;
+}
+
+function finiteNumber(value: unknown): number | undefined {
+  return typeof value === "number" && Number.isFinite(value) ? value : undefined;
+}
+
+function optionalBoolean(value: unknown): boolean | undefined {
+  return typeof value === "boolean" ? value : undefined;
+}
+
+function plainObject(value: unknown): Record<string, unknown> | undefined {
+  return typeof value === "object" && value !== null && !Array.isArray(value)
+    ? (value as Record<string, unknown>)
+    : undefined;
+}
+
+function normalizeTextNormalization(value: unknown): "auto" | "on" | "off" | undefined {
+  const normalized = trimString(value)?.toLowerCase();
+  return normalized === "auto" || normalized === "on" || normalized === "off"
+    ? normalized
+    : undefined;
+}
+
+function normalizeAliasKey(value: string): string {
+  return value.trim().toLowerCase();
+}
+
+function resolveTalkVoiceId(
+  providerConfig: TalkProviderConfig,
+  requested: string | undefined,
+): string | undefined {
+  if (!requested) {
+    return undefined;
+  }
+  const aliases = providerConfig.voiceAliases;
+  if (!aliases) {
+    return requested;
+  }
+  return aliases[normalizeAliasKey(requested)] ?? requested;
+}
+
+function readTalkVoiceSettings(
+  providerConfig: TalkProviderConfig,
+): ElevenLabsVoiceSettings | undefined {
+  const source = plainObject(providerConfig.voiceSettings);
+  if (!source) {
+    return undefined;
+  }
+  const stability = finiteNumber(source.stability);
+  const similarityBoost = finiteNumber(source.similarityBoost);
+  const style = finiteNumber(source.style);
+  const useSpeakerBoost = optionalBoolean(source.useSpeakerBoost);
+  const speed = finiteNumber(source.speed);
+  const voiceSettings = {
+    ...(stability == null ? {} : { stability }),
+    ...(similarityBoost == null ? {} : { similarityBoost }),
+    ...(style == null ? {} : { style }),
+    ...(useSpeakerBoost == null ? {} : { useSpeakerBoost }),
+    ...(speed == null ? {} : { speed }),
+  };
+  return Object.keys(voiceSettings).length > 0 ? voiceSettings : undefined;
+}
+
+function buildTalkTtsConfig(
+  config: OpenClawConfig,
+):
+  | { cfg: OpenClawConfig; provider: string; providerConfig: TalkProviderConfig }
+  | { error: string } {
+  const resolved = resolveActiveTalkProviderConfig(config.talk);
+  const provider = normalizeSpeechProviderId(resolved?.provider);
+  if (!resolved || !provider) {
+    return { error: "talk.speak unavailable: talk provider not configured" };
+  }
+
+  const baseTts = config.messages?.tts ?? {};
+  const providerConfig = resolved.config;
+  const talkTts: TtsConfig = {
+    ...baseTts,
+    auto: "always",
+    provider,
+  };
+
+  if (provider === "elevenlabs") {
+    talkTts.elevenlabs = {
+      ...baseTts.elevenlabs,
+      ...(providerConfig.apiKey === undefined ? {} : { apiKey: providerConfig.apiKey }),
+      ...(trimString(providerConfig.baseUrl) == null
+        ? {}
+        : { baseUrl: trimString(providerConfig.baseUrl) }),
+      ...(trimString(providerConfig.voiceId) == null
+        ? {}
+        : { voiceId: trimString(providerConfig.voiceId) }),
+      ...(trimString(providerConfig.modelId) == null
+        ? {}
+        : { modelId: trimString(providerConfig.modelId) }),
+      ...(finiteNumber(providerConfig.seed) == null
+        ? {}
+        : { seed: finiteNumber(providerConfig.seed) }),
+      ...(normalizeTextNormalization(providerConfig.applyTextNormalization) == null
+        ? {}
+        : {
+            applyTextNormalization: normalizeTextNormalization(
+              providerConfig.applyTextNormalization,
+            ),
+          }),
+      ...(trimString(providerConfig.languageCode) == null
+        ? {}
+        : { languageCode: trimString(providerConfig.languageCode) }),
+      ...(readTalkVoiceSettings(providerConfig) == null
+        ? {}
+        : { voiceSettings: readTalkVoiceSettings(providerConfig) }),
+    };
+  } else if (provider === "openai") {
+    talkTts.openai = {
+      ...baseTts.openai,
+      ...(providerConfig.apiKey === undefined ? {} : { apiKey: providerConfig.apiKey }),
+      ...(trimString(providerConfig.baseUrl) == null
+        ? {}
+        : { baseUrl: trimString(providerConfig.baseUrl) }),
+      ...(trimString(providerConfig.modelId) == null
+        ? {}
+        : { model: trimString(providerConfig.modelId) }),
+      ...(trimString(providerConfig.voiceId) == null
+        ? {}
+        : { voice: trimString(providerConfig.voiceId) }),
+      ...(finiteNumber(providerConfig.speed) == null
+        ? {}
+        : { speed: finiteNumber(providerConfig.speed) }),
+      ...(trimString(providerConfig.instructions) == null
+        ? {}
+        : { instructions: trimString(providerConfig.instructions) }),
+    };
+  } else if (provider === "microsoft") {
+    talkTts.microsoft = {
+      ...baseTts.microsoft,
+      enabled: true,
+      ...(trimString(providerConfig.voiceId) == null
+        ? {}
+        : { voice: trimString(providerConfig.voiceId) }),
+      ...(trimString(providerConfig.languageCode) == null
+        ? {}
+        : { lang: trimString(providerConfig.languageCode) }),
+      ...(trimString(providerConfig.outputFormat) == null
+        ? {}
+        : { outputFormat: trimString(providerConfig.outputFormat) }),
+      ...(trimString(providerConfig.pitch) == null
+        ? {}
+        : { pitch: trimString(providerConfig.pitch) }),
+      ...(trimString(providerConfig.rate) == null ? {} : { rate: trimString(providerConfig.rate) }),
+      ...(trimString(providerConfig.volume) == null
+        ? {}
+        : { volume: trimString(providerConfig.volume) }),
+      ...(trimString(providerConfig.proxy) == null
+        ? {}
+        : { proxy: trimString(providerConfig.proxy) }),
+      ...(finiteNumber(providerConfig.timeoutMs) == null
+        ? {}
+        : { timeoutMs: finiteNumber(providerConfig.timeoutMs) }),
+    };
+  } else {
+    return { error: `talk.speak unavailable: unsupported talk provider '${resolved.provider}'` };
+  }
+
+  return {
+    provider,
+    providerConfig,
+    cfg: {
+      ...config,
+      messages: {
+        ...config.messages,
+        tts: talkTts,
+      },
+    },
+  };
+}
+
+function buildTalkSpeakOverrides(
+  provider: string,
+  providerConfig: TalkProviderConfig,
+  params: Record<string, unknown>,
+): TtsDirectiveOverrides {
+  const voiceId = resolveTalkVoiceId(providerConfig, trimString(params.voiceId));
+  const modelId = trimString(params.modelId);
+  const speed = finiteNumber(params.speed);
+  const seed = finiteNumber(params.seed);
+  const normalize = normalizeTextNormalization(params.normalize);
+  const language = trimString(params.language)?.toLowerCase();
+  const overrides: TtsDirectiveOverrides = { provider };
+
+  if (provider === "elevenlabs") {
+    const voiceSettings = {
+      ...(speed == null ? {} : { speed }),
+      ...(finiteNumber(params.stability) == null
+        ? {}
+        : { stability: finiteNumber(params.stability) }),
+      ...(finiteNumber(params.similarity) == null
+        ? {}
+        : { similarityBoost: finiteNumber(params.similarity) }),
+      ...(finiteNumber(params.style) == null ? {} : { style: finiteNumber(params.style) }),
+      ...(optionalBoolean(params.speakerBoost) == null
+        ? {}
+        : { useSpeakerBoost: optionalBoolean(params.speakerBoost) }),
+    };
+    overrides.elevenlabs = {
+      ...(voiceId == null ? {} : { voiceId }),
+      ...(modelId == null ? {} : { modelId }),
+      ...(seed == null ? {} : { seed }),
+      ...(normalize == null ? {} : { applyTextNormalization: normalize }),
+      ...(language == null ? {} : { languageCode: language }),
+      ...(Object.keys(voiceSettings).length === 0 ? {} : { voiceSettings }),
+    };
+    return overrides;
+  }
+
+  if (provider === "openai") {
+    overrides.openai = {
+      ...(voiceId == null ? {} : { voice: voiceId }),
+      ...(modelId == null ? {} : { model: modelId }),
+      ...(speed == null ? {} : { speed }),
+    };
+    return overrides;
+  }
+
+  if (provider === "microsoft") {
+    overrides.microsoft = voiceId == null ? undefined : { voice: voiceId };
+  }
+
+  return overrides;
+}
+
+function inferMimeType(
+  outputFormat: string | undefined,
+  fileExtension: string | undefined,
+): string | undefined {
+  const normalizedOutput = outputFormat?.trim().toLowerCase();
+  const normalizedExtension = fileExtension?.trim().toLowerCase();
+  if (
+    normalizedOutput === "mp3" ||
+    normalizedOutput?.startsWith("mp3_") ||
+    normalizedOutput?.endsWith("-mp3") ||
+    normalizedExtension === ".mp3"
+  ) {
+    return "audio/mpeg";
+  }
+  if (
+    normalizedOutput === "opus" ||
+    normalizedOutput?.startsWith("opus_") ||
+    normalizedExtension === ".opus" ||
+    normalizedExtension === ".ogg"
+  ) {
+    return "audio/ogg";
+  }
+  if (normalizedOutput?.endsWith("-wav") || normalizedExtension === ".wav") {
+    return "audio/wav";
+  }
+  if (normalizedOutput?.endsWith("-webm") || normalizedExtension === ".webm") {
+    return "audio/webm";
+  }
+  return undefined;
+}
+
 export const talkHandlers: GatewayRequestHandlers = {
  "talk.config": async ({ params, respond, client }) => {
    if (!validateTalkConfigParams(params)) {
@@ -65,6 +339,65 @@ export const talkHandlers: GatewayRequestHandlers = {

    respond(true, { config: configPayload }, undefined);
  },
+  "talk.speak": async ({ params, respond }) => {
+    if (!validateTalkSpeakParams(params)) {
+      respond(
+        false,
+        undefined,
+        errorShape(
+          ErrorCodes.INVALID_REQUEST,
+          `invalid talk.speak params: ${formatValidationErrors(validateTalkSpeakParams.errors)}`,
+        ),
+      );
+      return;
+    }
+
+    const text = trimString((params as { text?: unknown }).text);
+    if (!text) {
+      respond(false, undefined, errorShape(ErrorCodes.INVALID_REQUEST, "talk.speak requires text"));
+      return;
+    }
+
+    try {
+      const snapshot = await readConfigFileSnapshot();
+      const setup = buildTalkTtsConfig(snapshot.config);
+      if ("error" in setup) {
+        respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, setup.error));
+        return;
+      }
+
+      const overrides = buildTalkSpeakOverrides(setup.provider, setup.providerConfig, params);
+      const result = await synthesizeSpeech({
+        text,
+        cfg: setup.cfg,
+        overrides,
+        disableFallback: true,
+      });
+      if (!result.success || !result.audioBuffer) {
+        respond(
+          false,
+          undefined,
+          errorShape(ErrorCodes.UNAVAILABLE, result.error ?? "talk synthesis failed"),
+        );
+        return;
+      }
+
+      respond(
+        true,
+        {
+          audioBase64: result.audioBuffer.toString("base64"),
+          provider: result.provider ?? setup.provider,
+          outputFormat: result.outputFormat,
+          voiceCompatible: result.voiceCompatible,
+          mimeType: inferMimeType(result.outputFormat, result.fileExtension),
+          fileExtension: result.fileExtension,
+        },
+        undefined,
+      );
+    } catch (err) {
+      respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, formatForLog(err)));
+    }
+  },
  "talk.mode": ({ params, respond, context, client, isWebchatConnect }) => {
    if (client && isWebchatConnect(client.connect) && !context.hasConnectedMobileNode()) {
      respond(
--- a/src/gateway/server.talk-config.test.ts
+++ b/src/gateway/server.talk-config.test.ts
@@ -1,6 +1,6 @@
 import os from "node:os";
 import path from "node:path";
-import { describe, expect, it } from "vitest";
+import { describe, expect, it, vi } from "vitest";
 import {
  loadOrCreateDeviceIdentity,
  publicKeyRawBase64UrlFromPem,
@@ -41,6 +41,13 @@ type TalkConfigPayload = {
  };
 };
 type TalkConfig = NonNullable<NonNullable<TalkConfigPayload["config"]>["talk"]>;
+type TalkSpeakPayload = {
+  audioBase64?: string;
+  provider?: string;
+  outputFormat?: string;
+  mimeType?: string;
+  fileExtension?: string;
+};
 const TALK_CONFIG_DEVICE_PATH = path.join(
  os.tmpdir(),
  `openclaw-talk-config-device-${process.pid}.json`,
@@ -95,6 +102,10 @@ async function fetchTalkConfig(
  return rpcReq<TalkConfigPayload>(ws, "talk.config", params ?? {});
 }

+async function fetchTalkSpeak(ws: GatewaySocket, params: Record<string, unknown>) {
+  return rpcReq<TalkSpeakPayload>(ws, "talk.speak", params);
+}
+
 function expectElevenLabsTalkConfig(
  talk: TalkConfig | undefined,
  expected: {
@@ -236,4 +247,58 @@ describe("gateway talk.config", () => {
      });
    });
  });
+
+  it("synthesizes talk audio via the active talk provider", async () => {
+    const { writeConfigFile } = await import("../config/config.js");
+    await writeConfigFile({
+      talk: {
+        provider: "openai",
+        providers: {
+          openai: {
+            apiKey: "openai-talk-key", // pragma: allowlist secret
+            voiceId: "alloy",
+            modelId: "gpt-4o-mini-tts",
+          },
+        },
+      },
+    });
+
+    const originalFetch = globalThis.fetch;
+    const requestInits: RequestInit[] = [];
+    const fetchMock = vi.fn(async (_input: RequestInfo | URL, init?: RequestInit) => {
+      if (init) {
+        requestInits.push(init);
+      }
+      return new Response(new Uint8Array([1, 2, 3]), { status: 200 });
+    });
+    globalThis.fetch = fetchMock as typeof fetch;
+
+    try {
+      await withServer(async (ws) => {
+        await connectOperator(ws, ["operator.read", "operator.write"]);
+        const res = await fetchTalkSpeak(ws, {
+          text: "Hello from talk mode.",
+          voiceId: "nova",
+          modelId: "tts-1",
+          speed: 1.25,
+        });
+        expect(res.ok).toBe(true);
+        expect(res.payload?.provider).toBe("openai");
+        expect(res.payload?.outputFormat).toBe("mp3");
+        expect(res.payload?.mimeType).toBe("audio/mpeg");
+        expect(res.payload?.fileExtension).toBe(".mp3");
+        expect(res.payload?.audioBase64).toBe(Buffer.from([1, 2, 3]).toString("base64"));
+      });
+
+      expect(fetchMock).toHaveBeenCalled();
+      const requestInit = requestInits.find((init) => typeof init.body === "string");
+      expect(requestInit).toBeDefined();
+      const body = JSON.parse(requestInit?.body as string) as Record<string, unknown>;
+      expect(body.model).toBe("tts-1");
+      expect(body.voice).toBe("nova");
+      expect(body.speed).toBe(1.25);
+    } finally {
+      globalThis.fetch = originalFetch;
+    }
+  });
 });