fix(tts): honor per-agent config in tts commands

2026-05-06 08:30:42 +00:00 · 2026-04-26 03:11:52 +01:00
parent 6a688e33f6
commit 9b4f0779ce
8 changed files with 97 additions and 2 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -10,6 +10,7 @@ Docs: https://docs.openclaw.ai

 - Plugins/tokenjuice: bump the bundled tokenjuice runtime to 0.6.3. Thanks @vincentkoc.
 - TTS/agents: allow `agents.list[].tts` to override global `messages.tts` for per-agent voices while keeping shared provider credentials and preferences in the existing TTS config surface.
+- TTS/agents: make `/tts audio`, `/tts status`, and the `tts` agent tool honor the active `agents.list[].tts` voice/provider override.
 - Providers/Azure Speech: add Azure Speech as a bundled TTS provider with Speech-resource auth, voice listing, SSML escaping, native Ogg/Opus voice-note output, and telephony output. (#51776) Thanks @leonchui.
 - CLI/image generation: expose generic `--background` on `openclaw infer image generate` and `openclaw infer image edit`, keep `--openai-background` as an OpenAI alias, and let fal image generation honor `--output-format png|jpeg`. Thanks @steipete.
 - Browser/config: allow local managed Chrome launch discovery and post-launch CDP readiness timeouts to be raised for slower hosts such as Raspberry Pi. Fixes #66803. Thanks @beat843796.
--- a/docs/tools/tts.md
+++ b/docs/tools/tts.md
@@ -146,7 +146,8 @@ voice, model, style, or auto-TTS mode. The agent block deep-merges over
 }
 ```

-Precedence for automatic replies is:
+Precedence for automatic replies, `/tts audio`, `/tts status`, and the `tts`
+agent tool is:

 1. `messages.tts`
 2. active `agents.list[].tts`
--- a/src/agents/openclaw-tools.ts
+++ b/src/agents/openclaw-tools.ts
@@ -253,6 +253,7 @@ export function createOpenClawTools(
    createTtsTool({
      agentChannel: options?.agentChannel,
      config: resolvedConfig,
+      agentId: sessionAgentId,
    }),
    ...collectPresentOpenClawTools([imageGenerateTool, musicGenerateTool, videoGenerateTool]),
    ...(embedded
--- a/src/agents/openclaw-tools.tts-config.test.ts
+++ b/src/agents/openclaw-tools.tts-config.test.ts
@@ -167,6 +167,40 @@ describe("createOpenClawTools TTS config wiring", () => {
      __testing.setDepsForTest();
    }
  });
+
+  it("passes the resolved session agent id into the tts tool", async () => {
+    const injectedConfig = {
+      agents: {
+        list: [{ id: "reader" }, { id: "main" }],
+      },
+    } satisfies OpenClawConfig;
+
+    const { __testing, createOpenClawTools } = await import("./openclaw-tools.js");
+    __testing.setDepsForTest({ config: injectedConfig });
+
+    try {
+      const tool = createOpenClawTools({
+        agentSessionKey: "agent:reader:telegram:chat:123",
+        disableMessageTool: true,
+        disablePluginTools: true,
+      }).find((candidate) => candidate.name === "tts");
+
+      if (!tool) {
+        throw new Error("missing tts tool");
+      }
+
+      await tool.execute("call-1", { text: "hello from reader" });
+
+      expect(mocks.textToSpeech).toHaveBeenCalledWith(
+        expect.objectContaining({
+          text: "hello from reader",
+          agentId: "reader",
+        }),
+      );
+    } finally {
+      __testing.setDepsForTest();
+    }
+  });
 });

 describe("createOpenClawTools cron context wiring", () => {
--- a/src/agents/tools/tts-tool.test.ts
+++ b/src/agents/tools/tts-tool.test.ts
@@ -85,6 +85,25 @@ describe("createTtsTool", () => {
    expect(result.details).toMatchObject({ timeoutMs: 12_345 });
  });

+  it("passes the active agent id to speech generation", async () => {
+    textToSpeechSpy.mockResolvedValue({
+      success: true,
+      audioPath: "/tmp/reply.opus",
+      provider: "test",
+      voiceCompatible: true,
+    });
+
+    const tool = createTtsTool({ agentId: "voice-agent" });
+    await tool.execute("call-1", { text: "hello" });
+
+    expect(textToSpeechSpy).toHaveBeenCalledWith(
+      expect.objectContaining({
+        text: "hello",
+        agentId: "voice-agent",
+      }),
+    );
+  });
+
  it("echoes longer utterances verbatim into the tool-result content", async () => {
    textToSpeechSpy.mockResolvedValue({
      success: true,
--- a/src/agents/tools/tts-tool.ts
+++ b/src/agents/tools/tts-tool.ts
@@ -57,6 +57,7 @@ function sanitizeTranscriptForToolContent(text: string): string {
 export function createTtsTool(opts?: {
  config?: OpenClawConfig;
  agentChannel?: GatewayMessageChannel;
+  agentId?: string;
 }): AnyAgentTool {
  return {
    label: "TTS",
@@ -75,6 +76,7 @@ export function createTtsTool(opts?: {
        cfg,
        channel: channel ?? opts?.agentChannel,
        timeoutMs,
+        agentId: opts?.agentId,
      });

      if (result.success && result.audioPath) {
--- a/src/auto-reply/reply/commands-tts.test.ts
+++ b/src/auto-reply/reply/commands-tts.test.ts
@@ -38,9 +38,11 @@ const FALLBACK_TTS_PROVIDER = "backup-speech";
 function buildTtsParams(
  commandBodyNormalized: string,
  cfg: OpenClawConfig = {},
+  agentId?: string,
 ): Parameters<typeof handleTtsCommands>[0] {
  return {
    cfg,
+    agentId,
    command: {
      commandBodyNormalized,
      isAuthorizedSender: true,
@@ -189,4 +191,38 @@ describe("handleTtsCommands status fallback reporting", () => {
    expect(result?.shouldContinue).toBe(false);
    expect(result?.reply?.text).toContain("TTS status");
  });
+
+  it("resolves status config for the active agent", async () => {
+    const cfg = {
+      agents: { list: [{ id: "reader", tts: { provider: "elevenlabs" } }] },
+    } as OpenClawConfig;
+
+    const result = await handleTtsCommands(buildTtsParams("/tts status", cfg, "reader"), true);
+
+    expect(result?.shouldContinue).toBe(false);
+    expect(ttsMocks.resolveTtsConfig).toHaveBeenCalledWith(cfg, "reader");
+  });
+
+  it("passes the active agent id to /tts audio synthesis", async () => {
+    ttsMocks.textToSpeech.mockResolvedValue({
+      success: true,
+      audioPath: "/tmp/reader.ogg",
+      provider: PRIMARY_TTS_PROVIDER,
+      voiceCompatible: true,
+    });
+    const cfg = {
+      agents: { list: [{ id: "reader", tts: { provider: PRIMARY_TTS_PROVIDER } }] },
+    } as OpenClawConfig;
+
+    const result = await handleTtsCommands(buildTtsParams("/tts audio hello", cfg, "reader"), true);
+
+    expect(result?.shouldContinue).toBe(false);
+    expect(ttsMocks.textToSpeech).toHaveBeenCalledWith(
+      expect.objectContaining({
+        text: "hello",
+        cfg,
+        agentId: "reader",
+      }),
+    );
+  });
 });
--- a/src/auto-reply/reply/commands-tts.ts
+++ b/src/auto-reply/reply/commands-tts.ts
@@ -111,7 +111,7 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand
    return { shouldContinue: false };
  }

-  const config = resolveTtsConfig(params.cfg);
+  const config = resolveTtsConfig(params.cfg, params.agentId);
  const prefsPath = resolveTtsPrefsPath(config);
  const action = parsed.action;
  const args = parsed.args;
@@ -149,6 +149,7 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand
      cfg: params.cfg,
      channel: params.command.channel,
      prefsPath,
+      agentId: params.agentId,
    });

    if (result.success && result.audioPath) {