diff --git a/CHANGELOG.md b/CHANGELOG.md index ab9a3f36f06..93bc9570301 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ Docs: https://docs.openclaw.ai ### Changes +- TTS/WhatsApp: add `/tts latest` read-aloud support with duplicate suppression and `/tts chat on|off|default` session-scoped auto-TTS overrides, completing the on-demand voice-note UX for current-chat replies. Fixes #66032. - Plugins/tokenjuice: bump the bundled tokenjuice runtime to 0.6.3. Thanks @vincentkoc. - TTS/agents: allow `agents.list[].tts` to override global `messages.tts` for per-agent voices while keeping shared provider credentials and preferences in the existing TTS config surface. - TTS/agents: make `/tts audio`, `/tts status`, and the `tts` agent tool honor the active `agents.list[].tts` voice/provider override. diff --git a/docs/channels/whatsapp.md b/docs/channels/whatsapp.md index fe7a3db7865..1c75496f9ab 100644 --- a/docs/channels/whatsapp.md +++ b/docs/channels/whatsapp.md @@ -365,6 +365,7 @@ When the linked self number is also present in `allowFrom`, WhatsApp self-chat s - reply payloads preserve `audioAsVoice`; TTS voice-note output for WhatsApp stays on this PTT path even when the provider returns MP3 or WebM - native Ogg/Opus audio is sent as `audio/ogg; codecs=opus` for voice-note compatibility - non-Ogg audio, including Microsoft Edge TTS MP3/WebM output, is transcoded with `ffmpeg` to 48 kHz mono Ogg/Opus before PTT delivery + - `/tts latest` sends the latest assistant reply as one voice note and suppresses repeat sends for the same reply; `/tts chat on|off|default` controls auto-TTS for the current WhatsApp chat - animated GIF playback is supported via `gifPlayback: true` on video sends - captions are applied to the first media item when sending multi-media reply payloads, except PTT voice notes send the audio first and visible text separately because WhatsApp clients do not render voice-note captions consistently - media source can be HTTP(S), `file://`, or local paths diff --git a/docs/tools/slash-commands.md b/docs/tools/slash-commands.md index eeaf42f858b..8a1c350c0a8 100644 --- a/docs/tools/slash-commands.md +++ b/docs/tools/slash-commands.md @@ -134,7 +134,7 @@ Built-in commands available today: - `/plugins list|inspect|show|get|install|enable|disable` inspects or mutates plugin state. `/plugin` is an alias. Owner-only for writes. Requires `commands.plugins: true`. - `/debug show|set|unset|reset` manages runtime-only config overrides. Owner-only. Requires `commands.debug: true`. - `/usage off|tokens|full|cost` controls the per-response usage footer or prints a local cost summary. -- `/tts on|off|status|provider|limit|summary|audio|help` controls TTS. See [/tools/tts](/tools/tts). +- `/tts on|off|status|chat|latest|provider|limit|summary|audio|help` controls TTS. See [/tools/tts](/tools/tts). - `/restart` restarts OpenClaw when enabled. Default: enabled; set `commands.restart: false` to disable it. - `/activation mention|always` sets group activation mode. - `/send on|off|inherit` sets send policy. Owner-only. diff --git a/docs/tools/tts.md b/docs/tools/tts.md index 742ed8971d3..7c8e879c5dc 100644 --- a/docs/tools/tts.md +++ b/docs/tools/tts.md @@ -821,6 +821,10 @@ Discord note: `/tts` is a built-in Discord command, so OpenClaw registers /tts off /tts on /tts status +/tts chat on +/tts chat off +/tts chat default +/tts latest /tts provider openai /tts limit 2000 /tts summary off @@ -833,9 +837,11 @@ Notes: - `commands.text` or native command registration must be enabled. - Config `messages.tts.auto` accepts `off|always|inbound|tagged`. - `/tts on` writes the local TTS preference to `always`; `/tts off` writes it to `off`. +- `/tts chat on|off|default` writes a session-scoped auto-TTS override for the current chat. - Use config when you want `inbound` or `tagged` defaults. - `limit` and `summary` are stored in local prefs, not the main config. - `/tts audio` generates a one-off audio reply (does not toggle TTS on). +- `/tts latest` reads the latest assistant reply from the current session transcript and sends it as audio once. It stores only a hash of that reply on the session entry to suppress duplicate voice sends. - `/tts status` includes fallback visibility for the latest attempt: - success fallback: `Fallback: -> ` plus `Attempts: ...` - failure: `Error: ...` plus `Attempts: ...` diff --git a/src/auto-reply/reply/commands-tts.test.ts b/src/auto-reply/reply/commands-tts.test.ts index 25e672c5098..0fb6df3ffa4 100644 --- a/src/auto-reply/reply/commands-tts.test.ts +++ b/src/auto-reply/reply/commands-tts.test.ts @@ -1,5 +1,9 @@ +import fs from "node:fs"; +import os from "node:os"; +import path from "node:path"; import { beforeEach, describe, expect, it, vi } from "vitest"; import type { OpenClawConfig } from "../../config/config.js"; +import type { SessionEntry } from "../../config/sessions.js"; const ttsMocks = vi.hoisted(() => ({ getResolvedSpeechProviderConfig: vi.fn(), @@ -39,6 +43,7 @@ function buildTtsParams( commandBodyNormalized: string, cfg: OpenClawConfig = {}, agentId?: string, + overrides: Partial[0]> = {}, ): Parameters[0] { return { cfg, @@ -49,11 +54,14 @@ function buildTtsParams( senderId: "owner", channel: "forum", }, + sessionKey: "session-key", + ...overrides, } as unknown as Parameters[0]; } describe("handleTtsCommands status fallback reporting", () => { beforeEach(() => { + vi.clearAllMocks(); ttsMocks.resolveTtsConfig.mockReturnValue({}); ttsMocks.resolveTtsPrefsPath.mockReturnValue("/tmp/tts-prefs.json"); ttsMocks.isTtsEnabled.mockReturnValue(true); @@ -225,4 +233,131 @@ describe("handleTtsCommands status fallback reporting", () => { }), ); }); + + it("reads the latest assistant transcript reply once", async () => { + const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), "openclaw-tts-latest-")); + const sessionFile = path.join(tempDir, "session.jsonl"); + fs.writeFileSync( + sessionFile, + [ + JSON.stringify({ type: "session", id: "s1" }), + JSON.stringify({ + type: "message", + message: { role: "assistant", content: [{ type: "text", text: "older reply" }] }, + }), + JSON.stringify({ + type: "message", + message: { + role: "assistant", + content: [ + { + type: "text", + text: "internal note", + textSignature: JSON.stringify({ + v: 1, + id: "item_commentary", + phase: "commentary", + }), + }, + { + type: "text", + text: "latest visible reply", + textSignature: JSON.stringify({ + v: 1, + id: "item_final", + phase: "final_answer", + }), + }, + ], + }, + }), + ].join("\n") + "\n", + "utf-8", + ); + ttsMocks.textToSpeech.mockResolvedValue({ + success: true, + audioPath: "/tmp/latest.ogg", + provider: PRIMARY_TTS_PROVIDER, + voiceCompatible: true, + }); + const sessionEntry: SessionEntry = { sessionId: "s1", updatedAt: 1, sessionFile }; + const sessionStore = { "session-key": sessionEntry }; + + const result = await handleTtsCommands( + buildTtsParams("/tts latest", {}, undefined, { sessionEntry, sessionStore }), + true, + ); + + expect(result?.shouldContinue).toBe(false); + expect(result?.reply).toMatchObject({ + mediaUrl: "/tmp/latest.ogg", + audioAsVoice: true, + spokenText: "latest visible reply", + }); + expect(ttsMocks.textToSpeech).toHaveBeenCalledWith( + expect.objectContaining({ text: "latest visible reply" }), + ); + expect(sessionEntry.lastTtsReadLatestHash).toMatch(/^[a-f0-9]{64}$/); + expect(sessionEntry.lastTtsReadLatestAt).toEqual(expect.any(Number)); + }); + + it("does not resend /tts latest for the same assistant reply", async () => { + const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), "openclaw-tts-latest-")); + const sessionFile = path.join(tempDir, "session.jsonl"); + fs.writeFileSync( + sessionFile, + [ + JSON.stringify({ type: "session", id: "s1" }), + JSON.stringify({ + type: "message", + message: { role: "assistant", content: [{ type: "text", text: "read me once" }] }, + }), + ].join("\n") + "\n", + "utf-8", + ); + ttsMocks.textToSpeech.mockResolvedValue({ + success: true, + audioPath: "/tmp/latest.ogg", + provider: PRIMARY_TTS_PROVIDER, + voiceCompatible: true, + }); + const sessionEntry: SessionEntry = { sessionId: "s1", updatedAt: 1, sessionFile }; + const sessionStore = { "session-key": sessionEntry }; + const params = buildTtsParams("/tts latest", {}, undefined, { sessionEntry, sessionStore }); + + const first = await handleTtsCommands(params, true); + expect(first?.reply?.mediaUrl).toBe("/tmp/latest.ogg"); + ttsMocks.textToSpeech.mockClear(); + + const second = await handleTtsCommands(params, true); + + expect(second?.reply?.text).toContain("already sent"); + expect(ttsMocks.textToSpeech).not.toHaveBeenCalled(); + }); + + it("stores chat-scoped TTS overrides on the session entry", async () => { + const sessionEntry: SessionEntry = { sessionId: "s1", updatedAt: 1 }; + const sessionStore = { "session-key": sessionEntry }; + + const onResult = await handleTtsCommands( + buildTtsParams("/tts chat on", {}, undefined, { sessionEntry, sessionStore }), + true, + ); + expect(onResult?.reply?.text).toContain("enabled for this chat"); + expect(sessionEntry.ttsAuto).toBe("always"); + + const offResult = await handleTtsCommands( + buildTtsParams("/tts chat off", {}, undefined, { sessionEntry, sessionStore }), + true, + ); + expect(offResult?.reply?.text).toContain("disabled for this chat"); + expect(sessionEntry.ttsAuto).toBe("off"); + + const clearResult = await handleTtsCommands( + buildTtsParams("/tts chat default", {}, undefined, { sessionEntry, sessionStore }), + true, + ); + expect(clearResult?.reply?.text).toContain("override cleared"); + expect(sessionEntry.ttsAuto).toBeUndefined(); + }); }); diff --git a/src/auto-reply/reply/commands-tts.ts b/src/auto-reply/reply/commands-tts.ts index b211776120d..397a902aaba 100644 --- a/src/auto-reply/reply/commands-tts.ts +++ b/src/auto-reply/reply/commands-tts.ts @@ -1,3 +1,5 @@ +import crypto from "node:crypto"; +import { readLatestAssistantTextFromSessionTranscript } from "../../config/sessions.js"; import { logVerbose } from "../../globals.js"; import { normalizeOptionalLowercaseString, @@ -25,7 +27,9 @@ import { setTtsProvider, textToSpeech, } from "../../tts/tts.js"; +import { isSilentReplyPayloadText } from "../tokens.js"; import type { ReplyPayload } from "../types.js"; +import { persistSessionEntry } from "./commands-session-store.js"; import type { CommandHandler } from "./commands-types.js"; type ParsedTtsCommand = { @@ -81,7 +85,9 @@ function ttsUsage(): ReplyPayload { `• /tts provider [name] — View/change provider\n` + `• /tts limit [number] — View/change text limit\n` + `• /tts summary [on|off] — View/change auto-summary\n` + - `• /tts audio — Generate audio from text\n\n` + + `• /tts audio — Generate audio from text\n` + + `• /tts latest — Read the latest assistant reply once\n` + + `• /tts chat on|off|default — Override auto-TTS for this chat\n\n` + `**Providers:**\n` + `Use /tts provider to list the registered speech providers and their status.\n\n` + `**Text Limit (default: 1500, max: 4096):**\n` + @@ -91,10 +97,67 @@ function ttsUsage(): ReplyPayload { `**Examples:**\n` + `/tts provider \n` + `/tts limit 2000\n` + + `/tts latest\n` + `/tts audio Hello, this is a test!`, }; } +function hashTtsReadLatestText(text: string): string { + return crypto.createHash("sha256").update(text).digest("hex"); +} + +async function buildTtsAudioReply(params: { + text: string; + cfg: Parameters[0]["cfg"]; + channel: string; + prefsPath: string; + agentId?: string; +}): Promise<{ reply: ReplyPayload; provider?: string; hash?: string } | { error: string }> { + const start = Date.now(); + const result = await textToSpeech({ + text: params.text, + cfg: params.cfg, + channel: params.channel, + prefsPath: params.prefsPath, + agentId: params.agentId, + }); + + if (result.success && result.audioPath) { + setLastTtsAttempt({ + timestamp: Date.now(), + success: true, + textLength: params.text.length, + summarized: false, + provider: result.provider, + fallbackFrom: result.fallbackFrom, + attemptedProviders: result.attemptedProviders, + attempts: result.attempts, + latencyMs: result.latencyMs, + }); + return { + provider: result.provider, + reply: { + mediaUrl: result.audioPath, + audioAsVoice: result.voiceCompatible === true, + trustedLocalMedia: true, + spokenText: params.text, + }, + }; + } + + setLastTtsAttempt({ + timestamp: Date.now(), + success: false, + textLength: params.text.length, + summarized: false, + attemptedProviders: result.attemptedProviders, + attempts: result.attempts, + error: result.error, + latencyMs: Date.now() - start, + }); + return { error: result.error ?? "unknown error" }; +} + export const handleTtsCommands: CommandHandler = async (params, allowTextCommands) => { if (!allowTextCommands) { return null; @@ -130,6 +193,86 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand return { shouldContinue: false, reply: { text: "🔇 TTS disabled." } }; } + if (action === "chat") { + const requested = normalizeOptionalLowercaseString(args) ?? ""; + if (!params.sessionEntry || !params.sessionStore || !params.sessionKey) { + return { + shouldContinue: false, + reply: { text: "🔇 No active chat session is available for a chat-scoped TTS override." }, + }; + } + if (!requested || requested === "status") { + return { + shouldContinue: false, + reply: { text: `🔊 Chat TTS override: ${params.sessionEntry.ttsAuto ?? "default"}.` }, + }; + } + if (requested === "on") { + params.sessionEntry.ttsAuto = "always"; + await persistSessionEntry(params); + return { shouldContinue: false, reply: { text: "🔊 TTS enabled for this chat." } }; + } + if (requested === "off") { + params.sessionEntry.ttsAuto = "off"; + await persistSessionEntry(params); + return { shouldContinue: false, reply: { text: "🔇 TTS disabled for this chat." } }; + } + if (requested === "default" || requested === "inherit" || requested === "clear") { + delete params.sessionEntry.ttsAuto; + await persistSessionEntry(params); + return { shouldContinue: false, reply: { text: "🔊 TTS chat override cleared." } }; + } + return { shouldContinue: false, reply: ttsUsage() }; + } + + if ( + action === "latest" || + (action === "read" && normalizeOptionalLowercaseString(args) === "latest") + ) { + if (!params.sessionEntry || !params.sessionStore || !params.sessionKey) { + return { + shouldContinue: false, + reply: { text: "🎤 No active chat session is available for `/tts latest`." }, + }; + } + const latest = await readLatestAssistantTextFromSessionTranscript( + params.sessionEntry.sessionFile, + ); + const latestText = latest?.text.trim(); + if (!latestText || isSilentReplyPayloadText(latestText)) { + return { + shouldContinue: false, + reply: { text: "🎤 No readable assistant reply was found in this chat yet." }, + }; + } + const hash = hashTtsReadLatestText(latestText); + if (params.sessionEntry.lastTtsReadLatestHash === hash) { + return { + shouldContinue: false, + reply: { text: "🔊 Latest assistant reply was already sent as audio." }, + }; + } + + const audio = await buildTtsAudioReply({ + text: latestText, + cfg: params.cfg, + channel: params.command.channel, + prefsPath, + agentId: params.agentId, + }); + if ("error" in audio) { + return { + shouldContinue: false, + reply: { text: `❌ Error generating audio: ${audio.error}` }, + }; + } + + params.sessionEntry.lastTtsReadLatestHash = hash; + params.sessionEntry.lastTtsReadLatestAt = Date.now(); + await persistSessionEntry(params); + return { shouldContinue: false, reply: audio.reply }; + } + if (action === "audio") { if (!args.trim()) { return { @@ -143,51 +286,19 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand }; } - const start = Date.now(); - const result = await textToSpeech({ + const audio = await buildTtsAudioReply({ text: args, cfg: params.cfg, channel: params.command.channel, prefsPath, agentId: params.agentId, }); - - if (result.success && result.audioPath) { - // Store last attempt for `/tts status`. - setLastTtsAttempt({ - timestamp: Date.now(), - success: true, - textLength: args.length, - summarized: false, - provider: result.provider, - fallbackFrom: result.fallbackFrom, - attemptedProviders: result.attemptedProviders, - attempts: result.attempts, - latencyMs: result.latencyMs, - }); - const payload: ReplyPayload = { - mediaUrl: result.audioPath, - audioAsVoice: result.voiceCompatible === true, - trustedLocalMedia: true, - spokenText: args, - }; - return { shouldContinue: false, reply: payload }; + if (!("error" in audio)) { + return { shouldContinue: false, reply: audio.reply }; } - - // Store failure details for `/tts status`. - setLastTtsAttempt({ - timestamp: Date.now(), - success: false, - textLength: args.length, - summarized: false, - attemptedProviders: result.attemptedProviders, - attempts: result.attempts, - error: result.error, - latencyMs: Date.now() - start, - }); return { shouldContinue: false, - reply: { text: `❌ Error generating audio: ${result.error ?? "unknown error"}` }, + reply: { text: `❌ Error generating audio: ${audio.error}` }, }; } @@ -306,6 +417,7 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand const lines = [ "📊 TTS status", `State: ${enabled ? "✅ enabled" : "❌ disabled"}`, + `Chat override: ${params.sessionEntry?.ttsAuto ?? "default"}`, `Provider: ${provider} (${hasKey ? "✅ configured" : "❌ not configured"})`, `Text limit: ${maxLength} chars`, `Auto-summary: ${summarize ? "on" : "off"}`, diff --git a/src/config/sessions/transcript.ts b/src/config/sessions/transcript.ts index b87f2e2fcd5..f875431e10a 100644 --- a/src/config/sessions/transcript.ts +++ b/src/config/sessions/transcript.ts @@ -3,6 +3,7 @@ import path from "node:path"; import { CURRENT_SESSION_VERSION, SessionManager } from "@mariozechner/pi-coding-agent"; import { formatErrorMessage } from "../../infra/errors.js"; import { emitSessionTranscriptUpdate } from "../../sessions/transcript-events.js"; +import { extractAssistantVisibleText } from "../../shared/chat-message-content.js"; import { resolveDefaultSessionStorePath, resolveSessionFilePath, @@ -46,6 +47,12 @@ export type SessionTranscriptAssistantMessage = Parameters { + if (!sessionFile?.trim()) { + return undefined; + } + + let raw: string; + try { + raw = await fs.promises.readFile(sessionFile, "utf-8"); + } catch { + return undefined; + } + + const lines = raw.split(/\r?\n/); + for (let index = lines.length - 1; index >= 0; index -= 1) { + const line = lines[index]; + if (!line.trim()) { + continue; + } + try { + const parsed = JSON.parse(line) as { + id?: unknown; + message?: unknown; + }; + const message = parsed.message as { role?: unknown; timestamp?: unknown } | undefined; + if (!message || message.role !== "assistant") { + continue; + } + const text = extractAssistantVisibleText(message)?.trim(); + if (!text) { + continue; + } + return { + ...(typeof parsed.id === "string" && parsed.id ? { id: parsed.id } : {}), + text, + ...(typeof message.timestamp === "number" && Number.isFinite(message.timestamp) + ? { timestamp: message.timestamp } + : {}), + }; + } catch { + continue; + } + } + return undefined; +} + export async function appendAssistantMessageToSessionTranscript(params: { agentId?: string; sessionKey: string; diff --git a/src/config/sessions/types.ts b/src/config/sessions/types.ts index 17226fbeef6..60f13c63dac 100644 --- a/src/config/sessions/types.ts +++ b/src/config/sessions/types.ts @@ -175,6 +175,10 @@ export type SessionEntry = { reasoningLevel?: string; elevatedLevel?: string; ttsAuto?: TtsAutoMode; + /** Hash of the latest assistant reply that was sent through `/tts latest`. */ + lastTtsReadLatestHash?: string; + /** Timestamp (ms) when `/tts latest` last sent audio for this session. */ + lastTtsReadLatestAt?: number; execHost?: string; execSecurity?: string; execAsk?: string;