diff --git a/CHANGELOG.md b/CHANGELOG.md index 549269edf5e..99e9d186980 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -53,6 +53,7 @@ Docs: https://docs.openclaw.ai - Agents/Pi: suppress persistence for synthetic mid-turn overflow continuation prompts, so transcript-retry recovery does not write the "continue from transcript" prompt as a new user turn. Thanks @vincentkoc. - Exec approvals: detect `env -S` split-string command-carrier risks when `-S`/`-s` is combined with other env short options, so approval explanations do not miss split payloads hidden behind `env -iS...`. Thanks @vincentkoc. +- Google Meet: log the concrete agent-mode TTS provider, model, voice, output format, and sample rate after speech synthesis, so Meet logs show which voice backend spoke each reply. - Voice Call: mark realtime calls completed when the realtime provider closes normally, so Twilio/OpenAI/Google realtime stop events do not leave active call records behind. Thanks @vincentkoc. - Exec approvals: treat POSIX `exec` as a command carrier for inline eval, shell-wrapper, and eval/source detection, so approval explanations and command-risk checks do not miss payloads hidden behind `exec`. Thanks @vincentkoc. - Google Meet: log the resolved audio provider model when starting Chrome and paired-node Meet talk-back bridges, so agent-mode joins show the STT model and bidi joins show the realtime voice model. diff --git a/docs/plugins/google-meet.md b/docs/plugins/google-meet.md index 2e30e2eb480..e5364f83ab5 100644 --- a/docs/plugins/google-meet.md +++ b/docs/plugins/google-meet.md @@ -1175,6 +1175,9 @@ agent produces the answer, and regular OpenClaw TTS speaks it into Meet. Use `mode: "bidi"` when you want the realtime voice model to answer directly. Raw `mode: "realtime"` remains accepted as a legacy compatibility alias for `mode: "agent"`, but it is no longer advertised in the agent tool schema. +Agent-mode logs include the resolved transcription provider/model at bridge +startup and the TTS provider, model, voice, output format, and sample rate after +each synthesized reply. Use `action: "status"` to list active sessions or inspect a session ID. Use `action: "speak"` with `sessionId` and `message` to make the realtime agent diff --git a/extensions/google-meet/index.test.ts b/extensions/google-meet/index.test.ts index 6513af119da..ddc876e2245 100644 --- a/extensions/google-meet/index.test.ts +++ b/extensions/google-meet/index.test.ts @@ -3848,6 +3848,10 @@ describe("google-meet plugin", () => { success: true, audioBuffer: Buffer.from([1, 0, 2, 0]), sampleRate: 24_000, + provider: "elevenlabs", + providerModel: "eleven_multilingual_v2", + providerVoice: "pMsXgVXv3BLzUgSXRplE", + outputFormat: "pcm16", })), }, agent: { @@ -3896,6 +3900,9 @@ describe("google-meet plugin", () => { text: "Use the Portugal launch data.", cfg: {}, }); + expect(noopLogger.info).toHaveBeenCalledWith( + "[google-meet] agent TTS: provider=elevenlabs model=eleven_multilingual_v2 voice=pMsXgVXv3BLzUgSXRplE outputFormat=pcm16 sampleRate=24000", + ); expect(Buffer.concat(outputStdinWrites)).toEqual(Buffer.from([1, 0, 2, 0])); expect(handle.getHealth()).toMatchObject({ providerConnected: true, diff --git a/extensions/google-meet/src/realtime-node.ts b/extensions/google-meet/src/realtime-node.ts index 9ef1622ed98..fac11676efa 100644 --- a/extensions/google-meet/src/realtime-node.ts +++ b/extensions/google-meet/src/realtime-node.ts @@ -32,6 +32,7 @@ import { convertGoogleMeetBridgeAudioForStt, convertGoogleMeetTtsAudioForBridge, formatGoogleMeetAgentAudioModelLog, + formatGoogleMeetAgentTtsResultLog, formatGoogleMeetRealtimeVoiceModelLog, type GoogleMeetRealtimeEventEntry, type GoogleMeetRealtimeTranscriptEntry, @@ -184,6 +185,7 @@ export async function startNodeAgentAudioBridge(params: { if (!result.success || !result.audioBuffer || !result.sampleRate) { throw new Error(result.error ?? "TTS conversion failed"); } + params.logger.info(formatGoogleMeetAgentTtsResultLog("node agent", result)); await pushOutputAudio( convertGoogleMeetTtsAudioForBridge( result.audioBuffer, diff --git a/extensions/google-meet/src/realtime.ts b/extensions/google-meet/src/realtime.ts index 94b50a21ce1..e112308ce06 100644 --- a/extensions/google-meet/src/realtime.ts +++ b/extensions/google-meet/src/realtime.ts @@ -473,6 +473,29 @@ export function formatGoogleMeetAgentAudioModelLog(params: { ].join(" "); } +type GoogleMeetTtsResultLogFields = { + provider?: string; + providerModel?: string; + providerVoice?: string; + outputFormat?: string; + sampleRate?: number; + fallbackFrom?: string; +}; + +export function formatGoogleMeetAgentTtsResultLog( + prefix: string, + result: GoogleMeetTtsResultLogFields, +): string { + return [ + `[google-meet] ${prefix} TTS: provider=${formatLogValue(result.provider)}`, + `model=${formatLogValue(result.providerModel)}`, + `voice=${formatLogValue(result.providerVoice)}`, + `outputFormat=${formatLogValue(result.outputFormat)}`, + `sampleRate=${result.sampleRate ?? "unknown"}`, + ...(result.fallbackFrom ? [`fallbackFrom=${formatLogValue(result.fallbackFrom)}`] : []), + ].join(" "); +} + function normalizeGoogleMeetTtsPromptText(text: string | undefined): string | undefined { const trimmed = text?.trim(); if (!trimmed) { @@ -648,6 +671,7 @@ export async function startCommandAgentAudioBridge(params: { if (!result.success || !result.audioBuffer || !result.sampleRate) { throw new Error(result.error ?? "TTS conversion failed"); } + params.logger.info(formatGoogleMeetAgentTtsResultLog("agent", result)); writeOutputAudio( convertGoogleMeetTtsAudioForBridge( result.audioBuffer, diff --git a/extensions/speech-core/src/tts.test.ts b/extensions/speech-core/src/tts.test.ts index 52714a41039..e75783333c9 100644 --- a/extensions/speech-core/src/tts.test.ts +++ b/extensions/speech-core/src/tts.test.ts @@ -625,6 +625,12 @@ describe("speech-core native voice-note routing", () => { tts: { enabled: true, provider: "mock", + providers: { + mock: { + modelId: "telephony-model", + voiceId: "default-voice", + }, + }, }, }, }, @@ -638,6 +644,8 @@ describe("speech-core native voice-note routing", () => { }); expect(result.success).toBe(true); + expect(result.providerModel).toBe("telephony-model"); + expect(result.providerVoice).toBe("directed-voice"); expect(synthesizeTelephony).toHaveBeenCalledWith( expect.objectContaining({ providerOverrides: { diff --git a/extensions/speech-core/src/tts.ts b/extensions/speech-core/src/tts.ts index 9616334e607..332c59fb4ae 100644 --- a/extensions/speech-core/src/tts.ts +++ b/extensions/speech-core/src/tts.ts @@ -123,6 +123,8 @@ export type TtsSynthesisResult = { error?: string; latencyMs?: number; provider?: string; + providerModel?: string; + providerVoice?: string; persona?: string; fallbackFrom?: string; attemptedProviders?: string[]; @@ -139,6 +141,8 @@ export type TtsTelephonyResult = { error?: string; latencyMs?: number; provider?: string; + providerModel?: string; + providerVoice?: string; persona?: string; fallbackFrom?: string; attemptedProviders?: string[]; @@ -1064,6 +1068,36 @@ function resolveTtsRequestSetup(params: { }; } +function readTtsResultString(value: unknown): string | undefined { + return typeof value === "string" && value.trim() ? value.trim() : undefined; +} + +function resolveTtsResultModel( + providerConfig: SpeechProviderConfig, + providerOverrides?: SpeechProviderOverrides, +): string | undefined { + return ( + readTtsResultString(providerOverrides?.modelId) ?? + readTtsResultString(providerOverrides?.model) ?? + readTtsResultString(providerConfig.modelId) ?? + readTtsResultString(providerConfig.model) + ); +} + +function resolveTtsResultVoice( + providerConfig: SpeechProviderConfig, + providerOverrides?: SpeechProviderOverrides, +): string | undefined { + return ( + readTtsResultString(providerOverrides?.voiceId) ?? + readTtsResultString(providerOverrides?.voiceName) ?? + readTtsResultString(providerOverrides?.voice) ?? + readTtsResultString(providerConfig.voiceId) ?? + readTtsResultString(providerConfig.voiceName) ?? + readTtsResultString(providerConfig.voice) + ); +} + export async function textToSpeech(params: { text: string; cfg: OpenClawConfig; @@ -1271,6 +1305,8 @@ export async function synthesizeSpeech(params: { audioBuffer: synthesis.audioBuffer, latencyMs, provider, + providerModel: resolveTtsResultModel(prepared.providerConfig, prepared.providerOverrides), + providerVoice: resolveTtsResultVoice(prepared.providerConfig, prepared.providerOverrides), persona: persona?.id, fallbackFrom: provider !== primaryProvider ? primaryProvider : undefined, attemptedProviders, @@ -1401,6 +1437,8 @@ export async function textToSpeechTelephony(params: { audioBuffer: synthesis.audioBuffer, latencyMs, provider, + providerModel: resolveTtsResultModel(prepared.providerConfig, prepared.providerOverrides), + providerVoice: resolveTtsResultVoice(prepared.providerConfig, prepared.providerOverrides), persona: persona?.id, fallbackFrom: provider !== primaryProvider ? primaryProvider : undefined, attemptedProviders, diff --git a/src/plugin-sdk/tts-runtime.types.ts b/src/plugin-sdk/tts-runtime.types.ts index 259548cda26..26fd82e01e4 100644 --- a/src/plugin-sdk/tts-runtime.types.ts +++ b/src/plugin-sdk/tts-runtime.types.ts @@ -154,6 +154,8 @@ export type TtsSynthesisResult = { error?: string; latencyMs?: number; provider?: string; + providerModel?: string; + providerVoice?: string; persona?: string; fallbackFrom?: string; attemptedProviders?: string[]; @@ -170,6 +172,8 @@ export type TtsTelephonyResult = { error?: string; latencyMs?: number; provider?: string; + providerModel?: string; + providerVoice?: string; persona?: string; fallbackFrom?: string; attemptedProviders?: string[];