fix: log google meet agent tts backend

2026-05-06 07:50:43 +00:00 · 2026-05-04 06:40:59 +01:00
parent 47134d1ce6
commit cbd91676ac
8 changed files with 87 additions and 0 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -53,6 +53,7 @@ Docs: https://docs.openclaw.ai

 - Agents/Pi: suppress persistence for synthetic mid-turn overflow continuation prompts, so transcript-retry recovery does not write the "continue from transcript" prompt as a new user turn. Thanks @vincentkoc.
 - Exec approvals: detect `env -S` split-string command-carrier risks when `-S`/`-s` is combined with other env short options, so approval explanations do not miss split payloads hidden behind `env -iS...`. Thanks @vincentkoc.
+- Google Meet: log the concrete agent-mode TTS provider, model, voice, output format, and sample rate after speech synthesis, so Meet logs show which voice backend spoke each reply.
 - Voice Call: mark realtime calls completed when the realtime provider closes normally, so Twilio/OpenAI/Google realtime stop events do not leave active call records behind. Thanks @vincentkoc.
 - Exec approvals: treat POSIX `exec` as a command carrier for inline eval, shell-wrapper, and eval/source detection, so approval explanations and command-risk checks do not miss payloads hidden behind `exec`. Thanks @vincentkoc.
 - Google Meet: log the resolved audio provider model when starting Chrome and paired-node Meet talk-back bridges, so agent-mode joins show the STT model and bidi joins show the realtime voice model.
--- a/docs/plugins/google-meet.md
+++ b/docs/plugins/google-meet.md
@@ -1175,6 +1175,9 @@ agent produces the answer, and regular OpenClaw TTS speaks it into Meet. Use
 `mode: "bidi"` when you want the realtime voice model to answer directly.
 Raw `mode: "realtime"` remains accepted as a legacy compatibility alias for
 `mode: "agent"`, but it is no longer advertised in the agent tool schema.
+Agent-mode logs include the resolved transcription provider/model at bridge
+startup and the TTS provider, model, voice, output format, and sample rate after
+each synthesized reply.

 Use `action: "status"` to list active sessions or inspect a session ID. Use
 `action: "speak"` with `sessionId` and `message` to make the realtime agent
--- a/extensions/google-meet/index.test.ts
+++ b/extensions/google-meet/index.test.ts
@@ -3848,6 +3848,10 @@ describe("google-meet plugin", () => {
          success: true,
          audioBuffer: Buffer.from([1, 0, 2, 0]),
          sampleRate: 24_000,
+          provider: "elevenlabs",
+          providerModel: "eleven_multilingual_v2",
+          providerVoice: "pMsXgVXv3BLzUgSXRplE",
+          outputFormat: "pcm16",
        })),
      },
      agent: {
@@ -3896,6 +3900,9 @@ describe("google-meet plugin", () => {
      text: "Use the Portugal launch data.",
      cfg: {},
    });
+    expect(noopLogger.info).toHaveBeenCalledWith(
+      "[google-meet] agent TTS: provider=elevenlabs model=eleven_multilingual_v2 voice=pMsXgVXv3BLzUgSXRplE outputFormat=pcm16 sampleRate=24000",
+    );
    expect(Buffer.concat(outputStdinWrites)).toEqual(Buffer.from([1, 0, 2, 0]));
    expect(handle.getHealth()).toMatchObject({
      providerConnected: true,
--- a/extensions/google-meet/src/realtime-node.ts
+++ b/extensions/google-meet/src/realtime-node.ts
@@ -32,6 +32,7 @@ import {
  convertGoogleMeetBridgeAudioForStt,
  convertGoogleMeetTtsAudioForBridge,
  formatGoogleMeetAgentAudioModelLog,
+  formatGoogleMeetAgentTtsResultLog,
  formatGoogleMeetRealtimeVoiceModelLog,
  type GoogleMeetRealtimeEventEntry,
  type GoogleMeetRealtimeTranscriptEntry,
@@ -184,6 +185,7 @@ export async function startNodeAgentAudioBridge(params: {
        if (!result.success || !result.audioBuffer || !result.sampleRate) {
          throw new Error(result.error ?? "TTS conversion failed");
        }
+        params.logger.info(formatGoogleMeetAgentTtsResultLog("node agent", result));
        await pushOutputAudio(
          convertGoogleMeetTtsAudioForBridge(
            result.audioBuffer,
--- a/extensions/google-meet/src/realtime.ts
+++ b/extensions/google-meet/src/realtime.ts
@@ -473,6 +473,29 @@ export function formatGoogleMeetAgentAudioModelLog(params: {
  ].join(" ");
 }

+type GoogleMeetTtsResultLogFields = {
+  provider?: string;
+  providerModel?: string;
+  providerVoice?: string;
+  outputFormat?: string;
+  sampleRate?: number;
+  fallbackFrom?: string;
+};
+
+export function formatGoogleMeetAgentTtsResultLog(
+  prefix: string,
+  result: GoogleMeetTtsResultLogFields,
+): string {
+  return [
+    `[google-meet] ${prefix} TTS: provider=${formatLogValue(result.provider)}`,
+    `model=${formatLogValue(result.providerModel)}`,
+    `voice=${formatLogValue(result.providerVoice)}`,
+    `outputFormat=${formatLogValue(result.outputFormat)}`,
+    `sampleRate=${result.sampleRate ?? "unknown"}`,
+    ...(result.fallbackFrom ? [`fallbackFrom=${formatLogValue(result.fallbackFrom)}`] : []),
+  ].join(" ");
+}
+
 function normalizeGoogleMeetTtsPromptText(text: string | undefined): string | undefined {
  const trimmed = text?.trim();
  if (!trimmed) {
@@ -648,6 +671,7 @@ export async function startCommandAgentAudioBridge(params: {
        if (!result.success || !result.audioBuffer || !result.sampleRate) {
          throw new Error(result.error ?? "TTS conversion failed");
        }
+        params.logger.info(formatGoogleMeetAgentTtsResultLog("agent", result));
        writeOutputAudio(
          convertGoogleMeetTtsAudioForBridge(
            result.audioBuffer,
--- a/extensions/speech-core/src/tts.test.ts
+++ b/extensions/speech-core/src/tts.test.ts
@@ -625,6 +625,12 @@ describe("speech-core native voice-note routing", () => {
          tts: {
            enabled: true,
            provider: "mock",
+            providers: {
+              mock: {
+                modelId: "telephony-model",
+                voiceId: "default-voice",
+              },
+            },
          },
        },
      },
@@ -638,6 +644,8 @@ describe("speech-core native voice-note routing", () => {
    });

    expect(result.success).toBe(true);
+    expect(result.providerModel).toBe("telephony-model");
+    expect(result.providerVoice).toBe("directed-voice");
    expect(synthesizeTelephony).toHaveBeenCalledWith(
      expect.objectContaining({
        providerOverrides: {
--- a/extensions/speech-core/src/tts.ts
+++ b/extensions/speech-core/src/tts.ts
@@ -123,6 +123,8 @@ export type TtsSynthesisResult = {
  error?: string;
  latencyMs?: number;
  provider?: string;
+  providerModel?: string;
+  providerVoice?: string;
  persona?: string;
  fallbackFrom?: string;
  attemptedProviders?: string[];
@@ -139,6 +141,8 @@ export type TtsTelephonyResult = {
  error?: string;
  latencyMs?: number;
  provider?: string;
+  providerModel?: string;
+  providerVoice?: string;
  persona?: string;
  fallbackFrom?: string;
  attemptedProviders?: string[];
@@ -1064,6 +1068,36 @@ function resolveTtsRequestSetup(params: {
  };
 }

+function readTtsResultString(value: unknown): string | undefined {
+  return typeof value === "string" && value.trim() ? value.trim() : undefined;
+}
+
+function resolveTtsResultModel(
+  providerConfig: SpeechProviderConfig,
+  providerOverrides?: SpeechProviderOverrides,
+): string | undefined {
+  return (
+    readTtsResultString(providerOverrides?.modelId) ??
+    readTtsResultString(providerOverrides?.model) ??
+    readTtsResultString(providerConfig.modelId) ??
+    readTtsResultString(providerConfig.model)
+  );
+}
+
+function resolveTtsResultVoice(
+  providerConfig: SpeechProviderConfig,
+  providerOverrides?: SpeechProviderOverrides,
+): string | undefined {
+  return (
+    readTtsResultString(providerOverrides?.voiceId) ??
+    readTtsResultString(providerOverrides?.voiceName) ??
+    readTtsResultString(providerOverrides?.voice) ??
+    readTtsResultString(providerConfig.voiceId) ??
+    readTtsResultString(providerConfig.voiceName) ??
+    readTtsResultString(providerConfig.voice)
+  );
+}
+
 export async function textToSpeech(params: {
  text: string;
  cfg: OpenClawConfig;
@@ -1271,6 +1305,8 @@ export async function synthesizeSpeech(params: {
        audioBuffer: synthesis.audioBuffer,
        latencyMs,
        provider,
+        providerModel: resolveTtsResultModel(prepared.providerConfig, prepared.providerOverrides),
+        providerVoice: resolveTtsResultVoice(prepared.providerConfig, prepared.providerOverrides),
        persona: persona?.id,
        fallbackFrom: provider !== primaryProvider ? primaryProvider : undefined,
        attemptedProviders,
@@ -1401,6 +1437,8 @@ export async function textToSpeechTelephony(params: {
        audioBuffer: synthesis.audioBuffer,
        latencyMs,
        provider,
+        providerModel: resolveTtsResultModel(prepared.providerConfig, prepared.providerOverrides),
+        providerVoice: resolveTtsResultVoice(prepared.providerConfig, prepared.providerOverrides),
        persona: persona?.id,
        fallbackFrom: provider !== primaryProvider ? primaryProvider : undefined,
        attemptedProviders,
--- a/src/plugin-sdk/tts-runtime.types.ts
+++ b/src/plugin-sdk/tts-runtime.types.ts
@@ -154,6 +154,8 @@ export type TtsSynthesisResult = {
  error?: string;
  latencyMs?: number;
  provider?: string;
+  providerModel?: string;
+  providerVoice?: string;
  persona?: string;
  fallbackFrom?: string;
  attemptedProviders?: string[];
@@ -170,6 +172,8 @@ export type TtsTelephonyResult = {
  error?: string;
  latencyMs?: number;
  provider?: string;
+  providerModel?: string;
+  providerVoice?: string;
  persona?: string;
  fallbackFrom?: string;
  attemptedProviders?: string[];