fix: log google meet agent tts backend

This commit is contained in:
Peter Steinberger
2026-05-04 06:40:59 +01:00
parent 47134d1ce6
commit cbd91676ac
8 changed files with 87 additions and 0 deletions

View File

@@ -53,6 +53,7 @@ Docs: https://docs.openclaw.ai
- Agents/Pi: suppress persistence for synthetic mid-turn overflow continuation prompts, so transcript-retry recovery does not write the "continue from transcript" prompt as a new user turn. Thanks @vincentkoc.
- Exec approvals: detect `env -S` split-string command-carrier risks when `-S`/`-s` is combined with other env short options, so approval explanations do not miss split payloads hidden behind `env -iS...`. Thanks @vincentkoc.
- Google Meet: log the concrete agent-mode TTS provider, model, voice, output format, and sample rate after speech synthesis, so Meet logs show which voice backend spoke each reply.
- Voice Call: mark realtime calls completed when the realtime provider closes normally, so Twilio/OpenAI/Google realtime stop events do not leave active call records behind. Thanks @vincentkoc.
- Exec approvals: treat POSIX `exec` as a command carrier for inline eval, shell-wrapper, and eval/source detection, so approval explanations and command-risk checks do not miss payloads hidden behind `exec`. Thanks @vincentkoc.
- Google Meet: log the resolved audio provider model when starting Chrome and paired-node Meet talk-back bridges, so agent-mode joins show the STT model and bidi joins show the realtime voice model.

View File

@@ -1175,6 +1175,9 @@ agent produces the answer, and regular OpenClaw TTS speaks it into Meet. Use
`mode: "bidi"` when you want the realtime voice model to answer directly.
Raw `mode: "realtime"` remains accepted as a legacy compatibility alias for
`mode: "agent"`, but it is no longer advertised in the agent tool schema.
Agent-mode logs include the resolved transcription provider/model at bridge
startup and the TTS provider, model, voice, output format, and sample rate after
each synthesized reply.
Use `action: "status"` to list active sessions or inspect a session ID. Use
`action: "speak"` with `sessionId` and `message` to make the realtime agent

View File

@@ -3848,6 +3848,10 @@ describe("google-meet plugin", () => {
success: true,
audioBuffer: Buffer.from([1, 0, 2, 0]),
sampleRate: 24_000,
provider: "elevenlabs",
providerModel: "eleven_multilingual_v2",
providerVoice: "pMsXgVXv3BLzUgSXRplE",
outputFormat: "pcm16",
})),
},
agent: {
@@ -3896,6 +3900,9 @@ describe("google-meet plugin", () => {
text: "Use the Portugal launch data.",
cfg: {},
});
expect(noopLogger.info).toHaveBeenCalledWith(
"[google-meet] agent TTS: provider=elevenlabs model=eleven_multilingual_v2 voice=pMsXgVXv3BLzUgSXRplE outputFormat=pcm16 sampleRate=24000",
);
expect(Buffer.concat(outputStdinWrites)).toEqual(Buffer.from([1, 0, 2, 0]));
expect(handle.getHealth()).toMatchObject({
providerConnected: true,

View File

@@ -32,6 +32,7 @@ import {
convertGoogleMeetBridgeAudioForStt,
convertGoogleMeetTtsAudioForBridge,
formatGoogleMeetAgentAudioModelLog,
formatGoogleMeetAgentTtsResultLog,
formatGoogleMeetRealtimeVoiceModelLog,
type GoogleMeetRealtimeEventEntry,
type GoogleMeetRealtimeTranscriptEntry,
@@ -184,6 +185,7 @@ export async function startNodeAgentAudioBridge(params: {
if (!result.success || !result.audioBuffer || !result.sampleRate) {
throw new Error(result.error ?? "TTS conversion failed");
}
params.logger.info(formatGoogleMeetAgentTtsResultLog("node agent", result));
await pushOutputAudio(
convertGoogleMeetTtsAudioForBridge(
result.audioBuffer,

View File

@@ -473,6 +473,29 @@ export function formatGoogleMeetAgentAudioModelLog(params: {
].join(" ");
}
type GoogleMeetTtsResultLogFields = {
provider?: string;
providerModel?: string;
providerVoice?: string;
outputFormat?: string;
sampleRate?: number;
fallbackFrom?: string;
};
export function formatGoogleMeetAgentTtsResultLog(
prefix: string,
result: GoogleMeetTtsResultLogFields,
): string {
return [
`[google-meet] ${prefix} TTS: provider=${formatLogValue(result.provider)}`,
`model=${formatLogValue(result.providerModel)}`,
`voice=${formatLogValue(result.providerVoice)}`,
`outputFormat=${formatLogValue(result.outputFormat)}`,
`sampleRate=${result.sampleRate ?? "unknown"}`,
...(result.fallbackFrom ? [`fallbackFrom=${formatLogValue(result.fallbackFrom)}`] : []),
].join(" ");
}
function normalizeGoogleMeetTtsPromptText(text: string | undefined): string | undefined {
const trimmed = text?.trim();
if (!trimmed) {
@@ -648,6 +671,7 @@ export async function startCommandAgentAudioBridge(params: {
if (!result.success || !result.audioBuffer || !result.sampleRate) {
throw new Error(result.error ?? "TTS conversion failed");
}
params.logger.info(formatGoogleMeetAgentTtsResultLog("agent", result));
writeOutputAudio(
convertGoogleMeetTtsAudioForBridge(
result.audioBuffer,

View File

@@ -625,6 +625,12 @@ describe("speech-core native voice-note routing", () => {
tts: {
enabled: true,
provider: "mock",
providers: {
mock: {
modelId: "telephony-model",
voiceId: "default-voice",
},
},
},
},
},
@@ -638,6 +644,8 @@ describe("speech-core native voice-note routing", () => {
});
expect(result.success).toBe(true);
expect(result.providerModel).toBe("telephony-model");
expect(result.providerVoice).toBe("directed-voice");
expect(synthesizeTelephony).toHaveBeenCalledWith(
expect.objectContaining({
providerOverrides: {

View File

@@ -123,6 +123,8 @@ export type TtsSynthesisResult = {
error?: string;
latencyMs?: number;
provider?: string;
providerModel?: string;
providerVoice?: string;
persona?: string;
fallbackFrom?: string;
attemptedProviders?: string[];
@@ -139,6 +141,8 @@ export type TtsTelephonyResult = {
error?: string;
latencyMs?: number;
provider?: string;
providerModel?: string;
providerVoice?: string;
persona?: string;
fallbackFrom?: string;
attemptedProviders?: string[];
@@ -1064,6 +1068,36 @@ function resolveTtsRequestSetup(params: {
};
}
function readTtsResultString(value: unknown): string | undefined {
return typeof value === "string" && value.trim() ? value.trim() : undefined;
}
function resolveTtsResultModel(
providerConfig: SpeechProviderConfig,
providerOverrides?: SpeechProviderOverrides,
): string | undefined {
return (
readTtsResultString(providerOverrides?.modelId) ??
readTtsResultString(providerOverrides?.model) ??
readTtsResultString(providerConfig.modelId) ??
readTtsResultString(providerConfig.model)
);
}
function resolveTtsResultVoice(
providerConfig: SpeechProviderConfig,
providerOverrides?: SpeechProviderOverrides,
): string | undefined {
return (
readTtsResultString(providerOverrides?.voiceId) ??
readTtsResultString(providerOverrides?.voiceName) ??
readTtsResultString(providerOverrides?.voice) ??
readTtsResultString(providerConfig.voiceId) ??
readTtsResultString(providerConfig.voiceName) ??
readTtsResultString(providerConfig.voice)
);
}
export async function textToSpeech(params: {
text: string;
cfg: OpenClawConfig;
@@ -1271,6 +1305,8 @@ export async function synthesizeSpeech(params: {
audioBuffer: synthesis.audioBuffer,
latencyMs,
provider,
providerModel: resolveTtsResultModel(prepared.providerConfig, prepared.providerOverrides),
providerVoice: resolveTtsResultVoice(prepared.providerConfig, prepared.providerOverrides),
persona: persona?.id,
fallbackFrom: provider !== primaryProvider ? primaryProvider : undefined,
attemptedProviders,
@@ -1401,6 +1437,8 @@ export async function textToSpeechTelephony(params: {
audioBuffer: synthesis.audioBuffer,
latencyMs,
provider,
providerModel: resolveTtsResultModel(prepared.providerConfig, prepared.providerOverrides),
providerVoice: resolveTtsResultVoice(prepared.providerConfig, prepared.providerOverrides),
persona: persona?.id,
fallbackFrom: provider !== primaryProvider ? primaryProvider : undefined,
attemptedProviders,

View File

@@ -154,6 +154,8 @@ export type TtsSynthesisResult = {
error?: string;
latencyMs?: number;
provider?: string;
providerModel?: string;
providerVoice?: string;
persona?: string;
fallbackFrom?: string;
attemptedProviders?: string[];
@@ -170,6 +172,8 @@ export type TtsTelephonyResult = {
error?: string;
latencyMs?: number;
provider?: string;
providerModel?: string;
providerVoice?: string;
persona?: string;
fallbackFrom?: string;
attemptedProviders?: string[];