mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 07:50:43 +00:00
fix: log google meet agent tts backend
This commit is contained in:
@@ -53,6 +53,7 @@ Docs: https://docs.openclaw.ai
|
||||
|
||||
- Agents/Pi: suppress persistence for synthetic mid-turn overflow continuation prompts, so transcript-retry recovery does not write the "continue from transcript" prompt as a new user turn. Thanks @vincentkoc.
|
||||
- Exec approvals: detect `env -S` split-string command-carrier risks when `-S`/`-s` is combined with other env short options, so approval explanations do not miss split payloads hidden behind `env -iS...`. Thanks @vincentkoc.
|
||||
- Google Meet: log the concrete agent-mode TTS provider, model, voice, output format, and sample rate after speech synthesis, so Meet logs show which voice backend spoke each reply.
|
||||
- Voice Call: mark realtime calls completed when the realtime provider closes normally, so Twilio/OpenAI/Google realtime stop events do not leave active call records behind. Thanks @vincentkoc.
|
||||
- Exec approvals: treat POSIX `exec` as a command carrier for inline eval, shell-wrapper, and eval/source detection, so approval explanations and command-risk checks do not miss payloads hidden behind `exec`. Thanks @vincentkoc.
|
||||
- Google Meet: log the resolved audio provider model when starting Chrome and paired-node Meet talk-back bridges, so agent-mode joins show the STT model and bidi joins show the realtime voice model.
|
||||
|
||||
@@ -1175,6 +1175,9 @@ agent produces the answer, and regular OpenClaw TTS speaks it into Meet. Use
|
||||
`mode: "bidi"` when you want the realtime voice model to answer directly.
|
||||
Raw `mode: "realtime"` remains accepted as a legacy compatibility alias for
|
||||
`mode: "agent"`, but it is no longer advertised in the agent tool schema.
|
||||
Agent-mode logs include the resolved transcription provider/model at bridge
|
||||
startup and the TTS provider, model, voice, output format, and sample rate after
|
||||
each synthesized reply.
|
||||
|
||||
Use `action: "status"` to list active sessions or inspect a session ID. Use
|
||||
`action: "speak"` with `sessionId` and `message` to make the realtime agent
|
||||
|
||||
@@ -3848,6 +3848,10 @@ describe("google-meet plugin", () => {
|
||||
success: true,
|
||||
audioBuffer: Buffer.from([1, 0, 2, 0]),
|
||||
sampleRate: 24_000,
|
||||
provider: "elevenlabs",
|
||||
providerModel: "eleven_multilingual_v2",
|
||||
providerVoice: "pMsXgVXv3BLzUgSXRplE",
|
||||
outputFormat: "pcm16",
|
||||
})),
|
||||
},
|
||||
agent: {
|
||||
@@ -3896,6 +3900,9 @@ describe("google-meet plugin", () => {
|
||||
text: "Use the Portugal launch data.",
|
||||
cfg: {},
|
||||
});
|
||||
expect(noopLogger.info).toHaveBeenCalledWith(
|
||||
"[google-meet] agent TTS: provider=elevenlabs model=eleven_multilingual_v2 voice=pMsXgVXv3BLzUgSXRplE outputFormat=pcm16 sampleRate=24000",
|
||||
);
|
||||
expect(Buffer.concat(outputStdinWrites)).toEqual(Buffer.from([1, 0, 2, 0]));
|
||||
expect(handle.getHealth()).toMatchObject({
|
||||
providerConnected: true,
|
||||
|
||||
@@ -32,6 +32,7 @@ import {
|
||||
convertGoogleMeetBridgeAudioForStt,
|
||||
convertGoogleMeetTtsAudioForBridge,
|
||||
formatGoogleMeetAgentAudioModelLog,
|
||||
formatGoogleMeetAgentTtsResultLog,
|
||||
formatGoogleMeetRealtimeVoiceModelLog,
|
||||
type GoogleMeetRealtimeEventEntry,
|
||||
type GoogleMeetRealtimeTranscriptEntry,
|
||||
@@ -184,6 +185,7 @@ export async function startNodeAgentAudioBridge(params: {
|
||||
if (!result.success || !result.audioBuffer || !result.sampleRate) {
|
||||
throw new Error(result.error ?? "TTS conversion failed");
|
||||
}
|
||||
params.logger.info(formatGoogleMeetAgentTtsResultLog("node agent", result));
|
||||
await pushOutputAudio(
|
||||
convertGoogleMeetTtsAudioForBridge(
|
||||
result.audioBuffer,
|
||||
|
||||
@@ -473,6 +473,29 @@ export function formatGoogleMeetAgentAudioModelLog(params: {
|
||||
].join(" ");
|
||||
}
|
||||
|
||||
type GoogleMeetTtsResultLogFields = {
|
||||
provider?: string;
|
||||
providerModel?: string;
|
||||
providerVoice?: string;
|
||||
outputFormat?: string;
|
||||
sampleRate?: number;
|
||||
fallbackFrom?: string;
|
||||
};
|
||||
|
||||
export function formatGoogleMeetAgentTtsResultLog(
|
||||
prefix: string,
|
||||
result: GoogleMeetTtsResultLogFields,
|
||||
): string {
|
||||
return [
|
||||
`[google-meet] ${prefix} TTS: provider=${formatLogValue(result.provider)}`,
|
||||
`model=${formatLogValue(result.providerModel)}`,
|
||||
`voice=${formatLogValue(result.providerVoice)}`,
|
||||
`outputFormat=${formatLogValue(result.outputFormat)}`,
|
||||
`sampleRate=${result.sampleRate ?? "unknown"}`,
|
||||
...(result.fallbackFrom ? [`fallbackFrom=${formatLogValue(result.fallbackFrom)}`] : []),
|
||||
].join(" ");
|
||||
}
|
||||
|
||||
function normalizeGoogleMeetTtsPromptText(text: string | undefined): string | undefined {
|
||||
const trimmed = text?.trim();
|
||||
if (!trimmed) {
|
||||
@@ -648,6 +671,7 @@ export async function startCommandAgentAudioBridge(params: {
|
||||
if (!result.success || !result.audioBuffer || !result.sampleRate) {
|
||||
throw new Error(result.error ?? "TTS conversion failed");
|
||||
}
|
||||
params.logger.info(formatGoogleMeetAgentTtsResultLog("agent", result));
|
||||
writeOutputAudio(
|
||||
convertGoogleMeetTtsAudioForBridge(
|
||||
result.audioBuffer,
|
||||
|
||||
@@ -625,6 +625,12 @@ describe("speech-core native voice-note routing", () => {
|
||||
tts: {
|
||||
enabled: true,
|
||||
provider: "mock",
|
||||
providers: {
|
||||
mock: {
|
||||
modelId: "telephony-model",
|
||||
voiceId: "default-voice",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
@@ -638,6 +644,8 @@ describe("speech-core native voice-note routing", () => {
|
||||
});
|
||||
|
||||
expect(result.success).toBe(true);
|
||||
expect(result.providerModel).toBe("telephony-model");
|
||||
expect(result.providerVoice).toBe("directed-voice");
|
||||
expect(synthesizeTelephony).toHaveBeenCalledWith(
|
||||
expect.objectContaining({
|
||||
providerOverrides: {
|
||||
|
||||
@@ -123,6 +123,8 @@ export type TtsSynthesisResult = {
|
||||
error?: string;
|
||||
latencyMs?: number;
|
||||
provider?: string;
|
||||
providerModel?: string;
|
||||
providerVoice?: string;
|
||||
persona?: string;
|
||||
fallbackFrom?: string;
|
||||
attemptedProviders?: string[];
|
||||
@@ -139,6 +141,8 @@ export type TtsTelephonyResult = {
|
||||
error?: string;
|
||||
latencyMs?: number;
|
||||
provider?: string;
|
||||
providerModel?: string;
|
||||
providerVoice?: string;
|
||||
persona?: string;
|
||||
fallbackFrom?: string;
|
||||
attemptedProviders?: string[];
|
||||
@@ -1064,6 +1068,36 @@ function resolveTtsRequestSetup(params: {
|
||||
};
|
||||
}
|
||||
|
||||
function readTtsResultString(value: unknown): string | undefined {
|
||||
return typeof value === "string" && value.trim() ? value.trim() : undefined;
|
||||
}
|
||||
|
||||
function resolveTtsResultModel(
|
||||
providerConfig: SpeechProviderConfig,
|
||||
providerOverrides?: SpeechProviderOverrides,
|
||||
): string | undefined {
|
||||
return (
|
||||
readTtsResultString(providerOverrides?.modelId) ??
|
||||
readTtsResultString(providerOverrides?.model) ??
|
||||
readTtsResultString(providerConfig.modelId) ??
|
||||
readTtsResultString(providerConfig.model)
|
||||
);
|
||||
}
|
||||
|
||||
function resolveTtsResultVoice(
|
||||
providerConfig: SpeechProviderConfig,
|
||||
providerOverrides?: SpeechProviderOverrides,
|
||||
): string | undefined {
|
||||
return (
|
||||
readTtsResultString(providerOverrides?.voiceId) ??
|
||||
readTtsResultString(providerOverrides?.voiceName) ??
|
||||
readTtsResultString(providerOverrides?.voice) ??
|
||||
readTtsResultString(providerConfig.voiceId) ??
|
||||
readTtsResultString(providerConfig.voiceName) ??
|
||||
readTtsResultString(providerConfig.voice)
|
||||
);
|
||||
}
|
||||
|
||||
export async function textToSpeech(params: {
|
||||
text: string;
|
||||
cfg: OpenClawConfig;
|
||||
@@ -1271,6 +1305,8 @@ export async function synthesizeSpeech(params: {
|
||||
audioBuffer: synthesis.audioBuffer,
|
||||
latencyMs,
|
||||
provider,
|
||||
providerModel: resolveTtsResultModel(prepared.providerConfig, prepared.providerOverrides),
|
||||
providerVoice: resolveTtsResultVoice(prepared.providerConfig, prepared.providerOverrides),
|
||||
persona: persona?.id,
|
||||
fallbackFrom: provider !== primaryProvider ? primaryProvider : undefined,
|
||||
attemptedProviders,
|
||||
@@ -1401,6 +1437,8 @@ export async function textToSpeechTelephony(params: {
|
||||
audioBuffer: synthesis.audioBuffer,
|
||||
latencyMs,
|
||||
provider,
|
||||
providerModel: resolveTtsResultModel(prepared.providerConfig, prepared.providerOverrides),
|
||||
providerVoice: resolveTtsResultVoice(prepared.providerConfig, prepared.providerOverrides),
|
||||
persona: persona?.id,
|
||||
fallbackFrom: provider !== primaryProvider ? primaryProvider : undefined,
|
||||
attemptedProviders,
|
||||
|
||||
@@ -154,6 +154,8 @@ export type TtsSynthesisResult = {
|
||||
error?: string;
|
||||
latencyMs?: number;
|
||||
provider?: string;
|
||||
providerModel?: string;
|
||||
providerVoice?: string;
|
||||
persona?: string;
|
||||
fallbackFrom?: string;
|
||||
attemptedProviders?: string[];
|
||||
@@ -170,6 +172,8 @@ export type TtsTelephonyResult = {
|
||||
error?: string;
|
||||
latencyMs?: number;
|
||||
provider?: string;
|
||||
providerModel?: string;
|
||||
providerVoice?: string;
|
||||
persona?: string;
|
||||
fallbackFrom?: string;
|
||||
attemptedProviders?: string[];
|
||||
|
||||
Reference in New Issue
Block a user