diff --git a/docs/channels/discord.md b/docs/channels/discord.md index f8277b658a8..867b1fdb92a 100644 --- a/docs/channels/discord.md +++ b/docs/channels/discord.md @@ -1234,7 +1234,7 @@ Notes: - In `stt-tts` mode, STT uses `tools.media.audio`; `voice.model` does not affect transcription. - In realtime modes, `voice.realtime.provider`, `voice.realtime.model`, and `voice.realtime.voice` configure the realtime audio session. For OpenAI Realtime 2 plus the Codex brain, use `voice.realtime.model: "gpt-realtime-2"` and `voice.model: "openai-codex/gpt-5.5"`. - Realtime voice modes include small `IDENTITY.md`, `USER.md`, and `SOUL.md` profile files in the realtime provider instructions by default so fast direct turns keep the same identity, user grounding, and persona as the routed OpenClaw agent. Set `voice.realtime.bootstrapContextFiles` to a subset to customize this, or `[]` to disable it. The supported realtime bootstrap files are limited to those profile files; `AGENTS.md` stays in the normal agent context. The injected profile context does not replace `openclaw_agent_consult` for workspace work, current facts, memory lookup, or tool-backed actions. -- In OpenAI `agent-proxy` realtime mode, set `voice.realtime.requireWakeName: true` to keep Discord realtime voice silent until a transcript starts or ends with a wake name. Configured wake names must be one or two words. If `voice.realtime.wakeNames` is unset, OpenClaw uses the routed agent `name` plus `OpenClaw`, falling back to the agent id plus `OpenClaw`. Wake-name gating disables realtime provider auto-response and routes accepted turns through the OpenClaw agent consult path. +- In OpenAI `agent-proxy` realtime mode, set `voice.realtime.requireWakeName: true` to keep Discord realtime voice silent until a transcript starts or ends with a wake name. Configured wake names must be one or two words. If `voice.realtime.wakeNames` is unset, OpenClaw uses the routed agent `name` plus `OpenClaw`, falling back to the agent id plus `OpenClaw`. Wake-name gating disables realtime provider auto-response, routes accepted turns through the OpenClaw agent consult path, and gives a short spoken acknowledgement when a leading wake name is recognized from partial transcription before the final transcript arrives. - The OpenAI realtime provider accepts current Realtime 2 event names and legacy Codex-compatible aliases for output audio and transcript events, so compatible provider snapshots can drift without dropping assistant audio. - `voice.realtime.bargeIn` controls whether Discord speaker-start events interrupt active realtime playback. If unset, it follows the realtime provider's input-audio interruption setting. - `voice.realtime.minBargeInAudioEndMs` controls the minimum assistant playback duration before an OpenAI realtime barge-in truncates audio. Default: `250`. Set `0` for immediate interruption in low-echo rooms, or raise it for echo-heavy speaker setups. diff --git a/extensions/discord/src/voice/manager.e2e.test.ts b/extensions/discord/src/voice/manager.e2e.test.ts index 6413780f412..2793140fabe 100644 --- a/extensions/discord/src/voice/manager.e2e.test.ts +++ b/extensions/discord/src/voice/manager.e2e.test.ts @@ -2241,6 +2241,7 @@ describe("DiscordVoiceManager", () => { | undefined; expect(bridgeParams?.autoRespondToAudio).toBe(false); expect(bridgeParams?.instructions).toContain("same OpenClaw agent"); + expect(bridgeParams?.instructions).toContain("short natural backchannel"); expect(bridgeParams?.tools?.map((tool) => tool.name)).toContain("openclaw_agent_consult"); expect(bridgeParams?.tools?.map((tool) => tool.name)).toContain("openclaw_agent_control"); const player = getLastAudioPlayer(); @@ -2915,6 +2916,65 @@ describe("DiscordVoiceManager", () => { expect(lastAgentCommandArgs().message).not.toContain("Hey"); }); + it("acknowledges leading wake names from partial realtime transcripts", async () => { + agentCommandMock.mockResolvedValueOnce({ payloads: [{ text: "wake answer" }] }); + const manager = createManager( + { + groupPolicy: "open", + voice: { + enabled: true, + mode: "agent-proxy", + realtime: { provider: "openai", consultPolicy: "auto", requireWakeName: true }, + }, + }, + undefined, + { + agents: { + list: [{ id: "agent-1", identity: { name: "Molty" } }], + }, + }, + ); + + await manager.join({ guildId: "g1", channelId: "1001" }); + const entry = getSessionEntry(manager) as { + realtime?: { + beginSpeakerTurn: ( + context: { extraSystemPrompt?: string; senderIsOwner: boolean; speakerLabel: string }, + userId: string, + ) => { close: () => void; sendInputAudio: (audio: Buffer) => void }; + }; + }; + const bridgeParams = lastRealtimeBridgeParams() as + | { + onEvent?: (event: { direction: "server"; type: string }) => void; + onTranscript?: (role: "user" | "assistant", text: string, isFinal: boolean) => void; + } + | undefined; + + const ownerTurn = entry.realtime?.beginSpeakerTurn( + { extraSystemPrompt: undefined, senderIsOwner: true, speakerLabel: "Owner" }, + "u-owner", + ); + ownerTurn?.sendInputAudio(Buffer.alloc(8)); + bridgeParams?.onEvent?.({ direction: "server", type: "input_audio_buffer.speech_started" }); + bridgeParams?.onTranscript?.("user", "Hey, Molty", false); + + expectUserMessageIncludes('Answer: "Yeah."'); + expect(controlRealtimeVoiceAgentRunMock).not.toHaveBeenCalled(); + expect(agentCommandMock).not.toHaveBeenCalled(); + + bridgeParams?.onEvent?.({ direction: "server", type: "response.done" }); + bridgeParams?.onTranscript?.("user", "Hey, Molty, how is it going", true); + await new Promise((resolve) => setTimeout(resolve, 260)); + + expect(controlRealtimeVoiceAgentRunMock).toHaveBeenCalledWith({ + sessionKey: "discord:g1:c1", + text: "how is it going", + }); + expect(lastAgentCommandArgs().message).toContain("how is it going"); + expectUserMessageIncludes("wake answer"); + }); + it("reuses recently ignored speaker context when wake-name consult has no pending turn", async () => { agentCommandMock.mockResolvedValueOnce({ payloads: [{ text: "wake answer" }] }); const manager = createManager( @@ -3185,16 +3245,16 @@ describe("DiscordVoiceManager", () => { expect(agentCommandArgsAt(6).message).toContain("can you hear me too?"); expect(agentCommandArgsAt(6).message).not.toContain("Open Cloud"); - const trailingMultiTurn = entry.realtime?.beginSpeakerTurn( + const trailingMoltyTurn = entry.realtime?.beginSpeakerTurn( { extraSystemPrompt: undefined, senderIsOwner: true, speakerLabel: "Owner" }, "u-owner", ); - trailingMultiTurn?.sendInputAudio(Buffer.alloc(8)); - bridgeParams?.onTranscript?.("user", "Can you still hear trailing, Multi.", true); + trailingMoltyTurn?.sendInputAudio(Buffer.alloc(8)); + bridgeParams?.onTranscript?.("user", "Can you still hear trailing, Molty.", true); await new Promise((resolve) => setTimeout(resolve, 260)); expect(agentCommandArgsAt(7).message).toContain("Can you still hear trailing"); - expect(agentCommandArgsAt(7).message).not.toContain("Multi"); + expect(agentCommandArgsAt(7).message).not.toContain("Molty"); const openChatTurn = entry.realtime?.beginSpeakerTurn( { extraSystemPrompt: undefined, senderIsOwner: true, speakerLabel: "Owner" }, @@ -3264,6 +3324,14 @@ describe("DiscordVoiceManager", () => { bridgeParams?.onTranscript?.("user", "Open law is not the wake phrase.", true); await new Promise((resolve) => setTimeout(resolve, 260)); + const fuzzyTrailingTurn = entry.realtime?.beginSpeakerTurn( + { extraSystemPrompt: undefined, senderIsOwner: true, speakerLabel: "Owner" }, + "u-owner", + ); + fuzzyTrailingTurn?.sendInputAudio(Buffer.alloc(8)); + bridgeParams?.onTranscript?.("user", "I miss the nonsensical German ranting from Multy.", true); + await new Promise((resolve) => setTimeout(resolve, 260)); + expect(agentCommandMock).not.toHaveBeenCalled(); }); @@ -4400,6 +4468,7 @@ describe("DiscordVoiceManager", () => { | undefined; expect(bridgeParams?.instructions).toContain("OpenClaw realtime voice profile context"); expect(bridgeParams?.instructions).toContain("Name: Wilfred"); + expect(bridgeParams?.instructions).toContain("short natural backchannel"); expect(bridgeParams?.instructions).toContain("Call openclaw_agent_consult"); }); diff --git a/extensions/discord/src/voice/realtime.ts b/extensions/discord/src/voice/realtime.ts index dce9f4ee612..99a3a024b44 100644 --- a/extensions/discord/src/voice/realtime.ts +++ b/extensions/discord/src/voice/realtime.ts @@ -71,6 +71,8 @@ const DISCORD_REALTIME_FORCED_CONSULT_FALLBACK_DELAY_MS = 200; const DISCORD_REALTIME_DUPLICATE_ERROR_SUPPRESS_MS = 60_000; const DISCORD_REALTIME_CONTROL_SPEECH_DEDUPE_MS = 5_000; const DISCORD_REALTIME_OUTPUT_PLAYBACK_WATCHDOG_MARGIN_MS = 1_500; +const DISCORD_REALTIME_WAKE_ACKS = ["Yeah.", "Mm-hmm.", "Got it.", "One sec."]; +const DISCORD_REALTIME_PARTIAL_TRANSCRIPT_MAX_CHARS = 240; const REALTIME_PCM16_BYTES_PER_SAMPLE = 2; const DISCORD_RAW_PCM_FRAME_BYTES = 3_840; const DISCORD_REALTIME_OUTPUT_PREROLL_FRAMES = 25; @@ -314,6 +316,15 @@ function normalizeControlSpeechText(text: string): string { return text.toLowerCase().replace(/\s+/g, " ").trim(); } +function mergeRealtimePartialTranscript(previous: string, next: string): string { + const trimmed = next.trim(); + if (!trimmed) { + return previous; + } + const merged = trimmed.startsWith(previous) ? trimmed : `${previous}${next}`; + return merged.slice(-DISCORD_REALTIME_PARTIAL_TRANSCRIPT_MAX_CHARS); +} + function resolveDiscordRealtimeWakeNames(params: { config: DiscordRealtimeVoiceConfig; cfg: OpenClawConfig; @@ -380,6 +391,9 @@ export class DiscordRealtimeVoiceSession implements VoiceRealtimeSession { private queuedExactSpeechMessages: string[] = []; private exactSpeechResponseActive = false; private exactSpeechAudioStarted = false; + private partialUserTranscript = ""; + private wakeNameAckedForTurn = false; + private wakeNameAckIndex = 0; private lastControlSpeech: | { normalizedText: string; sentAt: number; assistantTranscriptCount: number } | undefined; @@ -499,7 +513,11 @@ export class DiscordRealtimeVoiceSession implements VoiceRealtimeSession { if (isFinal && role === "assistant") { this.suppressDuplicateControlSpeech(text); } - if (!isFinal || role !== "user") { + if (role !== "user") { + return; + } + if (!isFinal) { + this.handlePartialUserTranscript(text); return; } void this.handleFinalUserTranscript(text, { usesRealtimeAgentHandoff }); @@ -507,6 +525,9 @@ export class DiscordRealtimeVoiceSession implements VoiceRealtimeSession { onToolCall: (event, session) => this.handleToolCall(event, session), onEvent: (event) => { const detail = event.detail ? ` ${event.detail}` : ""; + if (event.direction === "server" && event.type === "input_audio_buffer.speech_started") { + this.resetPartialWakeNameTracking(); + } if (shouldLogRealtimeVerboseEvent(event)) { logVoiceVerbose(`realtime ${event.direction}:${event.type}${detail}`); } @@ -567,6 +588,7 @@ export class DiscordRealtimeVoiceSession implements VoiceRealtimeSession { this.queuedExactSpeechMessages = []; this.exactSpeechResponseActive = false; this.exactSpeechAudioStarted = false; + this.resetPartialWakeNameTracking(); this.clearOutputAudio("session-close"); this.bridge?.close(); this.bridge = null; @@ -600,6 +622,7 @@ export class DiscordRealtimeVoiceSession implements VoiceRealtimeSession { } beginSpeakerTurn(context: VoiceRealtimeSpeakerContext, userId: string): VoiceRealtimeSpeakerTurn { + this.resetPartialWakeNameTracking(); const turn: PendingSpeakerTurn = { context: { ...context, userId }, hasAudio: false, @@ -882,6 +905,25 @@ export class DiscordRealtimeVoiceSession implements VoiceRealtimeSession { this.bridge?.sendUserMessage(buildDiscordSpeakExactUserMessage(text)); } + private sendWakeNameAck(result: RealtimeVoiceActivationNameTranscriptResult): void { + if (!result.allowed || this.stopped || this.exactSpeechResponseActive) { + return; + } + if (this.hasInterruptibleOutputAudio()) { + logger.info( + `discord voice: realtime wake-name ack skipped outputActive=true voiceSession=${this.params.entry.voiceSessionKey} agent=${this.params.entry.route.agentId}`, + ); + return; + } + const ack = + DISCORD_REALTIME_WAKE_ACKS[this.wakeNameAckIndex % DISCORD_REALTIME_WAKE_ACKS.length]; + this.wakeNameAckIndex += 1; + logger.info( + `discord voice: realtime wake-name ack canonical=${result.activationName} heard=${result.heardName} match=${result.match} voiceSession=${this.params.entry.voiceSessionKey} agent=${this.params.entry.route.agentId}`, + ); + this.sendExactSpeechMessage(ack ?? "Yeah."); + } + private speakControlResult(text: string): void { const trimmed = text.trim(); if (this.stopped || !trimmed) { @@ -1151,6 +1193,7 @@ export class DiscordRealtimeVoiceSession implements VoiceRealtimeSession { if (!trimmed) { return; } + this.partialUserTranscript = ""; const meetingNotesTurn = this.peekPendingSpeakerTurn(); this.recordMeetingNotesUtterance(trimmed, meetingNotesTurn); const wakeNameResult = this.resolveWakeNameTranscript(trimmed); @@ -1200,6 +1243,27 @@ export class DiscordRealtimeVoiceSession implements VoiceRealtimeSession { this.talkback.enqueue(acceptedText, this.consumePendingSpeakerContext()); } + private handlePartialUserTranscript(text: string): void { + if (!this.requireWakeName || this.wakeNameAckedForTurn) { + return; + } + this.partialUserTranscript = mergeRealtimePartialTranscript(this.partialUserTranscript, text); + const wakeNameResult = matchRealtimeVoiceActivationName( + this.partialUserTranscript, + this.wakeNames, + ); + if (!wakeNameResult || wakeNameResult.edge !== "leading") { + return; + } + this.wakeNameAckedForTurn = true; + this.sendWakeNameAck(wakeNameResult); + } + + private resetPartialWakeNameTracking(): void { + this.partialUserTranscript = ""; + this.wakeNameAckedForTurn = false; + } + private resolveWakeNameTranscript(text: string): RealtimeVoiceActivationNameTranscriptResult { if (!this.requireWakeName) { return { @@ -1672,6 +1736,7 @@ function buildDiscordRealtimeInstructions(params: { "Delegate substantive requests, actions, tool work, current facts, memory, workspace context, and user-specific context with openclaw_agent_consult.", "Do not block, refuse, or downscope at the voice layer. Delegate to OpenClaw and treat its result as authoritative.", "Answer directly only for greetings, acknowledgements, brief latency tests, or filler while waiting.", + 'While waiting for OpenClaw data or tool results, use at most one short natural backchannel such as "yeah", "mm-hmm", "got it", or "one sec"; vary it and do not treat it as the final answer.', "When OpenClaw sends an internal exact answer to speak, do not call tools. Say only that answer.", buildRealtimeVoiceAgentConsultPolicyInstructions({ toolPolicy: params.toolPolicy, @@ -1682,6 +1747,7 @@ function buildDiscordRealtimeInstructions(params: { return [ base, params.bootstrapContextInstructions?.trim(), + 'While waiting for OpenClaw data or tool results, use at most one short natural backchannel such as "yeah", "mm-hmm", "got it", or "one sec"; vary it and do not treat it as the final answer.', buildRealtimeVoiceAgentConsultPolicyInstructions({ toolPolicy: params.toolPolicy, consultPolicy: params.consultPolicy, diff --git a/src/talk/activation-name.test.ts b/src/talk/activation-name.test.ts index d90792389bc..b4ebc846a87 100644 --- a/src/talk/activation-name.test.ts +++ b/src/talk/activation-name.test.ts @@ -50,6 +50,14 @@ describe("realtime voice activation names", () => { }); }); + it("does not accept fuzzy trailing matches in ambient speech", () => { + expect( + matchRealtimeVoiceActivationName("I miss the nonsensical German ranting from Multy.", [ + "molty", + ]), + ).toBeUndefined(); + }); + it("does not fuzzy match inside a larger phrase without an edge boundary", () => { expect(matchRealtimeVoiceActivationName("maltiness is not a wake name", ["molty"])).toBe( undefined, diff --git a/src/talk/activation-name.ts b/src/talk/activation-name.ts index 66244176be3..aa15cae09fc 100644 --- a/src/talk/activation-name.ts +++ b/src/talk/activation-name.ts @@ -88,16 +88,16 @@ export function matchRealtimeVoiceActivationName( } const heardCompact = compactActivationName(candidate.heardName); const activationCompact = compactActivationName(normalizedActivationName); - if ( - heardCompact === activationCompact || - isFuzzyActivationNameMatch(candidate, activationName) - ) { + const exactMatch = heardCompact === activationCompact; + const fuzzyMatch = + candidate.edge === "leading" && isFuzzyActivationNameMatch(candidate, activationName); + if (exactMatch || fuzzyMatch) { return { allowed: true, text: stripEdgeActivationNameCandidate(text, candidate), activationName, heardName: candidate.heardName, - match: heardCompact === activationCompact ? "exact" : "fuzzy", + match: exactMatch ? "exact" : "fuzzy", edge: candidate.edge, }; }