From 2a31dae5e88a99e194a4d6fc8beb229bd68d8b4e Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Sat, 9 May 2026 23:22:07 +0100 Subject: [PATCH] fix(discord): sync realtime voice playback timestamps --- CHANGELOG.md | 1 + .../discord/src/voice/manager.e2e.test.ts | 6 +++++ extensions/discord/src/voice/realtime.ts | 22 +++++++++++++++++++ 3 files changed, 29 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 385aa4a74b2..38bfaadc84b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ Docs: https://docs.openclaw.ai ### Fixes +- Discord/voice: synthesize realtime playback timestamps from emitted Discord PCM so OpenAI realtime barge-in truncation no longer sees `audioEndMs=0` and skips legitimate interruptions. - Plugin SDK: keep activated linked plugin runtime facades loadable when bundled plugin fallback is disabled. Thanks @shakkernerd. - Feishu: auto-thread `message(action="send")` replies inside the topic when the active session is group_topic or group_topic_sender, and propagate `replyInThread` through text, card, and media outbound adapters so topic-scoped sessions no longer post at the group root. Fixes #74903. (#77151) Thanks @ai-hpc. - WhatsApp: pass routing context into voice-note transcript echo preflight so echoed transcripts can deliver to the originating chat. Fixes #79778. (#79788) Thanks @hclsys. diff --git a/extensions/discord/src/voice/manager.e2e.test.ts b/extensions/discord/src/voice/manager.e2e.test.ts index 54420a11c85..0efc08dee50 100644 --- a/extensions/discord/src/voice/manager.e2e.test.ts +++ b/extensions/discord/src/voice/manager.e2e.test.ts @@ -287,6 +287,7 @@ describe("DiscordVoiceManager", () => { realtimeSessionMock.sendAudio.mockClear(); realtimeSessionMock.sendUserMessage.mockClear(); realtimeSessionMock.handleBargeIn.mockClear(); + realtimeSessionMock.setMediaTimestamp.mockClear(); realtimeSessionMock.submitToolResult.mockClear(); createRealtimeVoiceBridgeSessionMock.mockClear(); createRealtimeVoiceBridgeSessionMock.mockReturnValue(realtimeSessionMock); @@ -641,7 +642,12 @@ describe("DiscordVoiceManager", () => { bridgeParams?.audioSink?.sendAudio(Buffer.alloc(480)); turn?.sendInputAudio(Buffer.alloc(3840)); + expect(realtimeSessionMock.setMediaTimestamp).toHaveBeenCalledWith(0); + expect(realtimeSessionMock.setMediaTimestamp).toHaveBeenCalledWith(10); expect(realtimeSessionMock.handleBargeIn).toHaveBeenCalled(); + const lastTimestampCall = realtimeSessionMock.setMediaTimestamp.mock.invocationCallOrder.at(-1); + const firstBargeInCall = realtimeSessionMock.handleBargeIn.mock.invocationCallOrder[0]; + expect(lastTimestampCall).toBeLessThan(firstBargeInCall); expect(player.stop).not.toHaveBeenCalled(); expect(realtimeSessionMock.sendAudio).toHaveBeenCalled(); }); diff --git a/extensions/discord/src/voice/realtime.ts b/extensions/discord/src/voice/realtime.ts index d03c41d46e0..b0d60c03e4b 100644 --- a/extensions/discord/src/voice/realtime.ts +++ b/extensions/discord/src/voice/realtime.ts @@ -43,6 +43,7 @@ const DISCORD_REALTIME_PENDING_SPEAKER_CONTEXT_LIMIT = 32; const DISCORD_REALTIME_LOG_PREVIEW_CHARS = 500; const DISCORD_REALTIME_DEFAULT_MIN_BARGE_IN_AUDIO_END_MS = 250; const DISCORD_REALTIME_FORCED_CONSULT_FALLBACK_DELAY_MS = 200; +const REALTIME_PCM16_BYTES_PER_SAMPLE = 2; export type DiscordVoiceMode = "stt-tts" | "agent-proxy" | "bidi"; @@ -245,6 +246,7 @@ export class DiscordRealtimeVoiceSession implements VoiceRealtimeSession { private consultPolicy: "auto" | "always" = "auto"; private pendingAgentProxyConsultContexts: PendingAgentProxyConsultContext[] = []; private readonly pendingSpeakerTurns: PendingSpeakerTurn[] = []; + private outputAudioTimestampMs = 0; private readonly playerIdleHandler = () => { this.resetOutputStream(); }; @@ -430,6 +432,7 @@ export class DiscordRealtimeVoiceSession implements VoiceRealtimeSession { if (!this.isBargeInEnabled()) { return; } + this.syncOutputAudioTimestamp(); this.bridge?.handleBargeIn({ audioPlaybackActive: true }); } @@ -450,8 +453,13 @@ export class DiscordRealtimeVoiceSession implements VoiceRealtimeSession { if (discordPcm.length === 0) { return; } + this.syncOutputAudioTimestamp(); const stream = this.ensureOutputStream(); stream.write(discordPcm); + this.outputAudioTimestampMs += pcm16MonoDurationMs( + realtimePcm24kMono, + REALTIME_VOICE_AUDIO_FORMAT_PCM16_24KHZ.sampleRateHz, + ); } private ensureOutputStream(): PassThrough { @@ -485,10 +493,15 @@ export class DiscordRealtimeVoiceSession implements VoiceRealtimeSession { private resetOutputStream(): void { const stream = this.outputStream; this.outputStream = null; + this.outputAudioTimestampMs = 0; stream?.end(); stream?.destroy(); } + private syncOutputAudioTimestamp(): void { + this.bridge?.setMediaTimestamp(Math.floor(this.outputAudioTimestampMs)); + } + private handleToolCall( event: RealtimeVoiceToolCallEvent, session: RealtimeVoiceBridgeSession, @@ -640,6 +653,7 @@ export class DiscordRealtimeVoiceSession implements VoiceRealtimeSession { logger.info( `discord voice: realtime forced agent consult starting chars=${question.length} voiceSession=${this.params.entry.voiceSessionKey} supervisorSession=${this.params.entry.route.sessionKey} agent=${this.params.entry.route.agentId} speaker=${context.speakerLabel} owner=${context.senderIsOwner}`, ); + this.syncOutputAudioTimestamp(); this.bridge?.handleBargeIn({ audioPlaybackActive: true, force: true }); this.clearOutputAudio(); try { @@ -718,6 +732,14 @@ function isDiscordRealtimeSpeakerContext(value: unknown): value is DiscordRealti ); } +function pcm16MonoDurationMs(audio: Buffer, sampleRate: number): number { + if (audio.length === 0 || sampleRate <= 0) { + return 0; + } + const samples = audio.length / REALTIME_PCM16_BYTES_PER_SAMPLE; + return (samples * 1000) / sampleRate; +} + function buildProviderConfigs( realtimeConfig: DiscordRealtimeVoiceConfig, ): Record | undefined {