From 98d593956493a2e942e17e102d266ba9631bcf48 Mon Sep 17 00:00:00 2001 From: Ayaan Zaidi Date: Sat, 4 Apr 2026 22:03:54 +0530 Subject: [PATCH] feat(android): add talk.speak playback path --- .../ai/openclaw/app/gateway/GatewaySession.kt | 15 +- .../ai/openclaw/app/voice/TalkAudioPlayer.kt | 237 ++++++++++++++++++ .../ai/openclaw/app/voice/TalkModeManager.kt | 30 ++- .../ai/openclaw/app/voice/TalkSpeakClient.kt | 141 +++++++++++ 4 files changed, 415 insertions(+), 8 deletions(-) create mode 100644 apps/android/app/src/main/java/ai/openclaw/app/voice/TalkAudioPlayer.kt create mode 100644 apps/android/app/src/main/java/ai/openclaw/app/voice/TalkSpeakClient.kt diff --git a/apps/android/app/src/main/java/ai/openclaw/app/gateway/GatewaySession.kt b/apps/android/app/src/main/java/ai/openclaw/app/gateway/GatewaySession.kt index c52712b8ff0..8d437da5e3c 100644 --- a/apps/android/app/src/main/java/ai/openclaw/app/gateway/GatewaySession.kt +++ b/apps/android/app/src/main/java/ai/openclaw/app/gateway/GatewaySession.kt @@ -64,6 +64,7 @@ data class GatewayConnectErrorDetails( val code: String?, val canRetryWithDeviceToken: Boolean, val recommendedNextStep: String?, + val reason: String? = null, ) private data class SelectedConnectAuth( @@ -116,6 +117,8 @@ class GatewaySession( val details: GatewayConnectErrorDetails? = null, ) + data class RpcResult(val ok: Boolean, val payloadJson: String?, val error: ErrorShape?) + private val json = Json { ignoreUnknownKeys = true } private val writeLock = Mutex() private val pending = ConcurrentHashMap>() @@ -196,6 +199,13 @@ class GatewaySession( } suspend fun request(method: String, paramsJson: String?, timeoutMs: Long = 15_000): String { + val res = requestDetailed(method = method, paramsJson = paramsJson, timeoutMs = timeoutMs) + if (res.ok) return res.payloadJson ?: "" + val err = res.error + throw IllegalStateException("${err?.code ?: "UNAVAILABLE"}: ${err?.message ?: "request failed"}") + } + + suspend fun requestDetailed(method: String, paramsJson: String?, timeoutMs: Long = 15_000): RpcResult { val conn = currentConnection ?: throw IllegalStateException("not connected") val params = if (paramsJson.isNullOrBlank()) { @@ -204,9 +214,7 @@ class GatewaySession( json.parseToJsonElement(paramsJson) } val res = conn.request(method, params, timeoutMs) - if (res.ok) return res.payloadJson ?: "" - val err = res.error - throw IllegalStateException("${err?.code ?: "UNAVAILABLE"}: ${err?.message ?: "request failed"}") + return RpcResult(ok = res.ok, payloadJson = res.payloadJson, error = res.error) } suspend fun refreshNodeCanvasCapability(timeoutMs: Long = 8_000): Boolean { @@ -631,6 +639,7 @@ class GatewaySession( code = it["code"].asStringOrNull(), canRetryWithDeviceToken = it["canRetryWithDeviceToken"].asBooleanOrNull() == true, recommendedNextStep = it["recommendedNextStep"].asStringOrNull(), + reason = it["reason"].asStringOrNull(), ) } ErrorShape(code, msg, details) diff --git a/apps/android/app/src/main/java/ai/openclaw/app/voice/TalkAudioPlayer.kt b/apps/android/app/src/main/java/ai/openclaw/app/voice/TalkAudioPlayer.kt new file mode 100644 index 00000000000..ea6ab467d09 --- /dev/null +++ b/apps/android/app/src/main/java/ai/openclaw/app/voice/TalkAudioPlayer.kt @@ -0,0 +1,237 @@ +package ai.openclaw.app.voice + +import android.content.Context +import android.media.AudioAttributes +import android.media.AudioFormat +import android.media.AudioTrack +import android.media.MediaPlayer +import kotlinx.coroutines.CancellationException +import kotlinx.coroutines.CompletableDeferred +import kotlinx.coroutines.Dispatchers +import kotlinx.coroutines.delay +import kotlinx.coroutines.withContext +import java.io.File + +internal class TalkAudioPlayer( + private val context: Context, +) { + private val lock = Any() + private var active: ActivePlayback? = null + + suspend fun play(audio: TalkSpeakAudio) { + when (val mode = resolvePlaybackMode(audio)) { + is TalkPlaybackMode.Pcm -> playPcm(audio.bytes, mode.sampleRate) + is TalkPlaybackMode.Compressed -> playCompressed(audio.bytes, mode.fileExtension) + } + } + + fun stop() { + synchronized(lock) { + active?.cancel() + active = null + } + } + + internal fun resolvePlaybackMode(audio: TalkSpeakAudio): TalkPlaybackMode { + return resolvePlaybackMode( + outputFormat = audio.outputFormat, + mimeType = audio.mimeType, + fileExtension = audio.fileExtension, + ) + } + + companion object { + internal fun resolvePlaybackMode( + outputFormat: String?, + mimeType: String?, + fileExtension: String?, + ): TalkPlaybackMode { + val normalizedOutputFormat = outputFormat?.trim()?.lowercase() + if (normalizedOutputFormat != null) { + val pcmSampleRate = parsePcmSampleRate(normalizedOutputFormat) + if (pcmSampleRate != null) { + return TalkPlaybackMode.Pcm(sampleRate = pcmSampleRate) + } + } + val normalizedMimeType = mimeType?.trim()?.lowercase() + val extension = + normalizeExtension( + fileExtension ?: inferExtension(outputFormat = normalizedOutputFormat, mimeType = normalizedMimeType), + ) + if (extension != null) { + return TalkPlaybackMode.Compressed(fileExtension = extension) + } + throw IllegalStateException("Unsupported talk audio format") + } + + private fun parsePcmSampleRate(outputFormat: String): Int? { + return when (outputFormat) { + "pcm_16000" -> 16_000 + "pcm_22050" -> 22_050 + "pcm_24000" -> 24_000 + "pcm_44100" -> 44_100 + else -> null + } + } + + private fun inferExtension(outputFormat: String?, mimeType: String?): String? { + return when { + outputFormat == "mp3" || outputFormat?.startsWith("mp3_") == true || mimeType == "audio/mpeg" -> ".mp3" + outputFormat == "opus" || outputFormat?.startsWith("opus_") == true || mimeType == "audio/ogg" -> ".ogg" + outputFormat?.endsWith("-wav") == true || mimeType == "audio/wav" -> ".wav" + outputFormat?.endsWith("-webm") == true || mimeType == "audio/webm" -> ".webm" + else -> null + } + } + + private fun normalizeExtension(value: String?): String? { + val trimmed = value?.trim()?.lowercase().orEmpty() + if (trimmed.isEmpty()) return null + return if (trimmed.startsWith(".")) trimmed else ".$trimmed" + } + } + + private suspend fun playPcm(bytes: ByteArray, sampleRate: Int) { + withContext(Dispatchers.IO) { + val minBufferSize = + AudioTrack.getMinBufferSize( + sampleRate, + AudioFormat.CHANNEL_OUT_MONO, + AudioFormat.ENCODING_PCM_16BIT, + ) + if (minBufferSize <= 0) { + throw IllegalStateException("AudioTrack buffer unavailable") + } + val track = + AudioTrack.Builder() + .setAudioAttributes( + AudioAttributes.Builder() + .setUsage(AudioAttributes.USAGE_MEDIA) + .setContentType(AudioAttributes.CONTENT_TYPE_SPEECH) + .build(), + ) + .setAudioFormat( + AudioFormat.Builder() + .setEncoding(AudioFormat.ENCODING_PCM_16BIT) + .setSampleRate(sampleRate) + .setChannelMask(AudioFormat.CHANNEL_OUT_MONO) + .build(), + ) + .setTransferMode(AudioTrack.MODE_STATIC) + .setBufferSizeInBytes(maxOf(minBufferSize, bytes.size)) + .build() + val finished = CompletableDeferred() + val playback = + ActivePlayback( + cancel = { + finished.completeExceptionally(CancellationException("assistant speech cancelled")) + runCatching { track.pause() } + runCatching { track.flush() } + runCatching { track.stop() } + }, + ) + register(playback) + try { + val written = track.write(bytes, 0, bytes.size) + if (written != bytes.size) { + throw IllegalStateException("AudioTrack write failed") + } + val totalFrames = bytes.size / 2 + track.play() + while (track.playState == AudioTrack.PLAYSTATE_PLAYING) { + if (track.playbackHeadPosition >= totalFrames) { + finished.complete(Unit) + break + } + delay(20) + } + if (!finished.isCompleted) { + finished.complete(Unit) + } + finished.await() + } finally { + clear(playback) + runCatching { track.pause() } + runCatching { track.flush() } + runCatching { track.stop() } + track.release() + } + } + } + + private suspend fun playCompressed(bytes: ByteArray, fileExtension: String) { + val tempFile = withContext(Dispatchers.IO) { + File.createTempFile("talk-audio-", fileExtension, context.cacheDir).apply { + writeBytes(bytes) + } + } + val finished = CompletableDeferred() + val player = + withContext(Dispatchers.Main) { + MediaPlayer().apply { + setAudioAttributes( + AudioAttributes.Builder() + .setUsage(AudioAttributes.USAGE_MEDIA) + .setContentType(AudioAttributes.CONTENT_TYPE_SPEECH) + .build(), + ) + setDataSource(tempFile.absolutePath) + setOnCompletionListener { + finished.complete(Unit) + } + setOnErrorListener { _, what, extra -> + finished.completeExceptionally(IllegalStateException("MediaPlayer error ($what/$extra)")) + true + } + prepare() + start() + } + } + val playback = + ActivePlayback( + cancel = { + finished.completeExceptionally(CancellationException("assistant speech cancelled")) + runCatching { player.stop() } + }, + ) + register(playback) + try { + finished.await() + } finally { + clear(playback) + withContext(Dispatchers.Main) { + runCatching { player.stop() } + player.release() + } + withContext(Dispatchers.IO) { + tempFile.delete() + } + } + } + + private fun register(playback: ActivePlayback) { + synchronized(lock) { + active?.cancel() + active = playback + } + } + + private fun clear(playback: ActivePlayback) { + synchronized(lock) { + if (active === playback) { + active = null + } + } + } + +} + +internal sealed interface TalkPlaybackMode { + data class Pcm(val sampleRate: Int) : TalkPlaybackMode + + data class Compressed(val fileExtension: String) : TalkPlaybackMode +} + +private class ActivePlayback( + val cancel: () -> Unit, +) diff --git a/apps/android/app/src/main/java/ai/openclaw/app/voice/TalkModeManager.kt b/apps/android/app/src/main/java/ai/openclaw/app/voice/TalkModeManager.kt index 1bd1b9bec60..8f1e0bae3cf 100644 --- a/apps/android/app/src/main/java/ai/openclaw/app/voice/TalkModeManager.kt +++ b/apps/android/app/src/main/java/ai/openclaw/app/voice/TalkModeManager.kt @@ -14,9 +14,9 @@ import android.os.SystemClock import android.speech.RecognitionListener import android.speech.RecognizerIntent import android.speech.SpeechRecognizer -import android.util.Log import android.speech.tts.TextToSpeech import android.speech.tts.UtteranceProgressListener +import android.util.Log import androidx.core.content.ContextCompat import ai.openclaw.app.gateway.GatewaySession import java.util.Locale @@ -61,6 +61,8 @@ class TalkModeManager( private val mainHandler = Handler(Looper.getMainLooper()) private val json = Json { ignoreUnknownKeys = true } + private val talkSpeakClient = TalkSpeakClient(session = session, json = json) + private val talkAudioPlayer = TalkAudioPlayer(context) private val _isEnabled = MutableStateFlow(false) val isEnabled: StateFlow = _isEnabled @@ -663,16 +665,32 @@ class TalkModeManager( requestAudioFocusForTts() try { - val ttsStarted = SystemClock.elapsedRealtime() - speakWithSystemTts(cleaned, directive, playbackToken) - Log.d(tag, "system tts ok durMs=${SystemClock.elapsedRealtime() - ttsStarted}") + val started = SystemClock.elapsedRealtime() + when (val result = talkSpeakClient.synthesize(text = cleaned, directive = directive)) { + is TalkSpeakResult.Success -> { + ensurePlaybackActive(playbackToken) + talkAudioPlayer.play(result.audio) + ensurePlaybackActive(playbackToken) + Log.d(tag, "talk.speak ok durMs=${SystemClock.elapsedRealtime() - started}") + } + is TalkSpeakResult.FallbackToLocal -> { + Log.d(tag, "talk.speak unavailable; using local TTS: ${result.message}") + speakWithSystemTts(cleaned, directive, playbackToken) + Log.d(tag, "system tts ok durMs=${SystemClock.elapsedRealtime() - started}") + } + is TalkSpeakResult.Failure -> { + throw IllegalStateException(result.message) + } + } } catch (err: Throwable) { if (isPlaybackCancelled(err, playbackToken)) { Log.d(tag, "assistant speech cancelled") return } _statusText.value = "Speak failed: ${err.message ?: err::class.simpleName}" - Log.w(tag, "system tts failed: ${err.message ?: err::class.simpleName}") + Log.w(tag, "talk playback failed: ${err.message ?: err::class.simpleName}") + } finally { + _isSpeaking.value = false } } @@ -812,6 +830,7 @@ class TalkModeManager( private fun stopSpeaking(resetInterrupt: Boolean = true) { if (!_isSpeaking.value) { + talkAudioPlayer.stop() stopTextToSpeechPlayback() abandonAudioFocus() return @@ -819,6 +838,7 @@ class TalkModeManager( if (resetInterrupt) { lastInterruptedAtSeconds = null } + talkAudioPlayer.stop() stopTextToSpeechPlayback() _isSpeaking.value = false abandonAudioFocus() diff --git a/apps/android/app/src/main/java/ai/openclaw/app/voice/TalkSpeakClient.kt b/apps/android/app/src/main/java/ai/openclaw/app/voice/TalkSpeakClient.kt new file mode 100644 index 00000000000..88d0b79b0e6 --- /dev/null +++ b/apps/android/app/src/main/java/ai/openclaw/app/voice/TalkSpeakClient.kt @@ -0,0 +1,141 @@ +package ai.openclaw.app.voice + +import ai.openclaw.app.gateway.GatewaySession +import kotlinx.serialization.Serializable +import kotlinx.serialization.encodeToString +import kotlinx.serialization.json.Json + +internal data class TalkSpeakAudio( + val bytes: ByteArray, + val provider: String, + val outputFormat: String?, + val voiceCompatible: Boolean?, + val mimeType: String?, + val fileExtension: String?, +) + +internal sealed interface TalkSpeakResult { + data class Success(val audio: TalkSpeakAudio) : TalkSpeakResult + + data class FallbackToLocal(val message: String) : TalkSpeakResult + + data class Failure(val message: String) : TalkSpeakResult +} + +internal class TalkSpeakClient( + private val session: GatewaySession? = null, + private val json: Json = Json { ignoreUnknownKeys = true }, + private val requestDetailed: (suspend (String, String, Long) -> GatewaySession.RpcResult)? = null, +) { + suspend fun synthesize(text: String, directive: TalkDirective?): TalkSpeakResult { + val response = + try { + performRequest( + method = "talk.speak", + paramsJson = json.encodeToString(TalkSpeakRequest.from(text = text, directive = directive)), + timeoutMs = 45_000, + ) + } catch (err: Throwable) { + return TalkSpeakResult.Failure(err.message ?: "talk.speak request failed") + } + if (!response.ok) { + val error = response.error + val message = error?.message ?: "talk.speak request failed" + return if (isFallbackEligible(error?.details?.reason)) { + TalkSpeakResult.FallbackToLocal(message) + } else { + TalkSpeakResult.Failure(message) + } + } + val payload = + try { + json.decodeFromString(response.payloadJson ?: "") + } catch (err: Throwable) { + return TalkSpeakResult.Failure(err.message ?: "talk.speak payload invalid") + } + val bytes = + try { + android.util.Base64.decode(payload.audioBase64, android.util.Base64.DEFAULT) + } catch (err: Throwable) { + return TalkSpeakResult.Failure(err.message ?: "talk.speak audio decode failed") + } + if (bytes.isEmpty()) { + return TalkSpeakResult.Failure("talk.speak returned empty audio") + } + return TalkSpeakResult.Success( + TalkSpeakAudio( + bytes = bytes, + provider = payload.provider, + outputFormat = payload.outputFormat, + voiceCompatible = payload.voiceCompatible, + mimeType = payload.mimeType, + fileExtension = payload.fileExtension, + ), + ) + } + + private fun isFallbackEligible(reason: String?): Boolean { + return reason == "talk_unconfigured" || + reason == "talk_provider_unsupported" || + reason == "method_unavailable" + } + + private suspend fun performRequest( + method: String, + paramsJson: String, + timeoutMs: Long, + ): GatewaySession.RpcResult { + requestDetailed?.let { return it(method, paramsJson, timeoutMs) } + val activeSession = session ?: throw IllegalStateException("session missing") + return activeSession.requestDetailed(method = method, paramsJson = paramsJson, timeoutMs = timeoutMs) + } +} + +@Serializable +internal data class TalkSpeakRequest( + val text: String, + val voiceId: String? = null, + val modelId: String? = null, + val outputFormat: String? = null, + val speed: Double? = null, + val rateWpm: Int? = null, + val stability: Double? = null, + val similarity: Double? = null, + val style: Double? = null, + val speakerBoost: Boolean? = null, + val seed: Long? = null, + val normalize: String? = null, + val language: String? = null, + val latencyTier: Int? = null, +) { + companion object { + fun from(text: String, directive: TalkDirective?): TalkSpeakRequest { + return TalkSpeakRequest( + text = text, + voiceId = directive?.voiceId, + modelId = directive?.modelId, + outputFormat = directive?.outputFormat, + speed = directive?.speed, + rateWpm = directive?.rateWpm, + stability = directive?.stability, + similarity = directive?.similarity, + style = directive?.style, + speakerBoost = directive?.speakerBoost, + seed = directive?.seed, + normalize = directive?.normalize, + language = directive?.language, + latencyTier = directive?.latencyTier, + ) + } + } +} + +@Serializable +private data class TalkSpeakResponse( + val audioBase64: String, + val provider: String, + val outputFormat: String? = null, + val voiceCompatible: Boolean? = null, + val mimeType: String? = null, + val fileExtension: String? = null, +)