From 98d593956493a2e942e17e102d266ba9631bcf48 Mon Sep 17 00:00:00 2001
From: Ayaan Zaidi <hi@obviy.us>
Date: Sat, 4 Apr 2026 22:03:54 +0530
Subject: [PATCH] feat(android): add talk.speak playback path

---
 .../ai/openclaw/app/gateway/GatewaySession.kt |  15 +-
 .../ai/openclaw/app/voice/TalkAudioPlayer.kt  | 237 ++++++++++++++++++
 .../ai/openclaw/app/voice/TalkModeManager.kt  |  30 ++-
 .../ai/openclaw/app/voice/TalkSpeakClient.kt  | 141 +++++++++++
 4 files changed, 415 insertions(+), 8 deletions(-)
 create mode 100644 apps/android/app/src/main/java/ai/openclaw/app/voice/TalkAudioPlayer.kt
 create mode 100644 apps/android/app/src/main/java/ai/openclaw/app/voice/TalkSpeakClient.kt
diff --git a/apps/android/app/src/main/java/ai/openclaw/app/gateway/GatewaySession.kt b/apps/android/app/src/main/java/ai/openclaw/app/gateway/GatewaySession.kt
index c52712b8ff0..8d437da5e3c 100644
--- a/apps/android/app/src/main/java/ai/openclaw/app/gateway/GatewaySession.kt
+++ b/apps/android/app/src/main/java/ai/openclaw/app/gateway/GatewaySession.kt
@@ -64,6 +64,7 @@ data class GatewayConnectErrorDetails(
   val code: String?,
   val canRetryWithDeviceToken: Boolean,
   val recommendedNextStep: String?,
+  val reason: String? = null,
 )
 
 private data class SelectedConnectAuth(
@@ -116,6 +117,8 @@ class GatewaySession(
     val details: GatewayConnectErrorDetails? = null,
   )
 
+  data class RpcResult(val ok: Boolean, val payloadJson: String?, val error: ErrorShape?)
+
   private val json = Json { ignoreUnknownKeys = true }
   private val writeLock = Mutex()
   private val pending = ConcurrentHashMap<String, CompletableDeferred<RpcResponse>>()
@@ -196,6 +199,13 @@ class GatewaySession(
   }
 
   suspend fun request(method: String, paramsJson: String?, timeoutMs: Long = 15_000): String {
+    val res = requestDetailed(method = method, paramsJson = paramsJson, timeoutMs = timeoutMs)
+    if (res.ok) return res.payloadJson ?: ""
+    val err = res.error
+    throw IllegalStateException("${err?.code ?: "UNAVAILABLE"}: ${err?.message ?: "request failed"}")
+  }
+
+  suspend fun requestDetailed(method: String, paramsJson: String?, timeoutMs: Long = 15_000): RpcResult {
     val conn = currentConnection ?: throw IllegalStateException("not connected")
     val params =
       if (paramsJson.isNullOrBlank()) {
@@ -204,9 +214,7 @@ class GatewaySession(
         json.parseToJsonElement(paramsJson)
       }
     val res = conn.request(method, params, timeoutMs)
-    if (res.ok) return res.payloadJson ?: ""
-    val err = res.error
-    throw IllegalStateException("${err?.code ?: "UNAVAILABLE"}: ${err?.message ?: "request failed"}")
+    return RpcResult(ok = res.ok, payloadJson = res.payloadJson, error = res.error)
   }
 
   suspend fun refreshNodeCanvasCapability(timeoutMs: Long = 8_000): Boolean {
@@ -631,6 +639,7 @@ class GatewaySession(
                 code = it["code"].asStringOrNull(),
                 canRetryWithDeviceToken = it["canRetryWithDeviceToken"].asBooleanOrNull() == true,
                 recommendedNextStep = it["recommendedNextStep"].asStringOrNull(),
+                reason = it["reason"].asStringOrNull(),
               )
             }
           ErrorShape(code, msg, details)
diff --git a/apps/android/app/src/main/java/ai/openclaw/app/voice/TalkAudioPlayer.kt b/apps/android/app/src/main/java/ai/openclaw/app/voice/TalkAudioPlayer.kt
new file mode 100644
index 00000000000..ea6ab467d09
--- /dev/null
+++ b/apps/android/app/src/main/java/ai/openclaw/app/voice/TalkAudioPlayer.kt
@@ -0,0 +1,237 @@
+package ai.openclaw.app.voice
+
+import android.content.Context
+import android.media.AudioAttributes
+import android.media.AudioFormat
+import android.media.AudioTrack
+import android.media.MediaPlayer
+import kotlinx.coroutines.CancellationException
+import kotlinx.coroutines.CompletableDeferred
+import kotlinx.coroutines.Dispatchers
+import kotlinx.coroutines.delay
+import kotlinx.coroutines.withContext
+import java.io.File
+
+internal class TalkAudioPlayer(
+  private val context: Context,
+) {
+  private val lock = Any()
+  private var active: ActivePlayback? = null
+
+  suspend fun play(audio: TalkSpeakAudio) {
+    when (val mode = resolvePlaybackMode(audio)) {
+      is TalkPlaybackMode.Pcm -> playPcm(audio.bytes, mode.sampleRate)
+      is TalkPlaybackMode.Compressed -> playCompressed(audio.bytes, mode.fileExtension)
+    }
+  }
+
+  fun stop() {
+    synchronized(lock) {
+      active?.cancel()
+      active = null
+    }
+  }
+
+  internal fun resolvePlaybackMode(audio: TalkSpeakAudio): TalkPlaybackMode {
+    return resolvePlaybackMode(
+      outputFormat = audio.outputFormat,
+      mimeType = audio.mimeType,
+      fileExtension = audio.fileExtension,
+    )
+  }
+
+  companion object {
+    internal fun resolvePlaybackMode(
+      outputFormat: String?,
+      mimeType: String?,
+      fileExtension: String?,
+    ): TalkPlaybackMode {
+      val normalizedOutputFormat = outputFormat?.trim()?.lowercase()
+      if (normalizedOutputFormat != null) {
+        val pcmSampleRate = parsePcmSampleRate(normalizedOutputFormat)
+        if (pcmSampleRate != null) {
+          return TalkPlaybackMode.Pcm(sampleRate = pcmSampleRate)
+        }
+      }
+      val normalizedMimeType = mimeType?.trim()?.lowercase()
+      val extension =
+        normalizeExtension(
+          fileExtension ?: inferExtension(outputFormat = normalizedOutputFormat, mimeType = normalizedMimeType),
+        )
+      if (extension != null) {
+        return TalkPlaybackMode.Compressed(fileExtension = extension)
+      }
+      throw IllegalStateException("Unsupported talk audio format")
+    }
+
+    private fun parsePcmSampleRate(outputFormat: String): Int? {
+      return when (outputFormat) {
+        "pcm_16000" -> 16_000
+        "pcm_22050" -> 22_050
+        "pcm_24000" -> 24_000
+        "pcm_44100" -> 44_100
+        else -> null
+      }
+    }
+
+    private fun inferExtension(outputFormat: String?, mimeType: String?): String? {
+      return when {
+        outputFormat == "mp3" || outputFormat?.startsWith("mp3_") == true || mimeType == "audio/mpeg" -> ".mp3"
+        outputFormat == "opus" || outputFormat?.startsWith("opus_") == true || mimeType == "audio/ogg" -> ".ogg"
+        outputFormat?.endsWith("-wav") == true || mimeType == "audio/wav" -> ".wav"
+        outputFormat?.endsWith("-webm") == true || mimeType == "audio/webm" -> ".webm"
+        else -> null
+      }
+    }
+
+    private fun normalizeExtension(value: String?): String? {
+      val trimmed = value?.trim()?.lowercase().orEmpty()
+      if (trimmed.isEmpty()) return null
+      return if (trimmed.startsWith(".")) trimmed else ".$trimmed"
+    }
+  }
+
+  private suspend fun playPcm(bytes: ByteArray, sampleRate: Int) {
+    withContext(Dispatchers.IO) {
+      val minBufferSize =
+        AudioTrack.getMinBufferSize(
+          sampleRate,
+          AudioFormat.CHANNEL_OUT_MONO,
+          AudioFormat.ENCODING_PCM_16BIT,
+        )
+      if (minBufferSize <= 0) {
+        throw IllegalStateException("AudioTrack buffer unavailable")
+      }
+      val track =
+        AudioTrack.Builder()
+          .setAudioAttributes(
+            AudioAttributes.Builder()
+              .setUsage(AudioAttributes.USAGE_MEDIA)
+              .setContentType(AudioAttributes.CONTENT_TYPE_SPEECH)
+              .build(),
+          )
+          .setAudioFormat(
+            AudioFormat.Builder()
+              .setEncoding(AudioFormat.ENCODING_PCM_16BIT)
+              .setSampleRate(sampleRate)
+              .setChannelMask(AudioFormat.CHANNEL_OUT_MONO)
+              .build(),
+          )
+          .setTransferMode(AudioTrack.MODE_STATIC)
+          .setBufferSizeInBytes(maxOf(minBufferSize, bytes.size))
+          .build()
+      val finished = CompletableDeferred<Unit>()
+      val playback =
+        ActivePlayback(
+          cancel = {
+            finished.completeExceptionally(CancellationException("assistant speech cancelled"))
+            runCatching { track.pause() }
+            runCatching { track.flush() }
+            runCatching { track.stop() }
+          },
+        )
+      register(playback)
+      try {
+        val written = track.write(bytes, 0, bytes.size)
+        if (written != bytes.size) {
+          throw IllegalStateException("AudioTrack write failed")
+        }
+        val totalFrames = bytes.size / 2
+        track.play()
+        while (track.playState == AudioTrack.PLAYSTATE_PLAYING) {
+          if (track.playbackHeadPosition >= totalFrames) {
+            finished.complete(Unit)
+            break
+          }
+          delay(20)
+        }
+        if (!finished.isCompleted) {
+          finished.complete(Unit)
+        }
+        finished.await()
+      } finally {
+        clear(playback)
+        runCatching { track.pause() }
+        runCatching { track.flush() }
+        runCatching { track.stop() }
+        track.release()
+      }
+    }
+  }
+
+  private suspend fun playCompressed(bytes: ByteArray, fileExtension: String) {
+    val tempFile = withContext(Dispatchers.IO) {
+      File.createTempFile("talk-audio-", fileExtension, context.cacheDir).apply {
+        writeBytes(bytes)
+      }
+    }
+    val finished = CompletableDeferred<Unit>()
+    val player =
+      withContext(Dispatchers.Main) {
+        MediaPlayer().apply {
+          setAudioAttributes(
+            AudioAttributes.Builder()
+              .setUsage(AudioAttributes.USAGE_MEDIA)
+              .setContentType(AudioAttributes.CONTENT_TYPE_SPEECH)
+              .build(),
+          )
+          setDataSource(tempFile.absolutePath)
+          setOnCompletionListener {
+            finished.complete(Unit)
+          }
+          setOnErrorListener { _, what, extra ->
+            finished.completeExceptionally(IllegalStateException("MediaPlayer error ($what/$extra)"))
+            true
+          }
+          prepare()
+          start()
+        }
+      }
+    val playback =
+      ActivePlayback(
+        cancel = {
+          finished.completeExceptionally(CancellationException("assistant speech cancelled"))
+          runCatching { player.stop() }
+        },
+      )
+    register(playback)
+    try {
+      finished.await()
+    } finally {
+      clear(playback)
+      withContext(Dispatchers.Main) {
+        runCatching { player.stop() }
+        player.release()
+      }
+      withContext(Dispatchers.IO) {
+        tempFile.delete()
+      }
+    }
+  }
+
+  private fun register(playback: ActivePlayback) {
+    synchronized(lock) {
+      active?.cancel()
+      active = playback
+    }
+  }
+
+  private fun clear(playback: ActivePlayback) {
+    synchronized(lock) {
+      if (active === playback) {
+        active = null
+      }
+    }
+  }
+
+}
+
+internal sealed interface TalkPlaybackMode {
+  data class Pcm(val sampleRate: Int) : TalkPlaybackMode
+
+  data class Compressed(val fileExtension: String) : TalkPlaybackMode
+}
+
+private class ActivePlayback(
+  val cancel: () -> Unit,
+)
diff --git a/apps/android/app/src/main/java/ai/openclaw/app/voice/TalkModeManager.kt b/apps/android/app/src/main/java/ai/openclaw/app/voice/TalkModeManager.kt
index 1bd1b9bec60..8f1e0bae3cf 100644
--- a/apps/android/app/src/main/java/ai/openclaw/app/voice/TalkModeManager.kt
+++ b/apps/android/app/src/main/java/ai/openclaw/app/voice/TalkModeManager.kt
@@ -14,9 +14,9 @@ import android.os.SystemClock
 import android.speech.RecognitionListener
 import android.speech.RecognizerIntent
 import android.speech.SpeechRecognizer
-import android.util.Log
 import android.speech.tts.TextToSpeech
 import android.speech.tts.UtteranceProgressListener
+import android.util.Log
 import androidx.core.content.ContextCompat
 import ai.openclaw.app.gateway.GatewaySession
 import java.util.Locale
@@ -61,6 +61,8 @@ class TalkModeManager(
 
   private val mainHandler = Handler(Looper.getMainLooper())
   private val json = Json { ignoreUnknownKeys = true }
+  private val talkSpeakClient = TalkSpeakClient(session = session, json = json)
+  private val talkAudioPlayer = TalkAudioPlayer(context)
 
   private val _isEnabled = MutableStateFlow(false)
   val isEnabled: StateFlow<Boolean> = _isEnabled
@@ -663,16 +665,32 @@ class TalkModeManager(
     requestAudioFocusForTts()
 
     try {
-      val ttsStarted = SystemClock.elapsedRealtime()
-      speakWithSystemTts(cleaned, directive, playbackToken)
-      Log.d(tag, "system tts ok durMs=${SystemClock.elapsedRealtime() - ttsStarted}")
+      val started = SystemClock.elapsedRealtime()
+      when (val result = talkSpeakClient.synthesize(text = cleaned, directive = directive)) {
+        is TalkSpeakResult.Success -> {
+          ensurePlaybackActive(playbackToken)
+          talkAudioPlayer.play(result.audio)
+          ensurePlaybackActive(playbackToken)
+          Log.d(tag, "talk.speak ok durMs=${SystemClock.elapsedRealtime() - started}")
+        }
+        is TalkSpeakResult.FallbackToLocal -> {
+          Log.d(tag, "talk.speak unavailable; using local TTS: ${result.message}")
+          speakWithSystemTts(cleaned, directive, playbackToken)
+          Log.d(tag, "system tts ok durMs=${SystemClock.elapsedRealtime() - started}")
+        }
+        is TalkSpeakResult.Failure -> {
+          throw IllegalStateException(result.message)
+        }
+      }
     } catch (err: Throwable) {
       if (isPlaybackCancelled(err, playbackToken)) {
         Log.d(tag, "assistant speech cancelled")
         return
       }
       _statusText.value = "Speak failed: ${err.message ?: err::class.simpleName}"
-      Log.w(tag, "system tts failed: ${err.message ?: err::class.simpleName}")
+      Log.w(tag, "talk playback failed: ${err.message ?: err::class.simpleName}")
+    } finally {
+      _isSpeaking.value = false
     }
   }
 
@@ -812,6 +830,7 @@ class TalkModeManager(
 
   private fun stopSpeaking(resetInterrupt: Boolean = true) {
     if (!_isSpeaking.value) {
+      talkAudioPlayer.stop()
       stopTextToSpeechPlayback()
       abandonAudioFocus()
       return
@@ -819,6 +838,7 @@ class TalkModeManager(
     if (resetInterrupt) {
       lastInterruptedAtSeconds = null
     }
+    talkAudioPlayer.stop()
     stopTextToSpeechPlayback()
     _isSpeaking.value = false
     abandonAudioFocus()
diff --git a/apps/android/app/src/main/java/ai/openclaw/app/voice/TalkSpeakClient.kt b/apps/android/app/src/main/java/ai/openclaw/app/voice/TalkSpeakClient.kt
new file mode 100644
index 00000000000..88d0b79b0e6
--- /dev/null
+++ b/apps/android/app/src/main/java/ai/openclaw/app/voice/TalkSpeakClient.kt
@@ -0,0 +1,141 @@
+package ai.openclaw.app.voice
+
+import ai.openclaw.app.gateway.GatewaySession
+import kotlinx.serialization.Serializable
+import kotlinx.serialization.encodeToString
+import kotlinx.serialization.json.Json
+
+internal data class TalkSpeakAudio(
+  val bytes: ByteArray,
+  val provider: String,
+  val outputFormat: String?,
+  val voiceCompatible: Boolean?,
+  val mimeType: String?,
+  val fileExtension: String?,
+)
+
+internal sealed interface TalkSpeakResult {
+  data class Success(val audio: TalkSpeakAudio) : TalkSpeakResult
+
+  data class FallbackToLocal(val message: String) : TalkSpeakResult
+
+  data class Failure(val message: String) : TalkSpeakResult
+}
+
+internal class TalkSpeakClient(
+  private val session: GatewaySession? = null,
+  private val json: Json = Json { ignoreUnknownKeys = true },
+  private val requestDetailed: (suspend (String, String, Long) -> GatewaySession.RpcResult)? = null,
+) {
+  suspend fun synthesize(text: String, directive: TalkDirective?): TalkSpeakResult {
+    val response =
+      try {
+        performRequest(
+          method = "talk.speak",
+          paramsJson = json.encodeToString(TalkSpeakRequest.from(text = text, directive = directive)),
+          timeoutMs = 45_000,
+        )
+      } catch (err: Throwable) {
+        return TalkSpeakResult.Failure(err.message ?: "talk.speak request failed")
+      }
+    if (!response.ok) {
+      val error = response.error
+      val message = error?.message ?: "talk.speak request failed"
+      return if (isFallbackEligible(error?.details?.reason)) {
+        TalkSpeakResult.FallbackToLocal(message)
+      } else {
+        TalkSpeakResult.Failure(message)
+      }
+    }
+    val payload =
+      try {
+        json.decodeFromString<TalkSpeakResponse>(response.payloadJson ?: "")
+      } catch (err: Throwable) {
+        return TalkSpeakResult.Failure(err.message ?: "talk.speak payload invalid")
+      }
+    val bytes =
+      try {
+        android.util.Base64.decode(payload.audioBase64, android.util.Base64.DEFAULT)
+      } catch (err: Throwable) {
+        return TalkSpeakResult.Failure(err.message ?: "talk.speak audio decode failed")
+      }
+    if (bytes.isEmpty()) {
+      return TalkSpeakResult.Failure("talk.speak returned empty audio")
+    }
+    return TalkSpeakResult.Success(
+      TalkSpeakAudio(
+        bytes = bytes,
+        provider = payload.provider,
+        outputFormat = payload.outputFormat,
+        voiceCompatible = payload.voiceCompatible,
+        mimeType = payload.mimeType,
+        fileExtension = payload.fileExtension,
+      ),
+    )
+  }
+
+  private fun isFallbackEligible(reason: String?): Boolean {
+    return reason == "talk_unconfigured" ||
+      reason == "talk_provider_unsupported" ||
+      reason == "method_unavailable"
+  }
+
+  private suspend fun performRequest(
+    method: String,
+    paramsJson: String,
+    timeoutMs: Long,
+  ): GatewaySession.RpcResult {
+    requestDetailed?.let { return it(method, paramsJson, timeoutMs) }
+    val activeSession = session ?: throw IllegalStateException("session missing")
+    return activeSession.requestDetailed(method = method, paramsJson = paramsJson, timeoutMs = timeoutMs)
+  }
+}
+
+@Serializable
+internal data class TalkSpeakRequest(
+  val text: String,
+  val voiceId: String? = null,
+  val modelId: String? = null,
+  val outputFormat: String? = null,
+  val speed: Double? = null,
+  val rateWpm: Int? = null,
+  val stability: Double? = null,
+  val similarity: Double? = null,
+  val style: Double? = null,
+  val speakerBoost: Boolean? = null,
+  val seed: Long? = null,
+  val normalize: String? = null,
+  val language: String? = null,
+  val latencyTier: Int? = null,
+) {
+  companion object {
+    fun from(text: String, directive: TalkDirective?): TalkSpeakRequest {
+      return TalkSpeakRequest(
+        text = text,
+        voiceId = directive?.voiceId,
+        modelId = directive?.modelId,
+        outputFormat = directive?.outputFormat,
+        speed = directive?.speed,
+        rateWpm = directive?.rateWpm,
+        stability = directive?.stability,
+        similarity = directive?.similarity,
+        style = directive?.style,
+        speakerBoost = directive?.speakerBoost,
+        seed = directive?.seed,
+        normalize = directive?.normalize,
+        language = directive?.language,
+        latencyTier = directive?.latencyTier,
+      )
+    }
+  }
+}
+
+@Serializable
+private data class TalkSpeakResponse(
+  val audioBase64: String,
+  val provider: String,
+  val outputFormat: String? = null,
+  val voiceCompatible: Boolean? = null,
+  val mimeType: String? = null,
+  val fileExtension: String? = null,
+)