feat(android): add talk.speak playback path

This commit is contained in:
Ayaan Zaidi
2026-04-04 22:03:54 +05:30
parent b558610ef3
commit 98d5939564
4 changed files with 415 additions and 8 deletions

View File

@@ -64,6 +64,7 @@ data class GatewayConnectErrorDetails(
val code: String?,
val canRetryWithDeviceToken: Boolean,
val recommendedNextStep: String?,
val reason: String? = null,
)
private data class SelectedConnectAuth(
@@ -116,6 +117,8 @@ class GatewaySession(
val details: GatewayConnectErrorDetails? = null,
)
data class RpcResult(val ok: Boolean, val payloadJson: String?, val error: ErrorShape?)
private val json = Json { ignoreUnknownKeys = true }
private val writeLock = Mutex()
private val pending = ConcurrentHashMap<String, CompletableDeferred<RpcResponse>>()
@@ -196,6 +199,13 @@ class GatewaySession(
}
suspend fun request(method: String, paramsJson: String?, timeoutMs: Long = 15_000): String {
val res = requestDetailed(method = method, paramsJson = paramsJson, timeoutMs = timeoutMs)
if (res.ok) return res.payloadJson ?: ""
val err = res.error
throw IllegalStateException("${err?.code ?: "UNAVAILABLE"}: ${err?.message ?: "request failed"}")
}
suspend fun requestDetailed(method: String, paramsJson: String?, timeoutMs: Long = 15_000): RpcResult {
val conn = currentConnection ?: throw IllegalStateException("not connected")
val params =
if (paramsJson.isNullOrBlank()) {
@@ -204,9 +214,7 @@ class GatewaySession(
json.parseToJsonElement(paramsJson)
}
val res = conn.request(method, params, timeoutMs)
if (res.ok) return res.payloadJson ?: ""
val err = res.error
throw IllegalStateException("${err?.code ?: "UNAVAILABLE"}: ${err?.message ?: "request failed"}")
return RpcResult(ok = res.ok, payloadJson = res.payloadJson, error = res.error)
}
suspend fun refreshNodeCanvasCapability(timeoutMs: Long = 8_000): Boolean {
@@ -631,6 +639,7 @@ class GatewaySession(
code = it["code"].asStringOrNull(),
canRetryWithDeviceToken = it["canRetryWithDeviceToken"].asBooleanOrNull() == true,
recommendedNextStep = it["recommendedNextStep"].asStringOrNull(),
reason = it["reason"].asStringOrNull(),
)
}
ErrorShape(code, msg, details)

View File

@@ -0,0 +1,237 @@
package ai.openclaw.app.voice
import android.content.Context
import android.media.AudioAttributes
import android.media.AudioFormat
import android.media.AudioTrack
import android.media.MediaPlayer
import kotlinx.coroutines.CancellationException
import kotlinx.coroutines.CompletableDeferred
import kotlinx.coroutines.Dispatchers
import kotlinx.coroutines.delay
import kotlinx.coroutines.withContext
import java.io.File
internal class TalkAudioPlayer(
private val context: Context,
) {
private val lock = Any()
private var active: ActivePlayback? = null
suspend fun play(audio: TalkSpeakAudio) {
when (val mode = resolvePlaybackMode(audio)) {
is TalkPlaybackMode.Pcm -> playPcm(audio.bytes, mode.sampleRate)
is TalkPlaybackMode.Compressed -> playCompressed(audio.bytes, mode.fileExtension)
}
}
fun stop() {
synchronized(lock) {
active?.cancel()
active = null
}
}
internal fun resolvePlaybackMode(audio: TalkSpeakAudio): TalkPlaybackMode {
return resolvePlaybackMode(
outputFormat = audio.outputFormat,
mimeType = audio.mimeType,
fileExtension = audio.fileExtension,
)
}
companion object {
internal fun resolvePlaybackMode(
outputFormat: String?,
mimeType: String?,
fileExtension: String?,
): TalkPlaybackMode {
val normalizedOutputFormat = outputFormat?.trim()?.lowercase()
if (normalizedOutputFormat != null) {
val pcmSampleRate = parsePcmSampleRate(normalizedOutputFormat)
if (pcmSampleRate != null) {
return TalkPlaybackMode.Pcm(sampleRate = pcmSampleRate)
}
}
val normalizedMimeType = mimeType?.trim()?.lowercase()
val extension =
normalizeExtension(
fileExtension ?: inferExtension(outputFormat = normalizedOutputFormat, mimeType = normalizedMimeType),
)
if (extension != null) {
return TalkPlaybackMode.Compressed(fileExtension = extension)
}
throw IllegalStateException("Unsupported talk audio format")
}
private fun parsePcmSampleRate(outputFormat: String): Int? {
return when (outputFormat) {
"pcm_16000" -> 16_000
"pcm_22050" -> 22_050
"pcm_24000" -> 24_000
"pcm_44100" -> 44_100
else -> null
}
}
private fun inferExtension(outputFormat: String?, mimeType: String?): String? {
return when {
outputFormat == "mp3" || outputFormat?.startsWith("mp3_") == true || mimeType == "audio/mpeg" -> ".mp3"
outputFormat == "opus" || outputFormat?.startsWith("opus_") == true || mimeType == "audio/ogg" -> ".ogg"
outputFormat?.endsWith("-wav") == true || mimeType == "audio/wav" -> ".wav"
outputFormat?.endsWith("-webm") == true || mimeType == "audio/webm" -> ".webm"
else -> null
}
}
private fun normalizeExtension(value: String?): String? {
val trimmed = value?.trim()?.lowercase().orEmpty()
if (trimmed.isEmpty()) return null
return if (trimmed.startsWith(".")) trimmed else ".$trimmed"
}
}
private suspend fun playPcm(bytes: ByteArray, sampleRate: Int) {
withContext(Dispatchers.IO) {
val minBufferSize =
AudioTrack.getMinBufferSize(
sampleRate,
AudioFormat.CHANNEL_OUT_MONO,
AudioFormat.ENCODING_PCM_16BIT,
)
if (minBufferSize <= 0) {
throw IllegalStateException("AudioTrack buffer unavailable")
}
val track =
AudioTrack.Builder()
.setAudioAttributes(
AudioAttributes.Builder()
.setUsage(AudioAttributes.USAGE_MEDIA)
.setContentType(AudioAttributes.CONTENT_TYPE_SPEECH)
.build(),
)
.setAudioFormat(
AudioFormat.Builder()
.setEncoding(AudioFormat.ENCODING_PCM_16BIT)
.setSampleRate(sampleRate)
.setChannelMask(AudioFormat.CHANNEL_OUT_MONO)
.build(),
)
.setTransferMode(AudioTrack.MODE_STATIC)
.setBufferSizeInBytes(maxOf(minBufferSize, bytes.size))
.build()
val finished = CompletableDeferred<Unit>()
val playback =
ActivePlayback(
cancel = {
finished.completeExceptionally(CancellationException("assistant speech cancelled"))
runCatching { track.pause() }
runCatching { track.flush() }
runCatching { track.stop() }
},
)
register(playback)
try {
val written = track.write(bytes, 0, bytes.size)
if (written != bytes.size) {
throw IllegalStateException("AudioTrack write failed")
}
val totalFrames = bytes.size / 2
track.play()
while (track.playState == AudioTrack.PLAYSTATE_PLAYING) {
if (track.playbackHeadPosition >= totalFrames) {
finished.complete(Unit)
break
}
delay(20)
}
if (!finished.isCompleted) {
finished.complete(Unit)
}
finished.await()
} finally {
clear(playback)
runCatching { track.pause() }
runCatching { track.flush() }
runCatching { track.stop() }
track.release()
}
}
}
private suspend fun playCompressed(bytes: ByteArray, fileExtension: String) {
val tempFile = withContext(Dispatchers.IO) {
File.createTempFile("talk-audio-", fileExtension, context.cacheDir).apply {
writeBytes(bytes)
}
}
val finished = CompletableDeferred<Unit>()
val player =
withContext(Dispatchers.Main) {
MediaPlayer().apply {
setAudioAttributes(
AudioAttributes.Builder()
.setUsage(AudioAttributes.USAGE_MEDIA)
.setContentType(AudioAttributes.CONTENT_TYPE_SPEECH)
.build(),
)
setDataSource(tempFile.absolutePath)
setOnCompletionListener {
finished.complete(Unit)
}
setOnErrorListener { _, what, extra ->
finished.completeExceptionally(IllegalStateException("MediaPlayer error ($what/$extra)"))
true
}
prepare()
start()
}
}
val playback =
ActivePlayback(
cancel = {
finished.completeExceptionally(CancellationException("assistant speech cancelled"))
runCatching { player.stop() }
},
)
register(playback)
try {
finished.await()
} finally {
clear(playback)
withContext(Dispatchers.Main) {
runCatching { player.stop() }
player.release()
}
withContext(Dispatchers.IO) {
tempFile.delete()
}
}
}
private fun register(playback: ActivePlayback) {
synchronized(lock) {
active?.cancel()
active = playback
}
}
private fun clear(playback: ActivePlayback) {
synchronized(lock) {
if (active === playback) {
active = null
}
}
}
}
internal sealed interface TalkPlaybackMode {
data class Pcm(val sampleRate: Int) : TalkPlaybackMode
data class Compressed(val fileExtension: String) : TalkPlaybackMode
}
private class ActivePlayback(
val cancel: () -> Unit,
)

View File

@@ -14,9 +14,9 @@ import android.os.SystemClock
import android.speech.RecognitionListener
import android.speech.RecognizerIntent
import android.speech.SpeechRecognizer
import android.util.Log
import android.speech.tts.TextToSpeech
import android.speech.tts.UtteranceProgressListener
import android.util.Log
import androidx.core.content.ContextCompat
import ai.openclaw.app.gateway.GatewaySession
import java.util.Locale
@@ -61,6 +61,8 @@ class TalkModeManager(
private val mainHandler = Handler(Looper.getMainLooper())
private val json = Json { ignoreUnknownKeys = true }
private val talkSpeakClient = TalkSpeakClient(session = session, json = json)
private val talkAudioPlayer = TalkAudioPlayer(context)
private val _isEnabled = MutableStateFlow(false)
val isEnabled: StateFlow<Boolean> = _isEnabled
@@ -663,16 +665,32 @@ class TalkModeManager(
requestAudioFocusForTts()
try {
val ttsStarted = SystemClock.elapsedRealtime()
speakWithSystemTts(cleaned, directive, playbackToken)
Log.d(tag, "system tts ok durMs=${SystemClock.elapsedRealtime() - ttsStarted}")
val started = SystemClock.elapsedRealtime()
when (val result = talkSpeakClient.synthesize(text = cleaned, directive = directive)) {
is TalkSpeakResult.Success -> {
ensurePlaybackActive(playbackToken)
talkAudioPlayer.play(result.audio)
ensurePlaybackActive(playbackToken)
Log.d(tag, "talk.speak ok durMs=${SystemClock.elapsedRealtime() - started}")
}
is TalkSpeakResult.FallbackToLocal -> {
Log.d(tag, "talk.speak unavailable; using local TTS: ${result.message}")
speakWithSystemTts(cleaned, directive, playbackToken)
Log.d(tag, "system tts ok durMs=${SystemClock.elapsedRealtime() - started}")
}
is TalkSpeakResult.Failure -> {
throw IllegalStateException(result.message)
}
}
} catch (err: Throwable) {
if (isPlaybackCancelled(err, playbackToken)) {
Log.d(tag, "assistant speech cancelled")
return
}
_statusText.value = "Speak failed: ${err.message ?: err::class.simpleName}"
Log.w(tag, "system tts failed: ${err.message ?: err::class.simpleName}")
Log.w(tag, "talk playback failed: ${err.message ?: err::class.simpleName}")
} finally {
_isSpeaking.value = false
}
}
@@ -812,6 +830,7 @@ class TalkModeManager(
private fun stopSpeaking(resetInterrupt: Boolean = true) {
if (!_isSpeaking.value) {
talkAudioPlayer.stop()
stopTextToSpeechPlayback()
abandonAudioFocus()
return
@@ -819,6 +838,7 @@ class TalkModeManager(
if (resetInterrupt) {
lastInterruptedAtSeconds = null
}
talkAudioPlayer.stop()
stopTextToSpeechPlayback()
_isSpeaking.value = false
abandonAudioFocus()

View File

@@ -0,0 +1,141 @@
package ai.openclaw.app.voice
import ai.openclaw.app.gateway.GatewaySession
import kotlinx.serialization.Serializable
import kotlinx.serialization.encodeToString
import kotlinx.serialization.json.Json
internal data class TalkSpeakAudio(
val bytes: ByteArray,
val provider: String,
val outputFormat: String?,
val voiceCompatible: Boolean?,
val mimeType: String?,
val fileExtension: String?,
)
internal sealed interface TalkSpeakResult {
data class Success(val audio: TalkSpeakAudio) : TalkSpeakResult
data class FallbackToLocal(val message: String) : TalkSpeakResult
data class Failure(val message: String) : TalkSpeakResult
}
internal class TalkSpeakClient(
private val session: GatewaySession? = null,
private val json: Json = Json { ignoreUnknownKeys = true },
private val requestDetailed: (suspend (String, String, Long) -> GatewaySession.RpcResult)? = null,
) {
suspend fun synthesize(text: String, directive: TalkDirective?): TalkSpeakResult {
val response =
try {
performRequest(
method = "talk.speak",
paramsJson = json.encodeToString(TalkSpeakRequest.from(text = text, directive = directive)),
timeoutMs = 45_000,
)
} catch (err: Throwable) {
return TalkSpeakResult.Failure(err.message ?: "talk.speak request failed")
}
if (!response.ok) {
val error = response.error
val message = error?.message ?: "talk.speak request failed"
return if (isFallbackEligible(error?.details?.reason)) {
TalkSpeakResult.FallbackToLocal(message)
} else {
TalkSpeakResult.Failure(message)
}
}
val payload =
try {
json.decodeFromString<TalkSpeakResponse>(response.payloadJson ?: "")
} catch (err: Throwable) {
return TalkSpeakResult.Failure(err.message ?: "talk.speak payload invalid")
}
val bytes =
try {
android.util.Base64.decode(payload.audioBase64, android.util.Base64.DEFAULT)
} catch (err: Throwable) {
return TalkSpeakResult.Failure(err.message ?: "talk.speak audio decode failed")
}
if (bytes.isEmpty()) {
return TalkSpeakResult.Failure("talk.speak returned empty audio")
}
return TalkSpeakResult.Success(
TalkSpeakAudio(
bytes = bytes,
provider = payload.provider,
outputFormat = payload.outputFormat,
voiceCompatible = payload.voiceCompatible,
mimeType = payload.mimeType,
fileExtension = payload.fileExtension,
),
)
}
private fun isFallbackEligible(reason: String?): Boolean {
return reason == "talk_unconfigured" ||
reason == "talk_provider_unsupported" ||
reason == "method_unavailable"
}
private suspend fun performRequest(
method: String,
paramsJson: String,
timeoutMs: Long,
): GatewaySession.RpcResult {
requestDetailed?.let { return it(method, paramsJson, timeoutMs) }
val activeSession = session ?: throw IllegalStateException("session missing")
return activeSession.requestDetailed(method = method, paramsJson = paramsJson, timeoutMs = timeoutMs)
}
}
@Serializable
internal data class TalkSpeakRequest(
val text: String,
val voiceId: String? = null,
val modelId: String? = null,
val outputFormat: String? = null,
val speed: Double? = null,
val rateWpm: Int? = null,
val stability: Double? = null,
val similarity: Double? = null,
val style: Double? = null,
val speakerBoost: Boolean? = null,
val seed: Long? = null,
val normalize: String? = null,
val language: String? = null,
val latencyTier: Int? = null,
) {
companion object {
fun from(text: String, directive: TalkDirective?): TalkSpeakRequest {
return TalkSpeakRequest(
text = text,
voiceId = directive?.voiceId,
modelId = directive?.modelId,
outputFormat = directive?.outputFormat,
speed = directive?.speed,
rateWpm = directive?.rateWpm,
stability = directive?.stability,
similarity = directive?.similarity,
style = directive?.style,
speakerBoost = directive?.speakerBoost,
seed = directive?.seed,
normalize = directive?.normalize,
language = directive?.language,
latencyTier = directive?.latencyTier,
)
}
}
}
@Serializable
private data class TalkSpeakResponse(
val audioBase64: String,
val provider: String,
val outputFormat: String? = null,
val voiceCompatible: Boolean? = null,
val mimeType: String? = null,
val fileExtension: String? = null,
)