mirror of
https://github.com/openclaw/openclaw.git
synced 2026-04-10 16:51:13 +00:00
feat(android): add talk.speak playback path
This commit is contained in:
@@ -64,6 +64,7 @@ data class GatewayConnectErrorDetails(
|
||||
val code: String?,
|
||||
val canRetryWithDeviceToken: Boolean,
|
||||
val recommendedNextStep: String?,
|
||||
val reason: String? = null,
|
||||
)
|
||||
|
||||
private data class SelectedConnectAuth(
|
||||
@@ -116,6 +117,8 @@ class GatewaySession(
|
||||
val details: GatewayConnectErrorDetails? = null,
|
||||
)
|
||||
|
||||
data class RpcResult(val ok: Boolean, val payloadJson: String?, val error: ErrorShape?)
|
||||
|
||||
private val json = Json { ignoreUnknownKeys = true }
|
||||
private val writeLock = Mutex()
|
||||
private val pending = ConcurrentHashMap<String, CompletableDeferred<RpcResponse>>()
|
||||
@@ -196,6 +199,13 @@ class GatewaySession(
|
||||
}
|
||||
|
||||
suspend fun request(method: String, paramsJson: String?, timeoutMs: Long = 15_000): String {
|
||||
val res = requestDetailed(method = method, paramsJson = paramsJson, timeoutMs = timeoutMs)
|
||||
if (res.ok) return res.payloadJson ?: ""
|
||||
val err = res.error
|
||||
throw IllegalStateException("${err?.code ?: "UNAVAILABLE"}: ${err?.message ?: "request failed"}")
|
||||
}
|
||||
|
||||
suspend fun requestDetailed(method: String, paramsJson: String?, timeoutMs: Long = 15_000): RpcResult {
|
||||
val conn = currentConnection ?: throw IllegalStateException("not connected")
|
||||
val params =
|
||||
if (paramsJson.isNullOrBlank()) {
|
||||
@@ -204,9 +214,7 @@ class GatewaySession(
|
||||
json.parseToJsonElement(paramsJson)
|
||||
}
|
||||
val res = conn.request(method, params, timeoutMs)
|
||||
if (res.ok) return res.payloadJson ?: ""
|
||||
val err = res.error
|
||||
throw IllegalStateException("${err?.code ?: "UNAVAILABLE"}: ${err?.message ?: "request failed"}")
|
||||
return RpcResult(ok = res.ok, payloadJson = res.payloadJson, error = res.error)
|
||||
}
|
||||
|
||||
suspend fun refreshNodeCanvasCapability(timeoutMs: Long = 8_000): Boolean {
|
||||
@@ -631,6 +639,7 @@ class GatewaySession(
|
||||
code = it["code"].asStringOrNull(),
|
||||
canRetryWithDeviceToken = it["canRetryWithDeviceToken"].asBooleanOrNull() == true,
|
||||
recommendedNextStep = it["recommendedNextStep"].asStringOrNull(),
|
||||
reason = it["reason"].asStringOrNull(),
|
||||
)
|
||||
}
|
||||
ErrorShape(code, msg, details)
|
||||
|
||||
@@ -0,0 +1,237 @@
|
||||
package ai.openclaw.app.voice
|
||||
|
||||
import android.content.Context
|
||||
import android.media.AudioAttributes
|
||||
import android.media.AudioFormat
|
||||
import android.media.AudioTrack
|
||||
import android.media.MediaPlayer
|
||||
import kotlinx.coroutines.CancellationException
|
||||
import kotlinx.coroutines.CompletableDeferred
|
||||
import kotlinx.coroutines.Dispatchers
|
||||
import kotlinx.coroutines.delay
|
||||
import kotlinx.coroutines.withContext
|
||||
import java.io.File
|
||||
|
||||
internal class TalkAudioPlayer(
|
||||
private val context: Context,
|
||||
) {
|
||||
private val lock = Any()
|
||||
private var active: ActivePlayback? = null
|
||||
|
||||
suspend fun play(audio: TalkSpeakAudio) {
|
||||
when (val mode = resolvePlaybackMode(audio)) {
|
||||
is TalkPlaybackMode.Pcm -> playPcm(audio.bytes, mode.sampleRate)
|
||||
is TalkPlaybackMode.Compressed -> playCompressed(audio.bytes, mode.fileExtension)
|
||||
}
|
||||
}
|
||||
|
||||
fun stop() {
|
||||
synchronized(lock) {
|
||||
active?.cancel()
|
||||
active = null
|
||||
}
|
||||
}
|
||||
|
||||
internal fun resolvePlaybackMode(audio: TalkSpeakAudio): TalkPlaybackMode {
|
||||
return resolvePlaybackMode(
|
||||
outputFormat = audio.outputFormat,
|
||||
mimeType = audio.mimeType,
|
||||
fileExtension = audio.fileExtension,
|
||||
)
|
||||
}
|
||||
|
||||
companion object {
|
||||
internal fun resolvePlaybackMode(
|
||||
outputFormat: String?,
|
||||
mimeType: String?,
|
||||
fileExtension: String?,
|
||||
): TalkPlaybackMode {
|
||||
val normalizedOutputFormat = outputFormat?.trim()?.lowercase()
|
||||
if (normalizedOutputFormat != null) {
|
||||
val pcmSampleRate = parsePcmSampleRate(normalizedOutputFormat)
|
||||
if (pcmSampleRate != null) {
|
||||
return TalkPlaybackMode.Pcm(sampleRate = pcmSampleRate)
|
||||
}
|
||||
}
|
||||
val normalizedMimeType = mimeType?.trim()?.lowercase()
|
||||
val extension =
|
||||
normalizeExtension(
|
||||
fileExtension ?: inferExtension(outputFormat = normalizedOutputFormat, mimeType = normalizedMimeType),
|
||||
)
|
||||
if (extension != null) {
|
||||
return TalkPlaybackMode.Compressed(fileExtension = extension)
|
||||
}
|
||||
throw IllegalStateException("Unsupported talk audio format")
|
||||
}
|
||||
|
||||
private fun parsePcmSampleRate(outputFormat: String): Int? {
|
||||
return when (outputFormat) {
|
||||
"pcm_16000" -> 16_000
|
||||
"pcm_22050" -> 22_050
|
||||
"pcm_24000" -> 24_000
|
||||
"pcm_44100" -> 44_100
|
||||
else -> null
|
||||
}
|
||||
}
|
||||
|
||||
private fun inferExtension(outputFormat: String?, mimeType: String?): String? {
|
||||
return when {
|
||||
outputFormat == "mp3" || outputFormat?.startsWith("mp3_") == true || mimeType == "audio/mpeg" -> ".mp3"
|
||||
outputFormat == "opus" || outputFormat?.startsWith("opus_") == true || mimeType == "audio/ogg" -> ".ogg"
|
||||
outputFormat?.endsWith("-wav") == true || mimeType == "audio/wav" -> ".wav"
|
||||
outputFormat?.endsWith("-webm") == true || mimeType == "audio/webm" -> ".webm"
|
||||
else -> null
|
||||
}
|
||||
}
|
||||
|
||||
private fun normalizeExtension(value: String?): String? {
|
||||
val trimmed = value?.trim()?.lowercase().orEmpty()
|
||||
if (trimmed.isEmpty()) return null
|
||||
return if (trimmed.startsWith(".")) trimmed else ".$trimmed"
|
||||
}
|
||||
}
|
||||
|
||||
private suspend fun playPcm(bytes: ByteArray, sampleRate: Int) {
|
||||
withContext(Dispatchers.IO) {
|
||||
val minBufferSize =
|
||||
AudioTrack.getMinBufferSize(
|
||||
sampleRate,
|
||||
AudioFormat.CHANNEL_OUT_MONO,
|
||||
AudioFormat.ENCODING_PCM_16BIT,
|
||||
)
|
||||
if (minBufferSize <= 0) {
|
||||
throw IllegalStateException("AudioTrack buffer unavailable")
|
||||
}
|
||||
val track =
|
||||
AudioTrack.Builder()
|
||||
.setAudioAttributes(
|
||||
AudioAttributes.Builder()
|
||||
.setUsage(AudioAttributes.USAGE_MEDIA)
|
||||
.setContentType(AudioAttributes.CONTENT_TYPE_SPEECH)
|
||||
.build(),
|
||||
)
|
||||
.setAudioFormat(
|
||||
AudioFormat.Builder()
|
||||
.setEncoding(AudioFormat.ENCODING_PCM_16BIT)
|
||||
.setSampleRate(sampleRate)
|
||||
.setChannelMask(AudioFormat.CHANNEL_OUT_MONO)
|
||||
.build(),
|
||||
)
|
||||
.setTransferMode(AudioTrack.MODE_STATIC)
|
||||
.setBufferSizeInBytes(maxOf(minBufferSize, bytes.size))
|
||||
.build()
|
||||
val finished = CompletableDeferred<Unit>()
|
||||
val playback =
|
||||
ActivePlayback(
|
||||
cancel = {
|
||||
finished.completeExceptionally(CancellationException("assistant speech cancelled"))
|
||||
runCatching { track.pause() }
|
||||
runCatching { track.flush() }
|
||||
runCatching { track.stop() }
|
||||
},
|
||||
)
|
||||
register(playback)
|
||||
try {
|
||||
val written = track.write(bytes, 0, bytes.size)
|
||||
if (written != bytes.size) {
|
||||
throw IllegalStateException("AudioTrack write failed")
|
||||
}
|
||||
val totalFrames = bytes.size / 2
|
||||
track.play()
|
||||
while (track.playState == AudioTrack.PLAYSTATE_PLAYING) {
|
||||
if (track.playbackHeadPosition >= totalFrames) {
|
||||
finished.complete(Unit)
|
||||
break
|
||||
}
|
||||
delay(20)
|
||||
}
|
||||
if (!finished.isCompleted) {
|
||||
finished.complete(Unit)
|
||||
}
|
||||
finished.await()
|
||||
} finally {
|
||||
clear(playback)
|
||||
runCatching { track.pause() }
|
||||
runCatching { track.flush() }
|
||||
runCatching { track.stop() }
|
||||
track.release()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private suspend fun playCompressed(bytes: ByteArray, fileExtension: String) {
|
||||
val tempFile = withContext(Dispatchers.IO) {
|
||||
File.createTempFile("talk-audio-", fileExtension, context.cacheDir).apply {
|
||||
writeBytes(bytes)
|
||||
}
|
||||
}
|
||||
val finished = CompletableDeferred<Unit>()
|
||||
val player =
|
||||
withContext(Dispatchers.Main) {
|
||||
MediaPlayer().apply {
|
||||
setAudioAttributes(
|
||||
AudioAttributes.Builder()
|
||||
.setUsage(AudioAttributes.USAGE_MEDIA)
|
||||
.setContentType(AudioAttributes.CONTENT_TYPE_SPEECH)
|
||||
.build(),
|
||||
)
|
||||
setDataSource(tempFile.absolutePath)
|
||||
setOnCompletionListener {
|
||||
finished.complete(Unit)
|
||||
}
|
||||
setOnErrorListener { _, what, extra ->
|
||||
finished.completeExceptionally(IllegalStateException("MediaPlayer error ($what/$extra)"))
|
||||
true
|
||||
}
|
||||
prepare()
|
||||
start()
|
||||
}
|
||||
}
|
||||
val playback =
|
||||
ActivePlayback(
|
||||
cancel = {
|
||||
finished.completeExceptionally(CancellationException("assistant speech cancelled"))
|
||||
runCatching { player.stop() }
|
||||
},
|
||||
)
|
||||
register(playback)
|
||||
try {
|
||||
finished.await()
|
||||
} finally {
|
||||
clear(playback)
|
||||
withContext(Dispatchers.Main) {
|
||||
runCatching { player.stop() }
|
||||
player.release()
|
||||
}
|
||||
withContext(Dispatchers.IO) {
|
||||
tempFile.delete()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private fun register(playback: ActivePlayback) {
|
||||
synchronized(lock) {
|
||||
active?.cancel()
|
||||
active = playback
|
||||
}
|
||||
}
|
||||
|
||||
private fun clear(playback: ActivePlayback) {
|
||||
synchronized(lock) {
|
||||
if (active === playback) {
|
||||
active = null
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
internal sealed interface TalkPlaybackMode {
|
||||
data class Pcm(val sampleRate: Int) : TalkPlaybackMode
|
||||
|
||||
data class Compressed(val fileExtension: String) : TalkPlaybackMode
|
||||
}
|
||||
|
||||
private class ActivePlayback(
|
||||
val cancel: () -> Unit,
|
||||
)
|
||||
@@ -14,9 +14,9 @@ import android.os.SystemClock
|
||||
import android.speech.RecognitionListener
|
||||
import android.speech.RecognizerIntent
|
||||
import android.speech.SpeechRecognizer
|
||||
import android.util.Log
|
||||
import android.speech.tts.TextToSpeech
|
||||
import android.speech.tts.UtteranceProgressListener
|
||||
import android.util.Log
|
||||
import androidx.core.content.ContextCompat
|
||||
import ai.openclaw.app.gateway.GatewaySession
|
||||
import java.util.Locale
|
||||
@@ -61,6 +61,8 @@ class TalkModeManager(
|
||||
|
||||
private val mainHandler = Handler(Looper.getMainLooper())
|
||||
private val json = Json { ignoreUnknownKeys = true }
|
||||
private val talkSpeakClient = TalkSpeakClient(session = session, json = json)
|
||||
private val talkAudioPlayer = TalkAudioPlayer(context)
|
||||
|
||||
private val _isEnabled = MutableStateFlow(false)
|
||||
val isEnabled: StateFlow<Boolean> = _isEnabled
|
||||
@@ -663,16 +665,32 @@ class TalkModeManager(
|
||||
requestAudioFocusForTts()
|
||||
|
||||
try {
|
||||
val ttsStarted = SystemClock.elapsedRealtime()
|
||||
speakWithSystemTts(cleaned, directive, playbackToken)
|
||||
Log.d(tag, "system tts ok durMs=${SystemClock.elapsedRealtime() - ttsStarted}")
|
||||
val started = SystemClock.elapsedRealtime()
|
||||
when (val result = talkSpeakClient.synthesize(text = cleaned, directive = directive)) {
|
||||
is TalkSpeakResult.Success -> {
|
||||
ensurePlaybackActive(playbackToken)
|
||||
talkAudioPlayer.play(result.audio)
|
||||
ensurePlaybackActive(playbackToken)
|
||||
Log.d(tag, "talk.speak ok durMs=${SystemClock.elapsedRealtime() - started}")
|
||||
}
|
||||
is TalkSpeakResult.FallbackToLocal -> {
|
||||
Log.d(tag, "talk.speak unavailable; using local TTS: ${result.message}")
|
||||
speakWithSystemTts(cleaned, directive, playbackToken)
|
||||
Log.d(tag, "system tts ok durMs=${SystemClock.elapsedRealtime() - started}")
|
||||
}
|
||||
is TalkSpeakResult.Failure -> {
|
||||
throw IllegalStateException(result.message)
|
||||
}
|
||||
}
|
||||
} catch (err: Throwable) {
|
||||
if (isPlaybackCancelled(err, playbackToken)) {
|
||||
Log.d(tag, "assistant speech cancelled")
|
||||
return
|
||||
}
|
||||
_statusText.value = "Speak failed: ${err.message ?: err::class.simpleName}"
|
||||
Log.w(tag, "system tts failed: ${err.message ?: err::class.simpleName}")
|
||||
Log.w(tag, "talk playback failed: ${err.message ?: err::class.simpleName}")
|
||||
} finally {
|
||||
_isSpeaking.value = false
|
||||
}
|
||||
}
|
||||
|
||||
@@ -812,6 +830,7 @@ class TalkModeManager(
|
||||
|
||||
private fun stopSpeaking(resetInterrupt: Boolean = true) {
|
||||
if (!_isSpeaking.value) {
|
||||
talkAudioPlayer.stop()
|
||||
stopTextToSpeechPlayback()
|
||||
abandonAudioFocus()
|
||||
return
|
||||
@@ -819,6 +838,7 @@ class TalkModeManager(
|
||||
if (resetInterrupt) {
|
||||
lastInterruptedAtSeconds = null
|
||||
}
|
||||
talkAudioPlayer.stop()
|
||||
stopTextToSpeechPlayback()
|
||||
_isSpeaking.value = false
|
||||
abandonAudioFocus()
|
||||
|
||||
@@ -0,0 +1,141 @@
|
||||
package ai.openclaw.app.voice
|
||||
|
||||
import ai.openclaw.app.gateway.GatewaySession
|
||||
import kotlinx.serialization.Serializable
|
||||
import kotlinx.serialization.encodeToString
|
||||
import kotlinx.serialization.json.Json
|
||||
|
||||
internal data class TalkSpeakAudio(
|
||||
val bytes: ByteArray,
|
||||
val provider: String,
|
||||
val outputFormat: String?,
|
||||
val voiceCompatible: Boolean?,
|
||||
val mimeType: String?,
|
||||
val fileExtension: String?,
|
||||
)
|
||||
|
||||
internal sealed interface TalkSpeakResult {
|
||||
data class Success(val audio: TalkSpeakAudio) : TalkSpeakResult
|
||||
|
||||
data class FallbackToLocal(val message: String) : TalkSpeakResult
|
||||
|
||||
data class Failure(val message: String) : TalkSpeakResult
|
||||
}
|
||||
|
||||
internal class TalkSpeakClient(
|
||||
private val session: GatewaySession? = null,
|
||||
private val json: Json = Json { ignoreUnknownKeys = true },
|
||||
private val requestDetailed: (suspend (String, String, Long) -> GatewaySession.RpcResult)? = null,
|
||||
) {
|
||||
suspend fun synthesize(text: String, directive: TalkDirective?): TalkSpeakResult {
|
||||
val response =
|
||||
try {
|
||||
performRequest(
|
||||
method = "talk.speak",
|
||||
paramsJson = json.encodeToString(TalkSpeakRequest.from(text = text, directive = directive)),
|
||||
timeoutMs = 45_000,
|
||||
)
|
||||
} catch (err: Throwable) {
|
||||
return TalkSpeakResult.Failure(err.message ?: "talk.speak request failed")
|
||||
}
|
||||
if (!response.ok) {
|
||||
val error = response.error
|
||||
val message = error?.message ?: "talk.speak request failed"
|
||||
return if (isFallbackEligible(error?.details?.reason)) {
|
||||
TalkSpeakResult.FallbackToLocal(message)
|
||||
} else {
|
||||
TalkSpeakResult.Failure(message)
|
||||
}
|
||||
}
|
||||
val payload =
|
||||
try {
|
||||
json.decodeFromString<TalkSpeakResponse>(response.payloadJson ?: "")
|
||||
} catch (err: Throwable) {
|
||||
return TalkSpeakResult.Failure(err.message ?: "talk.speak payload invalid")
|
||||
}
|
||||
val bytes =
|
||||
try {
|
||||
android.util.Base64.decode(payload.audioBase64, android.util.Base64.DEFAULT)
|
||||
} catch (err: Throwable) {
|
||||
return TalkSpeakResult.Failure(err.message ?: "talk.speak audio decode failed")
|
||||
}
|
||||
if (bytes.isEmpty()) {
|
||||
return TalkSpeakResult.Failure("talk.speak returned empty audio")
|
||||
}
|
||||
return TalkSpeakResult.Success(
|
||||
TalkSpeakAudio(
|
||||
bytes = bytes,
|
||||
provider = payload.provider,
|
||||
outputFormat = payload.outputFormat,
|
||||
voiceCompatible = payload.voiceCompatible,
|
||||
mimeType = payload.mimeType,
|
||||
fileExtension = payload.fileExtension,
|
||||
),
|
||||
)
|
||||
}
|
||||
|
||||
private fun isFallbackEligible(reason: String?): Boolean {
|
||||
return reason == "talk_unconfigured" ||
|
||||
reason == "talk_provider_unsupported" ||
|
||||
reason == "method_unavailable"
|
||||
}
|
||||
|
||||
private suspend fun performRequest(
|
||||
method: String,
|
||||
paramsJson: String,
|
||||
timeoutMs: Long,
|
||||
): GatewaySession.RpcResult {
|
||||
requestDetailed?.let { return it(method, paramsJson, timeoutMs) }
|
||||
val activeSession = session ?: throw IllegalStateException("session missing")
|
||||
return activeSession.requestDetailed(method = method, paramsJson = paramsJson, timeoutMs = timeoutMs)
|
||||
}
|
||||
}
|
||||
|
||||
@Serializable
|
||||
internal data class TalkSpeakRequest(
|
||||
val text: String,
|
||||
val voiceId: String? = null,
|
||||
val modelId: String? = null,
|
||||
val outputFormat: String? = null,
|
||||
val speed: Double? = null,
|
||||
val rateWpm: Int? = null,
|
||||
val stability: Double? = null,
|
||||
val similarity: Double? = null,
|
||||
val style: Double? = null,
|
||||
val speakerBoost: Boolean? = null,
|
||||
val seed: Long? = null,
|
||||
val normalize: String? = null,
|
||||
val language: String? = null,
|
||||
val latencyTier: Int? = null,
|
||||
) {
|
||||
companion object {
|
||||
fun from(text: String, directive: TalkDirective?): TalkSpeakRequest {
|
||||
return TalkSpeakRequest(
|
||||
text = text,
|
||||
voiceId = directive?.voiceId,
|
||||
modelId = directive?.modelId,
|
||||
outputFormat = directive?.outputFormat,
|
||||
speed = directive?.speed,
|
||||
rateWpm = directive?.rateWpm,
|
||||
stability = directive?.stability,
|
||||
similarity = directive?.similarity,
|
||||
style = directive?.style,
|
||||
speakerBoost = directive?.speakerBoost,
|
||||
seed = directive?.seed,
|
||||
normalize = directive?.normalize,
|
||||
language = directive?.language,
|
||||
latencyTier = directive?.latencyTier,
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Serializable
|
||||
private data class TalkSpeakResponse(
|
||||
val audioBase64: String,
|
||||
val provider: String,
|
||||
val outputFormat: String? = null,
|
||||
val voiceCompatible: Boolean? = null,
|
||||
val mimeType: String? = null,
|
||||
val fileExtension: String? = null,
|
||||
)
|
||||
Reference in New Issue
Block a user