fix: restore android talk mode reply tts (#60306) (thanks @MKV21)

* Android: keep talk-mode session key synced for TTS replies

* Android: harden talk-mode reply playback state

* Android: harden talk-mode playback cancellation

* Android: avoid stale talk-mode playback preemption

* Android: tighten talk-mode playback claiming

* fix: distill android talk-mode playback ownership

* fix: restore android talk mode reply tts (#60306) (thanks @MKV21)

---------

Co-authored-by: Michael Faath <michaelfaath@macbookpro.speedport.ip>
Co-authored-by: Ayaan Zaidi <hi@obviy.us>
This commit is contained in:
Michael Faath
2026-04-04 03:58:56 +02:00
committed by GitHub
parent 858bf405f4
commit 85c76e83b7
4 changed files with 152 additions and 18 deletions

View File

@@ -345,6 +345,8 @@ class NodeRuntime(
session = operatorSession,
supportsChatSubscribe = false,
isConnected = { operatorConnected },
onBeforeSpeak = { micCapture.pauseForTts() },
onAfterSpeak = { micCapture.resumeAfterTts() },
).also { speaker ->
speaker.setPlaybackEnabled(prefs.speakerEnabled.value)
}
@@ -416,14 +418,19 @@ class NodeRuntime(
session = operatorSession,
supportsChatSubscribe = true,
isConnected = { operatorConnected },
onBeforeSpeak = { micCapture.pauseForTts() },
onAfterSpeak = { micCapture.resumeAfterTts() },
)
}
private fun syncMainSessionKey(agentId: String?) {
val resolvedKey = resolveNodeMainSessionKey(agentId)
// Always push the resolved session key into TalkMode, even when the
// state flow value is unchanged, so lazy TalkMode instances do not
// stay on the default "main" session key.
talkMode.setMainSessionKey(resolvedKey)
if (_mainSessionKey.value == resolvedKey) return
_mainSessionKey.value = resolvedKey
talkMode.setMainSessionKey(resolvedKey)
chat.applyMainSessionKey(resolvedKey)
updateHomeCanvasState()
}

View File

@@ -14,11 +14,13 @@ import android.speech.SpeechRecognizer
import androidx.core.content.ContextCompat
import java.util.UUID
import kotlinx.coroutines.CoroutineScope
import kotlinx.coroutines.Dispatchers
import kotlinx.coroutines.Job
import kotlinx.coroutines.delay
import kotlinx.coroutines.flow.MutableStateFlow
import kotlinx.coroutines.flow.StateFlow
import kotlinx.coroutines.launch
import kotlinx.coroutines.withContext
import kotlinx.serialization.json.Json
import kotlinx.serialization.json.JsonArray
import kotlinx.serialization.json.JsonObject
@@ -99,11 +101,27 @@ class MicCaptureManager(
private var transcriptFlushJob: Job? = null
private var pendingRunTimeoutJob: Job? = null
private var stopRequested = false
private val ttsPauseLock = Any()
private var ttsPauseDepth = 0
private var resumeMicAfterTts = false
fun setMicEnabled(enabled: Boolean) {
if (_micEnabled.value == enabled) return
_micEnabled.value = enabled
if (enabled) {
val pausedForTts =
synchronized(ttsPauseLock) {
if (ttsPauseDepth > 0) {
resumeMicAfterTts = true
true
} else {
false
}
}
if (pausedForTts) {
_statusText.value = if (_isSending.value) "Speaking · waiting for reply" else "Speaking…"
return
}
start()
sendQueuedIfIdle()
} else {
@@ -126,6 +144,58 @@ class MicCaptureManager(
}
}
suspend fun pauseForTts() {
val shouldPause =
synchronized(ttsPauseLock) {
ttsPauseDepth += 1
if (ttsPauseDepth > 1) return@synchronized false
resumeMicAfterTts = _micEnabled.value
val active = resumeMicAfterTts || recognizer != null || _isListening.value
if (!active) return@synchronized false
stopRequested = true
restartJob?.cancel()
restartJob = null
transcriptFlushJob?.cancel()
transcriptFlushJob = null
_isListening.value = false
_inputLevel.value = 0f
_liveTranscript.value = null
_statusText.value = if (_isSending.value) "Speaking · waiting for reply" else "Speaking…"
true
}
if (!shouldPause) return
withContext(Dispatchers.Main) {
recognizer?.cancel()
recognizer?.destroy()
recognizer = null
}
}
suspend fun resumeAfterTts() {
val shouldResume =
synchronized(ttsPauseLock) {
if (ttsPauseDepth == 0) return@synchronized false
ttsPauseDepth -= 1
if (ttsPauseDepth > 0) return@synchronized false
val resume = resumeMicAfterTts && _micEnabled.value
resumeMicAfterTts = false
if (!resume) {
_statusText.value =
when {
_micEnabled.value && _isSending.value -> "Listening · sending queued voice"
_micEnabled.value -> "Listening"
_isSending.value -> "Mic off · sending…"
else -> "Mic off"
}
}
resume
}
if (!shouldResume) return
stopRequested = false
start()
sendQueuedIfIdle()
}
fun onGatewayConnectionChanged(connected: Boolean) {
gatewayConnected = connected
if (connected) {

View File

@@ -22,11 +22,13 @@ import ai.openclaw.app.gateway.GatewaySession
import java.util.Locale
import java.util.UUID
import java.util.concurrent.atomic.AtomicLong
import kotlin.coroutines.coroutineContext
import kotlinx.coroutines.CancellationException
import kotlinx.coroutines.CompletableDeferred
import kotlinx.coroutines.CoroutineScope
import kotlinx.coroutines.Dispatchers
import kotlinx.coroutines.Job
import kotlinx.coroutines.NonCancellable
import kotlinx.coroutines.delay
import kotlinx.coroutines.ensureActive
import kotlinx.coroutines.flow.MutableStateFlow
@@ -46,6 +48,8 @@ class TalkModeManager(
private val session: GatewaySession,
private val supportsChatSubscribe: Boolean,
private val isConnected: () -> Boolean,
private val onBeforeSpeak: suspend () -> Unit = {},
private val onAfterSpeak: suspend () -> Unit = {},
) {
companion object {
private const val tag = "TalkMode"
@@ -101,6 +105,7 @@ class TalkModeManager(
private val playbackGeneration = AtomicLong(0L)
private var ttsJob: Job? = null
private val ttsJobLock = Any()
private val ttsLock = Any()
private var textToSpeech: TextToSpeech? = null
private var textToSpeechInit: CompletableDeferred<TextToSpeech>? = null
@@ -163,8 +168,11 @@ class TalkModeManager(
?: waitForAssistantText(session, startedAt, if (ok) 12_000 else 25_000)
if (!assistant.isNullOrBlank()) {
val playbackToken = playbackGeneration.incrementAndGet()
cancelActivePlayback()
_statusText.value = "Speaking…"
playAssistant(assistant, playbackToken)
runPlaybackSession(playbackToken) {
playAssistant(assistant, playbackToken)
}
} else {
_statusText.value = "No reply"
}
@@ -180,14 +188,12 @@ class TalkModeManager(
fun playTtsForText(text: String) {
val playbackToken = playbackGeneration.incrementAndGet()
ttsJob?.cancel()
ttsJob = scope.launch {
cancelActivePlayback()
scope.launch {
reloadConfig()
ensurePlaybackActive(playbackToken)
_isSpeaking.value = true
_statusText.value = "Speaking…"
playAssistant(text, playbackToken)
ttsJob = null
runPlaybackSession(playbackToken) {
playAssistant(text, playbackToken)
}
}
}
@@ -270,10 +276,11 @@ class TalkModeManager(
suspend fun speakAssistantReply(text: String) {
if (!playbackEnabled) return
val playbackToken = playbackGeneration.incrementAndGet()
stopSpeaking(resetInterrupt = false)
cancelActivePlayback()
ensureConfigLoaded()
ensurePlaybackActive(playbackToken)
playAssistant(text, playbackToken)
runPlaybackSession(playbackToken) {
playAssistant(text, playbackToken)
}
}
private fun start() {
@@ -483,9 +490,10 @@ class TalkModeManager(
}
Log.d(tag, "assistant text ok chars=${assistant.length}")
val playbackToken = playbackGeneration.incrementAndGet()
stopSpeaking(resetInterrupt = false)
ensurePlaybackActive(playbackToken)
playAssistant(assistant, playbackToken)
cancelActivePlayback()
runPlaybackSession(playbackToken) {
playAssistant(assistant, playbackToken)
}
} catch (err: Throwable) {
if (err is CancellationException) {
Log.d(tag, "finalize speech cancelled")
@@ -665,12 +673,60 @@ class TalkModeManager(
}
_statusText.value = "Speak failed: ${err.message ?: err::class.simpleName}"
Log.w(tag, "system tts failed: ${err.message ?: err::class.simpleName}")
} finally {
_isSpeaking.value = false
}
}
private suspend fun runPlaybackSession(
playbackToken: Long,
block: suspend () -> Unit,
) {
val currentJob = coroutineContext[Job]
var shouldResumeAfterSpeak = false
try {
val claimedPlayback =
synchronized(ttsJobLock) {
if (!playbackEnabled || playbackToken != playbackGeneration.get()) {
false
} else {
ttsJob = currentJob
true
}
}
if (!claimedPlayback) {
ensurePlaybackActive(playbackToken)
return
}
ensurePlaybackActive(playbackToken)
shouldResumeAfterSpeak = true
onBeforeSpeak()
ensurePlaybackActive(playbackToken)
_isSpeaking.value = true
_statusText.value = "Speaking…"
block()
} finally {
synchronized(ttsJobLock) {
if (ttsJob === currentJob) {
ttsJob = null
}
}
_isSpeaking.value = false
if (shouldResumeAfterSpeak) {
withContext(NonCancellable) {
onAfterSpeak()
}
}
}
}
private fun cancelActivePlayback() {
val activeJob =
synchronized(ttsJobLock) {
ttsJob
}
activeJob?.cancel()
stopTextToSpeechPlayback()
}
private suspend fun speakWithSystemTts(text: String, directive: TalkDirective?, playbackToken: Long) {
ensurePlaybackActive(playbackToken)
val engine = ensureTextToSpeech()