fix: restore android talk mode reply tts (#60306) (thanks @MKV21)

* Android: keep talk-mode session key synced for TTS replies * Android: harden talk-mode reply playback state * Android: harden talk-mode playback cancellation * Android: avoid stale talk-mode playback preemption * Android: tighten talk-mode playback claiming * fix: distill android talk-mode playback ownership * fix: restore android talk mode reply tts (#60306) (thanks @MKV21) --------- Co-authored-by: Michael Faath <michaelfaath@macbookpro.speedport.ip> Co-authored-by: Ayaan Zaidi <hi@obviy.us>
2026-05-05 04:30:24 +00:00 · 2026-04-04 03:58:56 +02:00
parent 858bf405f4
commit 85c76e83b7
4 changed files with 152 additions and 18 deletions
--- a/apps/android/app/src/main/java/ai/openclaw/app/NodeRuntime.kt
+++ b/apps/android/app/src/main/java/ai/openclaw/app/NodeRuntime.kt
@@ -345,6 +345,8 @@ class NodeRuntime(
      session = operatorSession,
      supportsChatSubscribe = false,
      isConnected = { operatorConnected },
+      onBeforeSpeak = { micCapture.pauseForTts() },
+      onAfterSpeak = { micCapture.resumeAfterTts() },
    ).also { speaker ->
      speaker.setPlaybackEnabled(prefs.speakerEnabled.value)
    }
@@ -416,14 +418,19 @@ class NodeRuntime(
      session = operatorSession,
      supportsChatSubscribe = true,
      isConnected = { operatorConnected },
+      onBeforeSpeak = { micCapture.pauseForTts() },
+      onAfterSpeak = { micCapture.resumeAfterTts() },
    )
  }

  private fun syncMainSessionKey(agentId: String?) {
    val resolvedKey = resolveNodeMainSessionKey(agentId)
+    // Always push the resolved session key into TalkMode, even when the
+    // state flow value is unchanged, so lazy TalkMode instances do not
+    // stay on the default "main" session key.
+    talkMode.setMainSessionKey(resolvedKey)
    if (_mainSessionKey.value == resolvedKey) return
    _mainSessionKey.value = resolvedKey
-    talkMode.setMainSessionKey(resolvedKey)
    chat.applyMainSessionKey(resolvedKey)
    updateHomeCanvasState()
  }
--- a/apps/android/app/src/main/java/ai/openclaw/app/voice/MicCaptureManager.kt
+++ b/apps/android/app/src/main/java/ai/openclaw/app/voice/MicCaptureManager.kt
@@ -14,11 +14,13 @@ import android.speech.SpeechRecognizer
 import androidx.core.content.ContextCompat
 import java.util.UUID
 import kotlinx.coroutines.CoroutineScope
+import kotlinx.coroutines.Dispatchers
 import kotlinx.coroutines.Job
 import kotlinx.coroutines.delay
 import kotlinx.coroutines.flow.MutableStateFlow
 import kotlinx.coroutines.flow.StateFlow
 import kotlinx.coroutines.launch
+import kotlinx.coroutines.withContext
 import kotlinx.serialization.json.Json
 import kotlinx.serialization.json.JsonArray
 import kotlinx.serialization.json.JsonObject
@@ -99,11 +101,27 @@ class MicCaptureManager(
  private var transcriptFlushJob: Job? = null
  private var pendingRunTimeoutJob: Job? = null
  private var stopRequested = false
+  private val ttsPauseLock = Any()
+  private var ttsPauseDepth = 0
+  private var resumeMicAfterTts = false

  fun setMicEnabled(enabled: Boolean) {
    if (_micEnabled.value == enabled) return
    _micEnabled.value = enabled
    if (enabled) {
+      val pausedForTts =
+        synchronized(ttsPauseLock) {
+          if (ttsPauseDepth > 0) {
+            resumeMicAfterTts = true
+            true
+          } else {
+            false
+          }
+        }
+      if (pausedForTts) {
+        _statusText.value = if (_isSending.value) "Speaking · waiting for reply" else "Speaking…"
+        return
+      }
      start()
      sendQueuedIfIdle()
    } else {
@@ -126,6 +144,58 @@ class MicCaptureManager(
    }
  }

+  suspend fun pauseForTts() {
+    val shouldPause =
+      synchronized(ttsPauseLock) {
+        ttsPauseDepth += 1
+        if (ttsPauseDepth > 1) return@synchronized false
+        resumeMicAfterTts = _micEnabled.value
+        val active = resumeMicAfterTts || recognizer != null || _isListening.value
+        if (!active) return@synchronized false
+        stopRequested = true
+        restartJob?.cancel()
+        restartJob = null
+        transcriptFlushJob?.cancel()
+        transcriptFlushJob = null
+        _isListening.value = false
+        _inputLevel.value = 0f
+        _liveTranscript.value = null
+        _statusText.value = if (_isSending.value) "Speaking · waiting for reply" else "Speaking…"
+        true
+      }
+    if (!shouldPause) return
+    withContext(Dispatchers.Main) {
+      recognizer?.cancel()
+      recognizer?.destroy()
+      recognizer = null
+    }
+  }
+
+  suspend fun resumeAfterTts() {
+    val shouldResume =
+      synchronized(ttsPauseLock) {
+        if (ttsPauseDepth == 0) return@synchronized false
+        ttsPauseDepth -= 1
+        if (ttsPauseDepth > 0) return@synchronized false
+        val resume = resumeMicAfterTts && _micEnabled.value
+        resumeMicAfterTts = false
+        if (!resume) {
+          _statusText.value =
+            when {
+              _micEnabled.value && _isSending.value -> "Listening · sending queued voice"
+              _micEnabled.value -> "Listening"
+              _isSending.value -> "Mic off · sending…"
+              else -> "Mic off"
+            }
+        }
+        resume
+      }
+    if (!shouldResume) return
+    stopRequested = false
+    start()
+    sendQueuedIfIdle()
+  }
+
  fun onGatewayConnectionChanged(connected: Boolean) {
    gatewayConnected = connected
    if (connected) {
--- a/apps/android/app/src/main/java/ai/openclaw/app/voice/TalkModeManager.kt
+++ b/apps/android/app/src/main/java/ai/openclaw/app/voice/TalkModeManager.kt
@@ -22,11 +22,13 @@ import ai.openclaw.app.gateway.GatewaySession
 import java.util.Locale
 import java.util.UUID
 import java.util.concurrent.atomic.AtomicLong
+import kotlin.coroutines.coroutineContext
 import kotlinx.coroutines.CancellationException
 import kotlinx.coroutines.CompletableDeferred
 import kotlinx.coroutines.CoroutineScope
 import kotlinx.coroutines.Dispatchers
 import kotlinx.coroutines.Job
+import kotlinx.coroutines.NonCancellable
 import kotlinx.coroutines.delay
 import kotlinx.coroutines.ensureActive
 import kotlinx.coroutines.flow.MutableStateFlow
@@ -46,6 +48,8 @@ class TalkModeManager(
  private val session: GatewaySession,
  private val supportsChatSubscribe: Boolean,
  private val isConnected: () -> Boolean,
+  private val onBeforeSpeak: suspend () -> Unit = {},
+  private val onAfterSpeak: suspend () -> Unit = {},
 ) {
  companion object {
    private const val tag = "TalkMode"
@@ -101,6 +105,7 @@ class TalkModeManager(
  private val playbackGeneration = AtomicLong(0L)

  private var ttsJob: Job? = null
+  private val ttsJobLock = Any()
  private val ttsLock = Any()
  private var textToSpeech: TextToSpeech? = null
  private var textToSpeechInit: CompletableDeferred<TextToSpeech>? = null
@@ -163,8 +168,11 @@ class TalkModeManager(
          ?: waitForAssistantText(session, startedAt, if (ok) 12_000 else 25_000)
        if (!assistant.isNullOrBlank()) {
          val playbackToken = playbackGeneration.incrementAndGet()
+          cancelActivePlayback()
          _statusText.value = "Speaking…"
-          playAssistant(assistant, playbackToken)
+          runPlaybackSession(playbackToken) {
+            playAssistant(assistant, playbackToken)
+          }
        } else {
          _statusText.value = "No reply"
        }
@@ -180,14 +188,12 @@ class TalkModeManager(

  fun playTtsForText(text: String) {
    val playbackToken = playbackGeneration.incrementAndGet()
-    ttsJob?.cancel()
-    ttsJob = scope.launch {
+    cancelActivePlayback()
+    scope.launch {
      reloadConfig()
-      ensurePlaybackActive(playbackToken)
-      _isSpeaking.value = true
-      _statusText.value = "Speaking…"
-      playAssistant(text, playbackToken)
-      ttsJob = null
+      runPlaybackSession(playbackToken) {
+        playAssistant(text, playbackToken)
+      }
    }
  }

@@ -270,10 +276,11 @@ class TalkModeManager(
  suspend fun speakAssistantReply(text: String) {
    if (!playbackEnabled) return
    val playbackToken = playbackGeneration.incrementAndGet()
-    stopSpeaking(resetInterrupt = false)
+    cancelActivePlayback()
    ensureConfigLoaded()
-    ensurePlaybackActive(playbackToken)
-    playAssistant(text, playbackToken)
+    runPlaybackSession(playbackToken) {
+      playAssistant(text, playbackToken)
+    }
  }

  private fun start() {
@@ -483,9 +490,10 @@ class TalkModeManager(
      }
      Log.d(tag, "assistant text ok chars=${assistant.length}")
      val playbackToken = playbackGeneration.incrementAndGet()
-      stopSpeaking(resetInterrupt = false)
-      ensurePlaybackActive(playbackToken)
-      playAssistant(assistant, playbackToken)
+      cancelActivePlayback()
+      runPlaybackSession(playbackToken) {
+        playAssistant(assistant, playbackToken)
+      }
    } catch (err: Throwable) {
      if (err is CancellationException) {
        Log.d(tag, "finalize speech cancelled")
@@ -665,12 +673,60 @@ class TalkModeManager(
      }
      _statusText.value = "Speak failed: ${err.message ?: err::class.simpleName}"
      Log.w(tag, "system tts failed: ${err.message ?: err::class.simpleName}")
-    } finally {
-
-      _isSpeaking.value = false
    }
  }

+  private suspend fun runPlaybackSession(
+    playbackToken: Long,
+    block: suspend () -> Unit,
+  ) {
+    val currentJob = coroutineContext[Job]
+    var shouldResumeAfterSpeak = false
+    try {
+      val claimedPlayback =
+        synchronized(ttsJobLock) {
+          if (!playbackEnabled || playbackToken != playbackGeneration.get()) {
+            false
+          } else {
+            ttsJob = currentJob
+            true
+          }
+        }
+      if (!claimedPlayback) {
+        ensurePlaybackActive(playbackToken)
+        return
+      }
+      ensurePlaybackActive(playbackToken)
+      shouldResumeAfterSpeak = true
+      onBeforeSpeak()
+      ensurePlaybackActive(playbackToken)
+      _isSpeaking.value = true
+      _statusText.value = "Speaking…"
+      block()
+    } finally {
+      synchronized(ttsJobLock) {
+        if (ttsJob === currentJob) {
+          ttsJob = null
+        }
+      }
+      _isSpeaking.value = false
+      if (shouldResumeAfterSpeak) {
+        withContext(NonCancellable) {
+          onAfterSpeak()
+        }
+      }
+    }
+  }
+
+  private fun cancelActivePlayback() {
+    val activeJob =
+      synchronized(ttsJobLock) {
+        ttsJob
+      }
+    activeJob?.cancel()
+    stopTextToSpeechPlayback()
+  }
+
  private suspend fun speakWithSystemTts(text: String, directive: TalkDirective?, playbackToken: Long) {
    ensurePlaybackActive(playbackToken)
    val engine = ensureTextToSpeech()