fix(android): talk mode stability — thread safety, TTS fallback, mic cooldown

Bug fixes: - @Synchronized on ElevenLabsStreamingTts.sendText/finish to prevent sentFullText/sentTextLength races across OkHttp and caller threads - Pre-set pendingRunId via onRunIdKnown callback before chat.send to eliminate race where gateway events arrive before runId is stored - Track drain coroutine as Job; cancel prior on rapid mic toggle to prevent duplicate TTS and stale transcript sends - Mic button disabled during 2s drain cooldown (micCooldown StateFlow) Codex review fixes: - Gate agent streaming TTS on sessionKey to prevent cross-session audio leaks (P1) - Clear ElevenLabs credentials when talk.provider is not elevenlabs; gate streaming TTS on activeProviderIsElevenLabs (P2) System TTS fallback fixes: - Null streamingTts immediately in finishStreamingTts so next response gets a fresh TTS instance - Add hasReceivedAudio flag to ElevenLabsStreamingTts to detect when WebSocket connects but returns no audio (invalid key, network error) - Fall back to playTtsForText when streaming TTS produced no audio - Track ttsJob to cleanly cancel prior playTtsForText on new response - Re-throw CancellationException instead of cascading into fallback attempts that also get cancelled
2026-05-06 17:20:45 +00:00 · 2026-02-28 12:32:15 -05:00
parent 4748ba491d
commit 587790e84a
5 changed files with 46 additions and 8 deletions
--- a/apps/android/app/src/main/java/ai/openclaw/android/MainViewModel.kt
+++ b/apps/android/app/src/main/java/ai/openclaw/android/MainViewModel.kt
@@ -47,6 +47,7 @@ class MainViewModel(app: Application) : AndroidViewModel(app) {
  val locationPreciseEnabled: StateFlow<Boolean> = runtime.locationPreciseEnabled
  val preventSleep: StateFlow<Boolean> = runtime.preventSleep
  val micEnabled: StateFlow<Boolean> = runtime.micEnabled
+  val micCooldown: StateFlow<Boolean> = runtime.micCooldown
  val micStatusText: StateFlow<String> = runtime.micStatusText
  val micLiveTranscript: StateFlow<String?> = runtime.micLiveTranscript
  val micIsListening: StateFlow<Boolean> = runtime.micIsListening
--- a/apps/android/app/src/main/java/ai/openclaw/android/NodeRuntime.kt
+++ b/apps/android/app/src/main/java/ai/openclaw/android/NodeRuntime.kt
@@ -344,8 +344,11 @@ class NodeRuntime(context: Context) {
    MicCaptureManager(
      context = appContext,
      scope = scope,
-      sendToGateway = { message ->
+      sendToGateway = { message, onRunIdKnown ->
        val idempotencyKey = UUID.randomUUID().toString()
+        // Notify MicCaptureManager of the idempotency key *before* the network
+        // call so pendingRunId is set before any chat events can arrive.
+        onRunIdKnown(idempotencyKey)
        val params =
          buildJsonObject {
            put("sessionKey", JsonPrimitive(resolveMainSessionKey()))
@@ -375,6 +378,9 @@ class NodeRuntime(context: Context) {
  val micEnabled: StateFlow<Boolean>
    get() = micCapture.micEnabled

+  val micCooldown: StateFlow<Boolean>
+    get() = micCapture.micCooldown
+
  val micQueuedMessages: StateFlow<List<String>>
    get() = micCapture.queuedMessages

--- a/apps/android/app/src/main/java/ai/openclaw/android/ui/VoiceTabScreen.kt
+++ b/apps/android/app/src/main/java/ai/openclaw/android/ui/VoiceTabScreen.kt
@@ -80,7 +80,9 @@ fun VoiceTabScreen(viewModel: MainViewModel) {

  val gatewayStatus by viewModel.statusText.collectAsState()
  val micEnabled by viewModel.micEnabled.collectAsState()
+  val micCooldown by viewModel.micCooldown.collectAsState()
  val speakerEnabled by viewModel.speakerEnabled.collectAsState()
+  val micStatusText by viewModel.micStatusText.collectAsState()
  val micLiveTranscript by viewModel.micLiveTranscript.collectAsState()
  val micQueuedMessages by viewModel.micQueuedMessages.collectAsState()
  val micConversation by viewModel.micConversation.collectAsState()
@@ -244,6 +246,7 @@ fun VoiceTabScreen(viewModel: MainViewModel) {
          }
          Button(
            onClick = {
+              if (micCooldown) return@Button
              if (micEnabled) {
                viewModel.setMicEnabled(false)
                return@Button
@@ -255,13 +258,16 @@ fun VoiceTabScreen(viewModel: MainViewModel) {
                requestMicPermission.launch(Manifest.permission.RECORD_AUDIO)
              }
            },
+            enabled = !micCooldown,
            shape = CircleShape,
            contentPadding = PaddingValues(0.dp),
            modifier = Modifier.size(60.dp),
            colors =
              ButtonDefaults.buttonColors(
-                containerColor = if (micEnabled) mobileDanger else mobileAccent,
+                containerColor = if (micCooldown) mobileTextSecondary else if (micEnabled) mobileDanger else mobileAccent,
                contentColor = Color.White,
+                disabledContainerColor = mobileTextSecondary,
+                disabledContentColor = Color.White.copy(alpha = 0.5f),
              ),
          ) {
            Icon(
@@ -282,6 +288,7 @@ fun VoiceTabScreen(viewModel: MainViewModel) {
        when {
          queueCount > 0 -> "$queueCount queued"
          micIsSending -> "Sending"
+          micCooldown -> "Cooldown"
          micEnabled -> "Listening"
          else -> "Mic off"
        }
--- a/apps/android/app/src/main/java/ai/openclaw/android/voice/ElevenLabsStreamingTts.kt
+++ b/apps/android/app/src/main/java/ai/openclaw/android/voice/ElevenLabsStreamingTts.kt
@@ -63,6 +63,8 @@ class ElevenLabsStreamingTts(
  private var client: OkHttpClient? = null
  @Volatile private var stopped = false
  @Volatile private var finished = false
+  @Volatile var hasReceivedAudio = false
+    private set
  private var drainJob: Job? = null

  // Track text already sent so we only send incremental chunks
@@ -77,6 +79,7 @@ class ElevenLabsStreamingTts(
  fun start() {
    stopped = false
    finished = false
+    hasReceivedAudio = false
    sentTextLength = 0
    trackStarted = false
    wsReady = false
@@ -199,6 +202,7 @@ class ElevenLabsStreamingTts(
  /**
   * Returns true if text was accepted, false if text diverged (caller should restart).
   */
+  @Synchronized
  fun sendText(fullText: String): Boolean {
    if (stopped) return false
    if (finished) return true  // Already finishing — not a diverge, don't restart
@@ -233,6 +237,7 @@ class ElevenLabsStreamingTts(
   * Signal that no more text is coming. Sends EOS to ElevenLabs.
   * The WebSocket will close after generating remaining audio.
   */
+  @Synchronized
  fun finish() {
    if (stopped || finished) return
    finished = true
@@ -278,6 +283,7 @@ class ElevenLabsStreamingTts(
    if (!trackStarted) {
      track.play()
      trackStarted = true
+      hasReceivedAudio = true
      Log.d(TAG, "AudioTrack started on first chunk")
    }

--- a/apps/android/app/src/main/java/ai/openclaw/android/voice/MicCaptureManager.kt
+++ b/apps/android/app/src/main/java/ai/openclaw/android/voice/MicCaptureManager.kt
@@ -11,7 +11,6 @@ import android.util.Log
 import android.speech.RecognitionListener
 import android.speech.RecognizerIntent
 import android.speech.SpeechRecognizer
-import android.util.Log
 import androidx.core.content.ContextCompat
 import java.util.UUID
 import kotlinx.coroutines.CoroutineScope
@@ -40,7 +39,12 @@ data class VoiceConversationEntry(
 class MicCaptureManager(
  private val context: Context,
  private val scope: CoroutineScope,
-  private val sendToGateway: suspend (String) -> String?,
+  /**
+   * Send [message] to the gateway and return the run ID.
+   * [onRunIdKnown] is called with the idempotency key *before* the network
+   * round-trip so [pendingRunId] is set before any chat events can arrive.
+   */
+  private val sendToGateway: suspend (message: String, onRunIdKnown: (String) -> Unit) -> String?,
  private val speakAssistantReply: suspend (String) -> Unit = {},
 ) {
  companion object {
@@ -58,6 +62,9 @@ class MicCaptureManager(
  private val _micEnabled = MutableStateFlow(false)
  val micEnabled: StateFlow<Boolean> = _micEnabled

+  private val _micCooldown = MutableStateFlow(false)
+  val micCooldown: StateFlow<Boolean> = _micCooldown
+
  private val _isListening = MutableStateFlow(false)
  val isListening: StateFlow<Boolean> = _isListening

@@ -88,6 +95,7 @@ class MicCaptureManager(

  private var recognizer: SpeechRecognizer? = null
  private var restartJob: Job? = null
+  private var drainJob: Job? = null
  private var pendingRunTimeoutJob: Job? = null
  private var stopRequested = false

@@ -98,8 +106,11 @@ class MicCaptureManager(
      start()
      sendQueuedIfIdle()
    } else {
-      // Give the recognizer time to finish processing buffered audio
-      scope.launch {
+      // Give the recognizer time to finish processing buffered audio.
+      // Cancel any prior drain to prevent duplicate sends on rapid toggle.
+      drainJob?.cancel()
+      _micCooldown.value = true
+      drainJob = scope.launch {
        delay(2000L)
        stop()
        // Capture any partial transcript that didn't get a final result from the recognizer
@@ -108,6 +119,8 @@ class MicCaptureManager(
          sessionSegments.add(partial)
        }
        flushSessionToQueue()
+        drainJob = null
+        _micCooldown.value = false
        sendQueuedIfIdle()
      }
    }
@@ -296,8 +309,13 @@ class MicCaptureManager(

    scope.launch {
      try {
-        val runId = sendToGateway(next)
-        pendingRunId = runId
+        val runId = sendToGateway(next) { earlyRunId ->
+          // Called with the idempotency key before chat.send fires so that
+          // pendingRunId is populated before any chat events can arrive.
+          pendingRunId = earlyRunId
+        }
+        // Update to the real runId if the gateway returned a different one.
+        if (runId != null && runId != pendingRunId) pendingRunId = runId
        if (runId == null) {
          pendingRunTimeoutJob?.cancel()
          pendingRunTimeoutJob = null