fix(android): talk mode stability — thread safety, TTS fallback, mic cooldown

Bug fixes:
- @Synchronized on ElevenLabsStreamingTts.sendText/finish to prevent
  sentFullText/sentTextLength races across OkHttp and caller threads
- Pre-set pendingRunId via onRunIdKnown callback before chat.send to
  eliminate race where gateway events arrive before runId is stored
- Track drain coroutine as Job; cancel prior on rapid mic toggle to
  prevent duplicate TTS and stale transcript sends
- Mic button disabled during 2s drain cooldown (micCooldown StateFlow)

Codex review fixes:
- Gate agent streaming TTS on sessionKey to prevent cross-session
  audio leaks (P1)
- Clear ElevenLabs credentials when talk.provider is not elevenlabs;
  gate streaming TTS on activeProviderIsElevenLabs (P2)

System TTS fallback fixes:
- Null streamingTts immediately in finishStreamingTts so next response
  gets a fresh TTS instance
- Add hasReceivedAudio flag to ElevenLabsStreamingTts to detect when
  WebSocket connects but returns no audio (invalid key, network error)
- Fall back to playTtsForText when streaming TTS produced no audio
- Track ttsJob to cleanly cancel prior playTtsForText on new response
- Re-throw CancellationException instead of cascading into fallback
  attempts that also get cancelled
This commit is contained in:
Greg Mousseau
2026-02-28 12:32:15 -05:00
committed by Ayaan Zaidi
parent 4748ba491d
commit 587790e84a
5 changed files with 46 additions and 8 deletions

View File

@@ -47,6 +47,7 @@ class MainViewModel(app: Application) : AndroidViewModel(app) {
val locationPreciseEnabled: StateFlow<Boolean> = runtime.locationPreciseEnabled
val preventSleep: StateFlow<Boolean> = runtime.preventSleep
val micEnabled: StateFlow<Boolean> = runtime.micEnabled
val micCooldown: StateFlow<Boolean> = runtime.micCooldown
val micStatusText: StateFlow<String> = runtime.micStatusText
val micLiveTranscript: StateFlow<String?> = runtime.micLiveTranscript
val micIsListening: StateFlow<Boolean> = runtime.micIsListening

View File

@@ -344,8 +344,11 @@ class NodeRuntime(context: Context) {
MicCaptureManager(
context = appContext,
scope = scope,
sendToGateway = { message ->
sendToGateway = { message, onRunIdKnown ->
val idempotencyKey = UUID.randomUUID().toString()
// Notify MicCaptureManager of the idempotency key *before* the network
// call so pendingRunId is set before any chat events can arrive.
onRunIdKnown(idempotencyKey)
val params =
buildJsonObject {
put("sessionKey", JsonPrimitive(resolveMainSessionKey()))
@@ -375,6 +378,9 @@ class NodeRuntime(context: Context) {
val micEnabled: StateFlow<Boolean>
get() = micCapture.micEnabled
val micCooldown: StateFlow<Boolean>
get() = micCapture.micCooldown
val micQueuedMessages: StateFlow<List<String>>
get() = micCapture.queuedMessages

View File

@@ -80,7 +80,9 @@ fun VoiceTabScreen(viewModel: MainViewModel) {
val gatewayStatus by viewModel.statusText.collectAsState()
val micEnabled by viewModel.micEnabled.collectAsState()
val micCooldown by viewModel.micCooldown.collectAsState()
val speakerEnabled by viewModel.speakerEnabled.collectAsState()
val micStatusText by viewModel.micStatusText.collectAsState()
val micLiveTranscript by viewModel.micLiveTranscript.collectAsState()
val micQueuedMessages by viewModel.micQueuedMessages.collectAsState()
val micConversation by viewModel.micConversation.collectAsState()
@@ -244,6 +246,7 @@ fun VoiceTabScreen(viewModel: MainViewModel) {
}
Button(
onClick = {
if (micCooldown) return@Button
if (micEnabled) {
viewModel.setMicEnabled(false)
return@Button
@@ -255,13 +258,16 @@ fun VoiceTabScreen(viewModel: MainViewModel) {
requestMicPermission.launch(Manifest.permission.RECORD_AUDIO)
}
},
enabled = !micCooldown,
shape = CircleShape,
contentPadding = PaddingValues(0.dp),
modifier = Modifier.size(60.dp),
colors =
ButtonDefaults.buttonColors(
containerColor = if (micEnabled) mobileDanger else mobileAccent,
containerColor = if (micCooldown) mobileTextSecondary else if (micEnabled) mobileDanger else mobileAccent,
contentColor = Color.White,
disabledContainerColor = mobileTextSecondary,
disabledContentColor = Color.White.copy(alpha = 0.5f),
),
) {
Icon(
@@ -282,6 +288,7 @@ fun VoiceTabScreen(viewModel: MainViewModel) {
when {
queueCount > 0 -> "$queueCount queued"
micIsSending -> "Sending"
micCooldown -> "Cooldown"
micEnabled -> "Listening"
else -> "Mic off"
}

View File

@@ -63,6 +63,8 @@ class ElevenLabsStreamingTts(
private var client: OkHttpClient? = null
@Volatile private var stopped = false
@Volatile private var finished = false
@Volatile var hasReceivedAudio = false
private set
private var drainJob: Job? = null
// Track text already sent so we only send incremental chunks
@@ -77,6 +79,7 @@ class ElevenLabsStreamingTts(
fun start() {
stopped = false
finished = false
hasReceivedAudio = false
sentTextLength = 0
trackStarted = false
wsReady = false
@@ -199,6 +202,7 @@ class ElevenLabsStreamingTts(
/**
* Returns true if text was accepted, false if text diverged (caller should restart).
*/
@Synchronized
fun sendText(fullText: String): Boolean {
if (stopped) return false
if (finished) return true // Already finishing — not a diverge, don't restart
@@ -233,6 +237,7 @@ class ElevenLabsStreamingTts(
* Signal that no more text is coming. Sends EOS to ElevenLabs.
* The WebSocket will close after generating remaining audio.
*/
@Synchronized
fun finish() {
if (stopped || finished) return
finished = true
@@ -278,6 +283,7 @@ class ElevenLabsStreamingTts(
if (!trackStarted) {
track.play()
trackStarted = true
hasReceivedAudio = true
Log.d(TAG, "AudioTrack started on first chunk")
}

View File

@@ -11,7 +11,6 @@ import android.util.Log
import android.speech.RecognitionListener
import android.speech.RecognizerIntent
import android.speech.SpeechRecognizer
import android.util.Log
import androidx.core.content.ContextCompat
import java.util.UUID
import kotlinx.coroutines.CoroutineScope
@@ -40,7 +39,12 @@ data class VoiceConversationEntry(
class MicCaptureManager(
private val context: Context,
private val scope: CoroutineScope,
private val sendToGateway: suspend (String) -> String?,
/**
* Send [message] to the gateway and return the run ID.
* [onRunIdKnown] is called with the idempotency key *before* the network
* round-trip so [pendingRunId] is set before any chat events can arrive.
*/
private val sendToGateway: suspend (message: String, onRunIdKnown: (String) -> Unit) -> String?,
private val speakAssistantReply: suspend (String) -> Unit = {},
) {
companion object {
@@ -58,6 +62,9 @@ class MicCaptureManager(
private val _micEnabled = MutableStateFlow(false)
val micEnabled: StateFlow<Boolean> = _micEnabled
private val _micCooldown = MutableStateFlow(false)
val micCooldown: StateFlow<Boolean> = _micCooldown
private val _isListening = MutableStateFlow(false)
val isListening: StateFlow<Boolean> = _isListening
@@ -88,6 +95,7 @@ class MicCaptureManager(
private var recognizer: SpeechRecognizer? = null
private var restartJob: Job? = null
private var drainJob: Job? = null
private var pendingRunTimeoutJob: Job? = null
private var stopRequested = false
@@ -98,8 +106,11 @@ class MicCaptureManager(
start()
sendQueuedIfIdle()
} else {
// Give the recognizer time to finish processing buffered audio
scope.launch {
// Give the recognizer time to finish processing buffered audio.
// Cancel any prior drain to prevent duplicate sends on rapid toggle.
drainJob?.cancel()
_micCooldown.value = true
drainJob = scope.launch {
delay(2000L)
stop()
// Capture any partial transcript that didn't get a final result from the recognizer
@@ -108,6 +119,8 @@ class MicCaptureManager(
sessionSegments.add(partial)
}
flushSessionToQueue()
drainJob = null
_micCooldown.value = false
sendQueuedIfIdle()
}
}
@@ -296,8 +309,13 @@ class MicCaptureManager(
scope.launch {
try {
val runId = sendToGateway(next)
pendingRunId = runId
val runId = sendToGateway(next) { earlyRunId ->
// Called with the idempotency key before chat.send fires so that
// pendingRunId is populated before any chat events can arrive.
pendingRunId = earlyRunId
}
// Update to the real runId if the gateway returned a different one.
if (runId != null && runId != pendingRunId) pendingRunId = runId
if (runId == null) {
pendingRunTimeoutJob?.cancel()
pendingRunTimeoutJob = null