mirror of
https://github.com/openclaw/openclaw.git
synced 2026-03-12 07:20:45 +00:00
fix(android): talk mode stability — thread safety, TTS fallback, mic cooldown
Bug fixes: - @Synchronized on ElevenLabsStreamingTts.sendText/finish to prevent sentFullText/sentTextLength races across OkHttp and caller threads - Pre-set pendingRunId via onRunIdKnown callback before chat.send to eliminate race where gateway events arrive before runId is stored - Track drain coroutine as Job; cancel prior on rapid mic toggle to prevent duplicate TTS and stale transcript sends - Mic button disabled during 2s drain cooldown (micCooldown StateFlow) Codex review fixes: - Gate agent streaming TTS on sessionKey to prevent cross-session audio leaks (P1) - Clear ElevenLabs credentials when talk.provider is not elevenlabs; gate streaming TTS on activeProviderIsElevenLabs (P2) System TTS fallback fixes: - Null streamingTts immediately in finishStreamingTts so next response gets a fresh TTS instance - Add hasReceivedAudio flag to ElevenLabsStreamingTts to detect when WebSocket connects but returns no audio (invalid key, network error) - Fall back to playTtsForText when streaming TTS produced no audio - Track ttsJob to cleanly cancel prior playTtsForText on new response - Re-throw CancellationException instead of cascading into fallback attempts that also get cancelled
This commit is contained in:
committed by
Ayaan Zaidi
parent
4748ba491d
commit
587790e84a
@@ -47,6 +47,7 @@ class MainViewModel(app: Application) : AndroidViewModel(app) {
|
||||
val locationPreciseEnabled: StateFlow<Boolean> = runtime.locationPreciseEnabled
|
||||
val preventSleep: StateFlow<Boolean> = runtime.preventSleep
|
||||
val micEnabled: StateFlow<Boolean> = runtime.micEnabled
|
||||
val micCooldown: StateFlow<Boolean> = runtime.micCooldown
|
||||
val micStatusText: StateFlow<String> = runtime.micStatusText
|
||||
val micLiveTranscript: StateFlow<String?> = runtime.micLiveTranscript
|
||||
val micIsListening: StateFlow<Boolean> = runtime.micIsListening
|
||||
|
||||
@@ -344,8 +344,11 @@ class NodeRuntime(context: Context) {
|
||||
MicCaptureManager(
|
||||
context = appContext,
|
||||
scope = scope,
|
||||
sendToGateway = { message ->
|
||||
sendToGateway = { message, onRunIdKnown ->
|
||||
val idempotencyKey = UUID.randomUUID().toString()
|
||||
// Notify MicCaptureManager of the idempotency key *before* the network
|
||||
// call so pendingRunId is set before any chat events can arrive.
|
||||
onRunIdKnown(idempotencyKey)
|
||||
val params =
|
||||
buildJsonObject {
|
||||
put("sessionKey", JsonPrimitive(resolveMainSessionKey()))
|
||||
@@ -375,6 +378,9 @@ class NodeRuntime(context: Context) {
|
||||
val micEnabled: StateFlow<Boolean>
|
||||
get() = micCapture.micEnabled
|
||||
|
||||
val micCooldown: StateFlow<Boolean>
|
||||
get() = micCapture.micCooldown
|
||||
|
||||
val micQueuedMessages: StateFlow<List<String>>
|
||||
get() = micCapture.queuedMessages
|
||||
|
||||
|
||||
@@ -80,7 +80,9 @@ fun VoiceTabScreen(viewModel: MainViewModel) {
|
||||
|
||||
val gatewayStatus by viewModel.statusText.collectAsState()
|
||||
val micEnabled by viewModel.micEnabled.collectAsState()
|
||||
val micCooldown by viewModel.micCooldown.collectAsState()
|
||||
val speakerEnabled by viewModel.speakerEnabled.collectAsState()
|
||||
val micStatusText by viewModel.micStatusText.collectAsState()
|
||||
val micLiveTranscript by viewModel.micLiveTranscript.collectAsState()
|
||||
val micQueuedMessages by viewModel.micQueuedMessages.collectAsState()
|
||||
val micConversation by viewModel.micConversation.collectAsState()
|
||||
@@ -244,6 +246,7 @@ fun VoiceTabScreen(viewModel: MainViewModel) {
|
||||
}
|
||||
Button(
|
||||
onClick = {
|
||||
if (micCooldown) return@Button
|
||||
if (micEnabled) {
|
||||
viewModel.setMicEnabled(false)
|
||||
return@Button
|
||||
@@ -255,13 +258,16 @@ fun VoiceTabScreen(viewModel: MainViewModel) {
|
||||
requestMicPermission.launch(Manifest.permission.RECORD_AUDIO)
|
||||
}
|
||||
},
|
||||
enabled = !micCooldown,
|
||||
shape = CircleShape,
|
||||
contentPadding = PaddingValues(0.dp),
|
||||
modifier = Modifier.size(60.dp),
|
||||
colors =
|
||||
ButtonDefaults.buttonColors(
|
||||
containerColor = if (micEnabled) mobileDanger else mobileAccent,
|
||||
containerColor = if (micCooldown) mobileTextSecondary else if (micEnabled) mobileDanger else mobileAccent,
|
||||
contentColor = Color.White,
|
||||
disabledContainerColor = mobileTextSecondary,
|
||||
disabledContentColor = Color.White.copy(alpha = 0.5f),
|
||||
),
|
||||
) {
|
||||
Icon(
|
||||
@@ -282,6 +288,7 @@ fun VoiceTabScreen(viewModel: MainViewModel) {
|
||||
when {
|
||||
queueCount > 0 -> "$queueCount queued"
|
||||
micIsSending -> "Sending"
|
||||
micCooldown -> "Cooldown"
|
||||
micEnabled -> "Listening"
|
||||
else -> "Mic off"
|
||||
}
|
||||
|
||||
@@ -63,6 +63,8 @@ class ElevenLabsStreamingTts(
|
||||
private var client: OkHttpClient? = null
|
||||
@Volatile private var stopped = false
|
||||
@Volatile private var finished = false
|
||||
@Volatile var hasReceivedAudio = false
|
||||
private set
|
||||
private var drainJob: Job? = null
|
||||
|
||||
// Track text already sent so we only send incremental chunks
|
||||
@@ -77,6 +79,7 @@ class ElevenLabsStreamingTts(
|
||||
fun start() {
|
||||
stopped = false
|
||||
finished = false
|
||||
hasReceivedAudio = false
|
||||
sentTextLength = 0
|
||||
trackStarted = false
|
||||
wsReady = false
|
||||
@@ -199,6 +202,7 @@ class ElevenLabsStreamingTts(
|
||||
/**
|
||||
* Returns true if text was accepted, false if text diverged (caller should restart).
|
||||
*/
|
||||
@Synchronized
|
||||
fun sendText(fullText: String): Boolean {
|
||||
if (stopped) return false
|
||||
if (finished) return true // Already finishing — not a diverge, don't restart
|
||||
@@ -233,6 +237,7 @@ class ElevenLabsStreamingTts(
|
||||
* Signal that no more text is coming. Sends EOS to ElevenLabs.
|
||||
* The WebSocket will close after generating remaining audio.
|
||||
*/
|
||||
@Synchronized
|
||||
fun finish() {
|
||||
if (stopped || finished) return
|
||||
finished = true
|
||||
@@ -278,6 +283,7 @@ class ElevenLabsStreamingTts(
|
||||
if (!trackStarted) {
|
||||
track.play()
|
||||
trackStarted = true
|
||||
hasReceivedAudio = true
|
||||
Log.d(TAG, "AudioTrack started on first chunk")
|
||||
}
|
||||
|
||||
|
||||
@@ -11,7 +11,6 @@ import android.util.Log
|
||||
import android.speech.RecognitionListener
|
||||
import android.speech.RecognizerIntent
|
||||
import android.speech.SpeechRecognizer
|
||||
import android.util.Log
|
||||
import androidx.core.content.ContextCompat
|
||||
import java.util.UUID
|
||||
import kotlinx.coroutines.CoroutineScope
|
||||
@@ -40,7 +39,12 @@ data class VoiceConversationEntry(
|
||||
class MicCaptureManager(
|
||||
private val context: Context,
|
||||
private val scope: CoroutineScope,
|
||||
private val sendToGateway: suspend (String) -> String?,
|
||||
/**
|
||||
* Send [message] to the gateway and return the run ID.
|
||||
* [onRunIdKnown] is called with the idempotency key *before* the network
|
||||
* round-trip so [pendingRunId] is set before any chat events can arrive.
|
||||
*/
|
||||
private val sendToGateway: suspend (message: String, onRunIdKnown: (String) -> Unit) -> String?,
|
||||
private val speakAssistantReply: suspend (String) -> Unit = {},
|
||||
) {
|
||||
companion object {
|
||||
@@ -58,6 +62,9 @@ class MicCaptureManager(
|
||||
private val _micEnabled = MutableStateFlow(false)
|
||||
val micEnabled: StateFlow<Boolean> = _micEnabled
|
||||
|
||||
private val _micCooldown = MutableStateFlow(false)
|
||||
val micCooldown: StateFlow<Boolean> = _micCooldown
|
||||
|
||||
private val _isListening = MutableStateFlow(false)
|
||||
val isListening: StateFlow<Boolean> = _isListening
|
||||
|
||||
@@ -88,6 +95,7 @@ class MicCaptureManager(
|
||||
|
||||
private var recognizer: SpeechRecognizer? = null
|
||||
private var restartJob: Job? = null
|
||||
private var drainJob: Job? = null
|
||||
private var pendingRunTimeoutJob: Job? = null
|
||||
private var stopRequested = false
|
||||
|
||||
@@ -98,8 +106,11 @@ class MicCaptureManager(
|
||||
start()
|
||||
sendQueuedIfIdle()
|
||||
} else {
|
||||
// Give the recognizer time to finish processing buffered audio
|
||||
scope.launch {
|
||||
// Give the recognizer time to finish processing buffered audio.
|
||||
// Cancel any prior drain to prevent duplicate sends on rapid toggle.
|
||||
drainJob?.cancel()
|
||||
_micCooldown.value = true
|
||||
drainJob = scope.launch {
|
||||
delay(2000L)
|
||||
stop()
|
||||
// Capture any partial transcript that didn't get a final result from the recognizer
|
||||
@@ -108,6 +119,8 @@ class MicCaptureManager(
|
||||
sessionSegments.add(partial)
|
||||
}
|
||||
flushSessionToQueue()
|
||||
drainJob = null
|
||||
_micCooldown.value = false
|
||||
sendQueuedIfIdle()
|
||||
}
|
||||
}
|
||||
@@ -296,8 +309,13 @@ class MicCaptureManager(
|
||||
|
||||
scope.launch {
|
||||
try {
|
||||
val runId = sendToGateway(next)
|
||||
pendingRunId = runId
|
||||
val runId = sendToGateway(next) { earlyRunId ->
|
||||
// Called with the idempotency key before chat.send fires so that
|
||||
// pendingRunId is populated before any chat events can arrive.
|
||||
pendingRunId = earlyRunId
|
||||
}
|
||||
// Update to the real runId if the gateway returned a different one.
|
||||
if (runId != null && runId != pendingRunId) pendingRunId = runId
|
||||
if (runId == null) {
|
||||
pendingRunTimeoutJob?.cancel()
|
||||
pendingRunTimeoutJob = null
|
||||
|
||||
Reference in New Issue
Block a user