diff --git a/apps/android/app/src/main/java/ai/openclaw/android/MainViewModel.kt b/apps/android/app/src/main/java/ai/openclaw/android/MainViewModel.kt index 6af97c87543..6d10da0f5fe 100644 --- a/apps/android/app/src/main/java/ai/openclaw/android/MainViewModel.kt +++ b/apps/android/app/src/main/java/ai/openclaw/android/MainViewModel.kt @@ -47,6 +47,7 @@ class MainViewModel(app: Application) : AndroidViewModel(app) { val locationPreciseEnabled: StateFlow = runtime.locationPreciseEnabled val preventSleep: StateFlow = runtime.preventSleep val micEnabled: StateFlow = runtime.micEnabled + val micCooldown: StateFlow = runtime.micCooldown val micStatusText: StateFlow = runtime.micStatusText val micLiveTranscript: StateFlow = runtime.micLiveTranscript val micIsListening: StateFlow = runtime.micIsListening diff --git a/apps/android/app/src/main/java/ai/openclaw/android/NodeRuntime.kt b/apps/android/app/src/main/java/ai/openclaw/android/NodeRuntime.kt index d85673bf75d..2f488b4686b 100644 --- a/apps/android/app/src/main/java/ai/openclaw/android/NodeRuntime.kt +++ b/apps/android/app/src/main/java/ai/openclaw/android/NodeRuntime.kt @@ -344,8 +344,11 @@ class NodeRuntime(context: Context) { MicCaptureManager( context = appContext, scope = scope, - sendToGateway = { message -> + sendToGateway = { message, onRunIdKnown -> val idempotencyKey = UUID.randomUUID().toString() + // Notify MicCaptureManager of the idempotency key *before* the network + // call so pendingRunId is set before any chat events can arrive. + onRunIdKnown(idempotencyKey) val params = buildJsonObject { put("sessionKey", JsonPrimitive(resolveMainSessionKey())) @@ -375,6 +378,9 @@ class NodeRuntime(context: Context) { val micEnabled: StateFlow get() = micCapture.micEnabled + val micCooldown: StateFlow + get() = micCapture.micCooldown + val micQueuedMessages: StateFlow> get() = micCapture.queuedMessages diff --git a/apps/android/app/src/main/java/ai/openclaw/android/ui/VoiceTabScreen.kt b/apps/android/app/src/main/java/ai/openclaw/android/ui/VoiceTabScreen.kt index fd0e0a8a4b9..921f5ed016e 100644 --- a/apps/android/app/src/main/java/ai/openclaw/android/ui/VoiceTabScreen.kt +++ b/apps/android/app/src/main/java/ai/openclaw/android/ui/VoiceTabScreen.kt @@ -80,7 +80,9 @@ fun VoiceTabScreen(viewModel: MainViewModel) { val gatewayStatus by viewModel.statusText.collectAsState() val micEnabled by viewModel.micEnabled.collectAsState() + val micCooldown by viewModel.micCooldown.collectAsState() val speakerEnabled by viewModel.speakerEnabled.collectAsState() + val micStatusText by viewModel.micStatusText.collectAsState() val micLiveTranscript by viewModel.micLiveTranscript.collectAsState() val micQueuedMessages by viewModel.micQueuedMessages.collectAsState() val micConversation by viewModel.micConversation.collectAsState() @@ -244,6 +246,7 @@ fun VoiceTabScreen(viewModel: MainViewModel) { } Button( onClick = { + if (micCooldown) return@Button if (micEnabled) { viewModel.setMicEnabled(false) return@Button @@ -255,13 +258,16 @@ fun VoiceTabScreen(viewModel: MainViewModel) { requestMicPermission.launch(Manifest.permission.RECORD_AUDIO) } }, + enabled = !micCooldown, shape = CircleShape, contentPadding = PaddingValues(0.dp), modifier = Modifier.size(60.dp), colors = ButtonDefaults.buttonColors( - containerColor = if (micEnabled) mobileDanger else mobileAccent, + containerColor = if (micCooldown) mobileTextSecondary else if (micEnabled) mobileDanger else mobileAccent, contentColor = Color.White, + disabledContainerColor = mobileTextSecondary, + disabledContentColor = Color.White.copy(alpha = 0.5f), ), ) { Icon( @@ -282,6 +288,7 @@ fun VoiceTabScreen(viewModel: MainViewModel) { when { queueCount > 0 -> "$queueCount queued" micIsSending -> "Sending" + micCooldown -> "Cooldown" micEnabled -> "Listening" else -> "Mic off" } diff --git a/apps/android/app/src/main/java/ai/openclaw/android/voice/ElevenLabsStreamingTts.kt b/apps/android/app/src/main/java/ai/openclaw/android/voice/ElevenLabsStreamingTts.kt index da011ebd5f9..76583565b4c 100644 --- a/apps/android/app/src/main/java/ai/openclaw/android/voice/ElevenLabsStreamingTts.kt +++ b/apps/android/app/src/main/java/ai/openclaw/android/voice/ElevenLabsStreamingTts.kt @@ -63,6 +63,8 @@ class ElevenLabsStreamingTts( private var client: OkHttpClient? = null @Volatile private var stopped = false @Volatile private var finished = false + @Volatile var hasReceivedAudio = false + private set private var drainJob: Job? = null // Track text already sent so we only send incremental chunks @@ -77,6 +79,7 @@ class ElevenLabsStreamingTts( fun start() { stopped = false finished = false + hasReceivedAudio = false sentTextLength = 0 trackStarted = false wsReady = false @@ -199,6 +202,7 @@ class ElevenLabsStreamingTts( /** * Returns true if text was accepted, false if text diverged (caller should restart). */ + @Synchronized fun sendText(fullText: String): Boolean { if (stopped) return false if (finished) return true // Already finishing — not a diverge, don't restart @@ -233,6 +237,7 @@ class ElevenLabsStreamingTts( * Signal that no more text is coming. Sends EOS to ElevenLabs. * The WebSocket will close after generating remaining audio. */ + @Synchronized fun finish() { if (stopped || finished) return finished = true @@ -278,6 +283,7 @@ class ElevenLabsStreamingTts( if (!trackStarted) { track.play() trackStarted = true + hasReceivedAudio = true Log.d(TAG, "AudioTrack started on first chunk") } diff --git a/apps/android/app/src/main/java/ai/openclaw/android/voice/MicCaptureManager.kt b/apps/android/app/src/main/java/ai/openclaw/android/voice/MicCaptureManager.kt index 5457d2dc1aa..099c7c1cd1e 100644 --- a/apps/android/app/src/main/java/ai/openclaw/android/voice/MicCaptureManager.kt +++ b/apps/android/app/src/main/java/ai/openclaw/android/voice/MicCaptureManager.kt @@ -11,7 +11,6 @@ import android.util.Log import android.speech.RecognitionListener import android.speech.RecognizerIntent import android.speech.SpeechRecognizer -import android.util.Log import androidx.core.content.ContextCompat import java.util.UUID import kotlinx.coroutines.CoroutineScope @@ -40,7 +39,12 @@ data class VoiceConversationEntry( class MicCaptureManager( private val context: Context, private val scope: CoroutineScope, - private val sendToGateway: suspend (String) -> String?, + /** + * Send [message] to the gateway and return the run ID. + * [onRunIdKnown] is called with the idempotency key *before* the network + * round-trip so [pendingRunId] is set before any chat events can arrive. + */ + private val sendToGateway: suspend (message: String, onRunIdKnown: (String) -> Unit) -> String?, private val speakAssistantReply: suspend (String) -> Unit = {}, ) { companion object { @@ -58,6 +62,9 @@ class MicCaptureManager( private val _micEnabled = MutableStateFlow(false) val micEnabled: StateFlow = _micEnabled + private val _micCooldown = MutableStateFlow(false) + val micCooldown: StateFlow = _micCooldown + private val _isListening = MutableStateFlow(false) val isListening: StateFlow = _isListening @@ -88,6 +95,7 @@ class MicCaptureManager( private var recognizer: SpeechRecognizer? = null private var restartJob: Job? = null + private var drainJob: Job? = null private var pendingRunTimeoutJob: Job? = null private var stopRequested = false @@ -98,8 +106,11 @@ class MicCaptureManager( start() sendQueuedIfIdle() } else { - // Give the recognizer time to finish processing buffered audio - scope.launch { + // Give the recognizer time to finish processing buffered audio. + // Cancel any prior drain to prevent duplicate sends on rapid toggle. + drainJob?.cancel() + _micCooldown.value = true + drainJob = scope.launch { delay(2000L) stop() // Capture any partial transcript that didn't get a final result from the recognizer @@ -108,6 +119,8 @@ class MicCaptureManager( sessionSegments.add(partial) } flushSessionToQueue() + drainJob = null + _micCooldown.value = false sendQueuedIfIdle() } } @@ -296,8 +309,13 @@ class MicCaptureManager( scope.launch { try { - val runId = sendToGateway(next) - pendingRunId = runId + val runId = sendToGateway(next) { earlyRunId -> + // Called with the idempotency key before chat.send fires so that + // pendingRunId is populated before any chat events can arrive. + pendingRunId = earlyRunId + } + // Update to the real runId if the gateway returned a different one. + if (runId != null && runId != pendingRunId) pendingRunId = runId if (runId == null) { pendingRunTimeoutJob?.cancel() pendingRunTimeoutJob = null