diff --git a/apps/android/app/src/main/java/ai/openclaw/android/NodeRuntime.kt b/apps/android/app/src/main/java/ai/openclaw/android/NodeRuntime.kt index 3e8548c5df4..202635a07c6 100644 --- a/apps/android/app/src/main/java/ai/openclaw/android/NodeRuntime.kt +++ b/apps/android/app/src/main/java/ai/openclaw/android/NodeRuntime.kt @@ -20,6 +20,7 @@ import ai.openclaw.android.gateway.probeGatewayTlsFingerprint import ai.openclaw.android.node.* import ai.openclaw.android.protocol.OpenClawCanvasA2UIAction import ai.openclaw.android.voice.MicCaptureManager +import ai.openclaw.android.voice.TalkModeManager import ai.openclaw.android.voice.VoiceConversationEntry import kotlinx.coroutines.CoroutineScope import kotlinx.coroutines.Dispatchers @@ -318,6 +319,18 @@ class NodeRuntime(context: Context) { json = json, supportsChatSubscribe = false, ) + private val voiceReplySpeaker: TalkModeManager by lazy { + // Reuse the existing TalkMode speech engine (ElevenLabs + deterministic system-TTS fallback) + // without enabling the legacy talk capture loop. + TalkModeManager( + context = appContext, + scope = scope, + session = operatorSession, + supportsChatSubscribe = false, + isConnected = { operatorConnected }, + ) + } + private val micCapture: MicCaptureManager by lazy { MicCaptureManager( context = appContext, @@ -335,6 +348,9 @@ class NodeRuntime(context: Context) { val response = operatorSession.request("chat.send", params.toString()) parseChatSendRunId(response) ?: idempotencyKey }, + speakAssistantReply = { text -> + voiceReplySpeaker.speakAssistantReply(text) + }, ) } diff --git a/apps/android/app/src/main/java/ai/openclaw/android/voice/MicCaptureManager.kt b/apps/android/app/src/main/java/ai/openclaw/android/voice/MicCaptureManager.kt index c28e523a182..70f228a733a 100644 --- a/apps/android/app/src/main/java/ai/openclaw/android/voice/MicCaptureManager.kt +++ b/apps/android/app/src/main/java/ai/openclaw/android/voice/MicCaptureManager.kt @@ -10,6 +10,7 @@ import android.os.Looper import android.speech.RecognitionListener import android.speech.RecognizerIntent import android.speech.SpeechRecognizer +import android.util.Log import androidx.core.content.ContextCompat import java.util.UUID import kotlinx.coroutines.CoroutineScope @@ -39,8 +40,10 @@ class MicCaptureManager( private val context: Context, private val scope: CoroutineScope, private val sendToGateway: suspend (String) -> String?, + private val speakAssistantReply: suspend (String) -> Unit = {}, ) { companion object { + private const val tag = "MicCapture" private const val speechMinSessionMs = 30_000L private const val speechCompleteSilenceMs = 1_500L private const val speechPossibleSilenceMs = 900L @@ -140,6 +143,7 @@ class MicCaptureManager( val finalText = parseAssistantText(payload)?.trim().orEmpty() if (finalText.isNotEmpty()) { upsertPendingAssistant(text = finalText, isStreaming = false) + playAssistantReplyAsync(finalText) } else if (pendingAssistantEntryId != null) { updateConversationEntry(pendingAssistantEntryId!!, text = null, isStreaming = false) } @@ -386,6 +390,18 @@ class MicCaptureManager( updateConversationEntry(id = currentId, text = text, isStreaming = isStreaming) } + private fun playAssistantReplyAsync(text: String) { + val spoken = text.trim() + if (spoken.isEmpty()) return + scope.launch { + try { + speakAssistantReply(spoken) + } catch (err: Throwable) { + Log.w(tag, "assistant speech failed: ${err.message ?: err::class.simpleName}") + } + } + } + private fun onFinalTranscript(text: String) { val trimmed = text.trim() if (trimmed.isEmpty()) return