fix(android): speak final voice replies in mic capture flow

2026-05-06 09:50:42 +00:00 · 2026-02-28 18:51:34 +05:30
parent fcf3e5b0a0
commit fb92a91ef7
2 changed files with 32 additions and 0 deletions
--- a/apps/android/app/src/main/java/ai/openclaw/android/NodeRuntime.kt
+++ b/apps/android/app/src/main/java/ai/openclaw/android/NodeRuntime.kt
@@ -20,6 +20,7 @@ import ai.openclaw.android.gateway.probeGatewayTlsFingerprint
 import ai.openclaw.android.node.*
 import ai.openclaw.android.protocol.OpenClawCanvasA2UIAction
 import ai.openclaw.android.voice.MicCaptureManager
+import ai.openclaw.android.voice.TalkModeManager
 import ai.openclaw.android.voice.VoiceConversationEntry
 import kotlinx.coroutines.CoroutineScope
 import kotlinx.coroutines.Dispatchers
@@ -318,6 +319,18 @@ class NodeRuntime(context: Context) {
      json = json,
      supportsChatSubscribe = false,
    )
+  private val voiceReplySpeaker: TalkModeManager by lazy {
+    // Reuse the existing TalkMode speech engine (ElevenLabs + deterministic system-TTS fallback)
+    // without enabling the legacy talk capture loop.
+    TalkModeManager(
+      context = appContext,
+      scope = scope,
+      session = operatorSession,
+      supportsChatSubscribe = false,
+      isConnected = { operatorConnected },
+    )
+  }
+
  private val micCapture: MicCaptureManager by lazy {
    MicCaptureManager(
      context = appContext,
@@ -335,6 +348,9 @@ class NodeRuntime(context: Context) {
        val response = operatorSession.request("chat.send", params.toString())
        parseChatSendRunId(response) ?: idempotencyKey
      },
+      speakAssistantReply = { text ->
+        voiceReplySpeaker.speakAssistantReply(text)
+      },
    )
  }

--- a/apps/android/app/src/main/java/ai/openclaw/android/voice/MicCaptureManager.kt
+++ b/apps/android/app/src/main/java/ai/openclaw/android/voice/MicCaptureManager.kt
@@ -10,6 +10,7 @@ import android.os.Looper
 import android.speech.RecognitionListener
 import android.speech.RecognizerIntent
 import android.speech.SpeechRecognizer
+import android.util.Log
 import androidx.core.content.ContextCompat
 import java.util.UUID
 import kotlinx.coroutines.CoroutineScope
@@ -39,8 +40,10 @@ class MicCaptureManager(
  private val context: Context,
  private val scope: CoroutineScope,
  private val sendToGateway: suspend (String) -> String?,
+  private val speakAssistantReply: suspend (String) -> Unit = {},
 ) {
  companion object {
+    private const val tag = "MicCapture"
    private const val speechMinSessionMs = 30_000L
    private const val speechCompleteSilenceMs = 1_500L
    private const val speechPossibleSilenceMs = 900L
@@ -140,6 +143,7 @@ class MicCaptureManager(
        val finalText = parseAssistantText(payload)?.trim().orEmpty()
        if (finalText.isNotEmpty()) {
          upsertPendingAssistant(text = finalText, isStreaming = false)
+          playAssistantReplyAsync(finalText)
        } else if (pendingAssistantEntryId != null) {
          updateConversationEntry(pendingAssistantEntryId!!, text = null, isStreaming = false)
        }
@@ -386,6 +390,18 @@ class MicCaptureManager(
    updateConversationEntry(id = currentId, text = text, isStreaming = isStreaming)
  }

+  private fun playAssistantReplyAsync(text: String) {
+    val spoken = text.trim()
+    if (spoken.isEmpty()) return
+    scope.launch {
+      try {
+        speakAssistantReply(spoken)
+      } catch (err: Throwable) {
+        Log.w(tag, "assistant speech failed: ${err.message ?: err::class.simpleName}")
+      }
+    }
+  }
+
  private fun onFinalTranscript(text: String) {
    val trimmed = text.trim()
    if (trimmed.isEmpty()) return