fix(android): speak final voice replies in mic capture flow

This commit is contained in:
Ayaan Zaidi
2026-02-28 18:51:34 +05:30
committed by Ayaan Zaidi
parent fcf3e5b0a0
commit fb92a91ef7
2 changed files with 32 additions and 0 deletions

View File

@@ -20,6 +20,7 @@ import ai.openclaw.android.gateway.probeGatewayTlsFingerprint
import ai.openclaw.android.node.*
import ai.openclaw.android.protocol.OpenClawCanvasA2UIAction
import ai.openclaw.android.voice.MicCaptureManager
import ai.openclaw.android.voice.TalkModeManager
import ai.openclaw.android.voice.VoiceConversationEntry
import kotlinx.coroutines.CoroutineScope
import kotlinx.coroutines.Dispatchers
@@ -318,6 +319,18 @@ class NodeRuntime(context: Context) {
json = json,
supportsChatSubscribe = false,
)
private val voiceReplySpeaker: TalkModeManager by lazy {
// Reuse the existing TalkMode speech engine (ElevenLabs + deterministic system-TTS fallback)
// without enabling the legacy talk capture loop.
TalkModeManager(
context = appContext,
scope = scope,
session = operatorSession,
supportsChatSubscribe = false,
isConnected = { operatorConnected },
)
}
private val micCapture: MicCaptureManager by lazy {
MicCaptureManager(
context = appContext,
@@ -335,6 +348,9 @@ class NodeRuntime(context: Context) {
val response = operatorSession.request("chat.send", params.toString())
parseChatSendRunId(response) ?: idempotencyKey
},
speakAssistantReply = { text ->
voiceReplySpeaker.speakAssistantReply(text)
},
)
}

View File

@@ -10,6 +10,7 @@ import android.os.Looper
import android.speech.RecognitionListener
import android.speech.RecognizerIntent
import android.speech.SpeechRecognizer
import android.util.Log
import androidx.core.content.ContextCompat
import java.util.UUID
import kotlinx.coroutines.CoroutineScope
@@ -39,8 +40,10 @@ class MicCaptureManager(
private val context: Context,
private val scope: CoroutineScope,
private val sendToGateway: suspend (String) -> String?,
private val speakAssistantReply: suspend (String) -> Unit = {},
) {
companion object {
private const val tag = "MicCapture"
private const val speechMinSessionMs = 30_000L
private const val speechCompleteSilenceMs = 1_500L
private const val speechPossibleSilenceMs = 900L
@@ -140,6 +143,7 @@ class MicCaptureManager(
val finalText = parseAssistantText(payload)?.trim().orEmpty()
if (finalText.isNotEmpty()) {
upsertPendingAssistant(text = finalText, isStreaming = false)
playAssistantReplyAsync(finalText)
} else if (pendingAssistantEntryId != null) {
updateConversationEntry(pendingAssistantEntryId!!, text = null, isStreaming = false)
}
@@ -386,6 +390,18 @@ class MicCaptureManager(
updateConversationEntry(id = currentId, text = text, isStreaming = isStreaming)
}
private fun playAssistantReplyAsync(text: String) {
val spoken = text.trim()
if (spoken.isEmpty()) return
scope.launch {
try {
speakAssistantReply(spoken)
} catch (err: Throwable) {
Log.w(tag, "assistant speech failed: ${err.message ?: err::class.simpleName}")
}
}
}
private fun onFinalTranscript(text: String) {
val trimmed = text.trim()
if (trimmed.isEmpty()) return