mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-05 04:30:24 +00:00
fix: restore android talk mode reply tts (#60306) (thanks @MKV21)
* Android: keep talk-mode session key synced for TTS replies * Android: harden talk-mode reply playback state * Android: harden talk-mode playback cancellation * Android: avoid stale talk-mode playback preemption * Android: tighten talk-mode playback claiming * fix: distill android talk-mode playback ownership * fix: restore android talk mode reply tts (#60306) (thanks @MKV21) --------- Co-authored-by: Michael Faath <michaelfaath@macbookpro.speedport.ip> Co-authored-by: Ayaan Zaidi <hi@obviy.us>
This commit is contained in:
@@ -345,6 +345,8 @@ class NodeRuntime(
|
||||
session = operatorSession,
|
||||
supportsChatSubscribe = false,
|
||||
isConnected = { operatorConnected },
|
||||
onBeforeSpeak = { micCapture.pauseForTts() },
|
||||
onAfterSpeak = { micCapture.resumeAfterTts() },
|
||||
).also { speaker ->
|
||||
speaker.setPlaybackEnabled(prefs.speakerEnabled.value)
|
||||
}
|
||||
@@ -416,14 +418,19 @@ class NodeRuntime(
|
||||
session = operatorSession,
|
||||
supportsChatSubscribe = true,
|
||||
isConnected = { operatorConnected },
|
||||
onBeforeSpeak = { micCapture.pauseForTts() },
|
||||
onAfterSpeak = { micCapture.resumeAfterTts() },
|
||||
)
|
||||
}
|
||||
|
||||
private fun syncMainSessionKey(agentId: String?) {
|
||||
val resolvedKey = resolveNodeMainSessionKey(agentId)
|
||||
// Always push the resolved session key into TalkMode, even when the
|
||||
// state flow value is unchanged, so lazy TalkMode instances do not
|
||||
// stay on the default "main" session key.
|
||||
talkMode.setMainSessionKey(resolvedKey)
|
||||
if (_mainSessionKey.value == resolvedKey) return
|
||||
_mainSessionKey.value = resolvedKey
|
||||
talkMode.setMainSessionKey(resolvedKey)
|
||||
chat.applyMainSessionKey(resolvedKey)
|
||||
updateHomeCanvasState()
|
||||
}
|
||||
|
||||
@@ -14,11 +14,13 @@ import android.speech.SpeechRecognizer
|
||||
import androidx.core.content.ContextCompat
|
||||
import java.util.UUID
|
||||
import kotlinx.coroutines.CoroutineScope
|
||||
import kotlinx.coroutines.Dispatchers
|
||||
import kotlinx.coroutines.Job
|
||||
import kotlinx.coroutines.delay
|
||||
import kotlinx.coroutines.flow.MutableStateFlow
|
||||
import kotlinx.coroutines.flow.StateFlow
|
||||
import kotlinx.coroutines.launch
|
||||
import kotlinx.coroutines.withContext
|
||||
import kotlinx.serialization.json.Json
|
||||
import kotlinx.serialization.json.JsonArray
|
||||
import kotlinx.serialization.json.JsonObject
|
||||
@@ -99,11 +101,27 @@ class MicCaptureManager(
|
||||
private var transcriptFlushJob: Job? = null
|
||||
private var pendingRunTimeoutJob: Job? = null
|
||||
private var stopRequested = false
|
||||
private val ttsPauseLock = Any()
|
||||
private var ttsPauseDepth = 0
|
||||
private var resumeMicAfterTts = false
|
||||
|
||||
fun setMicEnabled(enabled: Boolean) {
|
||||
if (_micEnabled.value == enabled) return
|
||||
_micEnabled.value = enabled
|
||||
if (enabled) {
|
||||
val pausedForTts =
|
||||
synchronized(ttsPauseLock) {
|
||||
if (ttsPauseDepth > 0) {
|
||||
resumeMicAfterTts = true
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
if (pausedForTts) {
|
||||
_statusText.value = if (_isSending.value) "Speaking · waiting for reply" else "Speaking…"
|
||||
return
|
||||
}
|
||||
start()
|
||||
sendQueuedIfIdle()
|
||||
} else {
|
||||
@@ -126,6 +144,58 @@ class MicCaptureManager(
|
||||
}
|
||||
}
|
||||
|
||||
suspend fun pauseForTts() {
|
||||
val shouldPause =
|
||||
synchronized(ttsPauseLock) {
|
||||
ttsPauseDepth += 1
|
||||
if (ttsPauseDepth > 1) return@synchronized false
|
||||
resumeMicAfterTts = _micEnabled.value
|
||||
val active = resumeMicAfterTts || recognizer != null || _isListening.value
|
||||
if (!active) return@synchronized false
|
||||
stopRequested = true
|
||||
restartJob?.cancel()
|
||||
restartJob = null
|
||||
transcriptFlushJob?.cancel()
|
||||
transcriptFlushJob = null
|
||||
_isListening.value = false
|
||||
_inputLevel.value = 0f
|
||||
_liveTranscript.value = null
|
||||
_statusText.value = if (_isSending.value) "Speaking · waiting for reply" else "Speaking…"
|
||||
true
|
||||
}
|
||||
if (!shouldPause) return
|
||||
withContext(Dispatchers.Main) {
|
||||
recognizer?.cancel()
|
||||
recognizer?.destroy()
|
||||
recognizer = null
|
||||
}
|
||||
}
|
||||
|
||||
suspend fun resumeAfterTts() {
|
||||
val shouldResume =
|
||||
synchronized(ttsPauseLock) {
|
||||
if (ttsPauseDepth == 0) return@synchronized false
|
||||
ttsPauseDepth -= 1
|
||||
if (ttsPauseDepth > 0) return@synchronized false
|
||||
val resume = resumeMicAfterTts && _micEnabled.value
|
||||
resumeMicAfterTts = false
|
||||
if (!resume) {
|
||||
_statusText.value =
|
||||
when {
|
||||
_micEnabled.value && _isSending.value -> "Listening · sending queued voice"
|
||||
_micEnabled.value -> "Listening"
|
||||
_isSending.value -> "Mic off · sending…"
|
||||
else -> "Mic off"
|
||||
}
|
||||
}
|
||||
resume
|
||||
}
|
||||
if (!shouldResume) return
|
||||
stopRequested = false
|
||||
start()
|
||||
sendQueuedIfIdle()
|
||||
}
|
||||
|
||||
fun onGatewayConnectionChanged(connected: Boolean) {
|
||||
gatewayConnected = connected
|
||||
if (connected) {
|
||||
|
||||
@@ -22,11 +22,13 @@ import ai.openclaw.app.gateway.GatewaySession
|
||||
import java.util.Locale
|
||||
import java.util.UUID
|
||||
import java.util.concurrent.atomic.AtomicLong
|
||||
import kotlin.coroutines.coroutineContext
|
||||
import kotlinx.coroutines.CancellationException
|
||||
import kotlinx.coroutines.CompletableDeferred
|
||||
import kotlinx.coroutines.CoroutineScope
|
||||
import kotlinx.coroutines.Dispatchers
|
||||
import kotlinx.coroutines.Job
|
||||
import kotlinx.coroutines.NonCancellable
|
||||
import kotlinx.coroutines.delay
|
||||
import kotlinx.coroutines.ensureActive
|
||||
import kotlinx.coroutines.flow.MutableStateFlow
|
||||
@@ -46,6 +48,8 @@ class TalkModeManager(
|
||||
private val session: GatewaySession,
|
||||
private val supportsChatSubscribe: Boolean,
|
||||
private val isConnected: () -> Boolean,
|
||||
private val onBeforeSpeak: suspend () -> Unit = {},
|
||||
private val onAfterSpeak: suspend () -> Unit = {},
|
||||
) {
|
||||
companion object {
|
||||
private const val tag = "TalkMode"
|
||||
@@ -101,6 +105,7 @@ class TalkModeManager(
|
||||
private val playbackGeneration = AtomicLong(0L)
|
||||
|
||||
private var ttsJob: Job? = null
|
||||
private val ttsJobLock = Any()
|
||||
private val ttsLock = Any()
|
||||
private var textToSpeech: TextToSpeech? = null
|
||||
private var textToSpeechInit: CompletableDeferred<TextToSpeech>? = null
|
||||
@@ -163,8 +168,11 @@ class TalkModeManager(
|
||||
?: waitForAssistantText(session, startedAt, if (ok) 12_000 else 25_000)
|
||||
if (!assistant.isNullOrBlank()) {
|
||||
val playbackToken = playbackGeneration.incrementAndGet()
|
||||
cancelActivePlayback()
|
||||
_statusText.value = "Speaking…"
|
||||
playAssistant(assistant, playbackToken)
|
||||
runPlaybackSession(playbackToken) {
|
||||
playAssistant(assistant, playbackToken)
|
||||
}
|
||||
} else {
|
||||
_statusText.value = "No reply"
|
||||
}
|
||||
@@ -180,14 +188,12 @@ class TalkModeManager(
|
||||
|
||||
fun playTtsForText(text: String) {
|
||||
val playbackToken = playbackGeneration.incrementAndGet()
|
||||
ttsJob?.cancel()
|
||||
ttsJob = scope.launch {
|
||||
cancelActivePlayback()
|
||||
scope.launch {
|
||||
reloadConfig()
|
||||
ensurePlaybackActive(playbackToken)
|
||||
_isSpeaking.value = true
|
||||
_statusText.value = "Speaking…"
|
||||
playAssistant(text, playbackToken)
|
||||
ttsJob = null
|
||||
runPlaybackSession(playbackToken) {
|
||||
playAssistant(text, playbackToken)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -270,10 +276,11 @@ class TalkModeManager(
|
||||
suspend fun speakAssistantReply(text: String) {
|
||||
if (!playbackEnabled) return
|
||||
val playbackToken = playbackGeneration.incrementAndGet()
|
||||
stopSpeaking(resetInterrupt = false)
|
||||
cancelActivePlayback()
|
||||
ensureConfigLoaded()
|
||||
ensurePlaybackActive(playbackToken)
|
||||
playAssistant(text, playbackToken)
|
||||
runPlaybackSession(playbackToken) {
|
||||
playAssistant(text, playbackToken)
|
||||
}
|
||||
}
|
||||
|
||||
private fun start() {
|
||||
@@ -483,9 +490,10 @@ class TalkModeManager(
|
||||
}
|
||||
Log.d(tag, "assistant text ok chars=${assistant.length}")
|
||||
val playbackToken = playbackGeneration.incrementAndGet()
|
||||
stopSpeaking(resetInterrupt = false)
|
||||
ensurePlaybackActive(playbackToken)
|
||||
playAssistant(assistant, playbackToken)
|
||||
cancelActivePlayback()
|
||||
runPlaybackSession(playbackToken) {
|
||||
playAssistant(assistant, playbackToken)
|
||||
}
|
||||
} catch (err: Throwable) {
|
||||
if (err is CancellationException) {
|
||||
Log.d(tag, "finalize speech cancelled")
|
||||
@@ -665,12 +673,60 @@ class TalkModeManager(
|
||||
}
|
||||
_statusText.value = "Speak failed: ${err.message ?: err::class.simpleName}"
|
||||
Log.w(tag, "system tts failed: ${err.message ?: err::class.simpleName}")
|
||||
} finally {
|
||||
|
||||
_isSpeaking.value = false
|
||||
}
|
||||
}
|
||||
|
||||
private suspend fun runPlaybackSession(
|
||||
playbackToken: Long,
|
||||
block: suspend () -> Unit,
|
||||
) {
|
||||
val currentJob = coroutineContext[Job]
|
||||
var shouldResumeAfterSpeak = false
|
||||
try {
|
||||
val claimedPlayback =
|
||||
synchronized(ttsJobLock) {
|
||||
if (!playbackEnabled || playbackToken != playbackGeneration.get()) {
|
||||
false
|
||||
} else {
|
||||
ttsJob = currentJob
|
||||
true
|
||||
}
|
||||
}
|
||||
if (!claimedPlayback) {
|
||||
ensurePlaybackActive(playbackToken)
|
||||
return
|
||||
}
|
||||
ensurePlaybackActive(playbackToken)
|
||||
shouldResumeAfterSpeak = true
|
||||
onBeforeSpeak()
|
||||
ensurePlaybackActive(playbackToken)
|
||||
_isSpeaking.value = true
|
||||
_statusText.value = "Speaking…"
|
||||
block()
|
||||
} finally {
|
||||
synchronized(ttsJobLock) {
|
||||
if (ttsJob === currentJob) {
|
||||
ttsJob = null
|
||||
}
|
||||
}
|
||||
_isSpeaking.value = false
|
||||
if (shouldResumeAfterSpeak) {
|
||||
withContext(NonCancellable) {
|
||||
onAfterSpeak()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private fun cancelActivePlayback() {
|
||||
val activeJob =
|
||||
synchronized(ttsJobLock) {
|
||||
ttsJob
|
||||
}
|
||||
activeJob?.cancel()
|
||||
stopTextToSpeechPlayback()
|
||||
}
|
||||
|
||||
private suspend fun speakWithSystemTts(text: String, directive: TalkDirective?, playbackToken: Long) {
|
||||
ensurePlaybackActive(playbackToken)
|
||||
val engine = ensureTextToSpeech()
|
||||
|
||||
Reference in New Issue
Block a user