feat: wire talk handoff into native nodes

This commit is contained in:
Peter Steinberger
2026-05-05 20:59:46 +01:00
parent c434d7720b
commit 466f718320
34 changed files with 2474 additions and 89 deletions

View File

@@ -36,6 +36,7 @@ import ai.openclaw.app.node.Quad
import ai.openclaw.app.node.SmsHandler
import ai.openclaw.app.node.SmsManager
import ai.openclaw.app.node.SystemHandler
import ai.openclaw.app.node.TalkHandler
import ai.openclaw.app.node.asObjectOrNull
import ai.openclaw.app.node.asStringOrNull
import ai.openclaw.app.node.invokeErrorFromThrowable
@@ -205,6 +206,16 @@ class NodeRuntime(
deviceHandler = deviceHandler,
notificationsHandler = notificationsHandler,
systemHandler = systemHandler,
talkHandler =
object : TalkHandler {
override suspend fun handlePttStart(paramsJson: String?): GatewaySession.InvokeResult = handleTalkPttStart()
override suspend fun handlePttStop(paramsJson: String?): GatewaySession.InvokeResult = handleTalkPttStop()
override suspend fun handlePttCancel(paramsJson: String?): GatewaySession.InvokeResult = handleTalkPttCancel()
override suspend fun handlePttOnce(paramsJson: String?): GatewaySession.InvokeResult = handleTalkPttOnce()
},
photosHandler = photosHandler,
contactsHandler = contactsHandler,
calendarHandler = calendarHandler,
@@ -881,6 +892,80 @@ class NodeRuntime(
setVoiceCaptureMode(if (value) VoiceCaptureMode.TalkMode else VoiceCaptureMode.Off)
}
private suspend fun handleTalkPttStart(): GatewaySession.InvokeResult =
runPreparedTalkPttCommand {
val payload = talkMode.beginPushToTalk()
GatewaySession.InvokeResult.ok(payload.toJson())
}
private suspend fun handleTalkPttStop(): GatewaySession.InvokeResult =
runTalkPttCommand {
val payload = talkMode.endPushToTalk()
finishTalkCaptureIfIdle()
GatewaySession.InvokeResult.ok(payload.toJson())
}
private suspend fun handleTalkPttCancel(): GatewaySession.InvokeResult =
runTalkPttCommand {
val payload = talkMode.cancelPushToTalk()
finishTalkCaptureIfIdle()
GatewaySession.InvokeResult.ok(payload.toJson())
}
private suspend fun handleTalkPttOnce(): GatewaySession.InvokeResult =
runPreparedTalkPttCommand {
val payload = talkMode.runPushToTalkOnce()
finishTalkCaptureIfIdle()
GatewaySession.InvokeResult.ok(payload.toJson())
}
private suspend fun runPreparedTalkPttCommand(block: suspend () -> GatewaySession.InvokeResult): GatewaySession.InvokeResult =
runTalkPttCommand {
prepareTalkCapture()
try {
block()
} catch (err: Throwable) {
cleanupFailedTalkCapture()
throw err
}
}
private suspend fun runTalkPttCommand(block: suspend () -> GatewaySession.InvokeResult): GatewaySession.InvokeResult =
try {
block()
} catch (err: Throwable) {
val (code, message) = invokeErrorFromThrowable(err)
GatewaySession.InvokeResult.error(code = code, message = message)
}
private suspend fun prepareTalkCapture() {
if (!hasRecordAudioPermission()) {
throw IllegalStateException("MIC_PERMISSION_REQUIRED: grant Microphone permission")
}
micCapture.setMicEnabled(false)
stopVoicePlayback()
NodeForegroundService.setVoiceCaptureMode(appContext, VoiceCaptureMode.TalkMode)
talkMode.ttsOnAllResponses = true
talkMode.setPlaybackEnabled(speakerEnabled.value)
talkMode.ensureChatSubscribed()
externalAudioCaptureActive.value = true
}
private suspend fun cleanupFailedTalkCapture() {
runCatching { talkMode.cancelPushToTalk() }
talkMode.ttsOnAllResponses = false
NodeForegroundService.setVoiceCaptureMode(appContext, VoiceCaptureMode.Off)
externalAudioCaptureActive.value = false
}
private fun finishTalkCaptureIfIdle() {
if (!talkMode.isEnabled.value && !talkMode.isListening.value && !talkMode.isSpeaking.value) {
talkMode.ttsOnAllResponses = false
NodeForegroundService.setVoiceCaptureMode(appContext, VoiceCaptureMode.Off)
externalAudioCaptureActive.value = false
}
}
val speakerEnabled: StateFlow<Boolean>
get() = prefs.speakerEnabled

View File

@@ -278,14 +278,13 @@ class GatewayDiscovery(
return legacyHostAddress(resolved)
}
private fun legacyHostAddress(resolved: NsdServiceInfo): String? {
return try {
private fun legacyHostAddress(resolved: NsdServiceInfo): String? =
try {
val host = NsdServiceInfo::class.java.getMethod("getHost").invoke(resolved) as? InetAddress
host?.hostAddress
} catch (_: Throwable) {
null
}
}
private fun publish() {
_gateways.value =
@@ -529,20 +528,20 @@ class GatewayDiscovery(
val cm = connectivity ?: return null
// Prefer VPN (Tailscale) when present; otherwise use the active network.
trackedNetworks(cm).firstOrNull { n ->
val caps = cm.getNetworkCapabilities(n) ?: return@firstOrNull false
caps.hasTransport(NetworkCapabilities.TRANSPORT_VPN)
}?.let { return it }
trackedNetworks(cm)
.firstOrNull { n ->
val caps = cm.getNetworkCapabilities(n) ?: return@firstOrNull false
caps.hasTransport(NetworkCapabilities.TRANSPORT_VPN)
}?.let { return it }
return cm.activeNetwork
}
private fun trackedNetworks(cm: ConnectivityManager): List<Network> {
return buildList {
private fun trackedNetworks(cm: ConnectivityManager): List<Network> =
buildList {
cm.activeNetwork?.let(::add)
addAll(availableNetworks)
}.distinct()
}
private fun createDirectResolver(): Resolver? {
val cm = connectivity ?: return null

View File

@@ -14,6 +14,7 @@ import ai.openclaw.app.protocol.OpenClawNotificationsCommand
import ai.openclaw.app.protocol.OpenClawPhotosCommand
import ai.openclaw.app.protocol.OpenClawSmsCommand
import ai.openclaw.app.protocol.OpenClawSystemCommand
import ai.openclaw.app.protocol.OpenClawTalkCommand
data class NodeRuntimeFlags(
val cameraEnabled: Boolean,
@@ -81,6 +82,7 @@ object InvokeCommandRegistry {
name = OpenClawCapability.VoiceWake.rawValue,
availability = NodeCapabilityAvailability.VoiceWakeEnabled,
),
NodeCapabilitySpec(name = OpenClawCapability.Talk.rawValue),
NodeCapabilitySpec(
name = OpenClawCapability.Location.rawValue,
availability = NodeCapabilityAvailability.LocationEnabled,
@@ -135,6 +137,18 @@ object InvokeCommandRegistry {
InvokeCommandSpec(
name = OpenClawSystemCommand.Notify.rawValue,
),
InvokeCommandSpec(
name = OpenClawTalkCommand.PttStart.rawValue,
),
InvokeCommandSpec(
name = OpenClawTalkCommand.PttStop.rawValue,
),
InvokeCommandSpec(
name = OpenClawTalkCommand.PttCancel.rawValue,
),
InvokeCommandSpec(
name = OpenClawTalkCommand.PttOnce.rawValue,
),
InvokeCommandSpec(
name = OpenClawCameraCommand.List.rawValue,
requiresForeground = true,

View File

@@ -13,6 +13,7 @@ import ai.openclaw.app.protocol.OpenClawMotionCommand
import ai.openclaw.app.protocol.OpenClawNotificationsCommand
import ai.openclaw.app.protocol.OpenClawSmsCommand
import ai.openclaw.app.protocol.OpenClawSystemCommand
import ai.openclaw.app.protocol.OpenClawTalkCommand
internal enum class SmsSearchAvailabilityReason {
Available,
@@ -59,6 +60,7 @@ class InvokeDispatcher(
private val deviceHandler: DeviceHandler,
private val notificationsHandler: NotificationsHandler,
private val systemHandler: SystemHandler,
private val talkHandler: TalkHandler,
private val photosHandler: PhotosHandler,
private val contactsHandler: ContactsHandler,
private val calendarHandler: CalendarHandler,
@@ -188,6 +190,12 @@ class InvokeDispatcher(
// System command
OpenClawSystemCommand.Notify.rawValue -> systemHandler.handleSystemNotify(paramsJson)
// Talk commands
OpenClawTalkCommand.PttStart.rawValue -> talkHandler.handlePttStart(paramsJson)
OpenClawTalkCommand.PttStop.rawValue -> talkHandler.handlePttStop(paramsJson)
OpenClawTalkCommand.PttCancel.rawValue -> talkHandler.handlePttCancel(paramsJson)
OpenClawTalkCommand.PttOnce.rawValue -> talkHandler.handlePttOnce(paramsJson)
// Photos command
ai.openclaw.app.protocol.OpenClawPhotosCommand.Latest.rawValue ->
photosHandler.handlePhotosLatest(
@@ -336,3 +344,13 @@ class InvokeDispatcher(
}
}
}
interface TalkHandler {
suspend fun handlePttStart(paramsJson: String?): GatewaySession.InvokeResult
suspend fun handlePttStop(paramsJson: String?): GatewaySession.InvokeResult
suspend fun handlePttCancel(paramsJson: String?): GatewaySession.InvokeResult
suspend fun handlePttOnce(paramsJson: String?): GatewaySession.InvokeResult
}

View File

@@ -7,6 +7,7 @@ enum class OpenClawCapability(
Camera("camera"),
Sms("sms"),
VoiceWake("voiceWake"),
Talk("talk"),
Location("location"),
Device("device"),
Notifications("notifications"),
@@ -71,6 +72,20 @@ enum class OpenClawSmsCommand(
}
}
enum class OpenClawTalkCommand(
val rawValue: String,
) {
PttStart("talk.ptt.start"),
PttStop("talk.ptt.stop"),
PttCancel("talk.ptt.cancel"),
PttOnce("talk.ptt.once"),
;
companion object {
const val NamespacePrefix: String = "talk."
}
}
enum class OpenClawLocationCommand(
val rawValue: String,
) {

View File

@@ -0,0 +1,45 @@
package ai.openclaw.app.voice
import kotlinx.serialization.json.JsonArray
import kotlinx.serialization.json.JsonElement
import kotlinx.serialization.json.JsonObject
import kotlinx.serialization.json.JsonPrimitive
internal object ChatEventText {
fun assistantTextFromPayload(payload: JsonObject): String? = assistantTextFromMessage(payload["message"])
fun assistantTextFromMessage(messageEl: JsonElement?): String? {
val message = messageEl.asObjectOrNull() ?: return null
val role = message["role"].asStringOrNull()
if (role != null && role != "assistant") return null
return textFromContent(message["content"])
}
private fun textFromContent(content: JsonElement?): String? =
when (content) {
is JsonPrimitive -> content.asStringOrNull()?.trim()?.takeIf { it.isNotEmpty() }
is JsonArray ->
content
.mapNotNull(::textFromContentPart)
.filter { it.isNotEmpty() }
.joinToString("\n")
.takeIf { it.isNotBlank() }
else -> null
}
private fun textFromContentPart(part: JsonElement): String? {
part
.asStringOrNull()
?.trim()
?.takeIf { it.isNotEmpty() }
?.let { return it }
val obj = part.asObjectOrNull() ?: return null
val type = obj["type"].asStringOrNull()
if (type != null && type != "text") return null
return obj["text"].asStringOrNull()?.trim()?.takeIf { it.isNotEmpty() }
}
}
private fun JsonElement?.asObjectOrNull(): JsonObject? = this as? JsonObject
private fun JsonElement?.asStringOrNull(): String? = (this as? JsonPrimitive)?.takeIf { it.isString }?.content

View File

@@ -21,7 +21,6 @@ import kotlinx.coroutines.flow.StateFlow
import kotlinx.coroutines.launch
import kotlinx.coroutines.withContext
import kotlinx.serialization.json.Json
import kotlinx.serialization.json.JsonArray
import kotlinx.serialization.json.JsonObject
import kotlinx.serialization.json.JsonPrimitive
import java.util.UUID
@@ -596,20 +595,7 @@ class MicCaptureManager(
PackageManager.PERMISSION_GRANTED
)
private fun parseAssistantText(payload: JsonObject): String? {
val message = payload["message"].asObjectOrNull() ?: return null
if (message["role"].asStringOrNull() != "assistant") return null
val content = message["content"] as? JsonArray ?: return null
val parts =
content.mapNotNull { item ->
val obj = item.asObjectOrNull() ?: return@mapNotNull null
if (obj["type"].asStringOrNull() != "text") return@mapNotNull null
obj["text"].asStringOrNull()?.trim()?.takeIf { it.isNotEmpty() }
}
if (parts.isEmpty()) return null
return parts.joinToString("\n")
}
private fun parseAssistantText(payload: JsonObject): String? = ChatEventText.assistantTextFromPayload(payload)
private val listener =
object : RecognitionListener {

View File

@@ -12,20 +12,26 @@ import kotlinx.coroutines.delay
import kotlinx.coroutines.withContext
import java.io.File
internal interface TalkAudioPlaying {
suspend fun play(audio: TalkSpeakAudio)
fun stop()
}
internal class TalkAudioPlayer(
private val context: Context,
) {
) : TalkAudioPlaying {
private val lock = Any()
private var active: ActivePlayback? = null
suspend fun play(audio: TalkSpeakAudio) {
override suspend fun play(audio: TalkSpeakAudio) {
when (val mode = resolvePlaybackMode(audio)) {
is TalkPlaybackMode.Pcm -> playPcm(audio.bytes, mode.sampleRate)
is TalkPlaybackMode.Compressed -> playCompressed(audio.bytes, mode.fileExtension)
}
}
fun stop() {
override fun stop() {
synchronized(lock) {
active?.cancel()
active = null

View File

@@ -41,7 +41,28 @@ import java.util.UUID
import java.util.concurrent.atomic.AtomicLong
import kotlin.coroutines.coroutineContext
class TalkModeManager(
data class TalkPttStartPayload(
val captureId: String,
) {
fun toJson(): String = """{"captureId":"$captureId"}"""
}
data class TalkPttStopPayload(
val captureId: String,
val transcript: String?,
val status: String,
) {
fun toJson(): String =
buildJsonObject {
put("captureId", JsonPrimitive(captureId))
if (transcript != null) {
put("transcript", JsonPrimitive(transcript))
}
put("status", JsonPrimitive(status))
}.toString()
}
class TalkModeManager internal constructor(
private val context: Context,
private val scope: CoroutineScope,
private val session: GatewaySession,
@@ -49,6 +70,8 @@ class TalkModeManager(
private val isConnected: () -> Boolean,
private val onBeforeSpeak: suspend () -> Unit = {},
private val onAfterSpeak: suspend () -> Unit = {},
private val talkSpeakClient: TalkSpeechSynthesizing = TalkSpeakClient(session = session),
private val talkAudioPlayer: TalkAudioPlaying = TalkAudioPlayer(context),
) {
companion object {
private const val tag = "TalkMode"
@@ -60,9 +83,6 @@ class TalkModeManager(
private val mainHandler = Handler(Looper.getMainLooper())
private val json = Json { ignoreUnknownKeys = true }
private val talkSpeakClient = TalkSpeakClient(session = session, json = json)
private val talkAudioPlayer = TalkAudioPlayer(context)
private val _isEnabled = MutableStateFlow(false)
val isEnabled: StateFlow<Boolean> = _isEnabled
@@ -82,6 +102,10 @@ class TalkModeManager(
private var restartJob: Job? = null
private var stopRequested = false
private var listeningMode = false
private var activePttCaptureId: String? = null
private var pttAutoStopEnabled = false
private var pttTimeoutJob: Job? = null
private var pttCompletion: CompletableDeferred<TalkPttStopPayload>? = null
private var silenceJob: Job? = null
private var silenceWindowMs = TalkDefaults.defaultSilenceTimeoutMs
@@ -156,6 +180,127 @@ class TalkModeManager(
}
}
suspend fun beginPushToTalk(): TalkPttStartPayload {
if (!isConnected()) {
_statusText.value = "Gateway not connected"
throw IllegalStateException("UNAVAILABLE: Gateway not connected")
}
activePttCaptureId?.let { return TalkPttStartPayload(captureId = it) }
stopSpeaking(resetInterrupt = false)
pttTimeoutJob?.cancel()
pttTimeoutJob = null
pttAutoStopEnabled = false
pttCompletion = null
silenceJob?.cancel()
silenceJob = null
listeningMode = false
finalizeInFlight = false
stopRequested = false
lastTranscript = ""
lastHeardAtMs = null
val micOk =
ContextCompat.checkSelfPermission(context, Manifest.permission.RECORD_AUDIO) ==
PackageManager.PERMISSION_GRANTED
if (!micOk) {
_statusText.value = "Microphone permission required"
throw IllegalStateException("MIC_PERMISSION_REQUIRED: grant Microphone permission")
}
if (!SpeechRecognizer.isRecognitionAvailable(context)) {
_statusText.value = "Speech recognizer unavailable"
throw IllegalStateException("UNAVAILABLE: Speech recognizer unavailable")
}
val captureId = UUID.randomUUID().toString()
activePttCaptureId = captureId
withContext(Dispatchers.Main) {
recognizer?.cancel()
recognizer?.destroy()
recognizer = SpeechRecognizer.createSpeechRecognizer(context).also { it.setRecognitionListener(listener) }
startListeningInternal(markListening = true)
}
_statusText.value = "Listening (PTT)"
return TalkPttStartPayload(captureId = captureId)
}
suspend fun endPushToTalk(): TalkPttStopPayload {
val captureId = activePttCaptureId ?: UUID.randomUUID().toString()
if (activePttCaptureId == null) {
return finishPushToTalk(TalkPttStopPayload(captureId = captureId, transcript = null, status = "idle"))
}
clearPushToTalkRecognition()
val transcript = lastTranscript.trim()
lastTranscript = ""
lastHeardAtMs = null
if (transcript.isEmpty()) {
_statusText.value = if (_isEnabled.value) "Listening" else "Ready"
if (_isEnabled.value) {
start()
}
return finishPushToTalk(TalkPttStopPayload(captureId = captureId, transcript = null, status = "empty"))
}
if (!isConnected()) {
_statusText.value = "Gateway not connected"
if (_isEnabled.value) {
start()
}
return finishPushToTalk(TalkPttStopPayload(captureId = captureId, transcript = transcript, status = "offline"))
}
_statusText.value = "Thinking…"
scope.launch {
finalizeTranscript(transcript)
}
return finishPushToTalk(TalkPttStopPayload(captureId = captureId, transcript = transcript, status = "queued"))
}
suspend fun cancelPushToTalk(): TalkPttStopPayload {
val captureId = activePttCaptureId ?: UUID.randomUUID().toString()
if (activePttCaptureId == null) {
return finishPushToTalk(TalkPttStopPayload(captureId = captureId, transcript = null, status = "idle"))
}
clearPushToTalkRecognition()
lastTranscript = ""
lastHeardAtMs = null
_statusText.value = if (_isEnabled.value) "Listening" else "Ready"
if (_isEnabled.value) {
start()
}
return finishPushToTalk(TalkPttStopPayload(captureId = captureId, transcript = null, status = "cancelled"))
}
suspend fun runPushToTalkOnce(maxDurationMs: Long = 12_000L): TalkPttStopPayload {
if (pttCompletion != null) {
cancelPushToTalk()
}
if (activePttCaptureId != null) {
return TalkPttStopPayload(
captureId = activePttCaptureId ?: UUID.randomUUID().toString(),
transcript = null,
status = "busy",
)
}
beginPushToTalk()
val completion = CompletableDeferred<TalkPttStopPayload>()
pttCompletion = completion
pttAutoStopEnabled = true
startSilenceMonitor()
pttTimeoutJob =
scope.launch {
delay(maxDurationMs)
if (pttAutoStopEnabled && activePttCaptureId != null) {
endPushToTalk()
}
}
return completion.await()
}
/**
* Speak a wake-word command through TalkMode's full pipeline:
* chat.send → wait for final → read assistant text → TTS.
@@ -335,6 +480,12 @@ class TalkModeManager(
stopRequested = true
finalizeInFlight = false
listeningMode = false
activePttCaptureId = null
pttAutoStopEnabled = false
pttCompletion?.cancel()
pttCompletion = null
pttTimeoutJob?.cancel()
pttTimeoutJob = null
restartJob?.cancel()
restartJob = null
silenceJob?.cancel()
@@ -434,7 +585,7 @@ class TalkModeManager(
silenceJob?.cancel()
silenceJob =
scope.launch {
while (_isEnabled.value) {
while (_isEnabled.value || pttAutoStopEnabled) {
delay(200)
checkSilence()
}
@@ -448,6 +599,12 @@ class TalkModeManager(
val lastHeard = lastHeardAtMs ?: return
val elapsed = SystemClock.elapsedRealtime() - lastHeard
if (elapsed < silenceWindowMs) return
if (activePttCaptureId != null) {
if (pttAutoStopEnabled) {
scope.launch { endPushToTalk() }
}
return
}
if (finalizeInFlight) return
finalizeInFlight = true
scope.launch {
@@ -525,6 +682,27 @@ class TalkModeManager(
}
}
private suspend fun clearPushToTalkRecognition() {
pttTimeoutJob?.cancel()
pttTimeoutJob = null
pttAutoStopEnabled = false
activePttCaptureId = null
_isListening.value = false
listeningMode = false
clearListenWatchdog()
withContext(Dispatchers.Main) {
recognizer?.cancel()
recognizer?.destroy()
recognizer = null
}
}
private fun finishPushToTalk(payload: TalkPttStopPayload): TalkPttStopPayload {
pttCompletion?.complete(payload)
pttCompletion = null
return payload
}
private suspend fun subscribeChatIfNeeded(
session: GatewaySession,
sessionKey: String,
@@ -656,20 +834,7 @@ class TalkModeManager(
}
}
private fun extractTextFromChatEventMessage(messageEl: JsonElement?): String? {
val msg = messageEl?.asObjectOrNull() ?: return null
val content = msg["content"] as? JsonArray ?: return null
return content
.mapNotNull { entry ->
entry
.asObjectOrNull()
?.get("text")
?.asStringOrNull()
?.trim()
}.filter { it.isNotEmpty() }
.joinToString("\n")
.takeIf { it.isNotBlank() }
}
private fun extractTextFromChatEventMessage(messageEl: JsonElement?): String? = ChatEventText.assistantTextFromMessage(messageEl)
private suspend fun waitForAssistantText(
session: GatewaySession,
@@ -729,17 +894,16 @@ class TalkModeManager(
_lastAssistantText.value = cleaned
ensurePlaybackActive(playbackToken)
_statusText.value = "Speaking"
_isSpeaking.value = true
_statusText.value = "Generating voice"
_isSpeaking.value = false
lastSpokenText = cleaned
ensureInterruptListener()
requestAudioFocusForTts()
try {
val started = SystemClock.elapsedRealtime()
when (val result = talkSpeakClient.synthesize(text = cleaned, directive = directive)) {
is TalkSpeakResult.Success -> {
ensurePlaybackActive(playbackToken)
markAudioPlaybackStarting(playbackToken)
talkAudioPlayer.play(result.audio)
ensurePlaybackActive(playbackToken)
Log.d(tag, "talk.speak ok durMs=${SystemClock.elapsedRealtime() - started}")
@@ -789,8 +953,6 @@ class TalkModeManager(
shouldResumeAfterSpeak = true
onBeforeSpeak()
ensurePlaybackActive(playbackToken)
_isSpeaking.value = true
_statusText.value = "Speaking…"
block()
} finally {
synchronized(ttsJobLock) {
@@ -888,6 +1050,7 @@ class TalkModeManager(
}
},
)
markAudioPlaybackStarting(playbackToken)
val result = engine.speak(text, TextToSpeech.QUEUE_FLUSH, null, utteranceId)
if (result != TextToSpeech.SUCCESS) {
throw IllegalStateException("TextToSpeech start failed")
@@ -905,6 +1068,14 @@ class TalkModeManager(
}
}
private fun markAudioPlaybackStarting(playbackToken: Long) {
ensurePlaybackActive(playbackToken)
_statusText.value = "Speaking…"
_isSpeaking.value = true
ensureInterruptListener()
requestAudioFocusForTts()
}
fun stopTts() {
stopSpeaking(resetInterrupt = true)
_isSpeaking.value = false

View File

@@ -28,12 +28,19 @@ internal sealed interface TalkSpeakResult {
) : TalkSpeakResult
}
internal interface TalkSpeechSynthesizing {
suspend fun synthesize(
text: String,
directive: TalkDirective?,
): TalkSpeakResult
}
internal class TalkSpeakClient(
private val session: GatewaySession? = null,
private val json: Json = Json { ignoreUnknownKeys = true },
private val requestDetailed: (suspend (String, String, Long) -> GatewaySession.RpcResult)? = null,
) {
suspend fun synthesize(
) : TalkSpeechSynthesizing {
override suspend fun synthesize(
text: String,
directive: TalkDirective?,
): TalkSpeakResult {

View File

@@ -6,6 +6,11 @@ import ai.openclaw.app.gateway.GatewayEndpoint
import ai.openclaw.app.gateway.GatewaySession
import ai.openclaw.app.gateway.GatewayTlsProbeFailure
import ai.openclaw.app.gateway.GatewayTlsProbeResult
import ai.openclaw.app.node.InvokeDispatcher
import ai.openclaw.app.protocol.OpenClawTalkCommand
import ai.openclaw.app.voice.TalkModeManager
import android.Manifest
import kotlinx.coroutines.flow.MutableStateFlow
import kotlinx.coroutines.runBlocking
import org.junit.Assert.assertEquals
import org.junit.Assert.assertFalse
@@ -15,6 +20,7 @@ import org.junit.Test
import org.junit.runner.RunWith
import org.robolectric.RobolectricTestRunner
import org.robolectric.RuntimeEnvironment
import org.robolectric.Shadows.shadowOf
import org.robolectric.annotation.Config
import java.lang.reflect.Field
import java.util.UUID
@@ -221,6 +227,23 @@ class GatewayBootstrapAuthTest {
assertNull(authStore.loadToken(deviceId, "operator"))
}
@Test
fun talkPttStart_cleansPreparedCaptureWhenBeginFails() =
runBlocking {
val app = RuntimeEnvironment.getApplication()
shadowOf(app).grantPermissions(Manifest.permission.RECORD_AUDIO)
val runtime = NodeRuntime(app)
val dispatcher = readField<InvokeDispatcher>(runtime, "invokeDispatcher")
val result = dispatcher.handleInvoke(OpenClawTalkCommand.PttStart.rawValue, null)
assertEquals("UNAVAILABLE", result.error?.code)
assertEquals(VoiceCaptureMode.Off, runtime.voiceCaptureMode.value)
assertFalse(readField<MutableStateFlow<Boolean>>(runtime, "externalAudioCaptureActive").value)
val talkMode = readField<Lazy<TalkModeManager>>(runtime, "talkMode\$delegate").value
assertFalse(talkMode.ttsOnAllResponses)
}
private fun waitForGatewayTrustPrompt(runtime: NodeRuntime): NodeRuntime.GatewayTrustPrompt {
repeat(50) {
runtime.pendingGatewayTrust.value?.let { return it }

View File

@@ -12,6 +12,7 @@ import ai.openclaw.app.protocol.OpenClawNotificationsCommand
import ai.openclaw.app.protocol.OpenClawPhotosCommand
import ai.openclaw.app.protocol.OpenClawSmsCommand
import ai.openclaw.app.protocol.OpenClawSystemCommand
import ai.openclaw.app.protocol.OpenClawTalkCommand
import org.junit.Assert.assertEquals
import org.junit.Assert.assertFalse
import org.junit.Assert.assertNotNull
@@ -26,6 +27,7 @@ class InvokeCommandRegistryTest {
OpenClawCapability.Device.rawValue,
OpenClawCapability.Notifications.rawValue,
OpenClawCapability.System.rawValue,
OpenClawCapability.Talk.rawValue,
OpenClawCapability.Photos.rawValue,
OpenClawCapability.Contacts.rawValue,
OpenClawCapability.Calendar.rawValue,
@@ -50,6 +52,10 @@ class InvokeCommandRegistryTest {
OpenClawNotificationsCommand.List.rawValue,
OpenClawNotificationsCommand.Actions.rawValue,
OpenClawSystemCommand.Notify.rawValue,
OpenClawTalkCommand.PttStart.rawValue,
OpenClawTalkCommand.PttStop.rawValue,
OpenClawTalkCommand.PttCancel.rawValue,
OpenClawTalkCommand.PttOnce.rawValue,
OpenClawPhotosCommand.Latest.rawValue,
OpenClawContactsCommand.Search.rawValue,
OpenClawContactsCommand.Add.rawValue,

View File

@@ -1,11 +1,13 @@
package ai.openclaw.app.node
import ai.openclaw.app.gateway.DeviceIdentityStore
import ai.openclaw.app.gateway.GatewaySession
import ai.openclaw.app.protocol.OpenClawCallLogCommand
import ai.openclaw.app.protocol.OpenClawCameraCommand
import ai.openclaw.app.protocol.OpenClawLocationCommand
import ai.openclaw.app.protocol.OpenClawMotionCommand
import ai.openclaw.app.protocol.OpenClawSmsCommand
import ai.openclaw.app.protocol.OpenClawTalkCommand
import android.content.Context
import android.content.pm.PackageManager
import kotlinx.coroutines.flow.MutableStateFlow
@@ -208,6 +210,27 @@ class InvokeDispatcherTest {
assertEquals("INVALID_REQUEST: unknown command", result.error?.message)
}
@Test
fun handleInvoke_routesTalkPttCommands() =
runTest {
val talk = InvokeDispatcherFakeTalkHandler()
val dispatcher = newDispatcher(talkHandler = talk)
val start = dispatcher.handleInvoke(OpenClawTalkCommand.PttStart.rawValue, null)
val stop = dispatcher.handleInvoke(OpenClawTalkCommand.PttStop.rawValue, null)
val cancel = dispatcher.handleInvoke(OpenClawTalkCommand.PttCancel.rawValue, null)
val once = dispatcher.handleInvoke(OpenClawTalkCommand.PttOnce.rawValue, null)
assertEquals("""{"captureId":"start"}""", start.payloadJson)
assertEquals("""{"status":"stop"}""", stop.payloadJson)
assertEquals("""{"status":"cancel"}""", cancel.payloadJson)
assertEquals("""{"status":"once"}""", once.payloadJson)
assertEquals(
listOf("start", "stop", "cancel", "once"),
talk.calls,
)
}
private fun newDispatcher(
cameraEnabled: Boolean = false,
locationEnabled: Boolean = false,
@@ -219,6 +242,7 @@ class InvokeDispatcherTest {
debugBuild: Boolean = false,
motionActivityAvailable: Boolean = false,
motionPedometerAvailable: Boolean = false,
talkHandler: TalkHandler = InvokeDispatcherFakeTalkHandler(),
): InvokeDispatcher {
val appContext = RuntimeEnvironment.getApplication()
shadowOf(appContext.packageManager).setSystemFeature(PackageManager.FEATURE_TELEPHONY, smsTelephonyAvailable)
@@ -238,6 +262,7 @@ class InvokeDispatcherTest {
stateProvider = InvokeDispatcherFakeNotificationsStateProvider(),
),
systemHandler = SystemHandler.forTesting(InvokeDispatcherFakeSystemNotificationPoster()),
talkHandler = talkHandler,
photosHandler = PhotosHandler.forTesting(appContext, InvokeDispatcherFakePhotosDataSource()),
contactsHandler = ContactsHandler.forTesting(appContext, InvokeDispatcherFakeContactsDataSource()),
calendarHandler = CalendarHandler.forTesting(appContext, InvokeDispatcherFakeCalendarDataSource()),
@@ -312,6 +337,30 @@ private class InvokeDispatcherFakeSystemNotificationPoster : SystemNotificationP
override fun post(request: SystemNotifyRequest) = Unit
}
private class InvokeDispatcherFakeTalkHandler : TalkHandler {
val calls = mutableListOf<String>()
override suspend fun handlePttStart(paramsJson: String?): GatewaySession.InvokeResult {
calls.add("start")
return GatewaySession.InvokeResult.ok("""{"captureId":"start"}""")
}
override suspend fun handlePttStop(paramsJson: String?): GatewaySession.InvokeResult {
calls.add("stop")
return GatewaySession.InvokeResult.ok("""{"status":"stop"}""")
}
override suspend fun handlePttCancel(paramsJson: String?): GatewaySession.InvokeResult {
calls.add("cancel")
return GatewaySession.InvokeResult.ok("""{"status":"cancel"}""")
}
override suspend fun handlePttOnce(paramsJson: String?): GatewaySession.InvokeResult {
calls.add("once")
return GatewaySession.InvokeResult.ok("""{"status":"once"}""")
}
}
private class InvokeDispatcherFakePhotosDataSource : PhotosDataSource {
override fun hasPermission(context: Context): Boolean = true

View File

@@ -25,6 +25,7 @@ class OpenClawProtocolConstantsTest {
assertEquals("canvas", OpenClawCapability.Canvas.rawValue)
assertEquals("camera", OpenClawCapability.Camera.rawValue)
assertEquals("voiceWake", OpenClawCapability.VoiceWake.rawValue)
assertEquals("talk", OpenClawCapability.Talk.rawValue)
assertEquals("location", OpenClawCapability.Location.rawValue)
assertEquals("sms", OpenClawCapability.Sms.rawValue)
assertEquals("device", OpenClawCapability.Device.rawValue)
@@ -92,6 +93,14 @@ class OpenClawProtocolConstantsTest {
assertEquals("sms.search", OpenClawSmsCommand.Search.rawValue)
}
@Test
fun talkCommandsUseStableStrings() {
assertEquals("talk.ptt.start", OpenClawTalkCommand.PttStart.rawValue)
assertEquals("talk.ptt.stop", OpenClawTalkCommand.PttStop.rawValue)
assertEquals("talk.ptt.cancel", OpenClawTalkCommand.PttCancel.rawValue)
assertEquals("talk.ptt.once", OpenClawTalkCommand.PttOnce.rawValue)
}
@Test
fun callLogCommandsUseStableStrings() {
assertEquals("callLog.search", OpenClawCallLogCommand.Search.rawValue)

View File

@@ -0,0 +1,69 @@
package ai.openclaw.app.voice
import kotlinx.serialization.json.Json
import kotlinx.serialization.json.JsonObject
import org.junit.Assert.assertEquals
import org.junit.Assert.assertNull
import org.junit.Test
class ChatEventTextTest {
private val json = Json { ignoreUnknownKeys = true }
@Test
fun extractsAssistantTextParts() {
val payload =
payload(
"""
{
"message": {
"role": "assistant",
"content": [
{ "type": "text", "text": "hello" },
{ "type": "text", "text": "world" }
]
}
}
""",
)
assertEquals("hello\nworld", ChatEventText.assistantTextFromPayload(payload))
}
@Test
fun extractsPlainStringContent() {
val payload =
payload(
"""
{
"message": {
"role": "assistant",
"content": "plain reply"
}
}
""",
)
assertEquals("plain reply", ChatEventText.assistantTextFromPayload(payload))
}
@Test
fun ignoresUserMessages() {
val payload =
payload(
"""
{
"message": {
"role": "user",
"content": [
{ "type": "text", "text": "do not speak" }
]
}
}
""",
)
assertNull(ChatEventText.assistantTextFromPayload(payload))
}
private fun payload(source: String): JsonObject = json.parseToJsonElement(source.trimIndent()) as JsonObject
}

View File

@@ -9,7 +9,10 @@ import kotlinx.coroutines.CoroutineScope
import kotlinx.coroutines.Dispatchers
import kotlinx.coroutines.Job
import kotlinx.coroutines.SupervisorJob
import kotlinx.coroutines.launch
import kotlinx.coroutines.test.runTest
import org.junit.Assert.assertEquals
import org.junit.Assert.assertFalse
import org.junit.Assert.assertTrue
import org.junit.Test
import org.junit.runner.RunWith
@@ -78,7 +81,54 @@ class TalkModeManagerTest {
assertEquals(1L, playbackGeneration(manager).get())
}
private fun createManager(): TalkModeManager {
@Test
fun nonPendingUserFinalDoesNotUseAllResponseTts() {
val manager = createManager()
manager.ttsOnAllResponses = true
manager.handleGatewayEvent("chat", chatFinalPayload(runId = "run-user", text = "do not speak", role = "user"))
assertEquals(0L, playbackGeneration(manager).get())
}
@Test
fun textReadyDoesNotEnterSpeakingUntilAudioPlaybackStarts() =
runTest {
val talkSpeakClient = FakeTalkSpeechSynthesizer()
val talkAudioPlayer = FakeTalkAudioPlayer()
val manager = createManager(talkSpeakClient = talkSpeakClient, talkAudioPlayer = talkAudioPlayer)
val job = launch { manager.speakAssistantReply("hello") }
talkSpeakClient.requested.await()
assertEquals("Generating voice…", manager.statusText.value)
assertFalse(manager.isSpeaking.value)
talkSpeakClient.result.complete(
TalkSpeakResult.Success(
TalkSpeakAudio(
bytes = byteArrayOf(1, 2, 3),
provider = "test",
outputFormat = "mp3_44100_128",
voiceCompatible = true,
mimeType = "audio/mpeg",
fileExtension = ".mp3",
),
),
)
talkAudioPlayer.started.await()
assertEquals("Speaking…", manager.statusText.value)
assertTrue(manager.isSpeaking.value)
talkAudioPlayer.finished.complete(Unit)
job.join()
}
private fun createManager(
talkSpeakClient: TalkSpeechSynthesizing = TalkSpeakClient(),
talkAudioPlayer: TalkAudioPlaying? = null,
): TalkModeManager {
val app = RuntimeEnvironment.getApplication()
val sessionJob = SupervisorJob()
val session =
@@ -96,6 +146,8 @@ class TalkModeManagerTest {
session = session,
supportsChatSubscribe = false,
isConnected = { true },
talkSpeakClient = talkSpeakClient,
talkAudioPlayer = talkAudioPlayer ?: TalkAudioPlayer(app),
)
}
@@ -124,6 +176,7 @@ class TalkModeManagerTest {
private fun chatFinalPayload(
runId: String,
text: String,
role: String = "assistant",
): String =
"""
{
@@ -131,7 +184,7 @@ class TalkModeManagerTest {
"sessionKey": "main",
"state": "final",
"message": {
"role": "assistant",
"role": "$role",
"content": [
{ "type": "text", "text": "$text" }
]
@@ -140,6 +193,34 @@ class TalkModeManagerTest {
""".trimIndent()
}
private class FakeTalkSpeechSynthesizer : TalkSpeechSynthesizing {
val requested = CompletableDeferred<Unit>()
val result = CompletableDeferred<TalkSpeakResult>()
override suspend fun synthesize(
text: String,
directive: TalkDirective?,
): TalkSpeakResult {
requested.complete(Unit)
return result.await()
}
}
private class FakeTalkAudioPlayer : TalkAudioPlaying {
val started = CompletableDeferred<Unit>()
val finished = CompletableDeferred<Unit>()
var stopped = false
override suspend fun play(audio: TalkSpeakAudio) {
started.complete(Unit)
finished.await()
}
override fun stop() {
stopped = true
}
}
private class InMemoryDeviceAuthStore : DeviceAuthTokenStore {
override fun loadEntry(
deviceId: String,