diff --git a/apps/android/app/src/main/java/ai/openclaw/app/NodeRuntime.kt b/apps/android/app/src/main/java/ai/openclaw/app/NodeRuntime.kt index 166c31d4557..379bb0f5908 100644 --- a/apps/android/app/src/main/java/ai/openclaw/app/NodeRuntime.kt +++ b/apps/android/app/src/main/java/ai/openclaw/app/NodeRuntime.kt @@ -36,6 +36,7 @@ import ai.openclaw.app.node.Quad import ai.openclaw.app.node.SmsHandler import ai.openclaw.app.node.SmsManager import ai.openclaw.app.node.SystemHandler +import ai.openclaw.app.node.TalkHandler import ai.openclaw.app.node.asObjectOrNull import ai.openclaw.app.node.asStringOrNull import ai.openclaw.app.node.invokeErrorFromThrowable @@ -205,6 +206,16 @@ class NodeRuntime( deviceHandler = deviceHandler, notificationsHandler = notificationsHandler, systemHandler = systemHandler, + talkHandler = + object : TalkHandler { + override suspend fun handlePttStart(paramsJson: String?): GatewaySession.InvokeResult = handleTalkPttStart() + + override suspend fun handlePttStop(paramsJson: String?): GatewaySession.InvokeResult = handleTalkPttStop() + + override suspend fun handlePttCancel(paramsJson: String?): GatewaySession.InvokeResult = handleTalkPttCancel() + + override suspend fun handlePttOnce(paramsJson: String?): GatewaySession.InvokeResult = handleTalkPttOnce() + }, photosHandler = photosHandler, contactsHandler = contactsHandler, calendarHandler = calendarHandler, @@ -881,6 +892,80 @@ class NodeRuntime( setVoiceCaptureMode(if (value) VoiceCaptureMode.TalkMode else VoiceCaptureMode.Off) } + private suspend fun handleTalkPttStart(): GatewaySession.InvokeResult = + runPreparedTalkPttCommand { + val payload = talkMode.beginPushToTalk() + GatewaySession.InvokeResult.ok(payload.toJson()) + } + + private suspend fun handleTalkPttStop(): GatewaySession.InvokeResult = + runTalkPttCommand { + val payload = talkMode.endPushToTalk() + finishTalkCaptureIfIdle() + GatewaySession.InvokeResult.ok(payload.toJson()) + } + + private suspend fun handleTalkPttCancel(): GatewaySession.InvokeResult = + runTalkPttCommand { + val payload = talkMode.cancelPushToTalk() + finishTalkCaptureIfIdle() + GatewaySession.InvokeResult.ok(payload.toJson()) + } + + private suspend fun handleTalkPttOnce(): GatewaySession.InvokeResult = + runPreparedTalkPttCommand { + val payload = talkMode.runPushToTalkOnce() + finishTalkCaptureIfIdle() + GatewaySession.InvokeResult.ok(payload.toJson()) + } + + private suspend fun runPreparedTalkPttCommand(block: suspend () -> GatewaySession.InvokeResult): GatewaySession.InvokeResult = + runTalkPttCommand { + prepareTalkCapture() + try { + block() + } catch (err: Throwable) { + cleanupFailedTalkCapture() + throw err + } + } + + private suspend fun runTalkPttCommand(block: suspend () -> GatewaySession.InvokeResult): GatewaySession.InvokeResult = + try { + block() + } catch (err: Throwable) { + val (code, message) = invokeErrorFromThrowable(err) + GatewaySession.InvokeResult.error(code = code, message = message) + } + + private suspend fun prepareTalkCapture() { + if (!hasRecordAudioPermission()) { + throw IllegalStateException("MIC_PERMISSION_REQUIRED: grant Microphone permission") + } + micCapture.setMicEnabled(false) + stopVoicePlayback() + NodeForegroundService.setVoiceCaptureMode(appContext, VoiceCaptureMode.TalkMode) + talkMode.ttsOnAllResponses = true + talkMode.setPlaybackEnabled(speakerEnabled.value) + talkMode.ensureChatSubscribed() + externalAudioCaptureActive.value = true + } + + private suspend fun cleanupFailedTalkCapture() { + runCatching { talkMode.cancelPushToTalk() } + talkMode.ttsOnAllResponses = false + NodeForegroundService.setVoiceCaptureMode(appContext, VoiceCaptureMode.Off) + externalAudioCaptureActive.value = false + } + + private fun finishTalkCaptureIfIdle() { + if (!talkMode.isEnabled.value && !talkMode.isListening.value && !talkMode.isSpeaking.value) { + talkMode.ttsOnAllResponses = false + NodeForegroundService.setVoiceCaptureMode(appContext, VoiceCaptureMode.Off) + externalAudioCaptureActive.value = false + } + } + val speakerEnabled: StateFlow get() = prefs.speakerEnabled diff --git a/apps/android/app/src/main/java/ai/openclaw/app/gateway/GatewayDiscovery.kt b/apps/android/app/src/main/java/ai/openclaw/app/gateway/GatewayDiscovery.kt index e80dd4fa341..8820e4b53bb 100644 --- a/apps/android/app/src/main/java/ai/openclaw/app/gateway/GatewayDiscovery.kt +++ b/apps/android/app/src/main/java/ai/openclaw/app/gateway/GatewayDiscovery.kt @@ -278,14 +278,13 @@ class GatewayDiscovery( return legacyHostAddress(resolved) } - private fun legacyHostAddress(resolved: NsdServiceInfo): String? { - return try { + private fun legacyHostAddress(resolved: NsdServiceInfo): String? = + try { val host = NsdServiceInfo::class.java.getMethod("getHost").invoke(resolved) as? InetAddress host?.hostAddress } catch (_: Throwable) { null } - } private fun publish() { _gateways.value = @@ -529,20 +528,20 @@ class GatewayDiscovery( val cm = connectivity ?: return null // Prefer VPN (Tailscale) when present; otherwise use the active network. - trackedNetworks(cm).firstOrNull { n -> - val caps = cm.getNetworkCapabilities(n) ?: return@firstOrNull false - caps.hasTransport(NetworkCapabilities.TRANSPORT_VPN) - }?.let { return it } + trackedNetworks(cm) + .firstOrNull { n -> + val caps = cm.getNetworkCapabilities(n) ?: return@firstOrNull false + caps.hasTransport(NetworkCapabilities.TRANSPORT_VPN) + }?.let { return it } return cm.activeNetwork } - private fun trackedNetworks(cm: ConnectivityManager): List { - return buildList { + private fun trackedNetworks(cm: ConnectivityManager): List = + buildList { cm.activeNetwork?.let(::add) addAll(availableNetworks) }.distinct() - } private fun createDirectResolver(): Resolver? { val cm = connectivity ?: return null diff --git a/apps/android/app/src/main/java/ai/openclaw/app/node/InvokeCommandRegistry.kt b/apps/android/app/src/main/java/ai/openclaw/app/node/InvokeCommandRegistry.kt index 9608472f526..bbab87d0a56 100644 --- a/apps/android/app/src/main/java/ai/openclaw/app/node/InvokeCommandRegistry.kt +++ b/apps/android/app/src/main/java/ai/openclaw/app/node/InvokeCommandRegistry.kt @@ -14,6 +14,7 @@ import ai.openclaw.app.protocol.OpenClawNotificationsCommand import ai.openclaw.app.protocol.OpenClawPhotosCommand import ai.openclaw.app.protocol.OpenClawSmsCommand import ai.openclaw.app.protocol.OpenClawSystemCommand +import ai.openclaw.app.protocol.OpenClawTalkCommand data class NodeRuntimeFlags( val cameraEnabled: Boolean, @@ -81,6 +82,7 @@ object InvokeCommandRegistry { name = OpenClawCapability.VoiceWake.rawValue, availability = NodeCapabilityAvailability.VoiceWakeEnabled, ), + NodeCapabilitySpec(name = OpenClawCapability.Talk.rawValue), NodeCapabilitySpec( name = OpenClawCapability.Location.rawValue, availability = NodeCapabilityAvailability.LocationEnabled, @@ -135,6 +137,18 @@ object InvokeCommandRegistry { InvokeCommandSpec( name = OpenClawSystemCommand.Notify.rawValue, ), + InvokeCommandSpec( + name = OpenClawTalkCommand.PttStart.rawValue, + ), + InvokeCommandSpec( + name = OpenClawTalkCommand.PttStop.rawValue, + ), + InvokeCommandSpec( + name = OpenClawTalkCommand.PttCancel.rawValue, + ), + InvokeCommandSpec( + name = OpenClawTalkCommand.PttOnce.rawValue, + ), InvokeCommandSpec( name = OpenClawCameraCommand.List.rawValue, requiresForeground = true, diff --git a/apps/android/app/src/main/java/ai/openclaw/app/node/InvokeDispatcher.kt b/apps/android/app/src/main/java/ai/openclaw/app/node/InvokeDispatcher.kt index 351e6862923..10d610ef8a4 100644 --- a/apps/android/app/src/main/java/ai/openclaw/app/node/InvokeDispatcher.kt +++ b/apps/android/app/src/main/java/ai/openclaw/app/node/InvokeDispatcher.kt @@ -13,6 +13,7 @@ import ai.openclaw.app.protocol.OpenClawMotionCommand import ai.openclaw.app.protocol.OpenClawNotificationsCommand import ai.openclaw.app.protocol.OpenClawSmsCommand import ai.openclaw.app.protocol.OpenClawSystemCommand +import ai.openclaw.app.protocol.OpenClawTalkCommand internal enum class SmsSearchAvailabilityReason { Available, @@ -59,6 +60,7 @@ class InvokeDispatcher( private val deviceHandler: DeviceHandler, private val notificationsHandler: NotificationsHandler, private val systemHandler: SystemHandler, + private val talkHandler: TalkHandler, private val photosHandler: PhotosHandler, private val contactsHandler: ContactsHandler, private val calendarHandler: CalendarHandler, @@ -188,6 +190,12 @@ class InvokeDispatcher( // System command OpenClawSystemCommand.Notify.rawValue -> systemHandler.handleSystemNotify(paramsJson) + // Talk commands + OpenClawTalkCommand.PttStart.rawValue -> talkHandler.handlePttStart(paramsJson) + OpenClawTalkCommand.PttStop.rawValue -> talkHandler.handlePttStop(paramsJson) + OpenClawTalkCommand.PttCancel.rawValue -> talkHandler.handlePttCancel(paramsJson) + OpenClawTalkCommand.PttOnce.rawValue -> talkHandler.handlePttOnce(paramsJson) + // Photos command ai.openclaw.app.protocol.OpenClawPhotosCommand.Latest.rawValue -> photosHandler.handlePhotosLatest( @@ -336,3 +344,13 @@ class InvokeDispatcher( } } } + +interface TalkHandler { + suspend fun handlePttStart(paramsJson: String?): GatewaySession.InvokeResult + + suspend fun handlePttStop(paramsJson: String?): GatewaySession.InvokeResult + + suspend fun handlePttCancel(paramsJson: String?): GatewaySession.InvokeResult + + suspend fun handlePttOnce(paramsJson: String?): GatewaySession.InvokeResult +} diff --git a/apps/android/app/src/main/java/ai/openclaw/app/protocol/OpenClawProtocolConstants.kt b/apps/android/app/src/main/java/ai/openclaw/app/protocol/OpenClawProtocolConstants.kt index 40169afa940..171f3c1d7ab 100644 --- a/apps/android/app/src/main/java/ai/openclaw/app/protocol/OpenClawProtocolConstants.kt +++ b/apps/android/app/src/main/java/ai/openclaw/app/protocol/OpenClawProtocolConstants.kt @@ -7,6 +7,7 @@ enum class OpenClawCapability( Camera("camera"), Sms("sms"), VoiceWake("voiceWake"), + Talk("talk"), Location("location"), Device("device"), Notifications("notifications"), @@ -71,6 +72,20 @@ enum class OpenClawSmsCommand( } } +enum class OpenClawTalkCommand( + val rawValue: String, +) { + PttStart("talk.ptt.start"), + PttStop("talk.ptt.stop"), + PttCancel("talk.ptt.cancel"), + PttOnce("talk.ptt.once"), + ; + + companion object { + const val NamespacePrefix: String = "talk." + } +} + enum class OpenClawLocationCommand( val rawValue: String, ) { diff --git a/apps/android/app/src/main/java/ai/openclaw/app/voice/ChatEventText.kt b/apps/android/app/src/main/java/ai/openclaw/app/voice/ChatEventText.kt new file mode 100644 index 00000000000..81828910f23 --- /dev/null +++ b/apps/android/app/src/main/java/ai/openclaw/app/voice/ChatEventText.kt @@ -0,0 +1,45 @@ +package ai.openclaw.app.voice + +import kotlinx.serialization.json.JsonArray +import kotlinx.serialization.json.JsonElement +import kotlinx.serialization.json.JsonObject +import kotlinx.serialization.json.JsonPrimitive + +internal object ChatEventText { + fun assistantTextFromPayload(payload: JsonObject): String? = assistantTextFromMessage(payload["message"]) + + fun assistantTextFromMessage(messageEl: JsonElement?): String? { + val message = messageEl.asObjectOrNull() ?: return null + val role = message["role"].asStringOrNull() + if (role != null && role != "assistant") return null + return textFromContent(message["content"]) + } + + private fun textFromContent(content: JsonElement?): String? = + when (content) { + is JsonPrimitive -> content.asStringOrNull()?.trim()?.takeIf { it.isNotEmpty() } + is JsonArray -> + content + .mapNotNull(::textFromContentPart) + .filter { it.isNotEmpty() } + .joinToString("\n") + .takeIf { it.isNotBlank() } + else -> null + } + + private fun textFromContentPart(part: JsonElement): String? { + part + .asStringOrNull() + ?.trim() + ?.takeIf { it.isNotEmpty() } + ?.let { return it } + val obj = part.asObjectOrNull() ?: return null + val type = obj["type"].asStringOrNull() + if (type != null && type != "text") return null + return obj["text"].asStringOrNull()?.trim()?.takeIf { it.isNotEmpty() } + } +} + +private fun JsonElement?.asObjectOrNull(): JsonObject? = this as? JsonObject + +private fun JsonElement?.asStringOrNull(): String? = (this as? JsonPrimitive)?.takeIf { it.isString }?.content diff --git a/apps/android/app/src/main/java/ai/openclaw/app/voice/MicCaptureManager.kt b/apps/android/app/src/main/java/ai/openclaw/app/voice/MicCaptureManager.kt index 3bcdc6871c7..428ab2cd189 100644 --- a/apps/android/app/src/main/java/ai/openclaw/app/voice/MicCaptureManager.kt +++ b/apps/android/app/src/main/java/ai/openclaw/app/voice/MicCaptureManager.kt @@ -21,7 +21,6 @@ import kotlinx.coroutines.flow.StateFlow import kotlinx.coroutines.launch import kotlinx.coroutines.withContext import kotlinx.serialization.json.Json -import kotlinx.serialization.json.JsonArray import kotlinx.serialization.json.JsonObject import kotlinx.serialization.json.JsonPrimitive import java.util.UUID @@ -596,20 +595,7 @@ class MicCaptureManager( PackageManager.PERMISSION_GRANTED ) - private fun parseAssistantText(payload: JsonObject): String? { - val message = payload["message"].asObjectOrNull() ?: return null - if (message["role"].asStringOrNull() != "assistant") return null - val content = message["content"] as? JsonArray ?: return null - - val parts = - content.mapNotNull { item -> - val obj = item.asObjectOrNull() ?: return@mapNotNull null - if (obj["type"].asStringOrNull() != "text") return@mapNotNull null - obj["text"].asStringOrNull()?.trim()?.takeIf { it.isNotEmpty() } - } - if (parts.isEmpty()) return null - return parts.joinToString("\n") - } + private fun parseAssistantText(payload: JsonObject): String? = ChatEventText.assistantTextFromPayload(payload) private val listener = object : RecognitionListener { diff --git a/apps/android/app/src/main/java/ai/openclaw/app/voice/TalkAudioPlayer.kt b/apps/android/app/src/main/java/ai/openclaw/app/voice/TalkAudioPlayer.kt index d04845d6496..47e0126eaec 100644 --- a/apps/android/app/src/main/java/ai/openclaw/app/voice/TalkAudioPlayer.kt +++ b/apps/android/app/src/main/java/ai/openclaw/app/voice/TalkAudioPlayer.kt @@ -12,20 +12,26 @@ import kotlinx.coroutines.delay import kotlinx.coroutines.withContext import java.io.File +internal interface TalkAudioPlaying { + suspend fun play(audio: TalkSpeakAudio) + + fun stop() +} + internal class TalkAudioPlayer( private val context: Context, -) { +) : TalkAudioPlaying { private val lock = Any() private var active: ActivePlayback? = null - suspend fun play(audio: TalkSpeakAudio) { + override suspend fun play(audio: TalkSpeakAudio) { when (val mode = resolvePlaybackMode(audio)) { is TalkPlaybackMode.Pcm -> playPcm(audio.bytes, mode.sampleRate) is TalkPlaybackMode.Compressed -> playCompressed(audio.bytes, mode.fileExtension) } } - fun stop() { + override fun stop() { synchronized(lock) { active?.cancel() active = null diff --git a/apps/android/app/src/main/java/ai/openclaw/app/voice/TalkModeManager.kt b/apps/android/app/src/main/java/ai/openclaw/app/voice/TalkModeManager.kt index b219b293af1..693cbae20da 100644 --- a/apps/android/app/src/main/java/ai/openclaw/app/voice/TalkModeManager.kt +++ b/apps/android/app/src/main/java/ai/openclaw/app/voice/TalkModeManager.kt @@ -41,7 +41,28 @@ import java.util.UUID import java.util.concurrent.atomic.AtomicLong import kotlin.coroutines.coroutineContext -class TalkModeManager( +data class TalkPttStartPayload( + val captureId: String, +) { + fun toJson(): String = """{"captureId":"$captureId"}""" +} + +data class TalkPttStopPayload( + val captureId: String, + val transcript: String?, + val status: String, +) { + fun toJson(): String = + buildJsonObject { + put("captureId", JsonPrimitive(captureId)) + if (transcript != null) { + put("transcript", JsonPrimitive(transcript)) + } + put("status", JsonPrimitive(status)) + }.toString() +} + +class TalkModeManager internal constructor( private val context: Context, private val scope: CoroutineScope, private val session: GatewaySession, @@ -49,6 +70,8 @@ class TalkModeManager( private val isConnected: () -> Boolean, private val onBeforeSpeak: suspend () -> Unit = {}, private val onAfterSpeak: suspend () -> Unit = {}, + private val talkSpeakClient: TalkSpeechSynthesizing = TalkSpeakClient(session = session), + private val talkAudioPlayer: TalkAudioPlaying = TalkAudioPlayer(context), ) { companion object { private const val tag = "TalkMode" @@ -60,9 +83,6 @@ class TalkModeManager( private val mainHandler = Handler(Looper.getMainLooper()) private val json = Json { ignoreUnknownKeys = true } - private val talkSpeakClient = TalkSpeakClient(session = session, json = json) - private val talkAudioPlayer = TalkAudioPlayer(context) - private val _isEnabled = MutableStateFlow(false) val isEnabled: StateFlow = _isEnabled @@ -82,6 +102,10 @@ class TalkModeManager( private var restartJob: Job? = null private var stopRequested = false private var listeningMode = false + private var activePttCaptureId: String? = null + private var pttAutoStopEnabled = false + private var pttTimeoutJob: Job? = null + private var pttCompletion: CompletableDeferred? = null private var silenceJob: Job? = null private var silenceWindowMs = TalkDefaults.defaultSilenceTimeoutMs @@ -156,6 +180,127 @@ class TalkModeManager( } } + suspend fun beginPushToTalk(): TalkPttStartPayload { + if (!isConnected()) { + _statusText.value = "Gateway not connected" + throw IllegalStateException("UNAVAILABLE: Gateway not connected") + } + activePttCaptureId?.let { return TalkPttStartPayload(captureId = it) } + + stopSpeaking(resetInterrupt = false) + pttTimeoutJob?.cancel() + pttTimeoutJob = null + pttAutoStopEnabled = false + pttCompletion = null + silenceJob?.cancel() + silenceJob = null + listeningMode = false + finalizeInFlight = false + stopRequested = false + lastTranscript = "" + lastHeardAtMs = null + + val micOk = + ContextCompat.checkSelfPermission(context, Manifest.permission.RECORD_AUDIO) == + PackageManager.PERMISSION_GRANTED + if (!micOk) { + _statusText.value = "Microphone permission required" + throw IllegalStateException("MIC_PERMISSION_REQUIRED: grant Microphone permission") + } + if (!SpeechRecognizer.isRecognitionAvailable(context)) { + _statusText.value = "Speech recognizer unavailable" + throw IllegalStateException("UNAVAILABLE: Speech recognizer unavailable") + } + + val captureId = UUID.randomUUID().toString() + activePttCaptureId = captureId + withContext(Dispatchers.Main) { + recognizer?.cancel() + recognizer?.destroy() + recognizer = SpeechRecognizer.createSpeechRecognizer(context).also { it.setRecognitionListener(listener) } + startListeningInternal(markListening = true) + } + _statusText.value = "Listening (PTT)" + return TalkPttStartPayload(captureId = captureId) + } + + suspend fun endPushToTalk(): TalkPttStopPayload { + val captureId = activePttCaptureId ?: UUID.randomUUID().toString() + if (activePttCaptureId == null) { + return finishPushToTalk(TalkPttStopPayload(captureId = captureId, transcript = null, status = "idle")) + } + + clearPushToTalkRecognition() + val transcript = lastTranscript.trim() + lastTranscript = "" + lastHeardAtMs = null + + if (transcript.isEmpty()) { + _statusText.value = if (_isEnabled.value) "Listening" else "Ready" + if (_isEnabled.value) { + start() + } + return finishPushToTalk(TalkPttStopPayload(captureId = captureId, transcript = null, status = "empty")) + } + + if (!isConnected()) { + _statusText.value = "Gateway not connected" + if (_isEnabled.value) { + start() + } + return finishPushToTalk(TalkPttStopPayload(captureId = captureId, transcript = transcript, status = "offline")) + } + + _statusText.value = "Thinking…" + scope.launch { + finalizeTranscript(transcript) + } + return finishPushToTalk(TalkPttStopPayload(captureId = captureId, transcript = transcript, status = "queued")) + } + + suspend fun cancelPushToTalk(): TalkPttStopPayload { + val captureId = activePttCaptureId ?: UUID.randomUUID().toString() + if (activePttCaptureId == null) { + return finishPushToTalk(TalkPttStopPayload(captureId = captureId, transcript = null, status = "idle")) + } + + clearPushToTalkRecognition() + lastTranscript = "" + lastHeardAtMs = null + _statusText.value = if (_isEnabled.value) "Listening" else "Ready" + if (_isEnabled.value) { + start() + } + return finishPushToTalk(TalkPttStopPayload(captureId = captureId, transcript = null, status = "cancelled")) + } + + suspend fun runPushToTalkOnce(maxDurationMs: Long = 12_000L): TalkPttStopPayload { + if (pttCompletion != null) { + cancelPushToTalk() + } + if (activePttCaptureId != null) { + return TalkPttStopPayload( + captureId = activePttCaptureId ?: UUID.randomUUID().toString(), + transcript = null, + status = "busy", + ) + } + + beginPushToTalk() + val completion = CompletableDeferred() + pttCompletion = completion + pttAutoStopEnabled = true + startSilenceMonitor() + pttTimeoutJob = + scope.launch { + delay(maxDurationMs) + if (pttAutoStopEnabled && activePttCaptureId != null) { + endPushToTalk() + } + } + return completion.await() + } + /** * Speak a wake-word command through TalkMode's full pipeline: * chat.send → wait for final → read assistant text → TTS. @@ -335,6 +480,12 @@ class TalkModeManager( stopRequested = true finalizeInFlight = false listeningMode = false + activePttCaptureId = null + pttAutoStopEnabled = false + pttCompletion?.cancel() + pttCompletion = null + pttTimeoutJob?.cancel() + pttTimeoutJob = null restartJob?.cancel() restartJob = null silenceJob?.cancel() @@ -434,7 +585,7 @@ class TalkModeManager( silenceJob?.cancel() silenceJob = scope.launch { - while (_isEnabled.value) { + while (_isEnabled.value || pttAutoStopEnabled) { delay(200) checkSilence() } @@ -448,6 +599,12 @@ class TalkModeManager( val lastHeard = lastHeardAtMs ?: return val elapsed = SystemClock.elapsedRealtime() - lastHeard if (elapsed < silenceWindowMs) return + if (activePttCaptureId != null) { + if (pttAutoStopEnabled) { + scope.launch { endPushToTalk() } + } + return + } if (finalizeInFlight) return finalizeInFlight = true scope.launch { @@ -525,6 +682,27 @@ class TalkModeManager( } } + private suspend fun clearPushToTalkRecognition() { + pttTimeoutJob?.cancel() + pttTimeoutJob = null + pttAutoStopEnabled = false + activePttCaptureId = null + _isListening.value = false + listeningMode = false + clearListenWatchdog() + withContext(Dispatchers.Main) { + recognizer?.cancel() + recognizer?.destroy() + recognizer = null + } + } + + private fun finishPushToTalk(payload: TalkPttStopPayload): TalkPttStopPayload { + pttCompletion?.complete(payload) + pttCompletion = null + return payload + } + private suspend fun subscribeChatIfNeeded( session: GatewaySession, sessionKey: String, @@ -656,20 +834,7 @@ class TalkModeManager( } } - private fun extractTextFromChatEventMessage(messageEl: JsonElement?): String? { - val msg = messageEl?.asObjectOrNull() ?: return null - val content = msg["content"] as? JsonArray ?: return null - return content - .mapNotNull { entry -> - entry - .asObjectOrNull() - ?.get("text") - ?.asStringOrNull() - ?.trim() - }.filter { it.isNotEmpty() } - .joinToString("\n") - .takeIf { it.isNotBlank() } - } + private fun extractTextFromChatEventMessage(messageEl: JsonElement?): String? = ChatEventText.assistantTextFromMessage(messageEl) private suspend fun waitForAssistantText( session: GatewaySession, @@ -729,17 +894,16 @@ class TalkModeManager( _lastAssistantText.value = cleaned ensurePlaybackActive(playbackToken) - _statusText.value = "Speaking…" - _isSpeaking.value = true + _statusText.value = "Generating voice…" + _isSpeaking.value = false lastSpokenText = cleaned - ensureInterruptListener() - requestAudioFocusForTts() try { val started = SystemClock.elapsedRealtime() when (val result = talkSpeakClient.synthesize(text = cleaned, directive = directive)) { is TalkSpeakResult.Success -> { ensurePlaybackActive(playbackToken) + markAudioPlaybackStarting(playbackToken) talkAudioPlayer.play(result.audio) ensurePlaybackActive(playbackToken) Log.d(tag, "talk.speak ok durMs=${SystemClock.elapsedRealtime() - started}") @@ -789,8 +953,6 @@ class TalkModeManager( shouldResumeAfterSpeak = true onBeforeSpeak() ensurePlaybackActive(playbackToken) - _isSpeaking.value = true - _statusText.value = "Speaking…" block() } finally { synchronized(ttsJobLock) { @@ -888,6 +1050,7 @@ class TalkModeManager( } }, ) + markAudioPlaybackStarting(playbackToken) val result = engine.speak(text, TextToSpeech.QUEUE_FLUSH, null, utteranceId) if (result != TextToSpeech.SUCCESS) { throw IllegalStateException("TextToSpeech start failed") @@ -905,6 +1068,14 @@ class TalkModeManager( } } + private fun markAudioPlaybackStarting(playbackToken: Long) { + ensurePlaybackActive(playbackToken) + _statusText.value = "Speaking…" + _isSpeaking.value = true + ensureInterruptListener() + requestAudioFocusForTts() + } + fun stopTts() { stopSpeaking(resetInterrupt = true) _isSpeaking.value = false diff --git a/apps/android/app/src/main/java/ai/openclaw/app/voice/TalkSpeakClient.kt b/apps/android/app/src/main/java/ai/openclaw/app/voice/TalkSpeakClient.kt index b645ec230e7..f2eb32cca94 100644 --- a/apps/android/app/src/main/java/ai/openclaw/app/voice/TalkSpeakClient.kt +++ b/apps/android/app/src/main/java/ai/openclaw/app/voice/TalkSpeakClient.kt @@ -28,12 +28,19 @@ internal sealed interface TalkSpeakResult { ) : TalkSpeakResult } +internal interface TalkSpeechSynthesizing { + suspend fun synthesize( + text: String, + directive: TalkDirective?, + ): TalkSpeakResult +} + internal class TalkSpeakClient( private val session: GatewaySession? = null, private val json: Json = Json { ignoreUnknownKeys = true }, private val requestDetailed: (suspend (String, String, Long) -> GatewaySession.RpcResult)? = null, -) { - suspend fun synthesize( +) : TalkSpeechSynthesizing { + override suspend fun synthesize( text: String, directive: TalkDirective?, ): TalkSpeakResult { diff --git a/apps/android/app/src/test/java/ai/openclaw/app/GatewayBootstrapAuthTest.kt b/apps/android/app/src/test/java/ai/openclaw/app/GatewayBootstrapAuthTest.kt index 02a6106df29..54f31879b24 100644 --- a/apps/android/app/src/test/java/ai/openclaw/app/GatewayBootstrapAuthTest.kt +++ b/apps/android/app/src/test/java/ai/openclaw/app/GatewayBootstrapAuthTest.kt @@ -6,6 +6,11 @@ import ai.openclaw.app.gateway.GatewayEndpoint import ai.openclaw.app.gateway.GatewaySession import ai.openclaw.app.gateway.GatewayTlsProbeFailure import ai.openclaw.app.gateway.GatewayTlsProbeResult +import ai.openclaw.app.node.InvokeDispatcher +import ai.openclaw.app.protocol.OpenClawTalkCommand +import ai.openclaw.app.voice.TalkModeManager +import android.Manifest +import kotlinx.coroutines.flow.MutableStateFlow import kotlinx.coroutines.runBlocking import org.junit.Assert.assertEquals import org.junit.Assert.assertFalse @@ -15,6 +20,7 @@ import org.junit.Test import org.junit.runner.RunWith import org.robolectric.RobolectricTestRunner import org.robolectric.RuntimeEnvironment +import org.robolectric.Shadows.shadowOf import org.robolectric.annotation.Config import java.lang.reflect.Field import java.util.UUID @@ -221,6 +227,23 @@ class GatewayBootstrapAuthTest { assertNull(authStore.loadToken(deviceId, "operator")) } + @Test + fun talkPttStart_cleansPreparedCaptureWhenBeginFails() = + runBlocking { + val app = RuntimeEnvironment.getApplication() + shadowOf(app).grantPermissions(Manifest.permission.RECORD_AUDIO) + val runtime = NodeRuntime(app) + val dispatcher = readField(runtime, "invokeDispatcher") + + val result = dispatcher.handleInvoke(OpenClawTalkCommand.PttStart.rawValue, null) + + assertEquals("UNAVAILABLE", result.error?.code) + assertEquals(VoiceCaptureMode.Off, runtime.voiceCaptureMode.value) + assertFalse(readField>(runtime, "externalAudioCaptureActive").value) + val talkMode = readField>(runtime, "talkMode\$delegate").value + assertFalse(talkMode.ttsOnAllResponses) + } + private fun waitForGatewayTrustPrompt(runtime: NodeRuntime): NodeRuntime.GatewayTrustPrompt { repeat(50) { runtime.pendingGatewayTrust.value?.let { return it } diff --git a/apps/android/app/src/test/java/ai/openclaw/app/node/InvokeCommandRegistryTest.kt b/apps/android/app/src/test/java/ai/openclaw/app/node/InvokeCommandRegistryTest.kt index 78ceb30a0f1..dda7c574a3d 100644 --- a/apps/android/app/src/test/java/ai/openclaw/app/node/InvokeCommandRegistryTest.kt +++ b/apps/android/app/src/test/java/ai/openclaw/app/node/InvokeCommandRegistryTest.kt @@ -12,6 +12,7 @@ import ai.openclaw.app.protocol.OpenClawNotificationsCommand import ai.openclaw.app.protocol.OpenClawPhotosCommand import ai.openclaw.app.protocol.OpenClawSmsCommand import ai.openclaw.app.protocol.OpenClawSystemCommand +import ai.openclaw.app.protocol.OpenClawTalkCommand import org.junit.Assert.assertEquals import org.junit.Assert.assertFalse import org.junit.Assert.assertNotNull @@ -26,6 +27,7 @@ class InvokeCommandRegistryTest { OpenClawCapability.Device.rawValue, OpenClawCapability.Notifications.rawValue, OpenClawCapability.System.rawValue, + OpenClawCapability.Talk.rawValue, OpenClawCapability.Photos.rawValue, OpenClawCapability.Contacts.rawValue, OpenClawCapability.Calendar.rawValue, @@ -50,6 +52,10 @@ class InvokeCommandRegistryTest { OpenClawNotificationsCommand.List.rawValue, OpenClawNotificationsCommand.Actions.rawValue, OpenClawSystemCommand.Notify.rawValue, + OpenClawTalkCommand.PttStart.rawValue, + OpenClawTalkCommand.PttStop.rawValue, + OpenClawTalkCommand.PttCancel.rawValue, + OpenClawTalkCommand.PttOnce.rawValue, OpenClawPhotosCommand.Latest.rawValue, OpenClawContactsCommand.Search.rawValue, OpenClawContactsCommand.Add.rawValue, diff --git a/apps/android/app/src/test/java/ai/openclaw/app/node/InvokeDispatcherTest.kt b/apps/android/app/src/test/java/ai/openclaw/app/node/InvokeDispatcherTest.kt index 9b85084eb07..cad08b1f689 100644 --- a/apps/android/app/src/test/java/ai/openclaw/app/node/InvokeDispatcherTest.kt +++ b/apps/android/app/src/test/java/ai/openclaw/app/node/InvokeDispatcherTest.kt @@ -1,11 +1,13 @@ package ai.openclaw.app.node import ai.openclaw.app.gateway.DeviceIdentityStore +import ai.openclaw.app.gateway.GatewaySession import ai.openclaw.app.protocol.OpenClawCallLogCommand import ai.openclaw.app.protocol.OpenClawCameraCommand import ai.openclaw.app.protocol.OpenClawLocationCommand import ai.openclaw.app.protocol.OpenClawMotionCommand import ai.openclaw.app.protocol.OpenClawSmsCommand +import ai.openclaw.app.protocol.OpenClawTalkCommand import android.content.Context import android.content.pm.PackageManager import kotlinx.coroutines.flow.MutableStateFlow @@ -208,6 +210,27 @@ class InvokeDispatcherTest { assertEquals("INVALID_REQUEST: unknown command", result.error?.message) } + @Test + fun handleInvoke_routesTalkPttCommands() = + runTest { + val talk = InvokeDispatcherFakeTalkHandler() + val dispatcher = newDispatcher(talkHandler = talk) + + val start = dispatcher.handleInvoke(OpenClawTalkCommand.PttStart.rawValue, null) + val stop = dispatcher.handleInvoke(OpenClawTalkCommand.PttStop.rawValue, null) + val cancel = dispatcher.handleInvoke(OpenClawTalkCommand.PttCancel.rawValue, null) + val once = dispatcher.handleInvoke(OpenClawTalkCommand.PttOnce.rawValue, null) + + assertEquals("""{"captureId":"start"}""", start.payloadJson) + assertEquals("""{"status":"stop"}""", stop.payloadJson) + assertEquals("""{"status":"cancel"}""", cancel.payloadJson) + assertEquals("""{"status":"once"}""", once.payloadJson) + assertEquals( + listOf("start", "stop", "cancel", "once"), + talk.calls, + ) + } + private fun newDispatcher( cameraEnabled: Boolean = false, locationEnabled: Boolean = false, @@ -219,6 +242,7 @@ class InvokeDispatcherTest { debugBuild: Boolean = false, motionActivityAvailable: Boolean = false, motionPedometerAvailable: Boolean = false, + talkHandler: TalkHandler = InvokeDispatcherFakeTalkHandler(), ): InvokeDispatcher { val appContext = RuntimeEnvironment.getApplication() shadowOf(appContext.packageManager).setSystemFeature(PackageManager.FEATURE_TELEPHONY, smsTelephonyAvailable) @@ -238,6 +262,7 @@ class InvokeDispatcherTest { stateProvider = InvokeDispatcherFakeNotificationsStateProvider(), ), systemHandler = SystemHandler.forTesting(InvokeDispatcherFakeSystemNotificationPoster()), + talkHandler = talkHandler, photosHandler = PhotosHandler.forTesting(appContext, InvokeDispatcherFakePhotosDataSource()), contactsHandler = ContactsHandler.forTesting(appContext, InvokeDispatcherFakeContactsDataSource()), calendarHandler = CalendarHandler.forTesting(appContext, InvokeDispatcherFakeCalendarDataSource()), @@ -312,6 +337,30 @@ private class InvokeDispatcherFakeSystemNotificationPoster : SystemNotificationP override fun post(request: SystemNotifyRequest) = Unit } +private class InvokeDispatcherFakeTalkHandler : TalkHandler { + val calls = mutableListOf() + + override suspend fun handlePttStart(paramsJson: String?): GatewaySession.InvokeResult { + calls.add("start") + return GatewaySession.InvokeResult.ok("""{"captureId":"start"}""") + } + + override suspend fun handlePttStop(paramsJson: String?): GatewaySession.InvokeResult { + calls.add("stop") + return GatewaySession.InvokeResult.ok("""{"status":"stop"}""") + } + + override suspend fun handlePttCancel(paramsJson: String?): GatewaySession.InvokeResult { + calls.add("cancel") + return GatewaySession.InvokeResult.ok("""{"status":"cancel"}""") + } + + override suspend fun handlePttOnce(paramsJson: String?): GatewaySession.InvokeResult { + calls.add("once") + return GatewaySession.InvokeResult.ok("""{"status":"once"}""") + } +} + private class InvokeDispatcherFakePhotosDataSource : PhotosDataSource { override fun hasPermission(context: Context): Boolean = true diff --git a/apps/android/app/src/test/java/ai/openclaw/app/protocol/OpenClawProtocolConstantsTest.kt b/apps/android/app/src/test/java/ai/openclaw/app/protocol/OpenClawProtocolConstantsTest.kt index 55c5d683fa0..069f51603c2 100644 --- a/apps/android/app/src/test/java/ai/openclaw/app/protocol/OpenClawProtocolConstantsTest.kt +++ b/apps/android/app/src/test/java/ai/openclaw/app/protocol/OpenClawProtocolConstantsTest.kt @@ -25,6 +25,7 @@ class OpenClawProtocolConstantsTest { assertEquals("canvas", OpenClawCapability.Canvas.rawValue) assertEquals("camera", OpenClawCapability.Camera.rawValue) assertEquals("voiceWake", OpenClawCapability.VoiceWake.rawValue) + assertEquals("talk", OpenClawCapability.Talk.rawValue) assertEquals("location", OpenClawCapability.Location.rawValue) assertEquals("sms", OpenClawCapability.Sms.rawValue) assertEquals("device", OpenClawCapability.Device.rawValue) @@ -92,6 +93,14 @@ class OpenClawProtocolConstantsTest { assertEquals("sms.search", OpenClawSmsCommand.Search.rawValue) } + @Test + fun talkCommandsUseStableStrings() { + assertEquals("talk.ptt.start", OpenClawTalkCommand.PttStart.rawValue) + assertEquals("talk.ptt.stop", OpenClawTalkCommand.PttStop.rawValue) + assertEquals("talk.ptt.cancel", OpenClawTalkCommand.PttCancel.rawValue) + assertEquals("talk.ptt.once", OpenClawTalkCommand.PttOnce.rawValue) + } + @Test fun callLogCommandsUseStableStrings() { assertEquals("callLog.search", OpenClawCallLogCommand.Search.rawValue) diff --git a/apps/android/app/src/test/java/ai/openclaw/app/voice/ChatEventTextTest.kt b/apps/android/app/src/test/java/ai/openclaw/app/voice/ChatEventTextTest.kt new file mode 100644 index 00000000000..36978812faa --- /dev/null +++ b/apps/android/app/src/test/java/ai/openclaw/app/voice/ChatEventTextTest.kt @@ -0,0 +1,69 @@ +package ai.openclaw.app.voice + +import kotlinx.serialization.json.Json +import kotlinx.serialization.json.JsonObject +import org.junit.Assert.assertEquals +import org.junit.Assert.assertNull +import org.junit.Test + +class ChatEventTextTest { + private val json = Json { ignoreUnknownKeys = true } + + @Test + fun extractsAssistantTextParts() { + val payload = + payload( + """ + { + "message": { + "role": "assistant", + "content": [ + { "type": "text", "text": "hello" }, + { "type": "text", "text": "world" } + ] + } + } + """, + ) + + assertEquals("hello\nworld", ChatEventText.assistantTextFromPayload(payload)) + } + + @Test + fun extractsPlainStringContent() { + val payload = + payload( + """ + { + "message": { + "role": "assistant", + "content": "plain reply" + } + } + """, + ) + + assertEquals("plain reply", ChatEventText.assistantTextFromPayload(payload)) + } + + @Test + fun ignoresUserMessages() { + val payload = + payload( + """ + { + "message": { + "role": "user", + "content": [ + { "type": "text", "text": "do not speak" } + ] + } + } + """, + ) + + assertNull(ChatEventText.assistantTextFromPayload(payload)) + } + + private fun payload(source: String): JsonObject = json.parseToJsonElement(source.trimIndent()) as JsonObject +} diff --git a/apps/android/app/src/test/java/ai/openclaw/app/voice/TalkModeManagerTest.kt b/apps/android/app/src/test/java/ai/openclaw/app/voice/TalkModeManagerTest.kt index 6bd5c1fcbcc..b8e67058c64 100644 --- a/apps/android/app/src/test/java/ai/openclaw/app/voice/TalkModeManagerTest.kt +++ b/apps/android/app/src/test/java/ai/openclaw/app/voice/TalkModeManagerTest.kt @@ -9,7 +9,10 @@ import kotlinx.coroutines.CoroutineScope import kotlinx.coroutines.Dispatchers import kotlinx.coroutines.Job import kotlinx.coroutines.SupervisorJob +import kotlinx.coroutines.launch +import kotlinx.coroutines.test.runTest import org.junit.Assert.assertEquals +import org.junit.Assert.assertFalse import org.junit.Assert.assertTrue import org.junit.Test import org.junit.runner.RunWith @@ -78,7 +81,54 @@ class TalkModeManagerTest { assertEquals(1L, playbackGeneration(manager).get()) } - private fun createManager(): TalkModeManager { + @Test + fun nonPendingUserFinalDoesNotUseAllResponseTts() { + val manager = createManager() + + manager.ttsOnAllResponses = true + manager.handleGatewayEvent("chat", chatFinalPayload(runId = "run-user", text = "do not speak", role = "user")) + + assertEquals(0L, playbackGeneration(manager).get()) + } + + @Test + fun textReadyDoesNotEnterSpeakingUntilAudioPlaybackStarts() = + runTest { + val talkSpeakClient = FakeTalkSpeechSynthesizer() + val talkAudioPlayer = FakeTalkAudioPlayer() + val manager = createManager(talkSpeakClient = talkSpeakClient, talkAudioPlayer = talkAudioPlayer) + + val job = launch { manager.speakAssistantReply("hello") } + talkSpeakClient.requested.await() + + assertEquals("Generating voice…", manager.statusText.value) + assertFalse(manager.isSpeaking.value) + + talkSpeakClient.result.complete( + TalkSpeakResult.Success( + TalkSpeakAudio( + bytes = byteArrayOf(1, 2, 3), + provider = "test", + outputFormat = "mp3_44100_128", + voiceCompatible = true, + mimeType = "audio/mpeg", + fileExtension = ".mp3", + ), + ), + ) + talkAudioPlayer.started.await() + + assertEquals("Speaking…", manager.statusText.value) + assertTrue(manager.isSpeaking.value) + + talkAudioPlayer.finished.complete(Unit) + job.join() + } + + private fun createManager( + talkSpeakClient: TalkSpeechSynthesizing = TalkSpeakClient(), + talkAudioPlayer: TalkAudioPlaying? = null, + ): TalkModeManager { val app = RuntimeEnvironment.getApplication() val sessionJob = SupervisorJob() val session = @@ -96,6 +146,8 @@ class TalkModeManagerTest { session = session, supportsChatSubscribe = false, isConnected = { true }, + talkSpeakClient = talkSpeakClient, + talkAudioPlayer = talkAudioPlayer ?: TalkAudioPlayer(app), ) } @@ -124,6 +176,7 @@ class TalkModeManagerTest { private fun chatFinalPayload( runId: String, text: String, + role: String = "assistant", ): String = """ { @@ -131,7 +184,7 @@ class TalkModeManagerTest { "sessionKey": "main", "state": "final", "message": { - "role": "assistant", + "role": "$role", "content": [ { "type": "text", "text": "$text" } ] @@ -140,6 +193,34 @@ class TalkModeManagerTest { """.trimIndent() } +private class FakeTalkSpeechSynthesizer : TalkSpeechSynthesizing { + val requested = CompletableDeferred() + val result = CompletableDeferred() + + override suspend fun synthesize( + text: String, + directive: TalkDirective?, + ): TalkSpeakResult { + requested.complete(Unit) + return result.await() + } +} + +private class FakeTalkAudioPlayer : TalkAudioPlaying { + val started = CompletableDeferred() + val finished = CompletableDeferred() + var stopped = false + + override suspend fun play(audio: TalkSpeakAudio) { + started.complete(Unit) + finished.await() + } + + override fun stop() { + stopped = true + } +} + private class InMemoryDeviceAuthStore : DeviceAuthTokenStore { override fun loadEntry( deviceId: String, diff --git a/apps/ios/Sources/Gateway/GatewayConnectionController.swift b/apps/ios/Sources/Gateway/GatewayConnectionController.swift index 25aea75b4f0..95d0af2e2bd 100644 --- a/apps/ios/Sources/Gateway/GatewayConnectionController.swift +++ b/apps/ios/Sources/Gateway/GatewayConnectionController.swift @@ -821,6 +821,7 @@ final class GatewayConnectionController { if locationMode != .off { caps.append(OpenClawCapability.location.rawValue) } caps.append(OpenClawCapability.device.rawValue) + caps.append(OpenClawCapability.talk.rawValue) if WatchMessagingService.isSupportedOnDevice() { caps.append(OpenClawCapability.watch.rawValue) } diff --git a/apps/ios/Sources/Voice/TalkModeManager.swift b/apps/ios/Sources/Voice/TalkModeManager.swift index add9216dc77..65902a68eac 100644 --- a/apps/ios/Sources/Voice/TalkModeManager.swift +++ b/apps/ios/Sources/Voice/TalkModeManager.swift @@ -800,11 +800,11 @@ final class TalkModeManager: NSObject { } } let completion = await self.waitForChatCompletion(runId: runId, gateway: gateway, timeoutSeconds: 120) - if completion == .timeout { + if completion.state == .timeout { self.logger.warning( "chat completion timeout runId=\(runId, privacy: .public); attempting history fallback") GatewayDiagnostics.log("talk: chat completion timeout runId=\(runId)") - } else if completion == .aborted { + } else if completion.state == .aborted { self.statusText = "Aborted" self.logger.warning("chat completion aborted runId=\(runId, privacy: .public)") GatewayDiagnostics.log("talk: chat completion aborted runId=\(runId)") @@ -812,7 +812,7 @@ final class TalkModeManager: NSObject { await self.finishIncrementalSpeech() await self.start() return - } else if completion == .error { + } else if completion.state == .error { self.statusText = "Chat error" self.logger.warning("chat completion error runId=\(runId, privacy: .public)") GatewayDiagnostics.log("talk: chat completion error runId=\(runId)") @@ -822,16 +822,19 @@ final class TalkModeManager: NSObject { return } - var assistantText = try await self.waitForAssistantText( - gateway: gateway, - since: startedAt, - timeoutSeconds: completion == .final ? 12 : 25) + var assistantText = completion.assistantText if assistantText == nil, shouldIncremental { let fallback = self.incrementalSpeechBuffer.latestText if !fallback.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty { assistantText = fallback } } + if assistantText == nil { + assistantText = try await self.waitForAssistantTextFromHistory( + gateway: gateway, + since: startedAt, + timeoutSeconds: completion.state == .final ? 12 : 25) + } guard let assistantText else { self.statusText = "No reply" self.logger.warning("assistant text timeout runId=\(runId, privacy: .public)") @@ -898,6 +901,11 @@ final class TalkModeManager: NSObject { } } + private struct ChatCompletionResult { + var state: ChatCompletionState + var assistantText: String? + } + private func sendChat(_ message: String, gateway: GatewayNodeSession) async throws -> String { struct SendResponse: Decodable { let runId: String } let payload: [String: Any] = [ @@ -922,40 +930,51 @@ final class TalkModeManager: NSObject { private func waitForChatCompletion( runId: String, gateway: GatewayNodeSession, - timeoutSeconds: Int = 120) async -> ChatCompletionState + timeoutSeconds: Int = 120) async -> ChatCompletionResult { let stream = await gateway.subscribeServerEvents(bufferingNewest: 200) - return await withTaskGroup(of: ChatCompletionState.self) { group in + return await withTaskGroup(of: ChatCompletionResult.self) { group in group.addTask { [runId] in + var latestAssistantText: String? for await evt in stream { - if Task.isCancelled { return .timeout } + if Task.isCancelled { + return ChatCompletionResult(state: .timeout, assistantText: latestAssistantText) + } guard evt.event == "chat", let payload = evt.payload else { continue } - guard let chatEvent = try? GatewayPayloadDecoding.decode(payload, as: ChatEvent.self) else { + guard let chatEvent = try? GatewayPayloadDecoding.decode( + payload, + as: OpenClawChatEventPayload.self) + else { continue } - guard chatEvent.runid == runId else { continue } - if let state = chatEvent.state.value as? String { - switch state { - case "final": return .final - case "aborted": return .aborted - case "error": return .error - default: break - } + guard chatEvent.runId == runId else { continue } + if let text = OpenClawChatEventText.assistantText(from: chatEvent) { + latestAssistantText = text + } + switch chatEvent.state { + case "final": + return ChatCompletionResult(state: .final, assistantText: latestAssistantText) + case "aborted": + return ChatCompletionResult(state: .aborted, assistantText: nil) + case "error": + return ChatCompletionResult(state: .error, assistantText: nil) + default: + break } } - return .timeout + return ChatCompletionResult(state: .timeout, assistantText: latestAssistantText) } group.addTask { try? await Task.sleep(nanoseconds: UInt64(timeoutSeconds) * 1_000_000_000) - return .timeout + return ChatCompletionResult(state: .timeout, assistantText: nil) } - let result = await group.next() ?? .timeout + let result = await group.next() ?? ChatCompletionResult(state: .timeout, assistantText: nil) group.cancelAll() return result } } - private func waitForAssistantText( + private func waitForAssistantTextFromHistory( gateway: GatewayNodeSession, since: Double, timeoutSeconds: Int) async throws -> String? diff --git a/apps/ios/Tests/GatewayConnectionControllerTests.swift b/apps/ios/Tests/GatewayConnectionControllerTests.swift index 5acd0f6e774..b6341a01240 100644 --- a/apps/ios/Tests/GatewayConnectionControllerTests.swift +++ b/apps/ios/Tests/GatewayConnectionControllerTests.swift @@ -36,6 +36,7 @@ import UIKit #expect(caps.contains(OpenClawCapability.camera.rawValue)) #expect(caps.contains(OpenClawCapability.location.rawValue)) #expect(caps.contains(OpenClawCapability.voiceWake.rawValue)) + #expect(caps.contains(OpenClawCapability.talk.rawValue)) } } diff --git a/apps/macos/Sources/OpenClaw/TalkModeRuntime.swift b/apps/macos/Sources/OpenClaw/TalkModeRuntime.swift index 8f7bd5b0a10..da89b9ee098 100644 --- a/apps/macos/Sources/OpenClaw/TalkModeRuntime.swift +++ b/apps/macos/Sources/OpenClaw/TalkModeRuntime.swift @@ -395,10 +395,18 @@ actor TalkModeRuntime { "talk chat.send ok runId=\(response.runId, privacy: .public) " + "session=\(sessionKey, privacy: .public)") - guard let assistantText = await self.waitForAssistantText( + var assistantText = await self.waitForAssistantEventText( sessionKey: sessionKey, - since: startedAt, + runId: response.runId, timeoutSeconds: 45) + if assistantText == nil { + self.logger.warning("talk assistant event text missing; using history fallback") + assistantText = await self.waitForAssistantTextFromHistory( + sessionKey: sessionKey, + since: startedAt, + timeoutSeconds: 12) + } + guard let assistantText else { self.logger.warning("talk assistant text missing after timeout") await self.startListening() @@ -439,7 +447,67 @@ actor TalkModeRuntime { return TalkPromptBuilder.build(transcript: transcript, interruptedAtSeconds: interrupted) } - private func waitForAssistantText( + private func waitForAssistantEventText( + sessionKey: String, + runId: String, + timeoutSeconds: Int) async -> String? + { + let stream = await GatewayConnection.shared.subscribe(bufferingNewest: 200) + return await withTaskGroup(of: String?.self) { group in + group.addTask { [runId, sessionKey] in + var latestText: String? + for await push in stream { + if Task.isCancelled { return latestText } + guard case let .event(evt) = push else { continue } + guard evt.event == "chat", let payload = evt.payload else { continue } + guard let chatEvent = try? GatewayPayloadDecoding.decode( + payload, + as: OpenClawChatEventPayload.self) + else { + continue + } + guard chatEvent.runId == runId else { continue } + if let eventSessionKey = chatEvent.sessionKey, + !Self.matchesSessionKey(eventSessionKey, sessionKey) + { + continue + } + if let text = OpenClawChatEventText.assistantText(from: chatEvent) { + latestText = text + } + switch chatEvent.state { + case "final": + return latestText + case "aborted", "error": + return nil + default: + break + } + } + return latestText + } + group.addTask { + try? await Task.sleep(nanoseconds: UInt64(timeoutSeconds) * 1_000_000_000) + return nil + } + guard let result = await group.next() else { + group.cancelAll() + return nil + } + group.cancelAll() + return result + } + } + + private static func matchesSessionKey(_ incoming: String, _ current: String) -> Bool { + let incoming = incoming.trimmingCharacters(in: .whitespacesAndNewlines).lowercased() + let current = current.trimmingCharacters(in: .whitespacesAndNewlines).lowercased() + if incoming == current { return true } + return (incoming == "agent:main:main" && current == "main") || + (incoming == "main" && current == "agent:main:main") + } + + private func waitForAssistantTextFromHistory( sessionKey: String, since: Double, timeoutSeconds: Int) async -> String? @@ -1111,7 +1179,10 @@ extension TalkModeRuntime { } else { self.ttsLogger .info( - "talk provider \(parsed.activeProvider, privacy: .public) uses gateway talk.speak with system voice fallback") + """ + talk provider \(parsed.activeProvider, privacy: .public) uses gateway talk.speak \ + with system voice fallback + """) } return parsed } catch { diff --git a/apps/macos/Sources/OpenClawProtocol/GatewayModels.swift b/apps/macos/Sources/OpenClawProtocol/GatewayModels.swift index 9a2945988ca..f9009626696 100644 --- a/apps/macos/Sources/OpenClawProtocol/GatewayModels.swift +++ b/apps/macos/Sources/OpenClawProtocol/GatewayModels.swift @@ -2630,6 +2630,116 @@ public struct TalkModeParams: Codable, Sendable { } } +public struct TalkEvent: Codable, Sendable { + public let id: String + public let type: AnyCodable + public let sessionid: String + public let turnid: String? + public let captureid: String? + public let seq: Int + public let timestamp: String + public let mode: AnyCodable + public let transport: AnyCodable + public let brain: AnyCodable + public let provider: String? + public let final: Bool? + public let callid: String? + public let itemid: String? + public let parentid: String? + public let payload: AnyCodable + + public init( + id: String, + type: AnyCodable, + sessionid: String, + turnid: String?, + captureid: String?, + seq: Int, + timestamp: String, + mode: AnyCodable, + transport: AnyCodable, + brain: AnyCodable, + provider: String?, + final: Bool?, + callid: String?, + itemid: String?, + parentid: String?, + payload: AnyCodable) + { + self.id = id + self.type = type + self.sessionid = sessionid + self.turnid = turnid + self.captureid = captureid + self.seq = seq + self.timestamp = timestamp + self.mode = mode + self.transport = transport + self.brain = brain + self.provider = provider + self.final = final + self.callid = callid + self.itemid = itemid + self.parentid = parentid + self.payload = payload + } + + private enum CodingKeys: String, CodingKey { + case id + case type + case sessionid = "sessionId" + case turnid = "turnId" + case captureid = "captureId" + case seq + case timestamp + case mode + case transport + case brain + case provider + case final + case callid = "callId" + case itemid = "itemId" + case parentid = "parentId" + case payload + } +} + +public struct TalkCatalogParams: Codable, Sendable {} + +public struct TalkCatalogResult: Codable, Sendable { + public let modes: [AnyCodable] + public let transports: [AnyCodable] + public let brains: [AnyCodable] + public let speech: [String: AnyCodable] + public let transcription: [String: AnyCodable] + public let realtime: [String: AnyCodable] + + public init( + modes: [AnyCodable], + transports: [AnyCodable], + brains: [AnyCodable], + speech: [String: AnyCodable], + transcription: [String: AnyCodable], + realtime: [String: AnyCodable]) + { + self.modes = modes + self.transports = transports + self.brains = brains + self.speech = speech + self.transcription = transcription + self.realtime = realtime + } + + private enum CodingKeys: String, CodingKey { + case modes + case transports + case brains + case speech + case transcription + case realtime + } +} + public struct TalkConfigParams: Codable, Sendable { public let includesecrets: Bool? @@ -2658,22 +2768,383 @@ public struct TalkConfigResult: Codable, Sendable { } } +public struct TalkHandoffCreateParams: Codable, Sendable { + public let sessionkey: String + public let sessionid: String? + public let channel: String? + public let target: String? + public let provider: String? + public let model: String? + public let voice: String? + public let mode: AnyCodable? + public let transport: AnyCodable? + public let brain: AnyCodable? + public let ttlms: Int? + + public init( + sessionkey: String, + sessionid: String?, + channel: String?, + target: String?, + provider: String?, + model: String?, + voice: String?, + mode: AnyCodable?, + transport: AnyCodable?, + brain: AnyCodable?, + ttlms: Int?) + { + self.sessionkey = sessionkey + self.sessionid = sessionid + self.channel = channel + self.target = target + self.provider = provider + self.model = model + self.voice = voice + self.mode = mode + self.transport = transport + self.brain = brain + self.ttlms = ttlms + } + + private enum CodingKeys: String, CodingKey { + case sessionkey = "sessionKey" + case sessionid = "sessionId" + case channel + case target + case provider + case model + case voice + case mode + case transport + case brain + case ttlms = "ttlMs" + } +} + +public struct TalkHandoffCreateResult: Codable, Sendable { + public let id: String + public let roomid: String + public let roomurl: String + public let token: String + public let sessionkey: String + public let sessionid: String? + public let channel: String? + public let target: String? + public let provider: String? + public let model: String? + public let voice: String? + public let mode: AnyCodable + public let transport: AnyCodable + public let brain: AnyCodable + public let createdat: Double + public let expiresat: Double + public let room: [String: AnyCodable] + + public init( + id: String, + roomid: String, + roomurl: String, + token: String, + sessionkey: String, + sessionid: String?, + channel: String?, + target: String?, + provider: String?, + model: String?, + voice: String?, + mode: AnyCodable, + transport: AnyCodable, + brain: AnyCodable, + createdat: Double, + expiresat: Double, + room: [String: AnyCodable]) + { + self.id = id + self.roomid = roomid + self.roomurl = roomurl + self.token = token + self.sessionkey = sessionkey + self.sessionid = sessionid + self.channel = channel + self.target = target + self.provider = provider + self.model = model + self.voice = voice + self.mode = mode + self.transport = transport + self.brain = brain + self.createdat = createdat + self.expiresat = expiresat + self.room = room + } + + private enum CodingKeys: String, CodingKey { + case id + case roomid = "roomId" + case roomurl = "roomUrl" + case token + case sessionkey = "sessionKey" + case sessionid = "sessionId" + case channel + case target + case provider + case model + case voice + case mode + case transport + case brain + case createdat = "createdAt" + case expiresat = "expiresAt" + case room + } +} + +public struct TalkHandoffJoinParams: Codable, Sendable { + public let id: String + public let token: String + + public init( + id: String, + token: String) + { + self.id = id + self.token = token + } + + private enum CodingKeys: String, CodingKey { + case id + case token + } +} + +public struct TalkHandoffJoinResult: Codable, Sendable { + public let id: String + public let roomid: String + public let roomurl: String + public let sessionkey: String + public let sessionid: String? + public let channel: String? + public let target: String? + public let provider: String? + public let model: String? + public let voice: String? + public let mode: AnyCodable + public let transport: AnyCodable + public let brain: AnyCodable + public let createdat: Double + public let expiresat: Double + public let room: [String: AnyCodable] + + public init( + id: String, + roomid: String, + roomurl: String, + sessionkey: String, + sessionid: String?, + channel: String?, + target: String?, + provider: String?, + model: String?, + voice: String?, + mode: AnyCodable, + transport: AnyCodable, + brain: AnyCodable, + createdat: Double, + expiresat: Double, + room: [String: AnyCodable]) + { + self.id = id + self.roomid = roomid + self.roomurl = roomurl + self.sessionkey = sessionkey + self.sessionid = sessionid + self.channel = channel + self.target = target + self.provider = provider + self.model = model + self.voice = voice + self.mode = mode + self.transport = transport + self.brain = brain + self.createdat = createdat + self.expiresat = expiresat + self.room = room + } + + private enum CodingKeys: String, CodingKey { + case id + case roomid = "roomId" + case roomurl = "roomUrl" + case sessionkey = "sessionKey" + case sessionid = "sessionId" + case channel + case target + case provider + case model + case voice + case mode + case transport + case brain + case createdat = "createdAt" + case expiresat = "expiresAt" + case room + } +} + +public struct TalkHandoffRevokeParams: Codable, Sendable { + public let id: String + + public init( + id: String) + { + self.id = id + } + + private enum CodingKeys: String, CodingKey { + case id + } +} + +public struct TalkHandoffRevokeResult: Codable, Sendable { + public let ok: Bool + public let revoked: Bool + + public init( + ok: Bool, + revoked: Bool) + { + self.ok = ok + self.revoked = revoked + } + + private enum CodingKeys: String, CodingKey { + case ok + case revoked + } +} + +public struct TalkHandoffTurnStartParams: Codable, Sendable { + public let id: String + public let token: String + public let turnid: String? + + public init( + id: String, + token: String, + turnid: String?) + { + self.id = id + self.token = token + self.turnid = turnid + } + + private enum CodingKeys: String, CodingKey { + case id + case token + case turnid = "turnId" + } +} + +public struct TalkHandoffTurnEndParams: Codable, Sendable { + public let id: String + public let token: String + public let turnid: String? + + public init( + id: String, + token: String, + turnid: String?) + { + self.id = id + self.token = token + self.turnid = turnid + } + + private enum CodingKeys: String, CodingKey { + case id + case token + case turnid = "turnId" + } +} + +public struct TalkHandoffTurnCancelParams: Codable, Sendable { + public let id: String + public let token: String + public let turnid: String? + public let reason: String? + + public init( + id: String, + token: String, + turnid: String?, + reason: String?) + { + self.id = id + self.token = token + self.turnid = turnid + self.reason = reason + } + + private enum CodingKeys: String, CodingKey { + case id + case token + case turnid = "turnId" + case reason + } +} + +public struct TalkHandoffTurnResult: Codable, Sendable { + public let ok: Bool + public let record: TalkHandoffJoinResult + public let turnid: String + public let events: [TalkEvent] + + public init( + ok: Bool, + record: TalkHandoffJoinResult, + turnid: String, + events: [TalkEvent]) + { + self.ok = ok + self.record = record + self.turnid = turnid + self.events = events + } + + private enum CodingKeys: String, CodingKey { + case ok + case record + case turnid = "turnId" + case events + } +} + public struct TalkRealtimeSessionParams: Codable, Sendable { public let sessionkey: String? public let provider: String? public let model: String? public let voice: String? + public let mode: AnyCodable? + public let transport: AnyCodable? + public let brain: AnyCodable? public init( sessionkey: String?, provider: String?, model: String?, - voice: String?) + voice: String?, + mode: AnyCodable?, + transport: AnyCodable?, + brain: AnyCodable?) { self.sessionkey = sessionkey self.provider = provider self.model = model self.voice = voice + self.mode = mode + self.transport = transport + self.brain = brain } private enum CodingKeys: String, CodingKey { @@ -2681,6 +3152,9 @@ public struct TalkRealtimeSessionParams: Codable, Sendable { case provider case model case voice + case mode + case transport + case brain } } @@ -2706,6 +3180,24 @@ public struct TalkRealtimeRelayAudioParams: Codable, Sendable { } } +public struct TalkRealtimeRelayCancelParams: Codable, Sendable { + public let relaysessionid: String + public let reason: String? + + public init( + relaysessionid: String, + reason: String?) + { + self.relaysessionid = relaysessionid + self.reason = reason + } + + private enum CodingKeys: String, CodingKey { + case relaysessionid = "relaySessionId" + case reason + } +} + public struct TalkRealtimeRelayMarkParams: Codable, Sendable { public let relaysessionid: String public let markname: String? @@ -2774,6 +3266,166 @@ public struct TalkRealtimeRelayOkResult: Codable, Sendable { } } +public struct TalkRealtimeToolCallParams: Codable, Sendable { + public let sessionkey: String + public let callid: String + public let name: String + public let args: AnyCodable? + public let relaysessionid: String? + + public init( + sessionkey: String, + callid: String, + name: String, + args: AnyCodable?, + relaysessionid: String?) + { + self.sessionkey = sessionkey + self.callid = callid + self.name = name + self.args = args + self.relaysessionid = relaysessionid + } + + private enum CodingKeys: String, CodingKey { + case sessionkey = "sessionKey" + case callid = "callId" + case name + case args + case relaysessionid = "relaySessionId" + } +} + +public struct TalkRealtimeToolCallResult: Codable, Sendable { + public let runid: String + public let idempotencykey: String + + public init( + runid: String, + idempotencykey: String) + { + self.runid = runid + self.idempotencykey = idempotencykey + } + + private enum CodingKeys: String, CodingKey { + case runid = "runId" + case idempotencykey = "idempotencyKey" + } +} + +public struct TalkTranscriptionSessionParams: Codable, Sendable { + public let provider: String? + + public init( + provider: String?) + { + self.provider = provider + } + + private enum CodingKeys: String, CodingKey { + case provider + } +} + +public struct TalkTranscriptionSessionResult: Codable, Sendable { + public let provider: String + public let mode: String + public let transport: String + public let transcriptionsessionid: String + public let audio: [String: AnyCodable] + public let expiresat: Double + + public init( + provider: String, + mode: String, + transport: String, + transcriptionsessionid: String, + audio: [String: AnyCodable], + expiresat: Double) + { + self.provider = provider + self.mode = mode + self.transport = transport + self.transcriptionsessionid = transcriptionsessionid + self.audio = audio + self.expiresat = expiresat + } + + private enum CodingKeys: String, CodingKey { + case provider + case mode + case transport + case transcriptionsessionid = "transcriptionSessionId" + case audio + case expiresat = "expiresAt" + } +} + +public struct TalkTranscriptionRelayAudioParams: Codable, Sendable { + public let transcriptionsessionid: String + public let audiobase64: String + + public init( + transcriptionsessionid: String, + audiobase64: String) + { + self.transcriptionsessionid = transcriptionsessionid + self.audiobase64 = audiobase64 + } + + private enum CodingKeys: String, CodingKey { + case transcriptionsessionid = "transcriptionSessionId" + case audiobase64 = "audioBase64" + } +} + +public struct TalkTranscriptionRelayCancelParams: Codable, Sendable { + public let transcriptionsessionid: String + public let reason: String? + + public init( + transcriptionsessionid: String, + reason: String?) + { + self.transcriptionsessionid = transcriptionsessionid + self.reason = reason + } + + private enum CodingKeys: String, CodingKey { + case transcriptionsessionid = "transcriptionSessionId" + case reason + } +} + +public struct TalkTranscriptionRelayStopParams: Codable, Sendable { + public let transcriptionsessionid: String + + public init( + transcriptionsessionid: String) + { + self.transcriptionsessionid = transcriptionsessionid + } + + private enum CodingKeys: String, CodingKey { + case transcriptionsessionid = "transcriptionSessionId" + } +} + +public struct TalkTranscriptionRelayOkResult: Codable, Sendable { + public let ok: Bool + + public init( + ok: Bool) + { + self.ok = ok + } + + private enum CodingKeys: String, CodingKey { + case ok + } +} + public struct TalkSpeakParams: Codable, Sendable { public let text: String public let voiceid: String? diff --git a/apps/shared/OpenClawKit/Sources/OpenClawChatUI/ChatEventText.swift b/apps/shared/OpenClawKit/Sources/OpenClawChatUI/ChatEventText.swift new file mode 100644 index 00000000000..5bf5530d2ac --- /dev/null +++ b/apps/shared/OpenClawKit/Sources/OpenClawChatUI/ChatEventText.swift @@ -0,0 +1,78 @@ +import OpenClawKit + +public enum OpenClawChatEventText { + public static func assistantText(from event: OpenClawChatEventPayload) -> String? { + self.assistantText(fromMessage: event.message) + } + + public static func assistantText(fromMessage message: AnyCodable?) -> String? { + guard let message else { return nil } + return self.assistantText(fromValue: message.value) + } + + private static func assistantText(fromValue value: Any) -> String? { + if let text = value as? String { + return self.trimmed(text) + } + + guard let object = self.dictionary(from: value) else { return nil } + if let role = self.stringValue(object["role"])?.trimmingCharacters(in: .whitespacesAndNewlines), + !role.isEmpty, + role.lowercased() != "assistant" + { + return nil + } + + guard let content = object["content"] else { return nil } + return self.textContent(from: content) + } + + private static func textContent(from value: Any) -> String? { + if let text = value as? String { + return self.trimmed(text) + } + + let parts: [String] = if let array = value as? [AnyCodable] { + array.compactMap { self.textContentPart(from: $0.value) } + } else if let array = value as? [Any] { + array.compactMap { self.textContentPart(from: $0) } + } else { + self.textContentPart(from: value).map { [$0] } ?? [] + } + + return self.trimmed(parts.joined(separator: "\n")) + } + + private static func textContentPart(from value: Any) -> String? { + if let text = value as? String { + return self.trimmed(text) + } + guard let object = self.dictionary(from: value) else { return nil } + return self.trimmed(self.stringValue(object["text"]) ?? "") + } + + private static func dictionary(from value: Any) -> [String: Any]? { + if let dict = value as? [String: AnyCodable] { + return dict.mapValues(\.value) + } + if let dict = value as? [String: Any] { + return dict + } + return nil + } + + private static func stringValue(_ value: Any?) -> String? { + if let string = value as? String { + return string + } + if let wrapped = value as? AnyCodable { + return self.stringValue(wrapped.value) + } + return nil + } + + private static func trimmed(_ text: String) -> String? { + let trimmed = text.trimmingCharacters(in: .whitespacesAndNewlines) + return trimmed.isEmpty ? nil : trimmed + } +} diff --git a/apps/shared/OpenClawKit/Sources/OpenClawKit/Capabilities.swift b/apps/shared/OpenClawKit/Sources/OpenClawKit/Capabilities.swift index 3bbc03e937c..48f0f876595 100644 --- a/apps/shared/OpenClawKit/Sources/OpenClawKit/Capabilities.swift +++ b/apps/shared/OpenClawKit/Sources/OpenClawKit/Capabilities.swift @@ -6,6 +6,7 @@ public enum OpenClawCapability: String, Codable, Sendable { case camera case screen case voiceWake + case talk case location case device case watch diff --git a/apps/shared/OpenClawKit/Sources/OpenClawProtocol/GatewayModels.swift b/apps/shared/OpenClawKit/Sources/OpenClawProtocol/GatewayModels.swift index 9a2945988ca..f9009626696 100644 --- a/apps/shared/OpenClawKit/Sources/OpenClawProtocol/GatewayModels.swift +++ b/apps/shared/OpenClawKit/Sources/OpenClawProtocol/GatewayModels.swift @@ -2630,6 +2630,116 @@ public struct TalkModeParams: Codable, Sendable { } } +public struct TalkEvent: Codable, Sendable { + public let id: String + public let type: AnyCodable + public let sessionid: String + public let turnid: String? + public let captureid: String? + public let seq: Int + public let timestamp: String + public let mode: AnyCodable + public let transport: AnyCodable + public let brain: AnyCodable + public let provider: String? + public let final: Bool? + public let callid: String? + public let itemid: String? + public let parentid: String? + public let payload: AnyCodable + + public init( + id: String, + type: AnyCodable, + sessionid: String, + turnid: String?, + captureid: String?, + seq: Int, + timestamp: String, + mode: AnyCodable, + transport: AnyCodable, + brain: AnyCodable, + provider: String?, + final: Bool?, + callid: String?, + itemid: String?, + parentid: String?, + payload: AnyCodable) + { + self.id = id + self.type = type + self.sessionid = sessionid + self.turnid = turnid + self.captureid = captureid + self.seq = seq + self.timestamp = timestamp + self.mode = mode + self.transport = transport + self.brain = brain + self.provider = provider + self.final = final + self.callid = callid + self.itemid = itemid + self.parentid = parentid + self.payload = payload + } + + private enum CodingKeys: String, CodingKey { + case id + case type + case sessionid = "sessionId" + case turnid = "turnId" + case captureid = "captureId" + case seq + case timestamp + case mode + case transport + case brain + case provider + case final + case callid = "callId" + case itemid = "itemId" + case parentid = "parentId" + case payload + } +} + +public struct TalkCatalogParams: Codable, Sendable {} + +public struct TalkCatalogResult: Codable, Sendable { + public let modes: [AnyCodable] + public let transports: [AnyCodable] + public let brains: [AnyCodable] + public let speech: [String: AnyCodable] + public let transcription: [String: AnyCodable] + public let realtime: [String: AnyCodable] + + public init( + modes: [AnyCodable], + transports: [AnyCodable], + brains: [AnyCodable], + speech: [String: AnyCodable], + transcription: [String: AnyCodable], + realtime: [String: AnyCodable]) + { + self.modes = modes + self.transports = transports + self.brains = brains + self.speech = speech + self.transcription = transcription + self.realtime = realtime + } + + private enum CodingKeys: String, CodingKey { + case modes + case transports + case brains + case speech + case transcription + case realtime + } +} + public struct TalkConfigParams: Codable, Sendable { public let includesecrets: Bool? @@ -2658,22 +2768,383 @@ public struct TalkConfigResult: Codable, Sendable { } } +public struct TalkHandoffCreateParams: Codable, Sendable { + public let sessionkey: String + public let sessionid: String? + public let channel: String? + public let target: String? + public let provider: String? + public let model: String? + public let voice: String? + public let mode: AnyCodable? + public let transport: AnyCodable? + public let brain: AnyCodable? + public let ttlms: Int? + + public init( + sessionkey: String, + sessionid: String?, + channel: String?, + target: String?, + provider: String?, + model: String?, + voice: String?, + mode: AnyCodable?, + transport: AnyCodable?, + brain: AnyCodable?, + ttlms: Int?) + { + self.sessionkey = sessionkey + self.sessionid = sessionid + self.channel = channel + self.target = target + self.provider = provider + self.model = model + self.voice = voice + self.mode = mode + self.transport = transport + self.brain = brain + self.ttlms = ttlms + } + + private enum CodingKeys: String, CodingKey { + case sessionkey = "sessionKey" + case sessionid = "sessionId" + case channel + case target + case provider + case model + case voice + case mode + case transport + case brain + case ttlms = "ttlMs" + } +} + +public struct TalkHandoffCreateResult: Codable, Sendable { + public let id: String + public let roomid: String + public let roomurl: String + public let token: String + public let sessionkey: String + public let sessionid: String? + public let channel: String? + public let target: String? + public let provider: String? + public let model: String? + public let voice: String? + public let mode: AnyCodable + public let transport: AnyCodable + public let brain: AnyCodable + public let createdat: Double + public let expiresat: Double + public let room: [String: AnyCodable] + + public init( + id: String, + roomid: String, + roomurl: String, + token: String, + sessionkey: String, + sessionid: String?, + channel: String?, + target: String?, + provider: String?, + model: String?, + voice: String?, + mode: AnyCodable, + transport: AnyCodable, + brain: AnyCodable, + createdat: Double, + expiresat: Double, + room: [String: AnyCodable]) + { + self.id = id + self.roomid = roomid + self.roomurl = roomurl + self.token = token + self.sessionkey = sessionkey + self.sessionid = sessionid + self.channel = channel + self.target = target + self.provider = provider + self.model = model + self.voice = voice + self.mode = mode + self.transport = transport + self.brain = brain + self.createdat = createdat + self.expiresat = expiresat + self.room = room + } + + private enum CodingKeys: String, CodingKey { + case id + case roomid = "roomId" + case roomurl = "roomUrl" + case token + case sessionkey = "sessionKey" + case sessionid = "sessionId" + case channel + case target + case provider + case model + case voice + case mode + case transport + case brain + case createdat = "createdAt" + case expiresat = "expiresAt" + case room + } +} + +public struct TalkHandoffJoinParams: Codable, Sendable { + public let id: String + public let token: String + + public init( + id: String, + token: String) + { + self.id = id + self.token = token + } + + private enum CodingKeys: String, CodingKey { + case id + case token + } +} + +public struct TalkHandoffJoinResult: Codable, Sendable { + public let id: String + public let roomid: String + public let roomurl: String + public let sessionkey: String + public let sessionid: String? + public let channel: String? + public let target: String? + public let provider: String? + public let model: String? + public let voice: String? + public let mode: AnyCodable + public let transport: AnyCodable + public let brain: AnyCodable + public let createdat: Double + public let expiresat: Double + public let room: [String: AnyCodable] + + public init( + id: String, + roomid: String, + roomurl: String, + sessionkey: String, + sessionid: String?, + channel: String?, + target: String?, + provider: String?, + model: String?, + voice: String?, + mode: AnyCodable, + transport: AnyCodable, + brain: AnyCodable, + createdat: Double, + expiresat: Double, + room: [String: AnyCodable]) + { + self.id = id + self.roomid = roomid + self.roomurl = roomurl + self.sessionkey = sessionkey + self.sessionid = sessionid + self.channel = channel + self.target = target + self.provider = provider + self.model = model + self.voice = voice + self.mode = mode + self.transport = transport + self.brain = brain + self.createdat = createdat + self.expiresat = expiresat + self.room = room + } + + private enum CodingKeys: String, CodingKey { + case id + case roomid = "roomId" + case roomurl = "roomUrl" + case sessionkey = "sessionKey" + case sessionid = "sessionId" + case channel + case target + case provider + case model + case voice + case mode + case transport + case brain + case createdat = "createdAt" + case expiresat = "expiresAt" + case room + } +} + +public struct TalkHandoffRevokeParams: Codable, Sendable { + public let id: String + + public init( + id: String) + { + self.id = id + } + + private enum CodingKeys: String, CodingKey { + case id + } +} + +public struct TalkHandoffRevokeResult: Codable, Sendable { + public let ok: Bool + public let revoked: Bool + + public init( + ok: Bool, + revoked: Bool) + { + self.ok = ok + self.revoked = revoked + } + + private enum CodingKeys: String, CodingKey { + case ok + case revoked + } +} + +public struct TalkHandoffTurnStartParams: Codable, Sendable { + public let id: String + public let token: String + public let turnid: String? + + public init( + id: String, + token: String, + turnid: String?) + { + self.id = id + self.token = token + self.turnid = turnid + } + + private enum CodingKeys: String, CodingKey { + case id + case token + case turnid = "turnId" + } +} + +public struct TalkHandoffTurnEndParams: Codable, Sendable { + public let id: String + public let token: String + public let turnid: String? + + public init( + id: String, + token: String, + turnid: String?) + { + self.id = id + self.token = token + self.turnid = turnid + } + + private enum CodingKeys: String, CodingKey { + case id + case token + case turnid = "turnId" + } +} + +public struct TalkHandoffTurnCancelParams: Codable, Sendable { + public let id: String + public let token: String + public let turnid: String? + public let reason: String? + + public init( + id: String, + token: String, + turnid: String?, + reason: String?) + { + self.id = id + self.token = token + self.turnid = turnid + self.reason = reason + } + + private enum CodingKeys: String, CodingKey { + case id + case token + case turnid = "turnId" + case reason + } +} + +public struct TalkHandoffTurnResult: Codable, Sendable { + public let ok: Bool + public let record: TalkHandoffJoinResult + public let turnid: String + public let events: [TalkEvent] + + public init( + ok: Bool, + record: TalkHandoffJoinResult, + turnid: String, + events: [TalkEvent]) + { + self.ok = ok + self.record = record + self.turnid = turnid + self.events = events + } + + private enum CodingKeys: String, CodingKey { + case ok + case record + case turnid = "turnId" + case events + } +} + public struct TalkRealtimeSessionParams: Codable, Sendable { public let sessionkey: String? public let provider: String? public let model: String? public let voice: String? + public let mode: AnyCodable? + public let transport: AnyCodable? + public let brain: AnyCodable? public init( sessionkey: String?, provider: String?, model: String?, - voice: String?) + voice: String?, + mode: AnyCodable?, + transport: AnyCodable?, + brain: AnyCodable?) { self.sessionkey = sessionkey self.provider = provider self.model = model self.voice = voice + self.mode = mode + self.transport = transport + self.brain = brain } private enum CodingKeys: String, CodingKey { @@ -2681,6 +3152,9 @@ public struct TalkRealtimeSessionParams: Codable, Sendable { case provider case model case voice + case mode + case transport + case brain } } @@ -2706,6 +3180,24 @@ public struct TalkRealtimeRelayAudioParams: Codable, Sendable { } } +public struct TalkRealtimeRelayCancelParams: Codable, Sendable { + public let relaysessionid: String + public let reason: String? + + public init( + relaysessionid: String, + reason: String?) + { + self.relaysessionid = relaysessionid + self.reason = reason + } + + private enum CodingKeys: String, CodingKey { + case relaysessionid = "relaySessionId" + case reason + } +} + public struct TalkRealtimeRelayMarkParams: Codable, Sendable { public let relaysessionid: String public let markname: String? @@ -2774,6 +3266,166 @@ public struct TalkRealtimeRelayOkResult: Codable, Sendable { } } +public struct TalkRealtimeToolCallParams: Codable, Sendable { + public let sessionkey: String + public let callid: String + public let name: String + public let args: AnyCodable? + public let relaysessionid: String? + + public init( + sessionkey: String, + callid: String, + name: String, + args: AnyCodable?, + relaysessionid: String?) + { + self.sessionkey = sessionkey + self.callid = callid + self.name = name + self.args = args + self.relaysessionid = relaysessionid + } + + private enum CodingKeys: String, CodingKey { + case sessionkey = "sessionKey" + case callid = "callId" + case name + case args + case relaysessionid = "relaySessionId" + } +} + +public struct TalkRealtimeToolCallResult: Codable, Sendable { + public let runid: String + public let idempotencykey: String + + public init( + runid: String, + idempotencykey: String) + { + self.runid = runid + self.idempotencykey = idempotencykey + } + + private enum CodingKeys: String, CodingKey { + case runid = "runId" + case idempotencykey = "idempotencyKey" + } +} + +public struct TalkTranscriptionSessionParams: Codable, Sendable { + public let provider: String? + + public init( + provider: String?) + { + self.provider = provider + } + + private enum CodingKeys: String, CodingKey { + case provider + } +} + +public struct TalkTranscriptionSessionResult: Codable, Sendable { + public let provider: String + public let mode: String + public let transport: String + public let transcriptionsessionid: String + public let audio: [String: AnyCodable] + public let expiresat: Double + + public init( + provider: String, + mode: String, + transport: String, + transcriptionsessionid: String, + audio: [String: AnyCodable], + expiresat: Double) + { + self.provider = provider + self.mode = mode + self.transport = transport + self.transcriptionsessionid = transcriptionsessionid + self.audio = audio + self.expiresat = expiresat + } + + private enum CodingKeys: String, CodingKey { + case provider + case mode + case transport + case transcriptionsessionid = "transcriptionSessionId" + case audio + case expiresat = "expiresAt" + } +} + +public struct TalkTranscriptionRelayAudioParams: Codable, Sendable { + public let transcriptionsessionid: String + public let audiobase64: String + + public init( + transcriptionsessionid: String, + audiobase64: String) + { + self.transcriptionsessionid = transcriptionsessionid + self.audiobase64 = audiobase64 + } + + private enum CodingKeys: String, CodingKey { + case transcriptionsessionid = "transcriptionSessionId" + case audiobase64 = "audioBase64" + } +} + +public struct TalkTranscriptionRelayCancelParams: Codable, Sendable { + public let transcriptionsessionid: String + public let reason: String? + + public init( + transcriptionsessionid: String, + reason: String?) + { + self.transcriptionsessionid = transcriptionsessionid + self.reason = reason + } + + private enum CodingKeys: String, CodingKey { + case transcriptionsessionid = "transcriptionSessionId" + case reason + } +} + +public struct TalkTranscriptionRelayStopParams: Codable, Sendable { + public let transcriptionsessionid: String + + public init( + transcriptionsessionid: String) + { + self.transcriptionsessionid = transcriptionsessionid + } + + private enum CodingKeys: String, CodingKey { + case transcriptionsessionid = "transcriptionSessionId" + } +} + +public struct TalkTranscriptionRelayOkResult: Codable, Sendable { + public let ok: Bool + + public init( + ok: Bool) + { + self.ok = ok + } + + private enum CodingKeys: String, CodingKey { + case ok + } +} + public struct TalkSpeakParams: Codable, Sendable { public let text: String public let voiceid: String? diff --git a/apps/shared/OpenClawKit/Tests/OpenClawKitTests/ChatEventTextTests.swift b/apps/shared/OpenClawKit/Tests/OpenClawKitTests/ChatEventTextTests.swift new file mode 100644 index 00000000000..10ccb481e44 --- /dev/null +++ b/apps/shared/OpenClawKit/Tests/OpenClawKitTests/ChatEventTextTests.swift @@ -0,0 +1,50 @@ +import OpenClawKit +import Testing +@testable import OpenClawChatUI + +struct ChatEventTextTests { + @Test func `extracts assistant text from final chat event message`() { + let event = OpenClawChatEventPayload( + runId: "run-1", + sessionKey: "main", + state: "final", + message: AnyCodable([ + "role": "assistant", + "content": [ + ["type": "text", "text": "hello"], + ["type": "text", "text": "world"], + ], + ]), + errorMessage: nil) + + #expect(OpenClawChatEventText.assistantText(from: event) == "hello\nworld") + } + + @Test func `ignores user messages`() { + let event = OpenClawChatEventPayload( + runId: "run-1", + sessionKey: "main", + state: "delta", + message: AnyCodable([ + "role": "user", + "content": [["type": "text", "text": "ignore me"]], + ]), + errorMessage: nil) + + #expect(OpenClawChatEventText.assistantText(from: event) == nil) + } + + @Test func `extracts plain string content`() { + let event = OpenClawChatEventPayload( + runId: "run-1", + sessionKey: "main", + state: "final", + message: AnyCodable([ + "role": "assistant", + "content": "plain reply", + ]), + errorMessage: nil) + + #expect(OpenClawChatEventText.assistantText(from: event) == "plain reply") + } +} diff --git a/src/gateway/android-node.capabilities.live.test.ts b/src/gateway/android-node.capabilities.live.test.ts index 1c441e93975..5ce34076d38 100644 --- a/src/gateway/android-node.capabilities.live.test.ts +++ b/src/gateway/android-node.capabilities.live.test.ts @@ -534,6 +534,7 @@ describeLive("android node capability integration (preconditioned)", () => { const allowlist = resolveNodeCommandAllowlist(cfg, { platform: target.platform, deviceFamily: target.deviceFamily, + commands, }); commandsToRun = commands.filter( diff --git a/src/gateway/node-command-policy.test.ts b/src/gateway/node-command-policy.test.ts index f7526a9190a..348679aa501 100644 --- a/src/gateway/node-command-policy.test.ts +++ b/src/gateway/node-command-policy.test.ts @@ -1,5 +1,10 @@ import { describe, expect, it } from "vitest"; -import { normalizeDeclaredNodeCommands } from "./node-command-policy.js"; +import type { OpenClawConfig } from "../config/types.openclaw.js"; +import { + isNodeCommandAllowed, + normalizeDeclaredNodeCommands, + resolveNodeCommandAllowlist, +} from "./node-command-policy.js"; describe("gateway/node-command-policy", () => { it("normalizes declared node commands against the allowlist", () => { @@ -11,4 +16,43 @@ describe("gateway/node-command-policy", () => { }), ).toEqual(["canvas.snapshot", "system.run"]); }); + + it("allows declared push-to-talk commands on trusted talk-capable nodes", () => { + const cfg = {} as OpenClawConfig; + for (const platform of ["ios", "android", "macos", "other"]) { + const allowlist = resolveNodeCommandAllowlist(cfg, { platform, caps: ["talk"] }); + expect(allowlist.has("talk.ptt.start")).toBe(true); + expect(allowlist.has("talk.ptt.stop")).toBe(true); + expect(allowlist.has("talk.ptt.cancel")).toBe(true); + expect(allowlist.has("talk.ptt.once")).toBe(true); + expect( + isNodeCommandAllowed({ + command: "talk.ptt.start", + declaredCommands: ["talk.ptt.start"], + allowlist, + }), + ).toEqual({ ok: true }); + } + }); + + it("does not allow push-to-talk commands from platform label alone", () => { + const cfg = {} as OpenClawConfig; + const allowlist = resolveNodeCommandAllowlist(cfg, { + platform: "android", + caps: ["device"], + commands: [], + }); + + expect(allowlist.has("talk.ptt.start")).toBe(false); + }); + + it("allows push-to-talk commands when the node declares talk command support", () => { + const cfg = {} as OpenClawConfig; + const allowlist = resolveNodeCommandAllowlist(cfg, { + platform: "custom", + commands: ["talk.ptt.start"], + }); + + expect(allowlist.has("talk.ptt.start")).toBe(true); + }); }); diff --git a/src/gateway/node-command-policy.ts b/src/gateway/node-command-policy.ts index ded31eaae73..41c4ddf5055 100644 --- a/src/gateway/node-command-policy.ts +++ b/src/gateway/node-command-policy.ts @@ -5,6 +5,7 @@ import { NODE_SYSTEM_RUN_COMMANDS, } from "../infra/node-commands.js"; import { getActiveRuntimePluginRegistry } from "../plugins/active-runtime-registry.js"; +import { normalizeOptionalLowercaseString } from "../shared/string-coerce.js"; import { normalizeDeviceMetadataForPolicy } from "./device-metadata-normalization.js"; import type { NodeSession } from "./node-registry.js"; @@ -49,6 +50,8 @@ const MOTION_COMMANDS = ["motion.activity", "motion.pedometer"]; const SMS_DANGEROUS_COMMANDS = ["sms.send", "sms.search"]; +const TALK_PTT_COMMANDS = ["talk.ptt.start", "talk.ptt.stop", "talk.ptt.cancel", "talk.ptt.once"]; + // iOS nodes don't implement system.run/which, but they do support notifications. const IOS_SYSTEM_COMMANDS = [NODE_SYSTEM_NOTIFY_COMMAND]; @@ -197,17 +200,35 @@ export function listDangerousPluginNodeCommands(): string[] { return [...new Set(commands.map((command) => command.trim()).filter(Boolean))]; } +type NodeCommandPolicyNode = Pick & + Partial>; + +function hasTalkSurface(node?: NodeCommandPolicyNode): boolean { + if (!node) { + return false; + } + return ( + (node.caps ?? []).some( + (capability) => normalizeOptionalLowercaseString(capability) === "talk", + ) || + (node.commands ?? []).some((command) => + normalizeOptionalLowercaseString(command)?.startsWith("talk."), + ) + ); +} + export function resolveNodeCommandAllowlist( cfg: OpenClawConfig, - node?: Pick, + node?: NodeCommandPolicyNode, ): Set { const platformId = normalizePlatformId(node?.platform, node?.deviceFamily); const base = PLATFORM_DEFAULTS[platformId] ?? PLATFORM_DEFAULTS.unknown; + const talkCommands = hasTalkSurface(node) ? TALK_PTT_COMMANDS : []; const extra = cfg.gateway?.nodes?.allowCommands ?? []; const deny = new Set(cfg.gateway?.nodes?.denyCommands ?? []); const dangerousPluginCommands = new Set(listDangerousPluginNodeCommands()); const allow = new Set( - [...base, ...extra] + [...base, ...talkCommands, ...extra] .map((cmd) => cmd.trim()) .filter((cmd) => cmd && !dangerousPluginCommands.has(cmd)), ); diff --git a/src/gateway/node-connect-reconcile.ts b/src/gateway/node-connect-reconcile.ts index d8638f54ebc..359c48946fb 100644 --- a/src/gateway/node-connect-reconcile.ts +++ b/src/gateway/node-connect-reconcile.ts @@ -62,6 +62,8 @@ export async function reconcileNodePairingOnConnect(params: { const allowlist = resolveNodeCommandAllowlist(params.cfg, { platform: params.connectParams.client.platform, deviceFamily: params.connectParams.client.deviceFamily, + caps: params.connectParams.caps, + commands: params.connectParams.commands, }); const declared = normalizeDeclaredNodeCommands({ declaredCommands: Array.isArray(params.connectParams.commands) diff --git a/src/gateway/server-methods/nodes.invoke-wake.test.ts b/src/gateway/server-methods/nodes.invoke-wake.test.ts index e98b3b0f3b4..94cebed6814 100644 --- a/src/gateway/server-methods/nodes.invoke-wake.test.ts +++ b/src/gateway/server-methods/nodes.invoke-wake.test.ts @@ -405,6 +405,66 @@ describe("node.invoke APNs wake path", () => { expect(call?.[1]).toMatchObject({ ok: true, nodeId: "ios-node-reconnect" }); }); + it("broadcasts canonical Talk capture events for successful PTT node commands", async () => { + const respond = vi.fn(); + const broadcast = vi.fn(); + const nodeRegistry = { + get: vi.fn(() => ({ + nodeId: "android-talk-node", + commands: ["talk.ptt.start"], + capabilities: ["talk"], + platform: "android", + })), + invoke: vi.fn().mockResolvedValue({ + ok: true, + payloadJSON: '{"captureId":"capture-1"}', + }), + }; + + await nodeHandlers["node.invoke"]({ + params: { + nodeId: "android-talk-node", + command: "talk.ptt.start", + idempotencyKey: "idem-talk-ptt-start", + }, + respond: respond as never, + context: { + nodeRegistry, + execApprovalManager: undefined, + logGateway: { info: vi.fn(), warn: vi.fn() }, + getRuntimeConfig: () => mocks.getRuntimeConfig(), + broadcast, + } as never, + client: null, + req: { type: "req", id: "req-talk-ptt", method: "node.invoke" }, + isWebchatConnect: () => false, + }); + + expect(respond.mock.calls[0]?.[0]).toBe(true); + expect(broadcast).toHaveBeenCalledWith( + "talk.event", + expect.objectContaining({ + nodeId: "android-talk-node", + command: "talk.ptt.start", + talkEvent: expect.objectContaining({ + type: "capture.started", + sessionId: "node:android-talk-node:talk:capture-1", + captureId: "capture-1", + seq: expect.any(Number), + mode: "stt-tts", + transport: "managed-room", + brain: "agent-consult", + final: false, + payload: expect.objectContaining({ + nodeId: "android-talk-node", + command: "talk.ptt.start", + }), + }), + }), + { dropIfSlow: true }, + ); + }); + it("clears stale registrations after an invalid device token wake failure", async () => { const registration = directRegistration("ios-node-stale"); mocks.loadApnsRegistration.mockResolvedValue(registration); diff --git a/src/gateway/server-methods/nodes.ts b/src/gateway/server-methods/nodes.ts index aee2410b084..c7111af02a1 100644 --- a/src/gateway/server-methods/nodes.ts +++ b/src/gateway/server-methods/nodes.ts @@ -66,6 +66,7 @@ import { respondUnavailableOnThrow, safeParseJson, } from "./nodes.helpers.js"; +import type { GatewayRequestContext } from "./shared-types.js"; import type { GatewayRequestHandlers } from "./types.js"; export { @@ -78,6 +79,13 @@ const NODE_WAKE_THROTTLE_MS = 15_000; const NODE_WAKE_NUDGE_THROTTLE_MS = 10 * 60_000; const NODE_PENDING_ACTION_TTL_MS = 10 * 60_000; const NODE_PENDING_ACTION_MAX_PER_NODE = 64; +const TALK_PTT_COMMANDS = new Set([ + "talk.ptt.start", + "talk.ptt.stop", + "talk.ptt.cancel", + "talk.ptt.once", +]); +const talkPttEventSeqBySessionId = new Map(); type NodeWakeNudgeAttempt = { sent: boolean; @@ -259,6 +267,8 @@ function resolveAllowedPendingNodeActions(params: { const allowlist = resolveNodeCommandAllowlist(params.cfg, { platform: connect?.client?.platform, deviceFamily: connect?.client?.deviceFamily, + caps: connect?.caps, + commands: declaredCommands, }); const allowed = pending.filter((entry) => { const result = isNodeCommandAllowed({ @@ -304,6 +314,69 @@ function toPendingParamsJSON(params: unknown): string | undefined { } } +function emitTalkPttNodeEvent(params: { + context: Pick; + nodeId: string; + command: string; + payload: unknown; +}): void { + if (!TALK_PTT_COMMANDS.has(params.command)) { + return; + } + const payloadObj = + typeof params.payload === "object" && params.payload !== null + ? (params.payload as Record) + : {}; + const captureId = normalizeOptionalString(payloadObj.captureId) ?? randomUUID(); + const sessionId = `node:${params.nodeId}:talk:${captureId}`; + const seq = (talkPttEventSeqBySessionId.get(sessionId) ?? 0) + 1; + talkPttEventSeqBySessionId.set(sessionId, seq); + while (talkPttEventSeqBySessionId.size > 2048) { + const oldest = talkPttEventSeqBySessionId.keys().next().value; + if (oldest === undefined) { + break; + } + talkPttEventSeqBySessionId.delete(oldest); + } + + const type = + params.command === "talk.ptt.start" + ? "capture.started" + : params.command === "talk.ptt.cancel" + ? "capture.cancelled" + : params.command === "talk.ptt.once" + ? "capture.once" + : "capture.stopped"; + const final = params.command !== "talk.ptt.start"; + const talkEvent = { + id: `${sessionId}:${seq}`, + type, + sessionId, + captureId, + seq, + timestamp: new Date().toISOString(), + mode: "stt-tts", + transport: "managed-room", + brain: "agent-consult", + final, + payload: { + nodeId: params.nodeId, + command: params.command, + status: normalizeOptionalString(payloadObj.status) ?? undefined, + transcript: normalizeOptionalString(payloadObj.transcript) ?? undefined, + }, + }; + params.context.broadcast( + "talk.event", + { + nodeId: params.nodeId, + command: params.command, + talkEvent, + }, + { dropIfSlow: true }, + ); +} + export async function maybeWakeNodeWithApns( nodeId: string, opts?: { force?: boolean; wakeReason?: string; cfg?: OpenClawConfig }, @@ -1078,6 +1151,15 @@ export const nodeHandlers: GatewayRequestHandlers = { ); return; } + const payload = policyResult.payloadJSON + ? safeParseJson(policyResult.payloadJSON) + : policyResult.payload; + emitTalkPttNodeEvent({ + context, + nodeId, + command, + payload, + }); respond( true, { @@ -1151,6 +1233,12 @@ export const nodeHandlers: GatewayRequestHandlers = { return; } const payload = res.payloadJSON ? safeParseJson(res.payloadJSON) : res.payload; + emitTalkPttNodeEvent({ + context, + nodeId, + command, + payload, + }); respond( true, { @@ -1228,6 +1316,9 @@ function buildNodeCommandRejectionHint( return `node command not allowed: the node (platform: ${platform}) does not support "${command}"`; } if (reason === "command not allowlisted") { + if (command.startsWith("talk.")) { + return `node command not allowed: "${command}" requires a trusted Talk-capable node`; + } return `node command not allowed: "${command}" is not in the allowlist for platform "${platform}"`; } if (reason === "node did not declare commands") { diff --git a/src/gateway/server-talk-nodes.test.ts b/src/gateway/server-talk-nodes.test.ts new file mode 100644 index 00000000000..36d7d210571 --- /dev/null +++ b/src/gateway/server-talk-nodes.test.ts @@ -0,0 +1,32 @@ +import { describe, expect, it } from "vitest"; +import type { NodeRegistry, NodeSession } from "./node-registry.js"; +import { hasConnectedTalkNode } from "./server-talk-nodes.js"; + +function registryWith(nodes: Array>): NodeRegistry { + return { + listConnected: () => + nodes.map((node, index) => ({ + nodeId: `node-${index}`, + connId: `conn-${index}`, + caps: [], + commands: [], + connectedAtMs: 0, + ...node, + })), + } as NodeRegistry; +} + +describe("hasConnectedTalkNode", () => { + it("uses explicit talk capability instead of platform names", () => { + expect( + hasConnectedTalkNode(registryWith([{ platform: "android", caps: ["device"], commands: [] }])), + ).toBe(false); + expect(hasConnectedTalkNode(registryWith([{ platform: "linux", caps: ["talk"] }]))).toBe(true); + }); + + it("accepts nodes that declare talk command support", () => { + expect( + hasConnectedTalkNode(registryWith([{ platform: "custom", commands: ["talk.ptt.start"] }])), + ).toBe(true); + }); +}); diff --git a/src/gateway/server-talk-nodes.ts b/src/gateway/server-talk-nodes.ts new file mode 100644 index 00000000000..84f7b86d708 --- /dev/null +++ b/src/gateway/server-talk-nodes.ts @@ -0,0 +1,20 @@ +import { normalizeOptionalLowercaseString } from "../shared/string-coerce.js"; +import type { NodeRegistry, NodeSession } from "./node-registry.js"; + +const TALK_CAPABILITY = "talk"; +const TALK_COMMAND_PREFIX = "talk."; + +export function hasConnectedTalkNode(registry: NodeRegistry): boolean { + return registry.listConnected().some(isTalkCapableNode); +} + +function isTalkCapableNode(node: NodeSession): boolean { + return ( + node.caps.some( + (capability) => normalizeOptionalLowercaseString(capability) === TALK_CAPABILITY, + ) || + node.commands.some((command) => + normalizeOptionalLowercaseString(command)?.startsWith(TALK_COMMAND_PREFIX), + ) + ); +} diff --git a/src/security/audit-extra.sync.ts b/src/security/audit-extra.sync.ts index d99e879f1e9..4bd4bf8acf2 100644 --- a/src/security/audit-extra.sync.ts +++ b/src/security/audit-extra.sync.ts @@ -133,6 +133,12 @@ function listKnownNodeCommands(cfg: OpenClawConfig): Set { } } } + for (const cmd of resolveNodeCommandAllowlist(baseCfg, { caps: ["talk"] })) { + const normalized = normalizeNodeCommand(cmd); + if (normalized) { + out.add(normalized); + } + } for (const cmd of DEFAULT_DANGEROUS_NODE_COMMANDS) { const normalized = normalizeNodeCommand(cmd); if (normalized) {