feat: wire talk handoff into native nodes

This commit is contained in:
Peter Steinberger
2026-05-05 20:59:46 +01:00
parent c434d7720b
commit 466f718320
34 changed files with 2474 additions and 89 deletions

View File

@@ -36,6 +36,7 @@ import ai.openclaw.app.node.Quad
import ai.openclaw.app.node.SmsHandler
import ai.openclaw.app.node.SmsManager
import ai.openclaw.app.node.SystemHandler
import ai.openclaw.app.node.TalkHandler
import ai.openclaw.app.node.asObjectOrNull
import ai.openclaw.app.node.asStringOrNull
import ai.openclaw.app.node.invokeErrorFromThrowable
@@ -205,6 +206,16 @@ class NodeRuntime(
deviceHandler = deviceHandler,
notificationsHandler = notificationsHandler,
systemHandler = systemHandler,
talkHandler =
object : TalkHandler {
override suspend fun handlePttStart(paramsJson: String?): GatewaySession.InvokeResult = handleTalkPttStart()
override suspend fun handlePttStop(paramsJson: String?): GatewaySession.InvokeResult = handleTalkPttStop()
override suspend fun handlePttCancel(paramsJson: String?): GatewaySession.InvokeResult = handleTalkPttCancel()
override suspend fun handlePttOnce(paramsJson: String?): GatewaySession.InvokeResult = handleTalkPttOnce()
},
photosHandler = photosHandler,
contactsHandler = contactsHandler,
calendarHandler = calendarHandler,
@@ -881,6 +892,80 @@ class NodeRuntime(
setVoiceCaptureMode(if (value) VoiceCaptureMode.TalkMode else VoiceCaptureMode.Off)
}
private suspend fun handleTalkPttStart(): GatewaySession.InvokeResult =
runPreparedTalkPttCommand {
val payload = talkMode.beginPushToTalk()
GatewaySession.InvokeResult.ok(payload.toJson())
}
private suspend fun handleTalkPttStop(): GatewaySession.InvokeResult =
runTalkPttCommand {
val payload = talkMode.endPushToTalk()
finishTalkCaptureIfIdle()
GatewaySession.InvokeResult.ok(payload.toJson())
}
private suspend fun handleTalkPttCancel(): GatewaySession.InvokeResult =
runTalkPttCommand {
val payload = talkMode.cancelPushToTalk()
finishTalkCaptureIfIdle()
GatewaySession.InvokeResult.ok(payload.toJson())
}
private suspend fun handleTalkPttOnce(): GatewaySession.InvokeResult =
runPreparedTalkPttCommand {
val payload = talkMode.runPushToTalkOnce()
finishTalkCaptureIfIdle()
GatewaySession.InvokeResult.ok(payload.toJson())
}
private suspend fun runPreparedTalkPttCommand(block: suspend () -> GatewaySession.InvokeResult): GatewaySession.InvokeResult =
runTalkPttCommand {
prepareTalkCapture()
try {
block()
} catch (err: Throwable) {
cleanupFailedTalkCapture()
throw err
}
}
private suspend fun runTalkPttCommand(block: suspend () -> GatewaySession.InvokeResult): GatewaySession.InvokeResult =
try {
block()
} catch (err: Throwable) {
val (code, message) = invokeErrorFromThrowable(err)
GatewaySession.InvokeResult.error(code = code, message = message)
}
private suspend fun prepareTalkCapture() {
if (!hasRecordAudioPermission()) {
throw IllegalStateException("MIC_PERMISSION_REQUIRED: grant Microphone permission")
}
micCapture.setMicEnabled(false)
stopVoicePlayback()
NodeForegroundService.setVoiceCaptureMode(appContext, VoiceCaptureMode.TalkMode)
talkMode.ttsOnAllResponses = true
talkMode.setPlaybackEnabled(speakerEnabled.value)
talkMode.ensureChatSubscribed()
externalAudioCaptureActive.value = true
}
private suspend fun cleanupFailedTalkCapture() {
runCatching { talkMode.cancelPushToTalk() }
talkMode.ttsOnAllResponses = false
NodeForegroundService.setVoiceCaptureMode(appContext, VoiceCaptureMode.Off)
externalAudioCaptureActive.value = false
}
private fun finishTalkCaptureIfIdle() {
if (!talkMode.isEnabled.value && !talkMode.isListening.value && !talkMode.isSpeaking.value) {
talkMode.ttsOnAllResponses = false
NodeForegroundService.setVoiceCaptureMode(appContext, VoiceCaptureMode.Off)
externalAudioCaptureActive.value = false
}
}
val speakerEnabled: StateFlow<Boolean>
get() = prefs.speakerEnabled

View File

@@ -278,14 +278,13 @@ class GatewayDiscovery(
return legacyHostAddress(resolved)
}
private fun legacyHostAddress(resolved: NsdServiceInfo): String? {
return try {
private fun legacyHostAddress(resolved: NsdServiceInfo): String? =
try {
val host = NsdServiceInfo::class.java.getMethod("getHost").invoke(resolved) as? InetAddress
host?.hostAddress
} catch (_: Throwable) {
null
}
}
private fun publish() {
_gateways.value =
@@ -529,20 +528,20 @@ class GatewayDiscovery(
val cm = connectivity ?: return null
// Prefer VPN (Tailscale) when present; otherwise use the active network.
trackedNetworks(cm).firstOrNull { n ->
val caps = cm.getNetworkCapabilities(n) ?: return@firstOrNull false
caps.hasTransport(NetworkCapabilities.TRANSPORT_VPN)
}?.let { return it }
trackedNetworks(cm)
.firstOrNull { n ->
val caps = cm.getNetworkCapabilities(n) ?: return@firstOrNull false
caps.hasTransport(NetworkCapabilities.TRANSPORT_VPN)
}?.let { return it }
return cm.activeNetwork
}
private fun trackedNetworks(cm: ConnectivityManager): List<Network> {
return buildList {
private fun trackedNetworks(cm: ConnectivityManager): List<Network> =
buildList {
cm.activeNetwork?.let(::add)
addAll(availableNetworks)
}.distinct()
}
private fun createDirectResolver(): Resolver? {
val cm = connectivity ?: return null

View File

@@ -14,6 +14,7 @@ import ai.openclaw.app.protocol.OpenClawNotificationsCommand
import ai.openclaw.app.protocol.OpenClawPhotosCommand
import ai.openclaw.app.protocol.OpenClawSmsCommand
import ai.openclaw.app.protocol.OpenClawSystemCommand
import ai.openclaw.app.protocol.OpenClawTalkCommand
data class NodeRuntimeFlags(
val cameraEnabled: Boolean,
@@ -81,6 +82,7 @@ object InvokeCommandRegistry {
name = OpenClawCapability.VoiceWake.rawValue,
availability = NodeCapabilityAvailability.VoiceWakeEnabled,
),
NodeCapabilitySpec(name = OpenClawCapability.Talk.rawValue),
NodeCapabilitySpec(
name = OpenClawCapability.Location.rawValue,
availability = NodeCapabilityAvailability.LocationEnabled,
@@ -135,6 +137,18 @@ object InvokeCommandRegistry {
InvokeCommandSpec(
name = OpenClawSystemCommand.Notify.rawValue,
),
InvokeCommandSpec(
name = OpenClawTalkCommand.PttStart.rawValue,
),
InvokeCommandSpec(
name = OpenClawTalkCommand.PttStop.rawValue,
),
InvokeCommandSpec(
name = OpenClawTalkCommand.PttCancel.rawValue,
),
InvokeCommandSpec(
name = OpenClawTalkCommand.PttOnce.rawValue,
),
InvokeCommandSpec(
name = OpenClawCameraCommand.List.rawValue,
requiresForeground = true,

View File

@@ -13,6 +13,7 @@ import ai.openclaw.app.protocol.OpenClawMotionCommand
import ai.openclaw.app.protocol.OpenClawNotificationsCommand
import ai.openclaw.app.protocol.OpenClawSmsCommand
import ai.openclaw.app.protocol.OpenClawSystemCommand
import ai.openclaw.app.protocol.OpenClawTalkCommand
internal enum class SmsSearchAvailabilityReason {
Available,
@@ -59,6 +60,7 @@ class InvokeDispatcher(
private val deviceHandler: DeviceHandler,
private val notificationsHandler: NotificationsHandler,
private val systemHandler: SystemHandler,
private val talkHandler: TalkHandler,
private val photosHandler: PhotosHandler,
private val contactsHandler: ContactsHandler,
private val calendarHandler: CalendarHandler,
@@ -188,6 +190,12 @@ class InvokeDispatcher(
// System command
OpenClawSystemCommand.Notify.rawValue -> systemHandler.handleSystemNotify(paramsJson)
// Talk commands
OpenClawTalkCommand.PttStart.rawValue -> talkHandler.handlePttStart(paramsJson)
OpenClawTalkCommand.PttStop.rawValue -> talkHandler.handlePttStop(paramsJson)
OpenClawTalkCommand.PttCancel.rawValue -> talkHandler.handlePttCancel(paramsJson)
OpenClawTalkCommand.PttOnce.rawValue -> talkHandler.handlePttOnce(paramsJson)
// Photos command
ai.openclaw.app.protocol.OpenClawPhotosCommand.Latest.rawValue ->
photosHandler.handlePhotosLatest(
@@ -336,3 +344,13 @@ class InvokeDispatcher(
}
}
}
interface TalkHandler {
suspend fun handlePttStart(paramsJson: String?): GatewaySession.InvokeResult
suspend fun handlePttStop(paramsJson: String?): GatewaySession.InvokeResult
suspend fun handlePttCancel(paramsJson: String?): GatewaySession.InvokeResult
suspend fun handlePttOnce(paramsJson: String?): GatewaySession.InvokeResult
}

View File

@@ -7,6 +7,7 @@ enum class OpenClawCapability(
Camera("camera"),
Sms("sms"),
VoiceWake("voiceWake"),
Talk("talk"),
Location("location"),
Device("device"),
Notifications("notifications"),
@@ -71,6 +72,20 @@ enum class OpenClawSmsCommand(
}
}
enum class OpenClawTalkCommand(
val rawValue: String,
) {
PttStart("talk.ptt.start"),
PttStop("talk.ptt.stop"),
PttCancel("talk.ptt.cancel"),
PttOnce("talk.ptt.once"),
;
companion object {
const val NamespacePrefix: String = "talk."
}
}
enum class OpenClawLocationCommand(
val rawValue: String,
) {

View File

@@ -0,0 +1,45 @@
package ai.openclaw.app.voice
import kotlinx.serialization.json.JsonArray
import kotlinx.serialization.json.JsonElement
import kotlinx.serialization.json.JsonObject
import kotlinx.serialization.json.JsonPrimitive
internal object ChatEventText {
fun assistantTextFromPayload(payload: JsonObject): String? = assistantTextFromMessage(payload["message"])
fun assistantTextFromMessage(messageEl: JsonElement?): String? {
val message = messageEl.asObjectOrNull() ?: return null
val role = message["role"].asStringOrNull()
if (role != null && role != "assistant") return null
return textFromContent(message["content"])
}
private fun textFromContent(content: JsonElement?): String? =
when (content) {
is JsonPrimitive -> content.asStringOrNull()?.trim()?.takeIf { it.isNotEmpty() }
is JsonArray ->
content
.mapNotNull(::textFromContentPart)
.filter { it.isNotEmpty() }
.joinToString("\n")
.takeIf { it.isNotBlank() }
else -> null
}
private fun textFromContentPart(part: JsonElement): String? {
part
.asStringOrNull()
?.trim()
?.takeIf { it.isNotEmpty() }
?.let { return it }
val obj = part.asObjectOrNull() ?: return null
val type = obj["type"].asStringOrNull()
if (type != null && type != "text") return null
return obj["text"].asStringOrNull()?.trim()?.takeIf { it.isNotEmpty() }
}
}
private fun JsonElement?.asObjectOrNull(): JsonObject? = this as? JsonObject
private fun JsonElement?.asStringOrNull(): String? = (this as? JsonPrimitive)?.takeIf { it.isString }?.content

View File

@@ -21,7 +21,6 @@ import kotlinx.coroutines.flow.StateFlow
import kotlinx.coroutines.launch
import kotlinx.coroutines.withContext
import kotlinx.serialization.json.Json
import kotlinx.serialization.json.JsonArray
import kotlinx.serialization.json.JsonObject
import kotlinx.serialization.json.JsonPrimitive
import java.util.UUID
@@ -596,20 +595,7 @@ class MicCaptureManager(
PackageManager.PERMISSION_GRANTED
)
private fun parseAssistantText(payload: JsonObject): String? {
val message = payload["message"].asObjectOrNull() ?: return null
if (message["role"].asStringOrNull() != "assistant") return null
val content = message["content"] as? JsonArray ?: return null
val parts =
content.mapNotNull { item ->
val obj = item.asObjectOrNull() ?: return@mapNotNull null
if (obj["type"].asStringOrNull() != "text") return@mapNotNull null
obj["text"].asStringOrNull()?.trim()?.takeIf { it.isNotEmpty() }
}
if (parts.isEmpty()) return null
return parts.joinToString("\n")
}
private fun parseAssistantText(payload: JsonObject): String? = ChatEventText.assistantTextFromPayload(payload)
private val listener =
object : RecognitionListener {

View File

@@ -12,20 +12,26 @@ import kotlinx.coroutines.delay
import kotlinx.coroutines.withContext
import java.io.File
internal interface TalkAudioPlaying {
suspend fun play(audio: TalkSpeakAudio)
fun stop()
}
internal class TalkAudioPlayer(
private val context: Context,
) {
) : TalkAudioPlaying {
private val lock = Any()
private var active: ActivePlayback? = null
suspend fun play(audio: TalkSpeakAudio) {
override suspend fun play(audio: TalkSpeakAudio) {
when (val mode = resolvePlaybackMode(audio)) {
is TalkPlaybackMode.Pcm -> playPcm(audio.bytes, mode.sampleRate)
is TalkPlaybackMode.Compressed -> playCompressed(audio.bytes, mode.fileExtension)
}
}
fun stop() {
override fun stop() {
synchronized(lock) {
active?.cancel()
active = null

View File

@@ -41,7 +41,28 @@ import java.util.UUID
import java.util.concurrent.atomic.AtomicLong
import kotlin.coroutines.coroutineContext
class TalkModeManager(
data class TalkPttStartPayload(
val captureId: String,
) {
fun toJson(): String = """{"captureId":"$captureId"}"""
}
data class TalkPttStopPayload(
val captureId: String,
val transcript: String?,
val status: String,
) {
fun toJson(): String =
buildJsonObject {
put("captureId", JsonPrimitive(captureId))
if (transcript != null) {
put("transcript", JsonPrimitive(transcript))
}
put("status", JsonPrimitive(status))
}.toString()
}
class TalkModeManager internal constructor(
private val context: Context,
private val scope: CoroutineScope,
private val session: GatewaySession,
@@ -49,6 +70,8 @@ class TalkModeManager(
private val isConnected: () -> Boolean,
private val onBeforeSpeak: suspend () -> Unit = {},
private val onAfterSpeak: suspend () -> Unit = {},
private val talkSpeakClient: TalkSpeechSynthesizing = TalkSpeakClient(session = session),
private val talkAudioPlayer: TalkAudioPlaying = TalkAudioPlayer(context),
) {
companion object {
private const val tag = "TalkMode"
@@ -60,9 +83,6 @@ class TalkModeManager(
private val mainHandler = Handler(Looper.getMainLooper())
private val json = Json { ignoreUnknownKeys = true }
private val talkSpeakClient = TalkSpeakClient(session = session, json = json)
private val talkAudioPlayer = TalkAudioPlayer(context)
private val _isEnabled = MutableStateFlow(false)
val isEnabled: StateFlow<Boolean> = _isEnabled
@@ -82,6 +102,10 @@ class TalkModeManager(
private var restartJob: Job? = null
private var stopRequested = false
private var listeningMode = false
private var activePttCaptureId: String? = null
private var pttAutoStopEnabled = false
private var pttTimeoutJob: Job? = null
private var pttCompletion: CompletableDeferred<TalkPttStopPayload>? = null
private var silenceJob: Job? = null
private var silenceWindowMs = TalkDefaults.defaultSilenceTimeoutMs
@@ -156,6 +180,127 @@ class TalkModeManager(
}
}
suspend fun beginPushToTalk(): TalkPttStartPayload {
if (!isConnected()) {
_statusText.value = "Gateway not connected"
throw IllegalStateException("UNAVAILABLE: Gateway not connected")
}
activePttCaptureId?.let { return TalkPttStartPayload(captureId = it) }
stopSpeaking(resetInterrupt = false)
pttTimeoutJob?.cancel()
pttTimeoutJob = null
pttAutoStopEnabled = false
pttCompletion = null
silenceJob?.cancel()
silenceJob = null
listeningMode = false
finalizeInFlight = false
stopRequested = false
lastTranscript = ""
lastHeardAtMs = null
val micOk =
ContextCompat.checkSelfPermission(context, Manifest.permission.RECORD_AUDIO) ==
PackageManager.PERMISSION_GRANTED
if (!micOk) {
_statusText.value = "Microphone permission required"
throw IllegalStateException("MIC_PERMISSION_REQUIRED: grant Microphone permission")
}
if (!SpeechRecognizer.isRecognitionAvailable(context)) {
_statusText.value = "Speech recognizer unavailable"
throw IllegalStateException("UNAVAILABLE: Speech recognizer unavailable")
}
val captureId = UUID.randomUUID().toString()
activePttCaptureId = captureId
withContext(Dispatchers.Main) {
recognizer?.cancel()
recognizer?.destroy()
recognizer = SpeechRecognizer.createSpeechRecognizer(context).also { it.setRecognitionListener(listener) }
startListeningInternal(markListening = true)
}
_statusText.value = "Listening (PTT)"
return TalkPttStartPayload(captureId = captureId)
}
suspend fun endPushToTalk(): TalkPttStopPayload {
val captureId = activePttCaptureId ?: UUID.randomUUID().toString()
if (activePttCaptureId == null) {
return finishPushToTalk(TalkPttStopPayload(captureId = captureId, transcript = null, status = "idle"))
}
clearPushToTalkRecognition()
val transcript = lastTranscript.trim()
lastTranscript = ""
lastHeardAtMs = null
if (transcript.isEmpty()) {
_statusText.value = if (_isEnabled.value) "Listening" else "Ready"
if (_isEnabled.value) {
start()
}
return finishPushToTalk(TalkPttStopPayload(captureId = captureId, transcript = null, status = "empty"))
}
if (!isConnected()) {
_statusText.value = "Gateway not connected"
if (_isEnabled.value) {
start()
}
return finishPushToTalk(TalkPttStopPayload(captureId = captureId, transcript = transcript, status = "offline"))
}
_statusText.value = "Thinking…"
scope.launch {
finalizeTranscript(transcript)
}
return finishPushToTalk(TalkPttStopPayload(captureId = captureId, transcript = transcript, status = "queued"))
}
suspend fun cancelPushToTalk(): TalkPttStopPayload {
val captureId = activePttCaptureId ?: UUID.randomUUID().toString()
if (activePttCaptureId == null) {
return finishPushToTalk(TalkPttStopPayload(captureId = captureId, transcript = null, status = "idle"))
}
clearPushToTalkRecognition()
lastTranscript = ""
lastHeardAtMs = null
_statusText.value = if (_isEnabled.value) "Listening" else "Ready"
if (_isEnabled.value) {
start()
}
return finishPushToTalk(TalkPttStopPayload(captureId = captureId, transcript = null, status = "cancelled"))
}
suspend fun runPushToTalkOnce(maxDurationMs: Long = 12_000L): TalkPttStopPayload {
if (pttCompletion != null) {
cancelPushToTalk()
}
if (activePttCaptureId != null) {
return TalkPttStopPayload(
captureId = activePttCaptureId ?: UUID.randomUUID().toString(),
transcript = null,
status = "busy",
)
}
beginPushToTalk()
val completion = CompletableDeferred<TalkPttStopPayload>()
pttCompletion = completion
pttAutoStopEnabled = true
startSilenceMonitor()
pttTimeoutJob =
scope.launch {
delay(maxDurationMs)
if (pttAutoStopEnabled && activePttCaptureId != null) {
endPushToTalk()
}
}
return completion.await()
}
/**
* Speak a wake-word command through TalkMode's full pipeline:
* chat.send → wait for final → read assistant text → TTS.
@@ -335,6 +480,12 @@ class TalkModeManager(
stopRequested = true
finalizeInFlight = false
listeningMode = false
activePttCaptureId = null
pttAutoStopEnabled = false
pttCompletion?.cancel()
pttCompletion = null
pttTimeoutJob?.cancel()
pttTimeoutJob = null
restartJob?.cancel()
restartJob = null
silenceJob?.cancel()
@@ -434,7 +585,7 @@ class TalkModeManager(
silenceJob?.cancel()
silenceJob =
scope.launch {
while (_isEnabled.value) {
while (_isEnabled.value || pttAutoStopEnabled) {
delay(200)
checkSilence()
}
@@ -448,6 +599,12 @@ class TalkModeManager(
val lastHeard = lastHeardAtMs ?: return
val elapsed = SystemClock.elapsedRealtime() - lastHeard
if (elapsed < silenceWindowMs) return
if (activePttCaptureId != null) {
if (pttAutoStopEnabled) {
scope.launch { endPushToTalk() }
}
return
}
if (finalizeInFlight) return
finalizeInFlight = true
scope.launch {
@@ -525,6 +682,27 @@ class TalkModeManager(
}
}
private suspend fun clearPushToTalkRecognition() {
pttTimeoutJob?.cancel()
pttTimeoutJob = null
pttAutoStopEnabled = false
activePttCaptureId = null
_isListening.value = false
listeningMode = false
clearListenWatchdog()
withContext(Dispatchers.Main) {
recognizer?.cancel()
recognizer?.destroy()
recognizer = null
}
}
private fun finishPushToTalk(payload: TalkPttStopPayload): TalkPttStopPayload {
pttCompletion?.complete(payload)
pttCompletion = null
return payload
}
private suspend fun subscribeChatIfNeeded(
session: GatewaySession,
sessionKey: String,
@@ -656,20 +834,7 @@ class TalkModeManager(
}
}
private fun extractTextFromChatEventMessage(messageEl: JsonElement?): String? {
val msg = messageEl?.asObjectOrNull() ?: return null
val content = msg["content"] as? JsonArray ?: return null
return content
.mapNotNull { entry ->
entry
.asObjectOrNull()
?.get("text")
?.asStringOrNull()
?.trim()
}.filter { it.isNotEmpty() }
.joinToString("\n")
.takeIf { it.isNotBlank() }
}
private fun extractTextFromChatEventMessage(messageEl: JsonElement?): String? = ChatEventText.assistantTextFromMessage(messageEl)
private suspend fun waitForAssistantText(
session: GatewaySession,
@@ -729,17 +894,16 @@ class TalkModeManager(
_lastAssistantText.value = cleaned
ensurePlaybackActive(playbackToken)
_statusText.value = "Speaking"
_isSpeaking.value = true
_statusText.value = "Generating voice"
_isSpeaking.value = false
lastSpokenText = cleaned
ensureInterruptListener()
requestAudioFocusForTts()
try {
val started = SystemClock.elapsedRealtime()
when (val result = talkSpeakClient.synthesize(text = cleaned, directive = directive)) {
is TalkSpeakResult.Success -> {
ensurePlaybackActive(playbackToken)
markAudioPlaybackStarting(playbackToken)
talkAudioPlayer.play(result.audio)
ensurePlaybackActive(playbackToken)
Log.d(tag, "talk.speak ok durMs=${SystemClock.elapsedRealtime() - started}")
@@ -789,8 +953,6 @@ class TalkModeManager(
shouldResumeAfterSpeak = true
onBeforeSpeak()
ensurePlaybackActive(playbackToken)
_isSpeaking.value = true
_statusText.value = "Speaking…"
block()
} finally {
synchronized(ttsJobLock) {
@@ -888,6 +1050,7 @@ class TalkModeManager(
}
},
)
markAudioPlaybackStarting(playbackToken)
val result = engine.speak(text, TextToSpeech.QUEUE_FLUSH, null, utteranceId)
if (result != TextToSpeech.SUCCESS) {
throw IllegalStateException("TextToSpeech start failed")
@@ -905,6 +1068,14 @@ class TalkModeManager(
}
}
private fun markAudioPlaybackStarting(playbackToken: Long) {
ensurePlaybackActive(playbackToken)
_statusText.value = "Speaking…"
_isSpeaking.value = true
ensureInterruptListener()
requestAudioFocusForTts()
}
fun stopTts() {
stopSpeaking(resetInterrupt = true)
_isSpeaking.value = false

View File

@@ -28,12 +28,19 @@ internal sealed interface TalkSpeakResult {
) : TalkSpeakResult
}
internal interface TalkSpeechSynthesizing {
suspend fun synthesize(
text: String,
directive: TalkDirective?,
): TalkSpeakResult
}
internal class TalkSpeakClient(
private val session: GatewaySession? = null,
private val json: Json = Json { ignoreUnknownKeys = true },
private val requestDetailed: (suspend (String, String, Long) -> GatewaySession.RpcResult)? = null,
) {
suspend fun synthesize(
) : TalkSpeechSynthesizing {
override suspend fun synthesize(
text: String,
directive: TalkDirective?,
): TalkSpeakResult {

View File

@@ -6,6 +6,11 @@ import ai.openclaw.app.gateway.GatewayEndpoint
import ai.openclaw.app.gateway.GatewaySession
import ai.openclaw.app.gateway.GatewayTlsProbeFailure
import ai.openclaw.app.gateway.GatewayTlsProbeResult
import ai.openclaw.app.node.InvokeDispatcher
import ai.openclaw.app.protocol.OpenClawTalkCommand
import ai.openclaw.app.voice.TalkModeManager
import android.Manifest
import kotlinx.coroutines.flow.MutableStateFlow
import kotlinx.coroutines.runBlocking
import org.junit.Assert.assertEquals
import org.junit.Assert.assertFalse
@@ -15,6 +20,7 @@ import org.junit.Test
import org.junit.runner.RunWith
import org.robolectric.RobolectricTestRunner
import org.robolectric.RuntimeEnvironment
import org.robolectric.Shadows.shadowOf
import org.robolectric.annotation.Config
import java.lang.reflect.Field
import java.util.UUID
@@ -221,6 +227,23 @@ class GatewayBootstrapAuthTest {
assertNull(authStore.loadToken(deviceId, "operator"))
}
@Test
fun talkPttStart_cleansPreparedCaptureWhenBeginFails() =
runBlocking {
val app = RuntimeEnvironment.getApplication()
shadowOf(app).grantPermissions(Manifest.permission.RECORD_AUDIO)
val runtime = NodeRuntime(app)
val dispatcher = readField<InvokeDispatcher>(runtime, "invokeDispatcher")
val result = dispatcher.handleInvoke(OpenClawTalkCommand.PttStart.rawValue, null)
assertEquals("UNAVAILABLE", result.error?.code)
assertEquals(VoiceCaptureMode.Off, runtime.voiceCaptureMode.value)
assertFalse(readField<MutableStateFlow<Boolean>>(runtime, "externalAudioCaptureActive").value)
val talkMode = readField<Lazy<TalkModeManager>>(runtime, "talkMode\$delegate").value
assertFalse(talkMode.ttsOnAllResponses)
}
private fun waitForGatewayTrustPrompt(runtime: NodeRuntime): NodeRuntime.GatewayTrustPrompt {
repeat(50) {
runtime.pendingGatewayTrust.value?.let { return it }

View File

@@ -12,6 +12,7 @@ import ai.openclaw.app.protocol.OpenClawNotificationsCommand
import ai.openclaw.app.protocol.OpenClawPhotosCommand
import ai.openclaw.app.protocol.OpenClawSmsCommand
import ai.openclaw.app.protocol.OpenClawSystemCommand
import ai.openclaw.app.protocol.OpenClawTalkCommand
import org.junit.Assert.assertEquals
import org.junit.Assert.assertFalse
import org.junit.Assert.assertNotNull
@@ -26,6 +27,7 @@ class InvokeCommandRegistryTest {
OpenClawCapability.Device.rawValue,
OpenClawCapability.Notifications.rawValue,
OpenClawCapability.System.rawValue,
OpenClawCapability.Talk.rawValue,
OpenClawCapability.Photos.rawValue,
OpenClawCapability.Contacts.rawValue,
OpenClawCapability.Calendar.rawValue,
@@ -50,6 +52,10 @@ class InvokeCommandRegistryTest {
OpenClawNotificationsCommand.List.rawValue,
OpenClawNotificationsCommand.Actions.rawValue,
OpenClawSystemCommand.Notify.rawValue,
OpenClawTalkCommand.PttStart.rawValue,
OpenClawTalkCommand.PttStop.rawValue,
OpenClawTalkCommand.PttCancel.rawValue,
OpenClawTalkCommand.PttOnce.rawValue,
OpenClawPhotosCommand.Latest.rawValue,
OpenClawContactsCommand.Search.rawValue,
OpenClawContactsCommand.Add.rawValue,

View File

@@ -1,11 +1,13 @@
package ai.openclaw.app.node
import ai.openclaw.app.gateway.DeviceIdentityStore
import ai.openclaw.app.gateway.GatewaySession
import ai.openclaw.app.protocol.OpenClawCallLogCommand
import ai.openclaw.app.protocol.OpenClawCameraCommand
import ai.openclaw.app.protocol.OpenClawLocationCommand
import ai.openclaw.app.protocol.OpenClawMotionCommand
import ai.openclaw.app.protocol.OpenClawSmsCommand
import ai.openclaw.app.protocol.OpenClawTalkCommand
import android.content.Context
import android.content.pm.PackageManager
import kotlinx.coroutines.flow.MutableStateFlow
@@ -208,6 +210,27 @@ class InvokeDispatcherTest {
assertEquals("INVALID_REQUEST: unknown command", result.error?.message)
}
@Test
fun handleInvoke_routesTalkPttCommands() =
runTest {
val talk = InvokeDispatcherFakeTalkHandler()
val dispatcher = newDispatcher(talkHandler = talk)
val start = dispatcher.handleInvoke(OpenClawTalkCommand.PttStart.rawValue, null)
val stop = dispatcher.handleInvoke(OpenClawTalkCommand.PttStop.rawValue, null)
val cancel = dispatcher.handleInvoke(OpenClawTalkCommand.PttCancel.rawValue, null)
val once = dispatcher.handleInvoke(OpenClawTalkCommand.PttOnce.rawValue, null)
assertEquals("""{"captureId":"start"}""", start.payloadJson)
assertEquals("""{"status":"stop"}""", stop.payloadJson)
assertEquals("""{"status":"cancel"}""", cancel.payloadJson)
assertEquals("""{"status":"once"}""", once.payloadJson)
assertEquals(
listOf("start", "stop", "cancel", "once"),
talk.calls,
)
}
private fun newDispatcher(
cameraEnabled: Boolean = false,
locationEnabled: Boolean = false,
@@ -219,6 +242,7 @@ class InvokeDispatcherTest {
debugBuild: Boolean = false,
motionActivityAvailable: Boolean = false,
motionPedometerAvailable: Boolean = false,
talkHandler: TalkHandler = InvokeDispatcherFakeTalkHandler(),
): InvokeDispatcher {
val appContext = RuntimeEnvironment.getApplication()
shadowOf(appContext.packageManager).setSystemFeature(PackageManager.FEATURE_TELEPHONY, smsTelephonyAvailable)
@@ -238,6 +262,7 @@ class InvokeDispatcherTest {
stateProvider = InvokeDispatcherFakeNotificationsStateProvider(),
),
systemHandler = SystemHandler.forTesting(InvokeDispatcherFakeSystemNotificationPoster()),
talkHandler = talkHandler,
photosHandler = PhotosHandler.forTesting(appContext, InvokeDispatcherFakePhotosDataSource()),
contactsHandler = ContactsHandler.forTesting(appContext, InvokeDispatcherFakeContactsDataSource()),
calendarHandler = CalendarHandler.forTesting(appContext, InvokeDispatcherFakeCalendarDataSource()),
@@ -312,6 +337,30 @@ private class InvokeDispatcherFakeSystemNotificationPoster : SystemNotificationP
override fun post(request: SystemNotifyRequest) = Unit
}
private class InvokeDispatcherFakeTalkHandler : TalkHandler {
val calls = mutableListOf<String>()
override suspend fun handlePttStart(paramsJson: String?): GatewaySession.InvokeResult {
calls.add("start")
return GatewaySession.InvokeResult.ok("""{"captureId":"start"}""")
}
override suspend fun handlePttStop(paramsJson: String?): GatewaySession.InvokeResult {
calls.add("stop")
return GatewaySession.InvokeResult.ok("""{"status":"stop"}""")
}
override suspend fun handlePttCancel(paramsJson: String?): GatewaySession.InvokeResult {
calls.add("cancel")
return GatewaySession.InvokeResult.ok("""{"status":"cancel"}""")
}
override suspend fun handlePttOnce(paramsJson: String?): GatewaySession.InvokeResult {
calls.add("once")
return GatewaySession.InvokeResult.ok("""{"status":"once"}""")
}
}
private class InvokeDispatcherFakePhotosDataSource : PhotosDataSource {
override fun hasPermission(context: Context): Boolean = true

View File

@@ -25,6 +25,7 @@ class OpenClawProtocolConstantsTest {
assertEquals("canvas", OpenClawCapability.Canvas.rawValue)
assertEquals("camera", OpenClawCapability.Camera.rawValue)
assertEquals("voiceWake", OpenClawCapability.VoiceWake.rawValue)
assertEquals("talk", OpenClawCapability.Talk.rawValue)
assertEquals("location", OpenClawCapability.Location.rawValue)
assertEquals("sms", OpenClawCapability.Sms.rawValue)
assertEquals("device", OpenClawCapability.Device.rawValue)
@@ -92,6 +93,14 @@ class OpenClawProtocolConstantsTest {
assertEquals("sms.search", OpenClawSmsCommand.Search.rawValue)
}
@Test
fun talkCommandsUseStableStrings() {
assertEquals("talk.ptt.start", OpenClawTalkCommand.PttStart.rawValue)
assertEquals("talk.ptt.stop", OpenClawTalkCommand.PttStop.rawValue)
assertEquals("talk.ptt.cancel", OpenClawTalkCommand.PttCancel.rawValue)
assertEquals("talk.ptt.once", OpenClawTalkCommand.PttOnce.rawValue)
}
@Test
fun callLogCommandsUseStableStrings() {
assertEquals("callLog.search", OpenClawCallLogCommand.Search.rawValue)

View File

@@ -0,0 +1,69 @@
package ai.openclaw.app.voice
import kotlinx.serialization.json.Json
import kotlinx.serialization.json.JsonObject
import org.junit.Assert.assertEquals
import org.junit.Assert.assertNull
import org.junit.Test
class ChatEventTextTest {
private val json = Json { ignoreUnknownKeys = true }
@Test
fun extractsAssistantTextParts() {
val payload =
payload(
"""
{
"message": {
"role": "assistant",
"content": [
{ "type": "text", "text": "hello" },
{ "type": "text", "text": "world" }
]
}
}
""",
)
assertEquals("hello\nworld", ChatEventText.assistantTextFromPayload(payload))
}
@Test
fun extractsPlainStringContent() {
val payload =
payload(
"""
{
"message": {
"role": "assistant",
"content": "plain reply"
}
}
""",
)
assertEquals("plain reply", ChatEventText.assistantTextFromPayload(payload))
}
@Test
fun ignoresUserMessages() {
val payload =
payload(
"""
{
"message": {
"role": "user",
"content": [
{ "type": "text", "text": "do not speak" }
]
}
}
""",
)
assertNull(ChatEventText.assistantTextFromPayload(payload))
}
private fun payload(source: String): JsonObject = json.parseToJsonElement(source.trimIndent()) as JsonObject
}

View File

@@ -9,7 +9,10 @@ import kotlinx.coroutines.CoroutineScope
import kotlinx.coroutines.Dispatchers
import kotlinx.coroutines.Job
import kotlinx.coroutines.SupervisorJob
import kotlinx.coroutines.launch
import kotlinx.coroutines.test.runTest
import org.junit.Assert.assertEquals
import org.junit.Assert.assertFalse
import org.junit.Assert.assertTrue
import org.junit.Test
import org.junit.runner.RunWith
@@ -78,7 +81,54 @@ class TalkModeManagerTest {
assertEquals(1L, playbackGeneration(manager).get())
}
private fun createManager(): TalkModeManager {
@Test
fun nonPendingUserFinalDoesNotUseAllResponseTts() {
val manager = createManager()
manager.ttsOnAllResponses = true
manager.handleGatewayEvent("chat", chatFinalPayload(runId = "run-user", text = "do not speak", role = "user"))
assertEquals(0L, playbackGeneration(manager).get())
}
@Test
fun textReadyDoesNotEnterSpeakingUntilAudioPlaybackStarts() =
runTest {
val talkSpeakClient = FakeTalkSpeechSynthesizer()
val talkAudioPlayer = FakeTalkAudioPlayer()
val manager = createManager(talkSpeakClient = talkSpeakClient, talkAudioPlayer = talkAudioPlayer)
val job = launch { manager.speakAssistantReply("hello") }
talkSpeakClient.requested.await()
assertEquals("Generating voice…", manager.statusText.value)
assertFalse(manager.isSpeaking.value)
talkSpeakClient.result.complete(
TalkSpeakResult.Success(
TalkSpeakAudio(
bytes = byteArrayOf(1, 2, 3),
provider = "test",
outputFormat = "mp3_44100_128",
voiceCompatible = true,
mimeType = "audio/mpeg",
fileExtension = ".mp3",
),
),
)
talkAudioPlayer.started.await()
assertEquals("Speaking…", manager.statusText.value)
assertTrue(manager.isSpeaking.value)
talkAudioPlayer.finished.complete(Unit)
job.join()
}
private fun createManager(
talkSpeakClient: TalkSpeechSynthesizing = TalkSpeakClient(),
talkAudioPlayer: TalkAudioPlaying? = null,
): TalkModeManager {
val app = RuntimeEnvironment.getApplication()
val sessionJob = SupervisorJob()
val session =
@@ -96,6 +146,8 @@ class TalkModeManagerTest {
session = session,
supportsChatSubscribe = false,
isConnected = { true },
talkSpeakClient = talkSpeakClient,
talkAudioPlayer = talkAudioPlayer ?: TalkAudioPlayer(app),
)
}
@@ -124,6 +176,7 @@ class TalkModeManagerTest {
private fun chatFinalPayload(
runId: String,
text: String,
role: String = "assistant",
): String =
"""
{
@@ -131,7 +184,7 @@ class TalkModeManagerTest {
"sessionKey": "main",
"state": "final",
"message": {
"role": "assistant",
"role": "$role",
"content": [
{ "type": "text", "text": "$text" }
]
@@ -140,6 +193,34 @@ class TalkModeManagerTest {
""".trimIndent()
}
private class FakeTalkSpeechSynthesizer : TalkSpeechSynthesizing {
val requested = CompletableDeferred<Unit>()
val result = CompletableDeferred<TalkSpeakResult>()
override suspend fun synthesize(
text: String,
directive: TalkDirective?,
): TalkSpeakResult {
requested.complete(Unit)
return result.await()
}
}
private class FakeTalkAudioPlayer : TalkAudioPlaying {
val started = CompletableDeferred<Unit>()
val finished = CompletableDeferred<Unit>()
var stopped = false
override suspend fun play(audio: TalkSpeakAudio) {
started.complete(Unit)
finished.await()
}
override fun stop() {
stopped = true
}
}
private class InMemoryDeviceAuthStore : DeviceAuthTokenStore {
override fun loadEntry(
deviceId: String,

View File

@@ -821,6 +821,7 @@ final class GatewayConnectionController {
if locationMode != .off { caps.append(OpenClawCapability.location.rawValue) }
caps.append(OpenClawCapability.device.rawValue)
caps.append(OpenClawCapability.talk.rawValue)
if WatchMessagingService.isSupportedOnDevice() {
caps.append(OpenClawCapability.watch.rawValue)
}

View File

@@ -800,11 +800,11 @@ final class TalkModeManager: NSObject {
}
}
let completion = await self.waitForChatCompletion(runId: runId, gateway: gateway, timeoutSeconds: 120)
if completion == .timeout {
if completion.state == .timeout {
self.logger.warning(
"chat completion timeout runId=\(runId, privacy: .public); attempting history fallback")
GatewayDiagnostics.log("talk: chat completion timeout runId=\(runId)")
} else if completion == .aborted {
} else if completion.state == .aborted {
self.statusText = "Aborted"
self.logger.warning("chat completion aborted runId=\(runId, privacy: .public)")
GatewayDiagnostics.log("talk: chat completion aborted runId=\(runId)")
@@ -812,7 +812,7 @@ final class TalkModeManager: NSObject {
await self.finishIncrementalSpeech()
await self.start()
return
} else if completion == .error {
} else if completion.state == .error {
self.statusText = "Chat error"
self.logger.warning("chat completion error runId=\(runId, privacy: .public)")
GatewayDiagnostics.log("talk: chat completion error runId=\(runId)")
@@ -822,16 +822,19 @@ final class TalkModeManager: NSObject {
return
}
var assistantText = try await self.waitForAssistantText(
gateway: gateway,
since: startedAt,
timeoutSeconds: completion == .final ? 12 : 25)
var assistantText = completion.assistantText
if assistantText == nil, shouldIncremental {
let fallback = self.incrementalSpeechBuffer.latestText
if !fallback.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty {
assistantText = fallback
}
}
if assistantText == nil {
assistantText = try await self.waitForAssistantTextFromHistory(
gateway: gateway,
since: startedAt,
timeoutSeconds: completion.state == .final ? 12 : 25)
}
guard let assistantText else {
self.statusText = "No reply"
self.logger.warning("assistant text timeout runId=\(runId, privacy: .public)")
@@ -898,6 +901,11 @@ final class TalkModeManager: NSObject {
}
}
private struct ChatCompletionResult {
var state: ChatCompletionState
var assistantText: String?
}
private func sendChat(_ message: String, gateway: GatewayNodeSession) async throws -> String {
struct SendResponse: Decodable { let runId: String }
let payload: [String: Any] = [
@@ -922,40 +930,51 @@ final class TalkModeManager: NSObject {
private func waitForChatCompletion(
runId: String,
gateway: GatewayNodeSession,
timeoutSeconds: Int = 120) async -> ChatCompletionState
timeoutSeconds: Int = 120) async -> ChatCompletionResult
{
let stream = await gateway.subscribeServerEvents(bufferingNewest: 200)
return await withTaskGroup(of: ChatCompletionState.self) { group in
return await withTaskGroup(of: ChatCompletionResult.self) { group in
group.addTask { [runId] in
var latestAssistantText: String?
for await evt in stream {
if Task.isCancelled { return .timeout }
if Task.isCancelled {
return ChatCompletionResult(state: .timeout, assistantText: latestAssistantText)
}
guard evt.event == "chat", let payload = evt.payload else { continue }
guard let chatEvent = try? GatewayPayloadDecoding.decode(payload, as: ChatEvent.self) else {
guard let chatEvent = try? GatewayPayloadDecoding.decode(
payload,
as: OpenClawChatEventPayload.self)
else {
continue
}
guard chatEvent.runid == runId else { continue }
if let state = chatEvent.state.value as? String {
switch state {
case "final": return .final
case "aborted": return .aborted
case "error": return .error
default: break
}
guard chatEvent.runId == runId else { continue }
if let text = OpenClawChatEventText.assistantText(from: chatEvent) {
latestAssistantText = text
}
switch chatEvent.state {
case "final":
return ChatCompletionResult(state: .final, assistantText: latestAssistantText)
case "aborted":
return ChatCompletionResult(state: .aborted, assistantText: nil)
case "error":
return ChatCompletionResult(state: .error, assistantText: nil)
default:
break
}
}
return .timeout
return ChatCompletionResult(state: .timeout, assistantText: latestAssistantText)
}
group.addTask {
try? await Task.sleep(nanoseconds: UInt64(timeoutSeconds) * 1_000_000_000)
return .timeout
return ChatCompletionResult(state: .timeout, assistantText: nil)
}
let result = await group.next() ?? .timeout
let result = await group.next() ?? ChatCompletionResult(state: .timeout, assistantText: nil)
group.cancelAll()
return result
}
}
private func waitForAssistantText(
private func waitForAssistantTextFromHistory(
gateway: GatewayNodeSession,
since: Double,
timeoutSeconds: Int) async throws -> String?

View File

@@ -36,6 +36,7 @@ import UIKit
#expect(caps.contains(OpenClawCapability.camera.rawValue))
#expect(caps.contains(OpenClawCapability.location.rawValue))
#expect(caps.contains(OpenClawCapability.voiceWake.rawValue))
#expect(caps.contains(OpenClawCapability.talk.rawValue))
}
}

View File

@@ -395,10 +395,18 @@ actor TalkModeRuntime {
"talk chat.send ok runId=\(response.runId, privacy: .public) " +
"session=\(sessionKey, privacy: .public)")
guard let assistantText = await self.waitForAssistantText(
var assistantText = await self.waitForAssistantEventText(
sessionKey: sessionKey,
since: startedAt,
runId: response.runId,
timeoutSeconds: 45)
if assistantText == nil {
self.logger.warning("talk assistant event text missing; using history fallback")
assistantText = await self.waitForAssistantTextFromHistory(
sessionKey: sessionKey,
since: startedAt,
timeoutSeconds: 12)
}
guard let assistantText
else {
self.logger.warning("talk assistant text missing after timeout")
await self.startListening()
@@ -439,7 +447,67 @@ actor TalkModeRuntime {
return TalkPromptBuilder.build(transcript: transcript, interruptedAtSeconds: interrupted)
}
private func waitForAssistantText(
private func waitForAssistantEventText(
sessionKey: String,
runId: String,
timeoutSeconds: Int) async -> String?
{
let stream = await GatewayConnection.shared.subscribe(bufferingNewest: 200)
return await withTaskGroup(of: String?.self) { group in
group.addTask { [runId, sessionKey] in
var latestText: String?
for await push in stream {
if Task.isCancelled { return latestText }
guard case let .event(evt) = push else { continue }
guard evt.event == "chat", let payload = evt.payload else { continue }
guard let chatEvent = try? GatewayPayloadDecoding.decode(
payload,
as: OpenClawChatEventPayload.self)
else {
continue
}
guard chatEvent.runId == runId else { continue }
if let eventSessionKey = chatEvent.sessionKey,
!Self.matchesSessionKey(eventSessionKey, sessionKey)
{
continue
}
if let text = OpenClawChatEventText.assistantText(from: chatEvent) {
latestText = text
}
switch chatEvent.state {
case "final":
return latestText
case "aborted", "error":
return nil
default:
break
}
}
return latestText
}
group.addTask {
try? await Task.sleep(nanoseconds: UInt64(timeoutSeconds) * 1_000_000_000)
return nil
}
guard let result = await group.next() else {
group.cancelAll()
return nil
}
group.cancelAll()
return result
}
}
private static func matchesSessionKey(_ incoming: String, _ current: String) -> Bool {
let incoming = incoming.trimmingCharacters(in: .whitespacesAndNewlines).lowercased()
let current = current.trimmingCharacters(in: .whitespacesAndNewlines).lowercased()
if incoming == current { return true }
return (incoming == "agent:main:main" && current == "main") ||
(incoming == "main" && current == "agent:main:main")
}
private func waitForAssistantTextFromHistory(
sessionKey: String,
since: Double,
timeoutSeconds: Int) async -> String?
@@ -1111,7 +1179,10 @@ extension TalkModeRuntime {
} else {
self.ttsLogger
.info(
"talk provider \(parsed.activeProvider, privacy: .public) uses gateway talk.speak with system voice fallback")
"""
talk provider \(parsed.activeProvider, privacy: .public) uses gateway talk.speak \
with system voice fallback
""")
}
return parsed
} catch {

View File

@@ -2630,6 +2630,116 @@ public struct TalkModeParams: Codable, Sendable {
}
}
public struct TalkEvent: Codable, Sendable {
public let id: String
public let type: AnyCodable
public let sessionid: String
public let turnid: String?
public let captureid: String?
public let seq: Int
public let timestamp: String
public let mode: AnyCodable
public let transport: AnyCodable
public let brain: AnyCodable
public let provider: String?
public let final: Bool?
public let callid: String?
public let itemid: String?
public let parentid: String?
public let payload: AnyCodable
public init(
id: String,
type: AnyCodable,
sessionid: String,
turnid: String?,
captureid: String?,
seq: Int,
timestamp: String,
mode: AnyCodable,
transport: AnyCodable,
brain: AnyCodable,
provider: String?,
final: Bool?,
callid: String?,
itemid: String?,
parentid: String?,
payload: AnyCodable)
{
self.id = id
self.type = type
self.sessionid = sessionid
self.turnid = turnid
self.captureid = captureid
self.seq = seq
self.timestamp = timestamp
self.mode = mode
self.transport = transport
self.brain = brain
self.provider = provider
self.final = final
self.callid = callid
self.itemid = itemid
self.parentid = parentid
self.payload = payload
}
private enum CodingKeys: String, CodingKey {
case id
case type
case sessionid = "sessionId"
case turnid = "turnId"
case captureid = "captureId"
case seq
case timestamp
case mode
case transport
case brain
case provider
case final
case callid = "callId"
case itemid = "itemId"
case parentid = "parentId"
case payload
}
}
public struct TalkCatalogParams: Codable, Sendable {}
public struct TalkCatalogResult: Codable, Sendable {
public let modes: [AnyCodable]
public let transports: [AnyCodable]
public let brains: [AnyCodable]
public let speech: [String: AnyCodable]
public let transcription: [String: AnyCodable]
public let realtime: [String: AnyCodable]
public init(
modes: [AnyCodable],
transports: [AnyCodable],
brains: [AnyCodable],
speech: [String: AnyCodable],
transcription: [String: AnyCodable],
realtime: [String: AnyCodable])
{
self.modes = modes
self.transports = transports
self.brains = brains
self.speech = speech
self.transcription = transcription
self.realtime = realtime
}
private enum CodingKeys: String, CodingKey {
case modes
case transports
case brains
case speech
case transcription
case realtime
}
}
public struct TalkConfigParams: Codable, Sendable {
public let includesecrets: Bool?
@@ -2658,22 +2768,383 @@ public struct TalkConfigResult: Codable, Sendable {
}
}
public struct TalkHandoffCreateParams: Codable, Sendable {
public let sessionkey: String
public let sessionid: String?
public let channel: String?
public let target: String?
public let provider: String?
public let model: String?
public let voice: String?
public let mode: AnyCodable?
public let transport: AnyCodable?
public let brain: AnyCodable?
public let ttlms: Int?
public init(
sessionkey: String,
sessionid: String?,
channel: String?,
target: String?,
provider: String?,
model: String?,
voice: String?,
mode: AnyCodable?,
transport: AnyCodable?,
brain: AnyCodable?,
ttlms: Int?)
{
self.sessionkey = sessionkey
self.sessionid = sessionid
self.channel = channel
self.target = target
self.provider = provider
self.model = model
self.voice = voice
self.mode = mode
self.transport = transport
self.brain = brain
self.ttlms = ttlms
}
private enum CodingKeys: String, CodingKey {
case sessionkey = "sessionKey"
case sessionid = "sessionId"
case channel
case target
case provider
case model
case voice
case mode
case transport
case brain
case ttlms = "ttlMs"
}
}
public struct TalkHandoffCreateResult: Codable, Sendable {
public let id: String
public let roomid: String
public let roomurl: String
public let token: String
public let sessionkey: String
public let sessionid: String?
public let channel: String?
public let target: String?
public let provider: String?
public let model: String?
public let voice: String?
public let mode: AnyCodable
public let transport: AnyCodable
public let brain: AnyCodable
public let createdat: Double
public let expiresat: Double
public let room: [String: AnyCodable]
public init(
id: String,
roomid: String,
roomurl: String,
token: String,
sessionkey: String,
sessionid: String?,
channel: String?,
target: String?,
provider: String?,
model: String?,
voice: String?,
mode: AnyCodable,
transport: AnyCodable,
brain: AnyCodable,
createdat: Double,
expiresat: Double,
room: [String: AnyCodable])
{
self.id = id
self.roomid = roomid
self.roomurl = roomurl
self.token = token
self.sessionkey = sessionkey
self.sessionid = sessionid
self.channel = channel
self.target = target
self.provider = provider
self.model = model
self.voice = voice
self.mode = mode
self.transport = transport
self.brain = brain
self.createdat = createdat
self.expiresat = expiresat
self.room = room
}
private enum CodingKeys: String, CodingKey {
case id
case roomid = "roomId"
case roomurl = "roomUrl"
case token
case sessionkey = "sessionKey"
case sessionid = "sessionId"
case channel
case target
case provider
case model
case voice
case mode
case transport
case brain
case createdat = "createdAt"
case expiresat = "expiresAt"
case room
}
}
public struct TalkHandoffJoinParams: Codable, Sendable {
public let id: String
public let token: String
public init(
id: String,
token: String)
{
self.id = id
self.token = token
}
private enum CodingKeys: String, CodingKey {
case id
case token
}
}
public struct TalkHandoffJoinResult: Codable, Sendable {
public let id: String
public let roomid: String
public let roomurl: String
public let sessionkey: String
public let sessionid: String?
public let channel: String?
public let target: String?
public let provider: String?
public let model: String?
public let voice: String?
public let mode: AnyCodable
public let transport: AnyCodable
public let brain: AnyCodable
public let createdat: Double
public let expiresat: Double
public let room: [String: AnyCodable]
public init(
id: String,
roomid: String,
roomurl: String,
sessionkey: String,
sessionid: String?,
channel: String?,
target: String?,
provider: String?,
model: String?,
voice: String?,
mode: AnyCodable,
transport: AnyCodable,
brain: AnyCodable,
createdat: Double,
expiresat: Double,
room: [String: AnyCodable])
{
self.id = id
self.roomid = roomid
self.roomurl = roomurl
self.sessionkey = sessionkey
self.sessionid = sessionid
self.channel = channel
self.target = target
self.provider = provider
self.model = model
self.voice = voice
self.mode = mode
self.transport = transport
self.brain = brain
self.createdat = createdat
self.expiresat = expiresat
self.room = room
}
private enum CodingKeys: String, CodingKey {
case id
case roomid = "roomId"
case roomurl = "roomUrl"
case sessionkey = "sessionKey"
case sessionid = "sessionId"
case channel
case target
case provider
case model
case voice
case mode
case transport
case brain
case createdat = "createdAt"
case expiresat = "expiresAt"
case room
}
}
public struct TalkHandoffRevokeParams: Codable, Sendable {
public let id: String
public init(
id: String)
{
self.id = id
}
private enum CodingKeys: String, CodingKey {
case id
}
}
public struct TalkHandoffRevokeResult: Codable, Sendable {
public let ok: Bool
public let revoked: Bool
public init(
ok: Bool,
revoked: Bool)
{
self.ok = ok
self.revoked = revoked
}
private enum CodingKeys: String, CodingKey {
case ok
case revoked
}
}
public struct TalkHandoffTurnStartParams: Codable, Sendable {
public let id: String
public let token: String
public let turnid: String?
public init(
id: String,
token: String,
turnid: String?)
{
self.id = id
self.token = token
self.turnid = turnid
}
private enum CodingKeys: String, CodingKey {
case id
case token
case turnid = "turnId"
}
}
public struct TalkHandoffTurnEndParams: Codable, Sendable {
public let id: String
public let token: String
public let turnid: String?
public init(
id: String,
token: String,
turnid: String?)
{
self.id = id
self.token = token
self.turnid = turnid
}
private enum CodingKeys: String, CodingKey {
case id
case token
case turnid = "turnId"
}
}
public struct TalkHandoffTurnCancelParams: Codable, Sendable {
public let id: String
public let token: String
public let turnid: String?
public let reason: String?
public init(
id: String,
token: String,
turnid: String?,
reason: String?)
{
self.id = id
self.token = token
self.turnid = turnid
self.reason = reason
}
private enum CodingKeys: String, CodingKey {
case id
case token
case turnid = "turnId"
case reason
}
}
public struct TalkHandoffTurnResult: Codable, Sendable {
public let ok: Bool
public let record: TalkHandoffJoinResult
public let turnid: String
public let events: [TalkEvent]
public init(
ok: Bool,
record: TalkHandoffJoinResult,
turnid: String,
events: [TalkEvent])
{
self.ok = ok
self.record = record
self.turnid = turnid
self.events = events
}
private enum CodingKeys: String, CodingKey {
case ok
case record
case turnid = "turnId"
case events
}
}
public struct TalkRealtimeSessionParams: Codable, Sendable {
public let sessionkey: String?
public let provider: String?
public let model: String?
public let voice: String?
public let mode: AnyCodable?
public let transport: AnyCodable?
public let brain: AnyCodable?
public init(
sessionkey: String?,
provider: String?,
model: String?,
voice: String?)
voice: String?,
mode: AnyCodable?,
transport: AnyCodable?,
brain: AnyCodable?)
{
self.sessionkey = sessionkey
self.provider = provider
self.model = model
self.voice = voice
self.mode = mode
self.transport = transport
self.brain = brain
}
private enum CodingKeys: String, CodingKey {
@@ -2681,6 +3152,9 @@ public struct TalkRealtimeSessionParams: Codable, Sendable {
case provider
case model
case voice
case mode
case transport
case brain
}
}
@@ -2706,6 +3180,24 @@ public struct TalkRealtimeRelayAudioParams: Codable, Sendable {
}
}
public struct TalkRealtimeRelayCancelParams: Codable, Sendable {
public let relaysessionid: String
public let reason: String?
public init(
relaysessionid: String,
reason: String?)
{
self.relaysessionid = relaysessionid
self.reason = reason
}
private enum CodingKeys: String, CodingKey {
case relaysessionid = "relaySessionId"
case reason
}
}
public struct TalkRealtimeRelayMarkParams: Codable, Sendable {
public let relaysessionid: String
public let markname: String?
@@ -2774,6 +3266,166 @@ public struct TalkRealtimeRelayOkResult: Codable, Sendable {
}
}
public struct TalkRealtimeToolCallParams: Codable, Sendable {
public let sessionkey: String
public let callid: String
public let name: String
public let args: AnyCodable?
public let relaysessionid: String?
public init(
sessionkey: String,
callid: String,
name: String,
args: AnyCodable?,
relaysessionid: String?)
{
self.sessionkey = sessionkey
self.callid = callid
self.name = name
self.args = args
self.relaysessionid = relaysessionid
}
private enum CodingKeys: String, CodingKey {
case sessionkey = "sessionKey"
case callid = "callId"
case name
case args
case relaysessionid = "relaySessionId"
}
}
public struct TalkRealtimeToolCallResult: Codable, Sendable {
public let runid: String
public let idempotencykey: String
public init(
runid: String,
idempotencykey: String)
{
self.runid = runid
self.idempotencykey = idempotencykey
}
private enum CodingKeys: String, CodingKey {
case runid = "runId"
case idempotencykey = "idempotencyKey"
}
}
public struct TalkTranscriptionSessionParams: Codable, Sendable {
public let provider: String?
public init(
provider: String?)
{
self.provider = provider
}
private enum CodingKeys: String, CodingKey {
case provider
}
}
public struct TalkTranscriptionSessionResult: Codable, Sendable {
public let provider: String
public let mode: String
public let transport: String
public let transcriptionsessionid: String
public let audio: [String: AnyCodable]
public let expiresat: Double
public init(
provider: String,
mode: String,
transport: String,
transcriptionsessionid: String,
audio: [String: AnyCodable],
expiresat: Double)
{
self.provider = provider
self.mode = mode
self.transport = transport
self.transcriptionsessionid = transcriptionsessionid
self.audio = audio
self.expiresat = expiresat
}
private enum CodingKeys: String, CodingKey {
case provider
case mode
case transport
case transcriptionsessionid = "transcriptionSessionId"
case audio
case expiresat = "expiresAt"
}
}
public struct TalkTranscriptionRelayAudioParams: Codable, Sendable {
public let transcriptionsessionid: String
public let audiobase64: String
public init(
transcriptionsessionid: String,
audiobase64: String)
{
self.transcriptionsessionid = transcriptionsessionid
self.audiobase64 = audiobase64
}
private enum CodingKeys: String, CodingKey {
case transcriptionsessionid = "transcriptionSessionId"
case audiobase64 = "audioBase64"
}
}
public struct TalkTranscriptionRelayCancelParams: Codable, Sendable {
public let transcriptionsessionid: String
public let reason: String?
public init(
transcriptionsessionid: String,
reason: String?)
{
self.transcriptionsessionid = transcriptionsessionid
self.reason = reason
}
private enum CodingKeys: String, CodingKey {
case transcriptionsessionid = "transcriptionSessionId"
case reason
}
}
public struct TalkTranscriptionRelayStopParams: Codable, Sendable {
public let transcriptionsessionid: String
public init(
transcriptionsessionid: String)
{
self.transcriptionsessionid = transcriptionsessionid
}
private enum CodingKeys: String, CodingKey {
case transcriptionsessionid = "transcriptionSessionId"
}
}
public struct TalkTranscriptionRelayOkResult: Codable, Sendable {
public let ok: Bool
public init(
ok: Bool)
{
self.ok = ok
}
private enum CodingKeys: String, CodingKey {
case ok
}
}
public struct TalkSpeakParams: Codable, Sendable {
public let text: String
public let voiceid: String?

View File

@@ -0,0 +1,78 @@
import OpenClawKit
public enum OpenClawChatEventText {
public static func assistantText(from event: OpenClawChatEventPayload) -> String? {
self.assistantText(fromMessage: event.message)
}
public static func assistantText(fromMessage message: AnyCodable?) -> String? {
guard let message else { return nil }
return self.assistantText(fromValue: message.value)
}
private static func assistantText(fromValue value: Any) -> String? {
if let text = value as? String {
return self.trimmed(text)
}
guard let object = self.dictionary(from: value) else { return nil }
if let role = self.stringValue(object["role"])?.trimmingCharacters(in: .whitespacesAndNewlines),
!role.isEmpty,
role.lowercased() != "assistant"
{
return nil
}
guard let content = object["content"] else { return nil }
return self.textContent(from: content)
}
private static func textContent(from value: Any) -> String? {
if let text = value as? String {
return self.trimmed(text)
}
let parts: [String] = if let array = value as? [AnyCodable] {
array.compactMap { self.textContentPart(from: $0.value) }
} else if let array = value as? [Any] {
array.compactMap { self.textContentPart(from: $0) }
} else {
self.textContentPart(from: value).map { [$0] } ?? []
}
return self.trimmed(parts.joined(separator: "\n"))
}
private static func textContentPart(from value: Any) -> String? {
if let text = value as? String {
return self.trimmed(text)
}
guard let object = self.dictionary(from: value) else { return nil }
return self.trimmed(self.stringValue(object["text"]) ?? "")
}
private static func dictionary(from value: Any) -> [String: Any]? {
if let dict = value as? [String: AnyCodable] {
return dict.mapValues(\.value)
}
if let dict = value as? [String: Any] {
return dict
}
return nil
}
private static func stringValue(_ value: Any?) -> String? {
if let string = value as? String {
return string
}
if let wrapped = value as? AnyCodable {
return self.stringValue(wrapped.value)
}
return nil
}
private static func trimmed(_ text: String) -> String? {
let trimmed = text.trimmingCharacters(in: .whitespacesAndNewlines)
return trimmed.isEmpty ? nil : trimmed
}
}

View File

@@ -6,6 +6,7 @@ public enum OpenClawCapability: String, Codable, Sendable {
case camera
case screen
case voiceWake
case talk
case location
case device
case watch

View File

@@ -2630,6 +2630,116 @@ public struct TalkModeParams: Codable, Sendable {
}
}
public struct TalkEvent: Codable, Sendable {
public let id: String
public let type: AnyCodable
public let sessionid: String
public let turnid: String?
public let captureid: String?
public let seq: Int
public let timestamp: String
public let mode: AnyCodable
public let transport: AnyCodable
public let brain: AnyCodable
public let provider: String?
public let final: Bool?
public let callid: String?
public let itemid: String?
public let parentid: String?
public let payload: AnyCodable
public init(
id: String,
type: AnyCodable,
sessionid: String,
turnid: String?,
captureid: String?,
seq: Int,
timestamp: String,
mode: AnyCodable,
transport: AnyCodable,
brain: AnyCodable,
provider: String?,
final: Bool?,
callid: String?,
itemid: String?,
parentid: String?,
payload: AnyCodable)
{
self.id = id
self.type = type
self.sessionid = sessionid
self.turnid = turnid
self.captureid = captureid
self.seq = seq
self.timestamp = timestamp
self.mode = mode
self.transport = transport
self.brain = brain
self.provider = provider
self.final = final
self.callid = callid
self.itemid = itemid
self.parentid = parentid
self.payload = payload
}
private enum CodingKeys: String, CodingKey {
case id
case type
case sessionid = "sessionId"
case turnid = "turnId"
case captureid = "captureId"
case seq
case timestamp
case mode
case transport
case brain
case provider
case final
case callid = "callId"
case itemid = "itemId"
case parentid = "parentId"
case payload
}
}
public struct TalkCatalogParams: Codable, Sendable {}
public struct TalkCatalogResult: Codable, Sendable {
public let modes: [AnyCodable]
public let transports: [AnyCodable]
public let brains: [AnyCodable]
public let speech: [String: AnyCodable]
public let transcription: [String: AnyCodable]
public let realtime: [String: AnyCodable]
public init(
modes: [AnyCodable],
transports: [AnyCodable],
brains: [AnyCodable],
speech: [String: AnyCodable],
transcription: [String: AnyCodable],
realtime: [String: AnyCodable])
{
self.modes = modes
self.transports = transports
self.brains = brains
self.speech = speech
self.transcription = transcription
self.realtime = realtime
}
private enum CodingKeys: String, CodingKey {
case modes
case transports
case brains
case speech
case transcription
case realtime
}
}
public struct TalkConfigParams: Codable, Sendable {
public let includesecrets: Bool?
@@ -2658,22 +2768,383 @@ public struct TalkConfigResult: Codable, Sendable {
}
}
public struct TalkHandoffCreateParams: Codable, Sendable {
public let sessionkey: String
public let sessionid: String?
public let channel: String?
public let target: String?
public let provider: String?
public let model: String?
public let voice: String?
public let mode: AnyCodable?
public let transport: AnyCodable?
public let brain: AnyCodable?
public let ttlms: Int?
public init(
sessionkey: String,
sessionid: String?,
channel: String?,
target: String?,
provider: String?,
model: String?,
voice: String?,
mode: AnyCodable?,
transport: AnyCodable?,
brain: AnyCodable?,
ttlms: Int?)
{
self.sessionkey = sessionkey
self.sessionid = sessionid
self.channel = channel
self.target = target
self.provider = provider
self.model = model
self.voice = voice
self.mode = mode
self.transport = transport
self.brain = brain
self.ttlms = ttlms
}
private enum CodingKeys: String, CodingKey {
case sessionkey = "sessionKey"
case sessionid = "sessionId"
case channel
case target
case provider
case model
case voice
case mode
case transport
case brain
case ttlms = "ttlMs"
}
}
public struct TalkHandoffCreateResult: Codable, Sendable {
public let id: String
public let roomid: String
public let roomurl: String
public let token: String
public let sessionkey: String
public let sessionid: String?
public let channel: String?
public let target: String?
public let provider: String?
public let model: String?
public let voice: String?
public let mode: AnyCodable
public let transport: AnyCodable
public let brain: AnyCodable
public let createdat: Double
public let expiresat: Double
public let room: [String: AnyCodable]
public init(
id: String,
roomid: String,
roomurl: String,
token: String,
sessionkey: String,
sessionid: String?,
channel: String?,
target: String?,
provider: String?,
model: String?,
voice: String?,
mode: AnyCodable,
transport: AnyCodable,
brain: AnyCodable,
createdat: Double,
expiresat: Double,
room: [String: AnyCodable])
{
self.id = id
self.roomid = roomid
self.roomurl = roomurl
self.token = token
self.sessionkey = sessionkey
self.sessionid = sessionid
self.channel = channel
self.target = target
self.provider = provider
self.model = model
self.voice = voice
self.mode = mode
self.transport = transport
self.brain = brain
self.createdat = createdat
self.expiresat = expiresat
self.room = room
}
private enum CodingKeys: String, CodingKey {
case id
case roomid = "roomId"
case roomurl = "roomUrl"
case token
case sessionkey = "sessionKey"
case sessionid = "sessionId"
case channel
case target
case provider
case model
case voice
case mode
case transport
case brain
case createdat = "createdAt"
case expiresat = "expiresAt"
case room
}
}
public struct TalkHandoffJoinParams: Codable, Sendable {
public let id: String
public let token: String
public init(
id: String,
token: String)
{
self.id = id
self.token = token
}
private enum CodingKeys: String, CodingKey {
case id
case token
}
}
public struct TalkHandoffJoinResult: Codable, Sendable {
public let id: String
public let roomid: String
public let roomurl: String
public let sessionkey: String
public let sessionid: String?
public let channel: String?
public let target: String?
public let provider: String?
public let model: String?
public let voice: String?
public let mode: AnyCodable
public let transport: AnyCodable
public let brain: AnyCodable
public let createdat: Double
public let expiresat: Double
public let room: [String: AnyCodable]
public init(
id: String,
roomid: String,
roomurl: String,
sessionkey: String,
sessionid: String?,
channel: String?,
target: String?,
provider: String?,
model: String?,
voice: String?,
mode: AnyCodable,
transport: AnyCodable,
brain: AnyCodable,
createdat: Double,
expiresat: Double,
room: [String: AnyCodable])
{
self.id = id
self.roomid = roomid
self.roomurl = roomurl
self.sessionkey = sessionkey
self.sessionid = sessionid
self.channel = channel
self.target = target
self.provider = provider
self.model = model
self.voice = voice
self.mode = mode
self.transport = transport
self.brain = brain
self.createdat = createdat
self.expiresat = expiresat
self.room = room
}
private enum CodingKeys: String, CodingKey {
case id
case roomid = "roomId"
case roomurl = "roomUrl"
case sessionkey = "sessionKey"
case sessionid = "sessionId"
case channel
case target
case provider
case model
case voice
case mode
case transport
case brain
case createdat = "createdAt"
case expiresat = "expiresAt"
case room
}
}
public struct TalkHandoffRevokeParams: Codable, Sendable {
public let id: String
public init(
id: String)
{
self.id = id
}
private enum CodingKeys: String, CodingKey {
case id
}
}
public struct TalkHandoffRevokeResult: Codable, Sendable {
public let ok: Bool
public let revoked: Bool
public init(
ok: Bool,
revoked: Bool)
{
self.ok = ok
self.revoked = revoked
}
private enum CodingKeys: String, CodingKey {
case ok
case revoked
}
}
public struct TalkHandoffTurnStartParams: Codable, Sendable {
public let id: String
public let token: String
public let turnid: String?
public init(
id: String,
token: String,
turnid: String?)
{
self.id = id
self.token = token
self.turnid = turnid
}
private enum CodingKeys: String, CodingKey {
case id
case token
case turnid = "turnId"
}
}
public struct TalkHandoffTurnEndParams: Codable, Sendable {
public let id: String
public let token: String
public let turnid: String?
public init(
id: String,
token: String,
turnid: String?)
{
self.id = id
self.token = token
self.turnid = turnid
}
private enum CodingKeys: String, CodingKey {
case id
case token
case turnid = "turnId"
}
}
public struct TalkHandoffTurnCancelParams: Codable, Sendable {
public let id: String
public let token: String
public let turnid: String?
public let reason: String?
public init(
id: String,
token: String,
turnid: String?,
reason: String?)
{
self.id = id
self.token = token
self.turnid = turnid
self.reason = reason
}
private enum CodingKeys: String, CodingKey {
case id
case token
case turnid = "turnId"
case reason
}
}
public struct TalkHandoffTurnResult: Codable, Sendable {
public let ok: Bool
public let record: TalkHandoffJoinResult
public let turnid: String
public let events: [TalkEvent]
public init(
ok: Bool,
record: TalkHandoffJoinResult,
turnid: String,
events: [TalkEvent])
{
self.ok = ok
self.record = record
self.turnid = turnid
self.events = events
}
private enum CodingKeys: String, CodingKey {
case ok
case record
case turnid = "turnId"
case events
}
}
public struct TalkRealtimeSessionParams: Codable, Sendable {
public let sessionkey: String?
public let provider: String?
public let model: String?
public let voice: String?
public let mode: AnyCodable?
public let transport: AnyCodable?
public let brain: AnyCodable?
public init(
sessionkey: String?,
provider: String?,
model: String?,
voice: String?)
voice: String?,
mode: AnyCodable?,
transport: AnyCodable?,
brain: AnyCodable?)
{
self.sessionkey = sessionkey
self.provider = provider
self.model = model
self.voice = voice
self.mode = mode
self.transport = transport
self.brain = brain
}
private enum CodingKeys: String, CodingKey {
@@ -2681,6 +3152,9 @@ public struct TalkRealtimeSessionParams: Codable, Sendable {
case provider
case model
case voice
case mode
case transport
case brain
}
}
@@ -2706,6 +3180,24 @@ public struct TalkRealtimeRelayAudioParams: Codable, Sendable {
}
}
public struct TalkRealtimeRelayCancelParams: Codable, Sendable {
public let relaysessionid: String
public let reason: String?
public init(
relaysessionid: String,
reason: String?)
{
self.relaysessionid = relaysessionid
self.reason = reason
}
private enum CodingKeys: String, CodingKey {
case relaysessionid = "relaySessionId"
case reason
}
}
public struct TalkRealtimeRelayMarkParams: Codable, Sendable {
public let relaysessionid: String
public let markname: String?
@@ -2774,6 +3266,166 @@ public struct TalkRealtimeRelayOkResult: Codable, Sendable {
}
}
public struct TalkRealtimeToolCallParams: Codable, Sendable {
public let sessionkey: String
public let callid: String
public let name: String
public let args: AnyCodable?
public let relaysessionid: String?
public init(
sessionkey: String,
callid: String,
name: String,
args: AnyCodable?,
relaysessionid: String?)
{
self.sessionkey = sessionkey
self.callid = callid
self.name = name
self.args = args
self.relaysessionid = relaysessionid
}
private enum CodingKeys: String, CodingKey {
case sessionkey = "sessionKey"
case callid = "callId"
case name
case args
case relaysessionid = "relaySessionId"
}
}
public struct TalkRealtimeToolCallResult: Codable, Sendable {
public let runid: String
public let idempotencykey: String
public init(
runid: String,
idempotencykey: String)
{
self.runid = runid
self.idempotencykey = idempotencykey
}
private enum CodingKeys: String, CodingKey {
case runid = "runId"
case idempotencykey = "idempotencyKey"
}
}
public struct TalkTranscriptionSessionParams: Codable, Sendable {
public let provider: String?
public init(
provider: String?)
{
self.provider = provider
}
private enum CodingKeys: String, CodingKey {
case provider
}
}
public struct TalkTranscriptionSessionResult: Codable, Sendable {
public let provider: String
public let mode: String
public let transport: String
public let transcriptionsessionid: String
public let audio: [String: AnyCodable]
public let expiresat: Double
public init(
provider: String,
mode: String,
transport: String,
transcriptionsessionid: String,
audio: [String: AnyCodable],
expiresat: Double)
{
self.provider = provider
self.mode = mode
self.transport = transport
self.transcriptionsessionid = transcriptionsessionid
self.audio = audio
self.expiresat = expiresat
}
private enum CodingKeys: String, CodingKey {
case provider
case mode
case transport
case transcriptionsessionid = "transcriptionSessionId"
case audio
case expiresat = "expiresAt"
}
}
public struct TalkTranscriptionRelayAudioParams: Codable, Sendable {
public let transcriptionsessionid: String
public let audiobase64: String
public init(
transcriptionsessionid: String,
audiobase64: String)
{
self.transcriptionsessionid = transcriptionsessionid
self.audiobase64 = audiobase64
}
private enum CodingKeys: String, CodingKey {
case transcriptionsessionid = "transcriptionSessionId"
case audiobase64 = "audioBase64"
}
}
public struct TalkTranscriptionRelayCancelParams: Codable, Sendable {
public let transcriptionsessionid: String
public let reason: String?
public init(
transcriptionsessionid: String,
reason: String?)
{
self.transcriptionsessionid = transcriptionsessionid
self.reason = reason
}
private enum CodingKeys: String, CodingKey {
case transcriptionsessionid = "transcriptionSessionId"
case reason
}
}
public struct TalkTranscriptionRelayStopParams: Codable, Sendable {
public let transcriptionsessionid: String
public init(
transcriptionsessionid: String)
{
self.transcriptionsessionid = transcriptionsessionid
}
private enum CodingKeys: String, CodingKey {
case transcriptionsessionid = "transcriptionSessionId"
}
}
public struct TalkTranscriptionRelayOkResult: Codable, Sendable {
public let ok: Bool
public init(
ok: Bool)
{
self.ok = ok
}
private enum CodingKeys: String, CodingKey {
case ok
}
}
public struct TalkSpeakParams: Codable, Sendable {
public let text: String
public let voiceid: String?

View File

@@ -0,0 +1,50 @@
import OpenClawKit
import Testing
@testable import OpenClawChatUI
struct ChatEventTextTests {
@Test func `extracts assistant text from final chat event message`() {
let event = OpenClawChatEventPayload(
runId: "run-1",
sessionKey: "main",
state: "final",
message: AnyCodable([
"role": "assistant",
"content": [
["type": "text", "text": "hello"],
["type": "text", "text": "world"],
],
]),
errorMessage: nil)
#expect(OpenClawChatEventText.assistantText(from: event) == "hello\nworld")
}
@Test func `ignores user messages`() {
let event = OpenClawChatEventPayload(
runId: "run-1",
sessionKey: "main",
state: "delta",
message: AnyCodable([
"role": "user",
"content": [["type": "text", "text": "ignore me"]],
]),
errorMessage: nil)
#expect(OpenClawChatEventText.assistantText(from: event) == nil)
}
@Test func `extracts plain string content`() {
let event = OpenClawChatEventPayload(
runId: "run-1",
sessionKey: "main",
state: "final",
message: AnyCodable([
"role": "assistant",
"content": "plain reply",
]),
errorMessage: nil)
#expect(OpenClawChatEventText.assistantText(from: event) == "plain reply")
}
}

View File

@@ -534,6 +534,7 @@ describeLive("android node capability integration (preconditioned)", () => {
const allowlist = resolveNodeCommandAllowlist(cfg, {
platform: target.platform,
deviceFamily: target.deviceFamily,
commands,
});
commandsToRun = commands.filter(

View File

@@ -1,5 +1,10 @@
import { describe, expect, it } from "vitest";
import { normalizeDeclaredNodeCommands } from "./node-command-policy.js";
import type { OpenClawConfig } from "../config/types.openclaw.js";
import {
isNodeCommandAllowed,
normalizeDeclaredNodeCommands,
resolveNodeCommandAllowlist,
} from "./node-command-policy.js";
describe("gateway/node-command-policy", () => {
it("normalizes declared node commands against the allowlist", () => {
@@ -11,4 +16,43 @@ describe("gateway/node-command-policy", () => {
}),
).toEqual(["canvas.snapshot", "system.run"]);
});
it("allows declared push-to-talk commands on trusted talk-capable nodes", () => {
const cfg = {} as OpenClawConfig;
for (const platform of ["ios", "android", "macos", "other"]) {
const allowlist = resolveNodeCommandAllowlist(cfg, { platform, caps: ["talk"] });
expect(allowlist.has("talk.ptt.start")).toBe(true);
expect(allowlist.has("talk.ptt.stop")).toBe(true);
expect(allowlist.has("talk.ptt.cancel")).toBe(true);
expect(allowlist.has("talk.ptt.once")).toBe(true);
expect(
isNodeCommandAllowed({
command: "talk.ptt.start",
declaredCommands: ["talk.ptt.start"],
allowlist,
}),
).toEqual({ ok: true });
}
});
it("does not allow push-to-talk commands from platform label alone", () => {
const cfg = {} as OpenClawConfig;
const allowlist = resolveNodeCommandAllowlist(cfg, {
platform: "android",
caps: ["device"],
commands: [],
});
expect(allowlist.has("talk.ptt.start")).toBe(false);
});
it("allows push-to-talk commands when the node declares talk command support", () => {
const cfg = {} as OpenClawConfig;
const allowlist = resolveNodeCommandAllowlist(cfg, {
platform: "custom",
commands: ["talk.ptt.start"],
});
expect(allowlist.has("talk.ptt.start")).toBe(true);
});
});

View File

@@ -5,6 +5,7 @@ import {
NODE_SYSTEM_RUN_COMMANDS,
} from "../infra/node-commands.js";
import { getActiveRuntimePluginRegistry } from "../plugins/active-runtime-registry.js";
import { normalizeOptionalLowercaseString } from "../shared/string-coerce.js";
import { normalizeDeviceMetadataForPolicy } from "./device-metadata-normalization.js";
import type { NodeSession } from "./node-registry.js";
@@ -49,6 +50,8 @@ const MOTION_COMMANDS = ["motion.activity", "motion.pedometer"];
const SMS_DANGEROUS_COMMANDS = ["sms.send", "sms.search"];
const TALK_PTT_COMMANDS = ["talk.ptt.start", "talk.ptt.stop", "talk.ptt.cancel", "talk.ptt.once"];
// iOS nodes don't implement system.run/which, but they do support notifications.
const IOS_SYSTEM_COMMANDS = [NODE_SYSTEM_NOTIFY_COMMAND];
@@ -197,17 +200,35 @@ export function listDangerousPluginNodeCommands(): string[] {
return [...new Set(commands.map((command) => command.trim()).filter(Boolean))];
}
type NodeCommandPolicyNode = Pick<NodeSession, "platform" | "deviceFamily"> &
Partial<Pick<NodeSession, "caps" | "commands">>;
function hasTalkSurface(node?: NodeCommandPolicyNode): boolean {
if (!node) {
return false;
}
return (
(node.caps ?? []).some(
(capability) => normalizeOptionalLowercaseString(capability) === "talk",
) ||
(node.commands ?? []).some((command) =>
normalizeOptionalLowercaseString(command)?.startsWith("talk."),
)
);
}
export function resolveNodeCommandAllowlist(
cfg: OpenClawConfig,
node?: Pick<NodeSession, "platform" | "deviceFamily">,
node?: NodeCommandPolicyNode,
): Set<string> {
const platformId = normalizePlatformId(node?.platform, node?.deviceFamily);
const base = PLATFORM_DEFAULTS[platformId] ?? PLATFORM_DEFAULTS.unknown;
const talkCommands = hasTalkSurface(node) ? TALK_PTT_COMMANDS : [];
const extra = cfg.gateway?.nodes?.allowCommands ?? [];
const deny = new Set(cfg.gateway?.nodes?.denyCommands ?? []);
const dangerousPluginCommands = new Set(listDangerousPluginNodeCommands());
const allow = new Set(
[...base, ...extra]
[...base, ...talkCommands, ...extra]
.map((cmd) => cmd.trim())
.filter((cmd) => cmd && !dangerousPluginCommands.has(cmd)),
);

View File

@@ -62,6 +62,8 @@ export async function reconcileNodePairingOnConnect(params: {
const allowlist = resolveNodeCommandAllowlist(params.cfg, {
platform: params.connectParams.client.platform,
deviceFamily: params.connectParams.client.deviceFamily,
caps: params.connectParams.caps,
commands: params.connectParams.commands,
});
const declared = normalizeDeclaredNodeCommands({
declaredCommands: Array.isArray(params.connectParams.commands)

View File

@@ -405,6 +405,66 @@ describe("node.invoke APNs wake path", () => {
expect(call?.[1]).toMatchObject({ ok: true, nodeId: "ios-node-reconnect" });
});
it("broadcasts canonical Talk capture events for successful PTT node commands", async () => {
const respond = vi.fn();
const broadcast = vi.fn();
const nodeRegistry = {
get: vi.fn(() => ({
nodeId: "android-talk-node",
commands: ["talk.ptt.start"],
capabilities: ["talk"],
platform: "android",
})),
invoke: vi.fn().mockResolvedValue({
ok: true,
payloadJSON: '{"captureId":"capture-1"}',
}),
};
await nodeHandlers["node.invoke"]({
params: {
nodeId: "android-talk-node",
command: "talk.ptt.start",
idempotencyKey: "idem-talk-ptt-start",
},
respond: respond as never,
context: {
nodeRegistry,
execApprovalManager: undefined,
logGateway: { info: vi.fn(), warn: vi.fn() },
getRuntimeConfig: () => mocks.getRuntimeConfig(),
broadcast,
} as never,
client: null,
req: { type: "req", id: "req-talk-ptt", method: "node.invoke" },
isWebchatConnect: () => false,
});
expect(respond.mock.calls[0]?.[0]).toBe(true);
expect(broadcast).toHaveBeenCalledWith(
"talk.event",
expect.objectContaining({
nodeId: "android-talk-node",
command: "talk.ptt.start",
talkEvent: expect.objectContaining({
type: "capture.started",
sessionId: "node:android-talk-node:talk:capture-1",
captureId: "capture-1",
seq: expect.any(Number),
mode: "stt-tts",
transport: "managed-room",
brain: "agent-consult",
final: false,
payload: expect.objectContaining({
nodeId: "android-talk-node",
command: "talk.ptt.start",
}),
}),
}),
{ dropIfSlow: true },
);
});
it("clears stale registrations after an invalid device token wake failure", async () => {
const registration = directRegistration("ios-node-stale");
mocks.loadApnsRegistration.mockResolvedValue(registration);

View File

@@ -66,6 +66,7 @@ import {
respondUnavailableOnThrow,
safeParseJson,
} from "./nodes.helpers.js";
import type { GatewayRequestContext } from "./shared-types.js";
import type { GatewayRequestHandlers } from "./types.js";
export {
@@ -78,6 +79,13 @@ const NODE_WAKE_THROTTLE_MS = 15_000;
const NODE_WAKE_NUDGE_THROTTLE_MS = 10 * 60_000;
const NODE_PENDING_ACTION_TTL_MS = 10 * 60_000;
const NODE_PENDING_ACTION_MAX_PER_NODE = 64;
const TALK_PTT_COMMANDS = new Set([
"talk.ptt.start",
"talk.ptt.stop",
"talk.ptt.cancel",
"talk.ptt.once",
]);
const talkPttEventSeqBySessionId = new Map<string, number>();
type NodeWakeNudgeAttempt = {
sent: boolean;
@@ -259,6 +267,8 @@ function resolveAllowedPendingNodeActions(params: {
const allowlist = resolveNodeCommandAllowlist(params.cfg, {
platform: connect?.client?.platform,
deviceFamily: connect?.client?.deviceFamily,
caps: connect?.caps,
commands: declaredCommands,
});
const allowed = pending.filter((entry) => {
const result = isNodeCommandAllowed({
@@ -304,6 +314,69 @@ function toPendingParamsJSON(params: unknown): string | undefined {
}
}
function emitTalkPttNodeEvent(params: {
context: Pick<GatewayRequestContext, "broadcast">;
nodeId: string;
command: string;
payload: unknown;
}): void {
if (!TALK_PTT_COMMANDS.has(params.command)) {
return;
}
const payloadObj =
typeof params.payload === "object" && params.payload !== null
? (params.payload as Record<string, unknown>)
: {};
const captureId = normalizeOptionalString(payloadObj.captureId) ?? randomUUID();
const sessionId = `node:${params.nodeId}:talk:${captureId}`;
const seq = (talkPttEventSeqBySessionId.get(sessionId) ?? 0) + 1;
talkPttEventSeqBySessionId.set(sessionId, seq);
while (talkPttEventSeqBySessionId.size > 2048) {
const oldest = talkPttEventSeqBySessionId.keys().next().value;
if (oldest === undefined) {
break;
}
talkPttEventSeqBySessionId.delete(oldest);
}
const type =
params.command === "talk.ptt.start"
? "capture.started"
: params.command === "talk.ptt.cancel"
? "capture.cancelled"
: params.command === "talk.ptt.once"
? "capture.once"
: "capture.stopped";
const final = params.command !== "talk.ptt.start";
const talkEvent = {
id: `${sessionId}:${seq}`,
type,
sessionId,
captureId,
seq,
timestamp: new Date().toISOString(),
mode: "stt-tts",
transport: "managed-room",
brain: "agent-consult",
final,
payload: {
nodeId: params.nodeId,
command: params.command,
status: normalizeOptionalString(payloadObj.status) ?? undefined,
transcript: normalizeOptionalString(payloadObj.transcript) ?? undefined,
},
};
params.context.broadcast(
"talk.event",
{
nodeId: params.nodeId,
command: params.command,
talkEvent,
},
{ dropIfSlow: true },
);
}
export async function maybeWakeNodeWithApns(
nodeId: string,
opts?: { force?: boolean; wakeReason?: string; cfg?: OpenClawConfig },
@@ -1078,6 +1151,15 @@ export const nodeHandlers: GatewayRequestHandlers = {
);
return;
}
const payload = policyResult.payloadJSON
? safeParseJson(policyResult.payloadJSON)
: policyResult.payload;
emitTalkPttNodeEvent({
context,
nodeId,
command,
payload,
});
respond(
true,
{
@@ -1151,6 +1233,12 @@ export const nodeHandlers: GatewayRequestHandlers = {
return;
}
const payload = res.payloadJSON ? safeParseJson(res.payloadJSON) : res.payload;
emitTalkPttNodeEvent({
context,
nodeId,
command,
payload,
});
respond(
true,
{
@@ -1228,6 +1316,9 @@ function buildNodeCommandRejectionHint(
return `node command not allowed: the node (platform: ${platform}) does not support "${command}"`;
}
if (reason === "command not allowlisted") {
if (command.startsWith("talk.")) {
return `node command not allowed: "${command}" requires a trusted Talk-capable node`;
}
return `node command not allowed: "${command}" is not in the allowlist for platform "${platform}"`;
}
if (reason === "node did not declare commands") {

View File

@@ -0,0 +1,32 @@
import { describe, expect, it } from "vitest";
import type { NodeRegistry, NodeSession } from "./node-registry.js";
import { hasConnectedTalkNode } from "./server-talk-nodes.js";
function registryWith(nodes: Array<Partial<NodeSession>>): NodeRegistry {
return {
listConnected: () =>
nodes.map((node, index) => ({
nodeId: `node-${index}`,
connId: `conn-${index}`,
caps: [],
commands: [],
connectedAtMs: 0,
...node,
})),
} as NodeRegistry;
}
describe("hasConnectedTalkNode", () => {
it("uses explicit talk capability instead of platform names", () => {
expect(
hasConnectedTalkNode(registryWith([{ platform: "android", caps: ["device"], commands: [] }])),
).toBe(false);
expect(hasConnectedTalkNode(registryWith([{ platform: "linux", caps: ["talk"] }]))).toBe(true);
});
it("accepts nodes that declare talk command support", () => {
expect(
hasConnectedTalkNode(registryWith([{ platform: "custom", commands: ["talk.ptt.start"] }])),
).toBe(true);
});
});

View File

@@ -0,0 +1,20 @@
import { normalizeOptionalLowercaseString } from "../shared/string-coerce.js";
import type { NodeRegistry, NodeSession } from "./node-registry.js";
const TALK_CAPABILITY = "talk";
const TALK_COMMAND_PREFIX = "talk.";
export function hasConnectedTalkNode(registry: NodeRegistry): boolean {
return registry.listConnected().some(isTalkCapableNode);
}
function isTalkCapableNode(node: NodeSession): boolean {
return (
node.caps.some(
(capability) => normalizeOptionalLowercaseString(capability) === TALK_CAPABILITY,
) ||
node.commands.some((command) =>
normalizeOptionalLowercaseString(command)?.startsWith(TALK_COMMAND_PREFIX),
)
);
}

View File

@@ -133,6 +133,12 @@ function listKnownNodeCommands(cfg: OpenClawConfig): Set<string> {
}
}
}
for (const cmd of resolveNodeCommandAllowlist(baseCfg, { caps: ["talk"] })) {
const normalized = normalizeNodeCommand(cmd);
if (normalized) {
out.add(normalized);
}
}
for (const cmd of DEFAULT_DANGEROUS_NODE_COMMANDS) {
const normalized = normalizeNodeCommand(cmd);
if (normalized) {