diff --git a/CHANGELOG.md b/CHANGELOG.md index 4035bea08e9..5829546cc63 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -145,6 +145,7 @@ Docs: https://docs.openclaw.ai - Anthropic-compatible streams: recover text deltas that arrive before their matching content block, so Kimi Code and similar providers do not finish as empty `incomplete_result` replies. Fixes #76007. Thanks @vliuyt. - fix(infra): block workspace state-directory env override [AI]. (#75940) Thanks @pgondhi987. - MCP/OpenAI: normalize parameter-free tool schemas whose top-level object `properties` is missing, null, or invalid before sending tools to OpenAI, so MCP tools without params stay usable. Fixes #75362. Thanks @tolkonepiu and @SymbolStar. +- Control UI/WebChat: add server-side chat-draft microphone dictation via the existing audio transcription pipeline, avoiding browser Web Speech while keeping provider credentials on the Gateway. Fixes #47311. Thanks @jmomford. - TTS: honor explicit short `[[tts:text]]...[[/tts:text]]` blocks while keeping untagged short auto-TTS suppressed, so tagged voice replies are synthesized instead of being dropped as empty voice-only payloads. Fixes #73758. Thanks @yfge. - Hooks/doctor: warn when `hooks.transformsDir` points outside the canonical hooks transform directory, so invalid workspace skill paths get a direct recovery hint before the Gateway crash-loops. Fixes #75853. Thanks @midobk. - Proxy/audio: convert standard `FormData` bodies before proxy-backed undici fetches, so audio transcription and multipart uploads no longer send `[object FormData]` when `HTTP_PROXY` or `HTTPS_PROXY` is configured. Fixes #48554. Thanks @dco5. diff --git a/docs/nodes/audio.md b/docs/nodes/audio.md index 28e852787de..c9ace927ea6 100644 --- a/docs/nodes/audio.md +++ b/docs/nodes/audio.md @@ -17,6 +17,7 @@ title: "Audio and voice notes" 5. On success, it replaces `Body` with an `[Audio]` block and sets `{{Transcript}}`. - **Command parsing**: When transcription succeeds, `CommandBody`/`RawBody` are set to the transcript so slash commands still work. - **Verbose logging**: In `--verbose`, we log when transcription runs and when it replaces the body. +- **Control UI dictation**: The Chat composer can send a browser-recorded microphone clip to `chat.transcribeAudio`. That Gateway RPC writes the clip to a temporary local file, runs this same audio transcription pipeline, returns draft text to the browser, and deletes the temporary file. It does not create an agent run by itself. ## Auto-detection (default) diff --git a/docs/web/control-ui.md b/docs/web/control-ui.md index a4d9c27d4a7..c8771e52327 100644 --- a/docs/web/control-ui.md +++ b/docs/web/control-ui.md @@ -96,6 +96,7 @@ Imported themes are stored only in the current browser profile. They are not wri - Chat with the model via Gateway WS (`chat.history`, `chat.send`, `chat.abort`, `chat.inject`). + - Dictate into the Chat composer with server-side STT (`chat.transcribeAudio`). The browser records a short microphone clip and sends it to the Gateway, which runs the configured `tools.media.audio` transcription pipeline and returns draft text without exposing provider credentials to the browser. - Talk through browser realtime sessions. OpenAI uses direct WebRTC, Google Live uses a constrained one-use browser token over WebSocket, and backend-only realtime voice plugins use the Gateway relay transport. The relay keeps provider credentials on the Gateway while the browser streams microphone PCM through `talk.realtime.relay*` RPCs and sends `openclaw_agent_consult` tool calls back through `chat.send` for the larger configured OpenClaw model. - Stream tool calls + live tool output cards in Chat (agent events). @@ -149,6 +150,7 @@ Imported themes are stored only in the current browser profile. They are not wri - `chat.send` is **non-blocking**: it acks immediately with `{ runId, status: "started" }` and the response streams via `chat` events. + - `chat.transcribeAudio` is a one-shot dictation helper for Chat drafts. It accepts browser-recorded base64 audio, keeps uploads below the Gateway WebSocket frame limit, writes a temporary local file, runs media-understanding audio transcription with the active Gateway config, returns `{ text, provider, model }`, and removes the temporary file. It does not create an agent run and is separate from realtime Talk. - Chat uploads accept images plus non-video files. Images keep the native image path; other files are stored as managed media and shown in history as attachment links. - Re-sending with the same `idempotencyKey` returns `{ status: "in_flight" }` while running, and `{ status: "ok" }` after completion. - `chat.history` responses are size-bounded for UI safety. When transcript entries are too large, Gateway may truncate long text fields, omit heavy metadata blocks, and replace oversized messages with a placeholder (`[chat.history omitted: message too large]`). diff --git a/docs/web/webchat.md b/docs/web/webchat.md index 0499f607e09..5344acd14a6 100644 --- a/docs/web/webchat.md +++ b/docs/web/webchat.md @@ -22,7 +22,7 @@ Status: the macOS/iOS SwiftUI chat UI talks directly to the Gateway WebSocket. ## How it works (behavior) -- The UI connects to the Gateway WebSocket and uses `chat.history`, `chat.send`, and `chat.inject`. +- The UI connects to the Gateway WebSocket and uses `chat.history`, `chat.send`, `chat.inject`, and `chat.transcribeAudio`. - `chat.history` is bounded for stability: Gateway may truncate long text fields, omit heavy metadata, and replace oversized entries with `[chat.history omitted: message too large]`. - `chat.history` follows the active transcript branch for modern append-only session files, so abandoned rewrite branches and superseded prompt copies are not rendered in WebChat. - Control UI remembers the backing Gateway `sessionId` returned by `chat.history` and includes it on follow-up `chat.send` calls, so reconnects and page refreshes continue the same stored conversation unless the user starts or resets a session. @@ -37,6 +37,7 @@ Status: the macOS/iOS SwiftUI chat UI talks directly to the Gateway WebSocket. and assistant entries whose whole visible text is only the exact silent token `NO_REPLY` / `no_reply` are omitted. - Reasoning-flagged reply payloads (`isReasoning: true`) are excluded from WebChat assistant content, transcript replay text, and audio content blocks, so thinking-only payloads do not surface as visible assistant messages or playable audio. +- `chat.transcribeAudio` powers server-side dictation in the Control UI chat composer. The browser records microphone audio, sends it as base64 to the Gateway, and the Gateway runs the configured `tools.media.audio` pipeline. The returned transcript is inserted into the draft; no agent run is started until the user sends it. - `chat.inject` appends an assistant note directly to the transcript and broadcasts it to the UI (no agent run). - Aborted runs can keep partial assistant output visible in the UI. - Gateway persists aborted partial assistant text into transcript history when buffered output exists, and marks those entries with abort metadata. diff --git a/scripts/openclaw-npm-release-check.ts b/scripts/openclaw-npm-release-check.ts index b32574be439..4733141a491 100644 --- a/scripts/openclaw-npm-release-check.ts +++ b/scripts/openclaw-npm-release-check.ts @@ -427,16 +427,14 @@ function isNpmExecPath(value: string): boolean { return /^npm(?:-cli)?(?:\.(?:c?js|cmd|exe))?$/.test(basename(value).toLowerCase()); } -export function resolveNpmCommandInvocation( - params: { - npmExecPath?: string; - nodeExecPath?: string; - platform?: NodeJS.Platform; - } = {}, -): { command: string; args: string[] } { - const npmExecPath = params.npmExecPath ?? process.env.npm_execpath; - const nodeExecPath = params.nodeExecPath ?? process.execPath; - const npmCommand = (params.platform ?? process.platform) === "win32" ? "npm.cmd" : "npm"; +export function resolveNpmCommandInvocation(params?: { + npmExecPath?: string; + nodeExecPath?: string; + platform?: NodeJS.Platform; +}): { command: string; args: string[] } { + const npmExecPath = params === undefined ? process.env.npm_execpath : params.npmExecPath; + const nodeExecPath = params?.nodeExecPath ?? process.execPath; + const npmCommand = (params?.platform ?? process.platform) === "win32" ? "npm.cmd" : "npm"; if (typeof npmExecPath === "string" && npmExecPath.length > 0 && isNpmExecPath(npmExecPath)) { return { command: nodeExecPath, args: [npmExecPath] }; diff --git a/src/gateway/method-scopes.ts b/src/gateway/method-scopes.ts index af96bc0f09b..282707b7c94 100644 --- a/src/gateway/method-scopes.ts +++ b/src/gateway/method-scopes.ts @@ -151,6 +151,7 @@ const METHOD_SCOPE_GROUPS: Record = { "tools.invoke", "chat.send", "chat.abort", + "chat.transcribeAudio", "sessions.create", "sessions.send", "sessions.steer", diff --git a/src/gateway/server-methods-list.ts b/src/gateway/server-methods-list.ts index 2d3924f294e..264a77d62a9 100644 --- a/src/gateway/server-methods-list.ts +++ b/src/gateway/server-methods-list.ts @@ -155,6 +155,7 @@ const BASE_METHODS = [ "chat.history", "chat.abort", "chat.send", + "chat.transcribeAudio", ]; export function listGatewayMethods(): string[] { diff --git a/src/gateway/server-methods.ts b/src/gateway/server-methods.ts index 78a613a9188..9a87c51a8f5 100644 --- a/src/gateway/server-methods.ts +++ b/src/gateway/server-methods.ts @@ -12,6 +12,7 @@ import { agentHandlers } from "./server-methods/agent.js"; import { agentsHandlers } from "./server-methods/agents.js"; import { artifactsHandlers } from "./server-methods/artifacts.js"; import { channelsHandlers } from "./server-methods/channels.js"; +import { chatTranscribeAudioHandlers } from "./server-methods/chat-transcribe-audio.js"; import { chatHandlers } from "./server-methods/chat.js"; import { commandsHandlers } from "./server-methods/commands.js"; import { configHandlers } from "./server-methods/config.js"; @@ -85,6 +86,7 @@ export const coreGatewayHandlers: GatewayRequestHandlers = { ...healthHandlers, ...channelsHandlers, ...chatHandlers, + ...chatTranscribeAudioHandlers, ...commandsHandlers, ...cronHandlers, ...deviceHandlers, diff --git a/src/gateway/server-methods/chat-transcribe-audio.runtime.ts b/src/gateway/server-methods/chat-transcribe-audio.runtime.ts new file mode 100644 index 00000000000..647a8cab66d --- /dev/null +++ b/src/gateway/server-methods/chat-transcribe-audio.runtime.ts @@ -0,0 +1 @@ +export { transcribeAudioFile } from "../../media-understanding/runtime.js"; diff --git a/src/gateway/server-methods/chat-transcribe-audio.test.ts b/src/gateway/server-methods/chat-transcribe-audio.test.ts new file mode 100644 index 00000000000..416eeaf6057 --- /dev/null +++ b/src/gateway/server-methods/chat-transcribe-audio.test.ts @@ -0,0 +1,123 @@ +import fs from "node:fs/promises"; +import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; +import { ErrorCodes } from "../protocol/index.js"; +import { MAX_PAYLOAD_BYTES } from "../server-constants.js"; + +const mocks = vi.hoisted(() => ({ + transcribeAudioFile: vi.fn(async () => ({ + text: "hello from audio", + provider: "openai", + model: "gpt-4o-transcribe", + })), +})); + +vi.mock("../../media-understanding/runtime.js", () => ({ + transcribeAudioFile: + mocks.transcribeAudioFile as typeof import("../../media-understanding/runtime.js").transcribeAudioFile, +})); + +describe("chatTranscribeAudioHandlers", () => { + beforeEach(() => { + mocks.transcribeAudioFile.mockReset(); + mocks.transcribeAudioFile.mockResolvedValue({ + text: "hello from audio", + provider: "openai", + model: "gpt-4o-transcribe", + }); + }); + + afterEach(() => { + vi.restoreAllMocks(); + }); + + it("keeps the decoded audio cap below the base64 WebSocket frame limit", async () => { + const { MAX_CHAT_TRANSCRIBE_AUDIO_BYTES } = await import("./chat-transcribe-audio.js"); + const base64Bytes = Math.ceil(MAX_CHAT_TRANSCRIBE_AUDIO_BYTES / 3) * 4; + + expect(base64Bytes + 64 * 1024).toBeLessThanOrEqual(MAX_PAYLOAD_BYTES); + expect(MAX_CHAT_TRANSCRIBE_AUDIO_BYTES).toBeLessThan(20 * 1024 * 1024); + }); + + it("transcribes uploaded chat dictation audio through media understanding", async () => { + const { chatTranscribeAudioHandlers } = await import("./chat-transcribe-audio.js"); + const respond = vi.fn(); + + await chatTranscribeAudioHandlers["chat.transcribeAudio"]({ + params: { + audioDataUrl: `data:audio/webm;base64,${Buffer.from("audio").toString("base64")}`, + }, + respond, + context: { getRuntimeConfig: () => ({ tools: { media: {} } }) }, + } as never); + + expect(mocks.transcribeAudioFile).toHaveBeenCalledWith( + expect.objectContaining({ + cfg: { tools: { media: {} } }, + mime: "audio/webm", + }), + ); + const call = (mocks.transcribeAudioFile.mock.calls as unknown as Array<[{ filePath?: string }]>) + .at(0) + ?.at(0); + const filePath = call?.filePath; + expect(filePath).toMatch(/dictation\.webm$/); + await expect(fs.stat(filePath ?? "")).rejects.toMatchObject({ code: "ENOENT" }); + expect(respond).toHaveBeenCalledWith(true, { + text: "hello from audio", + provider: "openai", + model: "gpt-4o-transcribe", + }); + }); + + it("returns INVALID_REQUEST for missing audio payloads", async () => { + const { chatTranscribeAudioHandlers } = await import("./chat-transcribe-audio.js"); + const respond = vi.fn(); + + await chatTranscribeAudioHandlers["chat.transcribeAudio"]({ + params: {}, + respond, + context: { getRuntimeConfig: () => ({}) }, + } as never); + + expect(respond).toHaveBeenCalledWith( + false, + undefined, + expect.objectContaining({ + code: ErrorCodes.INVALID_REQUEST, + message: expect.stringContaining("requires audioDataUrl or audioBase64"), + }), + ); + expect(mocks.transcribeAudioFile).not.toHaveBeenCalled(); + }); + + it("returns UNAVAILABLE when no transcription provider is configured", async () => { + mocks.transcribeAudioFile.mockResolvedValue({ + text: undefined, + decision: { + capability: "audio", + outcome: "skipped", + attachments: [{ attempts: [] }], + }, + } as never); + const { chatTranscribeAudioHandlers } = await import("./chat-transcribe-audio.js"); + const respond = vi.fn(); + + await chatTranscribeAudioHandlers["chat.transcribeAudio"]({ + params: { + audioBase64: Buffer.from("audio").toString("base64"), + mimeType: "audio/ogg", + }, + respond, + context: { getRuntimeConfig: () => ({}) }, + } as never); + + expect(respond).toHaveBeenCalledWith( + false, + undefined, + expect.objectContaining({ + code: ErrorCodes.UNAVAILABLE, + message: expect.stringContaining("No audio transcription provider"), + }), + ); + }); +}); diff --git a/src/gateway/server-methods/chat-transcribe-audio.ts b/src/gateway/server-methods/chat-transcribe-audio.ts new file mode 100644 index 00000000000..13dbb2a84e3 --- /dev/null +++ b/src/gateway/server-methods/chat-transcribe-audio.ts @@ -0,0 +1,125 @@ +import fs from "node:fs/promises"; +import os from "node:os"; +import path from "node:path"; +import { extensionForMime, normalizeMimeType } from "../../media/mime.js"; +import { normalizeOptionalString } from "../../shared/string-coerce.js"; +import { ErrorCodes, errorShape } from "../protocol/index.js"; +import { MAX_PAYLOAD_BYTES } from "../server-constants.js"; +import { formatForLog } from "../ws-log.js"; +import type { GatewayRequestHandlers } from "./types.js"; + +type ChatTranscribeAudioRuntime = typeof import("./chat-transcribe-audio.runtime.js"); +type TranscribeAudioFileResult = Awaited< + ReturnType +>; + +let chatTranscribeAudioRuntimePromise: Promise | null = null; + +function loadChatTranscribeAudioRuntime(): Promise { + chatTranscribeAudioRuntimePromise ??= import("./chat-transcribe-audio.runtime.js"); + return chatTranscribeAudioRuntimePromise; +} + +const CHAT_TRANSCRIBE_AUDIO_WS_JSON_OVERHEAD_BYTES = 64 * 1024; +export const MAX_CHAT_TRANSCRIBE_AUDIO_BYTES = Math.floor( + ((MAX_PAYLOAD_BYTES - CHAT_TRANSCRIBE_AUDIO_WS_JSON_OVERHEAD_BYTES) * 3) / 4, +); + +function decodeAudioPayload(params: Record): { + data: Buffer; + mime?: string; +} { + const dataUrl = normalizeOptionalString(params.audioDataUrl); + const rawBase64 = normalizeOptionalString(params.audioBase64); + const explicitMime = normalizeMimeType(normalizeOptionalString(params.mimeType)); + + if (dataUrl) { + const match = /^data:([^;,]+)?(?:;[^,]*)?;base64,(.*)$/s.exec(dataUrl); + if (!match) { + throw new Error("chat.transcribeAudio requires a base64 data URL"); + } + const mime = normalizeMimeType(match[1]) ?? explicitMime; + return { data: Buffer.from(match[2] ?? "", "base64"), mime }; + } + + if (rawBase64) { + return { data: Buffer.from(rawBase64, "base64"), mime: explicitMime }; + } + + throw new Error("chat.transcribeAudio requires audioDataUrl or audioBase64"); +} + +function extensionForAudioMime(mime?: string): string { + if (mime === "audio/webm") { + return ".webm"; + } + return extensionForMime(mime) ?? ".audio"; +} + +function isMissingMediaUnderstandingProvider(result: TranscribeAudioFileResult) { + const decision = result.decision; + return ( + decision?.outcome === "skipped" && + decision.attachments.length > 0 && + decision.attachments.every((attachment) => attachment.attempts.length === 0) + ); +} + +export const chatTranscribeAudioHandlers: GatewayRequestHandlers = { + "chat.transcribeAudio": async ({ params, respond, context }) => { + let decoded: ReturnType; + try { + decoded = decodeAudioPayload(params); + } catch (err) { + respond(false, undefined, errorShape(ErrorCodes.INVALID_REQUEST, formatForLog(err))); + return; + } + + if (decoded.data.byteLength === 0) { + respond(false, undefined, errorShape(ErrorCodes.INVALID_REQUEST, "Audio payload is empty")); + return; + } + if (decoded.data.byteLength > MAX_CHAT_TRANSCRIBE_AUDIO_BYTES) { + respond( + false, + undefined, + errorShape( + ErrorCodes.INVALID_REQUEST, + `Audio payload exceeds ${MAX_CHAT_TRANSCRIBE_AUDIO_BYTES} bytes`, + ), + ); + return; + } + + const tmpDir = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-chat-stt-")); + const filePath = path.join(tmpDir, `dictation${extensionForAudioMime(decoded.mime)}`); + try { + await fs.writeFile(filePath, decoded.data); + const { transcribeAudioFile } = await loadChatTranscribeAudioRuntime(); + const result = await transcribeAudioFile({ + filePath, + cfg: context.getRuntimeConfig(), + mime: decoded.mime, + language: normalizeOptionalString(params.language), + prompt: normalizeOptionalString(params.prompt), + }); + const text = result.text?.trim(); + if (!text) { + const message = isMissingMediaUnderstandingProvider(result) + ? "No audio transcription provider is configured or ready. Configure tools.media.audio.models." + : "No transcript returned for audio"; + respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, message)); + return; + } + respond(true, { + text, + provider: result.provider ?? null, + model: result.model ?? null, + }); + } catch (err) { + respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, formatForLog(err))); + } finally { + await fs.rm(tmpDir, { recursive: true, force: true }); + } + }, +}; diff --git a/ui/src/styles/chat/layout.css b/ui/src/styles/chat/layout.css index 79813602a16..2e4fc2be528 100644 --- a/ui/src/styles/chat/layout.css +++ b/ui/src/styles/chat/layout.css @@ -663,10 +663,19 @@ background: color-mix(in srgb, var(--danger, #ef4444) 14%, transparent); } +.agent-chat__input-btn--dictating { + color: var(--danger, #ef4444); + background: color-mix(in srgb, var(--danger, #ef4444) 14%, transparent); +} + .agent-chat__talk-status { color: var(--text); } +.agent-chat__dictation-status { + color: var(--text); +} + .agent-chat__input-divider { width: 1px; height: 16px; diff --git a/ui/src/ui/app-chat.test.ts b/ui/src/ui/app-chat.test.ts index 12893fbd365..8b20ea2bbc2 100644 --- a/ui/src/ui/app-chat.test.ts +++ b/ui/src/ui/app-chat.test.ts @@ -44,6 +44,7 @@ let handleAbortChat: typeof import("./app-chat.ts").handleAbortChat; let refreshChatAvatar: typeof import("./app-chat.ts").refreshChatAvatar; let clearPendingQueueItemsForRun: typeof import("./app-chat.ts").clearPendingQueueItemsForRun; let removeQueuedMessage: typeof import("./app-chat.ts").removeQueuedMessage; +let transcribeChatAudio: typeof import("./app-chat.ts").transcribeChatAudio; async function loadChatHelpers(): Promise { ({ @@ -54,6 +55,7 @@ async function loadChatHelpers(): Promise { refreshChatAvatar, clearPendingQueueItemsForRun, removeQueuedMessage, + transcribeChatAudio, } = await import("./app-chat.ts")); } @@ -103,12 +105,73 @@ function makeHost(overrides?: Partial): ChatHost { toolStreamById: new Map(), toolStreamOrder: [], toolStreamSyncTimer: null, + chatDictationStatus: "idle", + chatDictationDetail: null, updateComplete: Promise.resolve(), ...overrides, }; return host as ChatHost; } +describe("transcribeChatAudio", () => { + beforeAll(async () => { + await loadChatHelpers(); + }); + + it("sends recorded audio to the gateway and appends the transcript to the draft", async () => { + const request = vi.fn(async () => ({ text: "new words" })); + const host = makeHost({ + client: { request } as never, + chatMessage: "existing", + }); + + await transcribeChatAudio(host, new Blob([new Uint8Array([1, 2, 3])], { type: "audio/webm" })); + + expect(request).toHaveBeenCalledWith("chat.transcribeAudio", { + audioBase64: "AQID", + mimeType: "audio/webm", + }); + expect(host.chatMessage).toBe("existing new words"); + expect(host.chatDictationStatus).toBe("idle"); + expect(host.chatDictationDetail).toBeNull(); + }); + + it("surfaces gateway transcription errors without changing the draft", async () => { + const request = vi.fn(async () => { + throw new Error("no provider"); + }); + const host = makeHost({ + client: { request } as never, + chatMessage: "existing", + }); + + await transcribeChatAudio(host, new Blob([new Uint8Array([1])], { type: "audio/ogg" })); + + expect(host.chatMessage).toBe("existing"); + expect(host.chatDictationStatus).toBe("error"); + expect(host.chatDictationDetail).toBe("no provider"); + expect(host.lastError).toBe("no provider"); + }); + + it("rejects oversized dictation before sending it over the gateway socket", async () => { + const request = vi.fn(); + const host = makeHost({ + client: { request } as never, + chatMessage: "existing", + }); + + await transcribeChatAudio( + host, + new Blob([new Uint8Array(18 * 1024 * 1024 + 1)], { type: "audio/webm" }), + ); + + expect(request).not.toHaveBeenCalled(); + expect(host.chatMessage).toBe("existing"); + expect(host.chatDictationStatus).toBe("error"); + expect(host.chatDictationDetail).toContain("too large"); + }); +}); + function createSessionsResult(sessions: GatewaySessionRow[]): SessionsListResult { return { ts: 0, diff --git a/ui/src/ui/app-chat.ts b/ui/src/ui/app-chat.ts index 1c458005687..d2d8717d070 100644 --- a/ui/src/ui/app-chat.ts +++ b/ui/src/ui/app-chat.ts @@ -17,6 +17,7 @@ import { type ChatInputHistoryKeyResult, type ChatInputHistoryState, } from "./chat/input-history.ts"; +import { bytesToBase64 } from "./chat/realtime-talk-audio.ts"; import type { ChatSideResult } from "./chat/side-result.ts"; import { executeSlashCommand } from "./chat/slash-command-executor.ts"; import { parseSlashCommand, refreshSlashCommands } from "./chat/slash-commands.ts"; @@ -68,10 +69,22 @@ export type ChatHost = ChatInputHistoryState & { refreshSessionsAfterChat: Set; pendingAbort?: { runId?: string | null; sessionKey: string } | null; chatSubmitGuards?: Map>; + chatDictationStatus?: ChatDictationStatus; + chatDictationDetail?: string | null; /** Callback for slash-command side effects that need app-level access. */ onSlashAction?: (action: string) => void | Promise; }; +export type ChatDictationStatus = "idle" | "starting" | "recording" | "transcribing" | "error"; + +type ChatTranscribeAudioResult = { + text?: unknown; + provider?: unknown; + model?: unknown; +}; + +export const CHAT_TRANSCRIBE_AUDIO_MAX_BYTES = 18 * 1024 * 1024; + export type ChatSendOptions = { confirmReset?: boolean; restoreDraft?: boolean; @@ -123,6 +136,60 @@ export function isChatStopCommand(text: string) { ); } +function appendDictationText(draft: string, transcript: string): string { + const text = transcript.trim(); + if (!text) { + return draft; + } + const current = draft.trimEnd(); + return current ? `${current} ${text}` : text; +} + +export async function transcribeChatAudio(host: ChatHost, audio: Blob): Promise { + if (!host.client || !host.connected) { + host.chatDictationStatus = "error"; + host.chatDictationDetail = "Gateway not connected"; + host.lastError = host.chatDictationDetail; + return null; + } + if (audio.size <= 0) { + host.chatDictationStatus = "error"; + host.chatDictationDetail = "No audio captured"; + host.lastError = host.chatDictationDetail; + return null; + } + if (audio.size > CHAT_TRANSCRIBE_AUDIO_MAX_BYTES) { + host.chatDictationStatus = "error"; + host.chatDictationDetail = `Audio clip is too large for WebChat dictation. Keep recordings under ${CHAT_TRANSCRIBE_AUDIO_MAX_BYTES} bytes.`; + host.lastError = host.chatDictationDetail; + return null; + } + + host.chatDictationStatus = "transcribing"; + host.chatDictationDetail = "Transcribing dictation..."; + try { + const bytes = new Uint8Array(await audio.arrayBuffer()); + const mimeType = audio.type || "audio/webm"; + const result = await host.client.request("chat.transcribeAudio", { + audioBase64: bytesToBase64(bytes), + mimeType, + }); + const transcript = typeof result.text === "string" ? result.text.trim() : ""; + if (!transcript) { + throw new Error("No transcript returned"); + } + host.chatMessage = appendDictationText(host.chatMessage, transcript); + host.chatDictationStatus = "idle"; + host.chatDictationDetail = null; + return transcript; + } catch (err) { + host.chatDictationStatus = "error"; + host.chatDictationDetail = err instanceof Error ? err.message : String(err); + host.lastError = host.chatDictationDetail; + return null; + } +} + function isChatResetCommand(text: string) { const trimmed = text.trim(); if (!trimmed) { diff --git a/ui/src/ui/app-gateway.ts b/ui/src/ui/app-gateway.ts index ce19800fee7..ca7a18cfb60 100644 --- a/ui/src/ui/app-gateway.ts +++ b/ui/src/ui/app-gateway.ts @@ -103,6 +103,7 @@ type GatewayHost = { sessionKey: string; chatRunId: string | null; pendingAbort?: { runId?: string | null; sessionKey: string } | null; + cancelChatDictation?: () => void; refreshSessionsAfterChat: Set; execApprovalQueue: ExecApprovalRequest[]; execApprovalError: string | null; @@ -483,6 +484,7 @@ export function connectGateway(host: GatewayHost, options?: ConnectGatewayOption return; } host.connected = false; + host.cancelChatDictation?.(); // Code 1012 = Service Restart (expected during config saves, don't show as error) host.lastErrorCode = resolveGatewayErrorDetailCode(error) ?? diff --git a/ui/src/ui/app-lifecycle.node.test.ts b/ui/src/ui/app-lifecycle.node.test.ts index 23d3129d887..2dd84558bed 100644 --- a/ui/src/ui/app-lifecycle.node.test.ts +++ b/ui/src/ui/app-lifecycle.node.test.ts @@ -34,6 +34,8 @@ describe("handleDisconnected", () => { }); const removeSpy = vi.spyOn(window, "removeEventListener").mockImplementation(() => undefined); const host = createHost(); + const cancelChatDictation = vi.fn(); + Object.assign(host, { cancelChatDictation }); const disconnectSpy = ( host.topbarObserver as unknown as { disconnect: ReturnType } ).disconnect; @@ -42,6 +44,7 @@ describe("handleDisconnected", () => { expect(removeSpy).toHaveBeenCalledWith("popstate", host.popStateHandler); expect(host.connectGeneration).toBe(1); + expect(cancelChatDictation).toHaveBeenCalledTimes(1); expect(host.client).toBeNull(); expect(host.connected).toBe(false); expect(disconnectSpy).toHaveBeenCalledTimes(1); diff --git a/ui/src/ui/app-lifecycle.ts b/ui/src/ui/app-lifecycle.ts index 784b9101e59..0bd5b0db483 100644 --- a/ui/src/ui/app-lifecycle.ts +++ b/ui/src/ui/app-lifecycle.ts @@ -41,6 +41,7 @@ type LifecycleHost = { realtimeTalkStatus?: string; realtimeTalkDetail?: string | null; realtimeTalkTranscript?: string | null; + cancelChatDictation?: () => void; chatLoading: boolean; chatMessages: unknown[]; chatToolMessages: unknown[]; @@ -91,6 +92,7 @@ export function handleDisconnected(host: LifecycleHost) { host.realtimeTalkStatus = "idle"; host.realtimeTalkDetail = null; host.realtimeTalkTranscript = null; + host.cancelChatDictation?.(); host.client?.stop(); host.client = null; host.connected = false; diff --git a/ui/src/ui/app-render.ts b/ui/src/ui/app-render.ts index c73772b00e8..f8dfbd8e9d8 100644 --- a/ui/src/ui/app-render.ts +++ b/ui/src/ui/app-render.ts @@ -2342,6 +2342,8 @@ export function renderApp(state: AppViewState) { realtimeTalkStatus: state.realtimeTalkStatus, realtimeTalkDetail: state.realtimeTalkDetail, realtimeTalkTranscript: state.realtimeTalkTranscript, + chatDictationStatus: state.chatDictationStatus, + chatDictationDetail: state.chatDictationDetail, connected: state.connected, canSend: state.connected, disabledReason: chatDisabledReason, @@ -2373,6 +2375,7 @@ export function renderApp(state: AppViewState) { onSend: () => state.handleSendChat(), onCompact: () => state.handleSendChat("/compact", { restoreDraft: true }), onToggleRealtimeTalk: () => state.toggleRealtimeTalk(), + onToggleChatDictation: () => state.toggleChatDictation(), canAbort: hasAbortableSessionRun(state), onAbort: () => void state.handleAbortChat(), onQueueRemove: (id) => state.removeQueuedMessage(id), diff --git a/ui/src/ui/app-view-state.ts b/ui/src/ui/app-view-state.ts index f221065c8a8..4405dd878d4 100644 --- a/ui/src/ui/app-view-state.ts +++ b/ui/src/ui/app-view-state.ts @@ -1,4 +1,4 @@ -import type { ChatSendOptions } from "./app-chat.ts"; +import type { ChatDictationStatus, ChatSendOptions } from "./app-chat.ts"; import type { EventLogEntry } from "./app-events.ts"; import type { CompactionStatus, FallbackStatus } from "./app-tool-stream.ts"; import type { ChatInputHistoryKeyInput, ChatInputHistoryKeyResult } from "./chat/input-history.ts"; @@ -119,6 +119,8 @@ export type AppViewState = { realtimeTalkStatus: RealtimeTalkStatus; realtimeTalkDetail: string | null; realtimeTalkTranscript: string | null; + chatDictationStatus: ChatDictationStatus; + chatDictationDetail: string | null; chatManualRefreshInFlight: boolean; chatMobileControlsOpen: boolean; nodesLoading: boolean; @@ -470,6 +472,7 @@ export type AppViewState = { resetChatInputHistoryNavigation: () => void; handleSendChat: (messageOverride?: string, opts?: ChatSendOptions) => Promise; toggleRealtimeTalk: () => Promise; + toggleChatDictation: () => Promise; steerQueuedChatMessage: (id: string) => Promise; handleAbortChat: () => Promise; removeQueuedMessage: (id: string) => void; diff --git a/ui/src/ui/app.test.ts b/ui/src/ui/app.test.ts new file mode 100644 index 00000000000..72a57758f5d --- /dev/null +++ b/ui/src/ui/app.test.ts @@ -0,0 +1,205 @@ +/* @vitest-environment jsdom */ + +import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; + +const { transcribeChatAudioMock } = vi.hoisted(() => ({ + transcribeChatAudioMock: vi.fn(), +})); + +vi.mock("./app-chat.ts", async (importOriginal) => { + const actual = await importOriginal(); + return { + ...actual, + transcribeChatAudio: transcribeChatAudioMock, + }; +}); + +class MockMediaRecorder extends EventTarget { + static instances: MockMediaRecorder[] = []; + static isTypeSupported = vi.fn((mimeType: string) => mimeType === "audio/webm"); + + readonly mimeType: string; + state: RecordingState = "inactive"; + + constructor( + readonly stream: MediaStream, + options?: MediaRecorderOptions, + ) { + super(); + this.mimeType = options?.mimeType ?? ""; + MockMediaRecorder.instances.push(this); + } + + start() { + this.state = "recording"; + } + + stop() { + this.state = "inactive"; + this.dispatchEvent(new Event("stop")); + } + + emitData(data: Blob) { + const event = new Event("dataavailable") as Event & { data: Blob }; + Object.defineProperty(event, "data", { value: data }); + this.dispatchEvent(event); + } + + emitError(message: string) { + const event = new Event("error") as Event & { error: Error; message: string }; + Object.defineProperty(event, "error", { value: new Error(message) }); + Object.defineProperty(event, "message", { value: message }); + this.dispatchEvent(event); + } +} + +type AppWithDictationInternals = { + client: unknown; + connected: boolean; + chatDictationStatus: string; + chatDictationDetail: string | null; + chatDictationChunks: Blob[]; + toggleChatDictation: () => Promise; + cancelChatDictation: () => void; +}; + +let originalMediaDevices: PropertyDescriptor | undefined; + +function createDeferred() { + let resolve!: (value: T) => void; + let reject!: (error: unknown) => void; + const promise = new Promise((resolvePromise, rejectPromise) => { + resolve = resolvePromise; + reject = rejectPromise; + }); + return { promise, resolve, reject }; +} + +function createMockStream(track = { stop: vi.fn() }) { + return { + getTracks: () => [track], + track, + } as unknown as MediaStream & { track: { stop: ReturnType } }; +} + +async function createRecordingApp() { + const { OpenClawApp } = await import("./app.ts"); + const app = new OpenClawApp(); + app.client = { request: vi.fn() } as never; + app.connected = true; + return app as unknown as AppWithDictationInternals; +} + +describe("OpenClawApp dictation recorder lifecycle", () => { + beforeEach(() => { + transcribeChatAudioMock.mockReset(); + transcribeChatAudioMock.mockResolvedValue(null); + MockMediaRecorder.instances = []; + MockMediaRecorder.isTypeSupported.mockClear(); + vi.stubGlobal("MediaRecorder", MockMediaRecorder); + originalMediaDevices = Object.getOwnPropertyDescriptor(globalThis.navigator, "mediaDevices"); + Object.defineProperty(globalThis.navigator, "mediaDevices", { + configurable: true, + value: { + getUserMedia: vi.fn(async () => createMockStream()), + }, + }); + }); + + afterEach(() => { + if (originalMediaDevices) { + Object.defineProperty(globalThis.navigator, "mediaDevices", originalMediaDevices); + } else { + Reflect.deleteProperty(globalThis.navigator, "mediaDevices"); + } + vi.unstubAllGlobals(); + }); + + it("does not submit collected audio after a recorder error and later stop", async () => { + const app = await createRecordingApp(); + await app.toggleChatDictation(); + const recorder = MockMediaRecorder.instances[0]; + + recorder.emitData(new Blob(["audio"], { type: "audio/webm" })); + recorder.emitError("microphone failed"); + recorder.emitData(new Blob(["late audio"], { type: "audio/webm" })); + recorder.stop(); + + expect(transcribeChatAudioMock).not.toHaveBeenCalled(); + expect(app.chatDictationStatus).toBe("error"); + expect(app.chatDictationDetail).toBe("microphone failed"); + expect(app.chatDictationChunks).toEqual([]); + }); + + it("releases recorded chunks after copying them for normal transcription", async () => { + const app = await createRecordingApp(); + await app.toggleChatDictation(); + const recorder = MockMediaRecorder.instances[0]; + recorder.emitData(new Blob(["audio"], { type: "audio/webm" })); + const transcription = createDeferred(); + transcribeChatAudioMock.mockReturnValueOnce(transcription.promise); + + await app.toggleChatDictation(); + + expect(app.chatDictationChunks).toEqual([]); + expect(transcribeChatAudioMock).toHaveBeenCalledTimes(1); + expect(transcribeChatAudioMock.mock.calls[0]?.[1]).toMatchObject({ + size: 5, + type: "audio/webm", + }); + transcription.resolve(null); + await transcription.promise; + }); + + it("ignores duplicate starts while microphone permission is pending", async () => { + const app = await createRecordingApp(); + const pendingUserMedia = createDeferred(); + const getUserMedia = vi.fn(() => pendingUserMedia.promise); + Object.defineProperty(globalThis.navigator, "mediaDevices", { + configurable: true, + value: { getUserMedia }, + }); + const stream = createMockStream(); + + const firstStart = app.toggleChatDictation(); + const secondStart = app.toggleChatDictation(); + + expect(getUserMedia).toHaveBeenCalledTimes(1); + await secondStart; + expect(app.chatDictationStatus).toBe("starting"); + + pendingUserMedia.resolve(stream); + await firstStart; + + expect(MockMediaRecorder.instances).toHaveLength(1); + expect(MockMediaRecorder.instances[0].state).toBe("recording"); + expect(stream.track.stop).not.toHaveBeenCalled(); + + MockMediaRecorder.instances[0].emitData(new Blob(["audio"], { type: "audio/webm" })); + MockMediaRecorder.instances[0].stop(); + + expect(stream.track.stop).toHaveBeenCalledTimes(1); + expect(transcribeChatAudioMock).toHaveBeenCalledTimes(1); + }); + + it("stops a microphone stream that resolves after pending dictation is canceled", async () => { + const app = await createRecordingApp(); + const pendingUserMedia = createDeferred(); + const getUserMedia = vi.fn(() => pendingUserMedia.promise); + Object.defineProperty(globalThis.navigator, "mediaDevices", { + configurable: true, + value: { getUserMedia }, + }); + const stream = createMockStream(); + + const start = app.toggleChatDictation(); + app.cancelChatDictation(); + pendingUserMedia.resolve(stream); + await start; + + expect(MockMediaRecorder.instances).toHaveLength(0); + expect(stream.track.stop).toHaveBeenCalledTimes(1); + expect(app.chatDictationStatus).toBe("idle"); + expect(transcribeChatAudioMock).not.toHaveBeenCalled(); + }); +}); diff --git a/ui/src/ui/app.ts b/ui/src/ui/app.ts index 1953c6f6c90..50da8be7577 100644 --- a/ui/src/ui/app.ts +++ b/ui/src/ui/app.ts @@ -22,8 +22,10 @@ import { removeQueuedMessage as removeQueuedMessageInternal, resetChatInputHistoryNavigation as resetChatInputHistoryNavigationInternal, steerQueuedChatMessage as steerQueuedChatMessageInternal, + transcribeChatAudio as transcribeChatAudioInternal, type ChatInputHistoryKeyInput, type ChatInputHistoryKeyResult, + type ChatDictationStatus, } from "./app-chat.ts"; import { DEFAULT_CRON_FORM, DEFAULT_LOG_LEVEL_FILTERS } from "./app-defaults.ts"; import type { EventLogEntry } from "./app-events.ts"; @@ -222,6 +224,13 @@ export class OpenClawApp extends LitElement { @state() realtimeTalkDetail: string | null = null; @state() realtimeTalkTranscript: string | null = null; private realtimeTalkSession: RealtimeTalkSession | null = null; + @state() chatDictationStatus: ChatDictationStatus = "idle"; + @state() chatDictationDetail: string | null = null; + private chatDictationRecorder: MediaRecorder | null = null; + private chatDictationStream: MediaStream | null = null; + private chatDictationChunks: Blob[] = []; + private chatDictationCancelNextStop = false; + private chatDictationStartToken = 0; @state() chatManualRefreshInFlight = false; @state() chatMobileControlsOpen = false; private chatMobileControlsTrigger: HTMLElement | null = null; @@ -944,6 +953,129 @@ export class OpenClawApp extends LitElement { } } + async toggleChatDictation() { + if (this.chatDictationRecorder && this.chatDictationStatus === "recording") { + this.chatDictationRecorder.stop(); + return; + } + if (this.chatDictationStatus === "starting" || this.chatDictationStatus === "transcribing") { + return; + } + if (!this.client || !this.connected) { + this.chatDictationStatus = "error"; + this.chatDictationDetail = "Gateway not connected"; + this.lastError = this.chatDictationDetail; + return; + } + if (!navigator.mediaDevices?.getUserMedia || typeof MediaRecorder === "undefined") { + this.chatDictationStatus = "error"; + this.chatDictationDetail = "Browser microphone recording is unavailable"; + this.lastError = this.chatDictationDetail; + return; + } + + const startToken = ++this.chatDictationStartToken; + this.chatDictationStatus = "starting"; + this.chatDictationDetail = "Starting dictation..."; + let stream: MediaStream | null = null; + try { + stream = await navigator.mediaDevices.getUserMedia({ audio: true }); + if (this.chatDictationStartToken !== startToken || this.chatDictationStatus !== "starting") { + this.stopMediaStream(stream); + return; + } + const mimeType = ["audio/webm;codecs=opus", "audio/webm", "audio/mp4"].find((candidate) => + MediaRecorder.isTypeSupported(candidate), + ); + const recorder = new MediaRecorder(stream, mimeType ? { mimeType } : undefined); + this.chatDictationStream = stream; + this.chatDictationRecorder = recorder; + this.chatDictationChunks = []; + recorder.addEventListener("dataavailable", (event) => { + if (this.chatDictationRecorder !== recorder || this.chatDictationCancelNextStop) { + return; + } + if (event.data.size > 0) { + this.chatDictationChunks.push(event.data); + } + }); + recorder.addEventListener("error", (event) => { + if (this.chatDictationRecorder !== recorder) { + return; + } + this.chatDictationRecorder = null; + this.chatDictationChunks = []; + this.chatDictationStatus = "error"; + this.chatDictationDetail = + event.message || event.error?.message || "Dictation recording failed"; + this.lastError = this.chatDictationDetail; + this.stopChatDictationStream(); + }); + recorder.addEventListener("stop", () => { + if (this.chatDictationRecorder !== recorder) { + return; + } + const chunks = this.chatDictationChunks.splice(0); + const canceledByRequest = this.chatDictationCancelNextStop; + this.chatDictationCancelNextStop = false; + this.chatDictationRecorder = null; + this.stopChatDictationStream(); + if (canceledByRequest) { + if (this.chatDictationStatus !== "error") { + this.chatDictationStatus = "idle"; + this.chatDictationDetail = null; + } + return; + } + const blob = new Blob(chunks, { + type: recorder.mimeType || chunks[0]?.type || "audio/webm", + }); + void transcribeChatAudioInternal( + this as unknown as Parameters[0], + blob, + ); + }); + this.chatDictationStatus = "recording"; + this.chatDictationDetail = "Recording dictation..."; + recorder.start(); + } catch (error) { + if (stream && this.chatDictationStream !== stream) { + this.stopMediaStream(stream); + } + if (this.chatDictationStartToken !== startToken) { + return; + } + this.chatDictationRecorder = null; + this.stopChatDictationStream(); + this.chatDictationStatus = "error"; + this.chatDictationDetail = error instanceof Error ? error.message : String(error); + this.lastError = this.chatDictationDetail; + } + } + + private stopChatDictationStream() { + this.stopMediaStream(this.chatDictationStream); + this.chatDictationStream = null; + } + + private stopMediaStream(stream: MediaStream | null) { + stream?.getTracks().forEach((track) => track.stop()); + } + + cancelChatDictation() { + this.chatDictationStartToken += 1; + if (this.chatDictationRecorder?.state === "recording") { + this.chatDictationCancelNextStop = true; + this.chatDictationRecorder.stop(); + } + this.chatDictationRecorder = null; + this.chatDictationChunks = []; + this.chatDictationCancelNextStop = false; + this.stopChatDictationStream(); + this.chatDictationStatus = "idle"; + this.chatDictationDetail = null; + } + async steerQueuedChatMessage(id: string) { await steerQueuedChatMessageInternal( this as unknown as Parameters[0], diff --git a/ui/src/ui/views/chat.test.ts b/ui/src/ui/views/chat.test.ts index eca9f273619..01d15840eef 100644 --- a/ui/src/ui/views/chat.test.ts +++ b/ui/src/ui/views/chat.test.ts @@ -334,6 +334,8 @@ function renderChatView(overrides: Partial[0]> = { realtimeTalkStatus: "idle", realtimeTalkDetail: null, realtimeTalkTranscript: null, + chatDictationStatus: "idle", + chatDictationDetail: null, connected: true, canSend: true, disabledReason: null, @@ -366,6 +368,7 @@ function renderChatView(overrides: Partial[0]> = { onSend: () => undefined, onCompact: () => undefined, onToggleRealtimeTalk: () => undefined, + onToggleChatDictation: () => undefined, onAbort: () => undefined, onQueueRemove: () => undefined, onQueueSteer: () => undefined, @@ -445,12 +448,48 @@ describe("chat loading skeleton", () => { }); describe("chat voice controls", () => { - it("keeps Talk visible without the stale browser dictation button", () => { + it("shows server dictation and Talk without the stale browser dictation button", () => { const container = renderChatView(); + expect(container.querySelector('[aria-label="Dictate with server STT"]')).not.toBeNull(); expect(container.querySelector('[aria-label="Start Talk"]')).not.toBeNull(); expect(container.querySelector('[aria-label="Voice input"]')).toBeNull(); }); + + it("shows dictation recording state", () => { + const container = renderChatView({ + chatDictationStatus: "recording", + chatDictationDetail: null, + }); + + expect(container.querySelector('[aria-label="Stop dictation"]')).not.toBeNull(); + expect(container.textContent).toContain("Recording dictation"); + }); + + it("disables duplicate dictation starts while microphone access is pending", () => { + const container = renderChatView({ + chatDictationStatus: "starting", + chatDictationDetail: null, + }); + + const button = container.querySelector( + '[aria-label="Dictate with server STT"]', + ); + expect(button).not.toBeNull(); + expect(button!.disabled).toBe(true); + expect(container.textContent).toContain("Starting dictation"); + }); + + it("keeps stop dictation enabled while recording after disconnect", () => { + const container = renderChatView({ + connected: false, + chatDictationStatus: "recording", + }); + + const button = container.querySelector('[aria-label="Stop dictation"]'); + expect(button).not.toBeNull(); + expect(button!.disabled).toBe(false); + }); }); describe("chat slash menu accessibility", () => { diff --git a/ui/src/ui/views/chat.ts b/ui/src/ui/views/chat.ts index 1553bc5b1b0..6790eab1a49 100644 --- a/ui/src/ui/views/chat.ts +++ b/ui/src/ui/views/chat.ts @@ -3,6 +3,7 @@ import { ifDefined } from "lit/directives/if-defined.js"; import { ref } from "lit/directives/ref.js"; import { repeat } from "lit/directives/repeat.js"; import { t } from "../../i18n/index.ts"; +import type { ChatDictationStatus } from "../app-chat.ts"; import type { CompactionStatus, FallbackStatus } from "../app-tool-stream.ts"; import { getChatAttachmentPreviewUrl, @@ -77,6 +78,8 @@ export type ChatProps = { realtimeTalkStatus?: RealtimeTalkStatus; realtimeTalkDetail?: string | null; realtimeTalkTranscript?: string | null; + chatDictationStatus?: ChatDictationStatus; + chatDictationDetail?: string | null; connected: boolean; canSend: boolean; disabledReason: string | null; @@ -110,6 +113,7 @@ export type ChatProps = { onSend: () => void; onCompact?: () => void | Promise; onToggleRealtimeTalk?: () => void; + onToggleChatDictation?: () => void; onAbort?: () => void; onQueueRemove: (id: string) => void; onQueueSteer?: (id: string) => void; @@ -1198,19 +1202,32 @@ export function renderChat(props: ChatProps) { @change=${(e: Event) => handleFileSelect(e, props)} /> - ${props.realtimeTalkActive || props.realtimeTalkDetail || props.realtimeTalkTranscript + ${props.chatDictationStatus && props.chatDictationStatus !== "idle" ? html` -
- ${props.realtimeTalkDetail ?? - props.realtimeTalkTranscript ?? - (props.realtimeTalkStatus === "thinking" - ? "Asking OpenClaw..." - : props.realtimeTalkStatus === "connecting" - ? "Connecting Talk..." - : "Talk live")} +
+ ${props.chatDictationDetail ?? + (props.chatDictationStatus === "starting" + ? "Starting dictation..." + : props.chatDictationStatus === "recording" + ? "Recording dictation..." + : props.chatDictationStatus === "transcribing" + ? "Transcribing dictation..." + : "Dictation unavailable")}
` - : nothing} + : props.realtimeTalkActive || props.realtimeTalkDetail || props.realtimeTalkTranscript + ? html` +
+ ${props.realtimeTalkDetail ?? + props.realtimeTalkTranscript ?? + (props.realtimeTalkStatus === "thinking" + ? "Asking OpenClaw..." + : props.realtimeTalkStatus === "connecting" + ? "Connecting Talk..." + : "Talk live")} +
+ ` + : nothing}