feat(webchat): add server-side dictation (#76021)

Summary: - This PR adds WebChat server-side dictation through a new authenticated `chat.transcribeAudio` Gateway RPC, MediaRecorder composer controls, docs/changelog updates, and focused gateway/UI tests. - Reproducibility: yes. Current main reproduces the missing feature by inspection: the Gateway method list, write scopes, docs, and WebChat voice-control test have no `chat.transcribeAudio` server-dictation path. ClawSweeper fixups: - Included follow-up commit: feat(webchat): add server-side dictation - Included follow-up commit: fix(clawsweeper): address review for automerge-openclaw-openclaw-7602… Validation: - ClawSweeper review passed for head 850571380a. - Required merge gates passed before the squash merge. Prepared head SHA: 850571380a Review: https://github.com/openclaw/openclaw/pull/76021#issuecomment-4363514226 Co-authored-by: Peter Steinberger <steipete@gmail.com> Co-authored-by: clawsweeper <274271284+clawsweeper[bot]@users.noreply.github.com>
2026-05-06 16:50:43 +00:00 · 2026-05-03 00:09:23 +01:00
parent 15bbf4f2f3
commit 68359cacbf
23 changed files with 847 additions and 23 deletions
--- a/ui/src/styles/chat/layout.css
+++ b/ui/src/styles/chat/layout.css
@@ -663,10 +663,19 @@
  background: color-mix(in srgb, var(--danger, #ef4444) 14%, transparent);
 }

+.agent-chat__input-btn--dictating {
+  color: var(--danger, #ef4444);
+  background: color-mix(in srgb, var(--danger, #ef4444) 14%, transparent);
+}
+
 .agent-chat__talk-status {
  color: var(--text);
 }

+.agent-chat__dictation-status {
+  color: var(--text);
+}
+
 .agent-chat__input-divider {
  width: 1px;
  height: 16px;
--- a/ui/src/ui/app-chat.test.ts
+++ b/ui/src/ui/app-chat.test.ts
@@ -44,6 +44,7 @@ let handleAbortChat: typeof import("./app-chat.ts").handleAbortChat;
 let refreshChatAvatar: typeof import("./app-chat.ts").refreshChatAvatar;
 let clearPendingQueueItemsForRun: typeof import("./app-chat.ts").clearPendingQueueItemsForRun;
 let removeQueuedMessage: typeof import("./app-chat.ts").removeQueuedMessage;
+let transcribeChatAudio: typeof import("./app-chat.ts").transcribeChatAudio;

 async function loadChatHelpers(): Promise<void> {
  ({
@@ -54,6 +55,7 @@ async function loadChatHelpers(): Promise<void> {
    refreshChatAvatar,
    clearPendingQueueItemsForRun,
    removeQueuedMessage,
+    transcribeChatAudio,
  } = await import("./app-chat.ts"));
 }

@@ -103,12 +105,73 @@ function makeHost(overrides?: Partial<ChatHost>): ChatHost {
    toolStreamById: new Map(),
    toolStreamOrder: [],
    toolStreamSyncTimer: null,
+    chatDictationStatus: "idle",
+    chatDictationDetail: null,
    updateComplete: Promise.resolve(),
    ...overrides,
  };
  return host as ChatHost;
 }

+describe("transcribeChatAudio", () => {
+  beforeAll(async () => {
+    await loadChatHelpers();
+  });
+
+  it("sends recorded audio to the gateway and appends the transcript to the draft", async () => {
+    const request = vi.fn(async () => ({ text: "new words" }));
+    const host = makeHost({
+      client: { request } as never,
+      chatMessage: "existing",
+    });
+
+    await transcribeChatAudio(host, new Blob([new Uint8Array([1, 2, 3])], { type: "audio/webm" }));
+
+    expect(request).toHaveBeenCalledWith("chat.transcribeAudio", {
+      audioBase64: "AQID",
+      mimeType: "audio/webm",
+    });
+    expect(host.chatMessage).toBe("existing new words");
+    expect(host.chatDictationStatus).toBe("idle");
+    expect(host.chatDictationDetail).toBeNull();
+  });
+
+  it("surfaces gateway transcription errors without changing the draft", async () => {
+    const request = vi.fn(async () => {
+      throw new Error("no provider");
+    });
+    const host = makeHost({
+      client: { request } as never,
+      chatMessage: "existing",
+    });
+
+    await transcribeChatAudio(host, new Blob([new Uint8Array([1])], { type: "audio/ogg" }));
+
+    expect(host.chatMessage).toBe("existing");
+    expect(host.chatDictationStatus).toBe("error");
+    expect(host.chatDictationDetail).toBe("no provider");
+    expect(host.lastError).toBe("no provider");
+  });
+
+  it("rejects oversized dictation before sending it over the gateway socket", async () => {
+    const request = vi.fn();
+    const host = makeHost({
+      client: { request } as never,
+      chatMessage: "existing",
+    });
+
+    await transcribeChatAudio(
+      host,
+      new Blob([new Uint8Array(18 * 1024 * 1024 + 1)], { type: "audio/webm" }),
+    );
+
+    expect(request).not.toHaveBeenCalled();
+    expect(host.chatMessage).toBe("existing");
+    expect(host.chatDictationStatus).toBe("error");
+    expect(host.chatDictationDetail).toContain("too large");
+  });
+});
+
 function createSessionsResult(sessions: GatewaySessionRow[]): SessionsListResult {
  return {
    ts: 0,
--- a/ui/src/ui/app-chat.ts
+++ b/ui/src/ui/app-chat.ts
@@ -17,6 +17,7 @@ import {
  type ChatInputHistoryKeyResult,
  type ChatInputHistoryState,
 } from "./chat/input-history.ts";
+import { bytesToBase64 } from "./chat/realtime-talk-audio.ts";
 import type { ChatSideResult } from "./chat/side-result.ts";
 import { executeSlashCommand } from "./chat/slash-command-executor.ts";
 import { parseSlashCommand, refreshSlashCommands } from "./chat/slash-commands.ts";
@@ -68,10 +69,22 @@ export type ChatHost = ChatInputHistoryState & {
  refreshSessionsAfterChat: Set<string>;
  pendingAbort?: { runId?: string | null; sessionKey: string } | null;
  chatSubmitGuards?: Map<string, Promise<void>>;
+  chatDictationStatus?: ChatDictationStatus;
+  chatDictationDetail?: string | null;
  /** Callback for slash-command side effects that need app-level access. */
  onSlashAction?: (action: string) => void | Promise<void>;
 };

+export type ChatDictationStatus = "idle" | "starting" | "recording" | "transcribing" | "error";
+
+type ChatTranscribeAudioResult = {
+  text?: unknown;
+  provider?: unknown;
+  model?: unknown;
+};
+
+export const CHAT_TRANSCRIBE_AUDIO_MAX_BYTES = 18 * 1024 * 1024;
+
 export type ChatSendOptions = {
  confirmReset?: boolean;
  restoreDraft?: boolean;
@@ -123,6 +136,60 @@ export function isChatStopCommand(text: string) {
  );
 }

+function appendDictationText(draft: string, transcript: string): string {
+  const text = transcript.trim();
+  if (!text) {
+    return draft;
+  }
+  const current = draft.trimEnd();
+  return current ? `${current} ${text}` : text;
+}
+
+export async function transcribeChatAudio(host: ChatHost, audio: Blob): Promise<string | null> {
+  if (!host.client || !host.connected) {
+    host.chatDictationStatus = "error";
+    host.chatDictationDetail = "Gateway not connected";
+    host.lastError = host.chatDictationDetail;
+    return null;
+  }
+  if (audio.size <= 0) {
+    host.chatDictationStatus = "error";
+    host.chatDictationDetail = "No audio captured";
+    host.lastError = host.chatDictationDetail;
+    return null;
+  }
+  if (audio.size > CHAT_TRANSCRIBE_AUDIO_MAX_BYTES) {
+    host.chatDictationStatus = "error";
+    host.chatDictationDetail = `Audio clip is too large for WebChat dictation. Keep recordings under ${CHAT_TRANSCRIBE_AUDIO_MAX_BYTES} bytes.`;
+    host.lastError = host.chatDictationDetail;
+    return null;
+  }
+
+  host.chatDictationStatus = "transcribing";
+  host.chatDictationDetail = "Transcribing dictation...";
+  try {
+    const bytes = new Uint8Array(await audio.arrayBuffer());
+    const mimeType = audio.type || "audio/webm";
+    const result = await host.client.request<ChatTranscribeAudioResult>("chat.transcribeAudio", {
+      audioBase64: bytesToBase64(bytes),
+      mimeType,
+    });
+    const transcript = typeof result.text === "string" ? result.text.trim() : "";
+    if (!transcript) {
+      throw new Error("No transcript returned");
+    }
+    host.chatMessage = appendDictationText(host.chatMessage, transcript);
+    host.chatDictationStatus = "idle";
+    host.chatDictationDetail = null;
+    return transcript;
+  } catch (err) {
+    host.chatDictationStatus = "error";
+    host.chatDictationDetail = err instanceof Error ? err.message : String(err);
+    host.lastError = host.chatDictationDetail;
+    return null;
+  }
+}
+
 function isChatResetCommand(text: string) {
  const trimmed = text.trim();
  if (!trimmed) {
--- a/ui/src/ui/app-gateway.ts
+++ b/ui/src/ui/app-gateway.ts
@@ -103,6 +103,7 @@ type GatewayHost = {
  sessionKey: string;
  chatRunId: string | null;
  pendingAbort?: { runId?: string | null; sessionKey: string } | null;
+  cancelChatDictation?: () => void;
  refreshSessionsAfterChat: Set<string>;
  execApprovalQueue: ExecApprovalRequest[];
  execApprovalError: string | null;
@@ -483,6 +484,7 @@ export function connectGateway(host: GatewayHost, options?: ConnectGatewayOption
        return;
      }
      host.connected = false;
+      host.cancelChatDictation?.();
      // Code 1012 = Service Restart (expected during config saves, don't show as error)
      host.lastErrorCode =
        resolveGatewayErrorDetailCode(error) ??
--- a/ui/src/ui/app-lifecycle.node.test.ts
+++ b/ui/src/ui/app-lifecycle.node.test.ts
@@ -34,6 +34,8 @@ describe("handleDisconnected", () => {
    });
    const removeSpy = vi.spyOn(window, "removeEventListener").mockImplementation(() => undefined);
    const host = createHost();
+    const cancelChatDictation = vi.fn();
+    Object.assign(host, { cancelChatDictation });
    const disconnectSpy = (
      host.topbarObserver as unknown as { disconnect: ReturnType<typeof vi.fn> }
    ).disconnect;
@@ -42,6 +44,7 @@ describe("handleDisconnected", () => {

    expect(removeSpy).toHaveBeenCalledWith("popstate", host.popStateHandler);
    expect(host.connectGeneration).toBe(1);
+    expect(cancelChatDictation).toHaveBeenCalledTimes(1);
    expect(host.client).toBeNull();
    expect(host.connected).toBe(false);
    expect(disconnectSpy).toHaveBeenCalledTimes(1);
--- a/ui/src/ui/app-lifecycle.ts
+++ b/ui/src/ui/app-lifecycle.ts
@@ -41,6 +41,7 @@ type LifecycleHost = {
  realtimeTalkStatus?: string;
  realtimeTalkDetail?: string | null;
  realtimeTalkTranscript?: string | null;
+  cancelChatDictation?: () => void;
  chatLoading: boolean;
  chatMessages: unknown[];
  chatToolMessages: unknown[];
@@ -91,6 +92,7 @@ export function handleDisconnected(host: LifecycleHost) {
  host.realtimeTalkStatus = "idle";
  host.realtimeTalkDetail = null;
  host.realtimeTalkTranscript = null;
+  host.cancelChatDictation?.();
  host.client?.stop();
  host.client = null;
  host.connected = false;
--- a/ui/src/ui/app-render.ts
+++ b/ui/src/ui/app-render.ts
@@ -2342,6 +2342,8 @@ export function renderApp(state: AppViewState) {
              realtimeTalkStatus: state.realtimeTalkStatus,
              realtimeTalkDetail: state.realtimeTalkDetail,
              realtimeTalkTranscript: state.realtimeTalkTranscript,
+              chatDictationStatus: state.chatDictationStatus,
+              chatDictationDetail: state.chatDictationDetail,
              connected: state.connected,
              canSend: state.connected,
              disabledReason: chatDisabledReason,
@@ -2373,6 +2375,7 @@ export function renderApp(state: AppViewState) {
              onSend: () => state.handleSendChat(),
              onCompact: () => state.handleSendChat("/compact", { restoreDraft: true }),
              onToggleRealtimeTalk: () => state.toggleRealtimeTalk(),
+              onToggleChatDictation: () => state.toggleChatDictation(),
              canAbort: hasAbortableSessionRun(state),
              onAbort: () => void state.handleAbortChat(),
              onQueueRemove: (id) => state.removeQueuedMessage(id),
--- a/ui/src/ui/app-view-state.ts
+++ b/ui/src/ui/app-view-state.ts
@@ -1,4 +1,4 @@
-import type { ChatSendOptions } from "./app-chat.ts";
+import type { ChatDictationStatus, ChatSendOptions } from "./app-chat.ts";
 import type { EventLogEntry } from "./app-events.ts";
 import type { CompactionStatus, FallbackStatus } from "./app-tool-stream.ts";
 import type { ChatInputHistoryKeyInput, ChatInputHistoryKeyResult } from "./chat/input-history.ts";
@@ -119,6 +119,8 @@ export type AppViewState = {
  realtimeTalkStatus: RealtimeTalkStatus;
  realtimeTalkDetail: string | null;
  realtimeTalkTranscript: string | null;
+  chatDictationStatus: ChatDictationStatus;
+  chatDictationDetail: string | null;
  chatManualRefreshInFlight: boolean;
  chatMobileControlsOpen: boolean;
  nodesLoading: boolean;
@@ -470,6 +472,7 @@ export type AppViewState = {
    resetChatInputHistoryNavigation: () => void;
    handleSendChat: (messageOverride?: string, opts?: ChatSendOptions) => Promise<void>;
    toggleRealtimeTalk: () => Promise<void>;
+    toggleChatDictation: () => Promise<void>;
    steerQueuedChatMessage: (id: string) => Promise<void>;
    handleAbortChat: () => Promise<void>;
    removeQueuedMessage: (id: string) => void;
--- a/ui/src/ui/app.test.ts
+++ b/ui/src/ui/app.test.ts
@@ -0,0 +1,205 @@
+/* @vitest-environment jsdom */
+
+import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
+
+const { transcribeChatAudioMock } = vi.hoisted(() => ({
+  transcribeChatAudioMock: vi.fn(),
+}));
+
+vi.mock("./app-chat.ts", async (importOriginal) => {
+  const actual = await importOriginal<typeof import("./app-chat.ts")>();
+  return {
+    ...actual,
+    transcribeChatAudio: transcribeChatAudioMock,
+  };
+});
+
+class MockMediaRecorder extends EventTarget {
+  static instances: MockMediaRecorder[] = [];
+  static isTypeSupported = vi.fn((mimeType: string) => mimeType === "audio/webm");
+
+  readonly mimeType: string;
+  state: RecordingState = "inactive";
+
+  constructor(
+    readonly stream: MediaStream,
+    options?: MediaRecorderOptions,
+  ) {
+    super();
+    this.mimeType = options?.mimeType ?? "";
+    MockMediaRecorder.instances.push(this);
+  }
+
+  start() {
+    this.state = "recording";
+  }
+
+  stop() {
+    this.state = "inactive";
+    this.dispatchEvent(new Event("stop"));
+  }
+
+  emitData(data: Blob) {
+    const event = new Event("dataavailable") as Event & { data: Blob };
+    Object.defineProperty(event, "data", { value: data });
+    this.dispatchEvent(event);
+  }
+
+  emitError(message: string) {
+    const event = new Event("error") as Event & { error: Error; message: string };
+    Object.defineProperty(event, "error", { value: new Error(message) });
+    Object.defineProperty(event, "message", { value: message });
+    this.dispatchEvent(event);
+  }
+}
+
+type AppWithDictationInternals = {
+  client: unknown;
+  connected: boolean;
+  chatDictationStatus: string;
+  chatDictationDetail: string | null;
+  chatDictationChunks: Blob[];
+  toggleChatDictation: () => Promise<void>;
+  cancelChatDictation: () => void;
+};
+
+let originalMediaDevices: PropertyDescriptor | undefined;
+
+function createDeferred<T>() {
+  let resolve!: (value: T) => void;
+  let reject!: (error: unknown) => void;
+  const promise = new Promise<T>((resolvePromise, rejectPromise) => {
+    resolve = resolvePromise;
+    reject = rejectPromise;
+  });
+  return { promise, resolve, reject };
+}
+
+function createMockStream(track = { stop: vi.fn() }) {
+  return {
+    getTracks: () => [track],
+    track,
+  } as unknown as MediaStream & { track: { stop: ReturnType<typeof vi.fn> } };
+}
+
+async function createRecordingApp() {
+  const { OpenClawApp } = await import("./app.ts");
+  const app = new OpenClawApp();
+  app.client = { request: vi.fn() } as never;
+  app.connected = true;
+  return app as unknown as AppWithDictationInternals;
+}
+
+describe("OpenClawApp dictation recorder lifecycle", () => {
+  beforeEach(() => {
+    transcribeChatAudioMock.mockReset();
+    transcribeChatAudioMock.mockResolvedValue(null);
+    MockMediaRecorder.instances = [];
+    MockMediaRecorder.isTypeSupported.mockClear();
+    vi.stubGlobal("MediaRecorder", MockMediaRecorder);
+    originalMediaDevices = Object.getOwnPropertyDescriptor(globalThis.navigator, "mediaDevices");
+    Object.defineProperty(globalThis.navigator, "mediaDevices", {
+      configurable: true,
+      value: {
+        getUserMedia: vi.fn(async () => createMockStream()),
+      },
+    });
+  });
+
+  afterEach(() => {
+    if (originalMediaDevices) {
+      Object.defineProperty(globalThis.navigator, "mediaDevices", originalMediaDevices);
+    } else {
+      Reflect.deleteProperty(globalThis.navigator, "mediaDevices");
+    }
+    vi.unstubAllGlobals();
+  });
+
+  it("does not submit collected audio after a recorder error and later stop", async () => {
+    const app = await createRecordingApp();
+    await app.toggleChatDictation();
+    const recorder = MockMediaRecorder.instances[0];
+
+    recorder.emitData(new Blob(["audio"], { type: "audio/webm" }));
+    recorder.emitError("microphone failed");
+    recorder.emitData(new Blob(["late audio"], { type: "audio/webm" }));
+    recorder.stop();
+
+    expect(transcribeChatAudioMock).not.toHaveBeenCalled();
+    expect(app.chatDictationStatus).toBe("error");
+    expect(app.chatDictationDetail).toBe("microphone failed");
+    expect(app.chatDictationChunks).toEqual([]);
+  });
+
+  it("releases recorded chunks after copying them for normal transcription", async () => {
+    const app = await createRecordingApp();
+    await app.toggleChatDictation();
+    const recorder = MockMediaRecorder.instances[0];
+    recorder.emitData(new Blob(["audio"], { type: "audio/webm" }));
+    const transcription = createDeferred<null>();
+    transcribeChatAudioMock.mockReturnValueOnce(transcription.promise);
+
+    await app.toggleChatDictation();
+
+    expect(app.chatDictationChunks).toEqual([]);
+    expect(transcribeChatAudioMock).toHaveBeenCalledTimes(1);
+    expect(transcribeChatAudioMock.mock.calls[0]?.[1]).toMatchObject({
+      size: 5,
+      type: "audio/webm",
+    });
+    transcription.resolve(null);
+    await transcription.promise;
+  });
+
+  it("ignores duplicate starts while microphone permission is pending", async () => {
+    const app = await createRecordingApp();
+    const pendingUserMedia = createDeferred<MediaStream>();
+    const getUserMedia = vi.fn(() => pendingUserMedia.promise);
+    Object.defineProperty(globalThis.navigator, "mediaDevices", {
+      configurable: true,
+      value: { getUserMedia },
+    });
+    const stream = createMockStream();
+
+    const firstStart = app.toggleChatDictation();
+    const secondStart = app.toggleChatDictation();
+
+    expect(getUserMedia).toHaveBeenCalledTimes(1);
+    await secondStart;
+    expect(app.chatDictationStatus).toBe("starting");
+
+    pendingUserMedia.resolve(stream);
+    await firstStart;
+
+    expect(MockMediaRecorder.instances).toHaveLength(1);
+    expect(MockMediaRecorder.instances[0].state).toBe("recording");
+    expect(stream.track.stop).not.toHaveBeenCalled();
+
+    MockMediaRecorder.instances[0].emitData(new Blob(["audio"], { type: "audio/webm" }));
+    MockMediaRecorder.instances[0].stop();
+
+    expect(stream.track.stop).toHaveBeenCalledTimes(1);
+    expect(transcribeChatAudioMock).toHaveBeenCalledTimes(1);
+  });
+
+  it("stops a microphone stream that resolves after pending dictation is canceled", async () => {
+    const app = await createRecordingApp();
+    const pendingUserMedia = createDeferred<MediaStream>();
+    const getUserMedia = vi.fn(() => pendingUserMedia.promise);
+    Object.defineProperty(globalThis.navigator, "mediaDevices", {
+      configurable: true,
+      value: { getUserMedia },
+    });
+    const stream = createMockStream();
+
+    const start = app.toggleChatDictation();
+    app.cancelChatDictation();
+    pendingUserMedia.resolve(stream);
+    await start;
+
+    expect(MockMediaRecorder.instances).toHaveLength(0);
+    expect(stream.track.stop).toHaveBeenCalledTimes(1);
+    expect(app.chatDictationStatus).toBe("idle");
+    expect(transcribeChatAudioMock).not.toHaveBeenCalled();
+  });
+});
--- a/ui/src/ui/app.ts
+++ b/ui/src/ui/app.ts
@@ -22,8 +22,10 @@ import {
  removeQueuedMessage as removeQueuedMessageInternal,
  resetChatInputHistoryNavigation as resetChatInputHistoryNavigationInternal,
  steerQueuedChatMessage as steerQueuedChatMessageInternal,
+  transcribeChatAudio as transcribeChatAudioInternal,
  type ChatInputHistoryKeyInput,
  type ChatInputHistoryKeyResult,
+  type ChatDictationStatus,
 } from "./app-chat.ts";
 import { DEFAULT_CRON_FORM, DEFAULT_LOG_LEVEL_FILTERS } from "./app-defaults.ts";
 import type { EventLogEntry } from "./app-events.ts";
@@ -222,6 +224,13 @@ export class OpenClawApp extends LitElement {
  @state() realtimeTalkDetail: string | null = null;
  @state() realtimeTalkTranscript: string | null = null;
  private realtimeTalkSession: RealtimeTalkSession | null = null;
+  @state() chatDictationStatus: ChatDictationStatus = "idle";
+  @state() chatDictationDetail: string | null = null;
+  private chatDictationRecorder: MediaRecorder | null = null;
+  private chatDictationStream: MediaStream | null = null;
+  private chatDictationChunks: Blob[] = [];
+  private chatDictationCancelNextStop = false;
+  private chatDictationStartToken = 0;
  @state() chatManualRefreshInFlight = false;
  @state() chatMobileControlsOpen = false;
  private chatMobileControlsTrigger: HTMLElement | null = null;
@@ -944,6 +953,129 @@ export class OpenClawApp extends LitElement {
    }
  }

+  async toggleChatDictation() {
+    if (this.chatDictationRecorder && this.chatDictationStatus === "recording") {
+      this.chatDictationRecorder.stop();
+      return;
+    }
+    if (this.chatDictationStatus === "starting" || this.chatDictationStatus === "transcribing") {
+      return;
+    }
+    if (!this.client || !this.connected) {
+      this.chatDictationStatus = "error";
+      this.chatDictationDetail = "Gateway not connected";
+      this.lastError = this.chatDictationDetail;
+      return;
+    }
+    if (!navigator.mediaDevices?.getUserMedia || typeof MediaRecorder === "undefined") {
+      this.chatDictationStatus = "error";
+      this.chatDictationDetail = "Browser microphone recording is unavailable";
+      this.lastError = this.chatDictationDetail;
+      return;
+    }
+
+    const startToken = ++this.chatDictationStartToken;
+    this.chatDictationStatus = "starting";
+    this.chatDictationDetail = "Starting dictation...";
+    let stream: MediaStream | null = null;
+    try {
+      stream = await navigator.mediaDevices.getUserMedia({ audio: true });
+      if (this.chatDictationStartToken !== startToken || this.chatDictationStatus !== "starting") {
+        this.stopMediaStream(stream);
+        return;
+      }
+      const mimeType = ["audio/webm;codecs=opus", "audio/webm", "audio/mp4"].find((candidate) =>
+        MediaRecorder.isTypeSupported(candidate),
+      );
+      const recorder = new MediaRecorder(stream, mimeType ? { mimeType } : undefined);
+      this.chatDictationStream = stream;
+      this.chatDictationRecorder = recorder;
+      this.chatDictationChunks = [];
+      recorder.addEventListener("dataavailable", (event) => {
+        if (this.chatDictationRecorder !== recorder || this.chatDictationCancelNextStop) {
+          return;
+        }
+        if (event.data.size > 0) {
+          this.chatDictationChunks.push(event.data);
+        }
+      });
+      recorder.addEventListener("error", (event) => {
+        if (this.chatDictationRecorder !== recorder) {
+          return;
+        }
+        this.chatDictationRecorder = null;
+        this.chatDictationChunks = [];
+        this.chatDictationStatus = "error";
+        this.chatDictationDetail =
+          event.message || event.error?.message || "Dictation recording failed";
+        this.lastError = this.chatDictationDetail;
+        this.stopChatDictationStream();
+      });
+      recorder.addEventListener("stop", () => {
+        if (this.chatDictationRecorder !== recorder) {
+          return;
+        }
+        const chunks = this.chatDictationChunks.splice(0);
+        const canceledByRequest = this.chatDictationCancelNextStop;
+        this.chatDictationCancelNextStop = false;
+        this.chatDictationRecorder = null;
+        this.stopChatDictationStream();
+        if (canceledByRequest) {
+          if (this.chatDictationStatus !== "error") {
+            this.chatDictationStatus = "idle";
+            this.chatDictationDetail = null;
+          }
+          return;
+        }
+        const blob = new Blob(chunks, {
+          type: recorder.mimeType || chunks[0]?.type || "audio/webm",
+        });
+        void transcribeChatAudioInternal(
+          this as unknown as Parameters<typeof transcribeChatAudioInternal>[0],
+          blob,
+        );
+      });
+      this.chatDictationStatus = "recording";
+      this.chatDictationDetail = "Recording dictation...";
+      recorder.start();
+    } catch (error) {
+      if (stream && this.chatDictationStream !== stream) {
+        this.stopMediaStream(stream);
+      }
+      if (this.chatDictationStartToken !== startToken) {
+        return;
+      }
+      this.chatDictationRecorder = null;
+      this.stopChatDictationStream();
+      this.chatDictationStatus = "error";
+      this.chatDictationDetail = error instanceof Error ? error.message : String(error);
+      this.lastError = this.chatDictationDetail;
+    }
+  }
+
+  private stopChatDictationStream() {
+    this.stopMediaStream(this.chatDictationStream);
+    this.chatDictationStream = null;
+  }
+
+  private stopMediaStream(stream: MediaStream | null) {
+    stream?.getTracks().forEach((track) => track.stop());
+  }
+
+  cancelChatDictation() {
+    this.chatDictationStartToken += 1;
+    if (this.chatDictationRecorder?.state === "recording") {
+      this.chatDictationCancelNextStop = true;
+      this.chatDictationRecorder.stop();
+    }
+    this.chatDictationRecorder = null;
+    this.chatDictationChunks = [];
+    this.chatDictationCancelNextStop = false;
+    this.stopChatDictationStream();
+    this.chatDictationStatus = "idle";
+    this.chatDictationDetail = null;
+  }
+
  async steerQueuedChatMessage(id: string) {
    await steerQueuedChatMessageInternal(
      this as unknown as Parameters<typeof steerQueuedChatMessageInternal>[0],
--- a/ui/src/ui/views/chat.test.ts
+++ b/ui/src/ui/views/chat.test.ts
@@ -334,6 +334,8 @@ function renderChatView(overrides: Partial<Parameters<typeof renderChat>[0]> = {
      realtimeTalkStatus: "idle",
      realtimeTalkDetail: null,
      realtimeTalkTranscript: null,
+      chatDictationStatus: "idle",
+      chatDictationDetail: null,
      connected: true,
      canSend: true,
      disabledReason: null,
@@ -366,6 +368,7 @@ function renderChatView(overrides: Partial<Parameters<typeof renderChat>[0]> = {
      onSend: () => undefined,
      onCompact: () => undefined,
      onToggleRealtimeTalk: () => undefined,
+      onToggleChatDictation: () => undefined,
      onAbort: () => undefined,
      onQueueRemove: () => undefined,
      onQueueSteer: () => undefined,
@@ -445,12 +448,48 @@ describe("chat loading skeleton", () => {
 });

 describe("chat voice controls", () => {
-  it("keeps Talk visible without the stale browser dictation button", () => {
+  it("shows server dictation and Talk without the stale browser dictation button", () => {
    const container = renderChatView();

+    expect(container.querySelector('[aria-label="Dictate with server STT"]')).not.toBeNull();
    expect(container.querySelector('[aria-label="Start Talk"]')).not.toBeNull();
    expect(container.querySelector('[aria-label="Voice input"]')).toBeNull();
  });
+
+  it("shows dictation recording state", () => {
+    const container = renderChatView({
+      chatDictationStatus: "recording",
+      chatDictationDetail: null,
+    });
+
+    expect(container.querySelector('[aria-label="Stop dictation"]')).not.toBeNull();
+    expect(container.textContent).toContain("Recording dictation");
+  });
+
+  it("disables duplicate dictation starts while microphone access is pending", () => {
+    const container = renderChatView({
+      chatDictationStatus: "starting",
+      chatDictationDetail: null,
+    });
+
+    const button = container.querySelector<HTMLButtonElement>(
+      '[aria-label="Dictate with server STT"]',
+    );
+    expect(button).not.toBeNull();
+    expect(button!.disabled).toBe(true);
+    expect(container.textContent).toContain("Starting dictation");
+  });
+
+  it("keeps stop dictation enabled while recording after disconnect", () => {
+    const container = renderChatView({
+      connected: false,
+      chatDictationStatus: "recording",
+    });
+
+    const button = container.querySelector<HTMLButtonElement>('[aria-label="Stop dictation"]');
+    expect(button).not.toBeNull();
+    expect(button!.disabled).toBe(false);
+  });
 });

 describe("chat slash menu accessibility", () => {
--- a/ui/src/ui/views/chat.ts
+++ b/ui/src/ui/views/chat.ts
@@ -3,6 +3,7 @@ import { ifDefined } from "lit/directives/if-defined.js";
 import { ref } from "lit/directives/ref.js";
 import { repeat } from "lit/directives/repeat.js";
 import { t } from "../../i18n/index.ts";
+import type { ChatDictationStatus } from "../app-chat.ts";
 import type { CompactionStatus, FallbackStatus } from "../app-tool-stream.ts";
 import {
  getChatAttachmentPreviewUrl,
@@ -77,6 +78,8 @@ export type ChatProps = {
  realtimeTalkStatus?: RealtimeTalkStatus;
  realtimeTalkDetail?: string | null;
  realtimeTalkTranscript?: string | null;
+  chatDictationStatus?: ChatDictationStatus;
+  chatDictationDetail?: string | null;
  connected: boolean;
  canSend: boolean;
  disabledReason: string | null;
@@ -110,6 +113,7 @@ export type ChatProps = {
  onSend: () => void;
  onCompact?: () => void | Promise<void>;
  onToggleRealtimeTalk?: () => void;
+  onToggleChatDictation?: () => void;
  onAbort?: () => void;
  onQueueRemove: (id: string) => void;
  onQueueSteer?: (id: string) => void;
@@ -1198,19 +1202,32 @@ export function renderChat(props: ChatProps) {
          @change=${(e: Event) => handleFileSelect(e, props)}
        />

-        ${props.realtimeTalkActive || props.realtimeTalkDetail || props.realtimeTalkTranscript
+        ${props.chatDictationStatus && props.chatDictationStatus !== "idle"
          ? html`
-              <div class="agent-chat__stt-interim agent-chat__talk-status">
-                ${props.realtimeTalkDetail ??
-                props.realtimeTalkTranscript ??
-                (props.realtimeTalkStatus === "thinking"
-                  ? "Asking OpenClaw..."
-                  : props.realtimeTalkStatus === "connecting"
-                    ? "Connecting Talk..."
-                    : "Talk live")}
+              <div class="agent-chat__stt-interim agent-chat__dictation-status">
+                ${props.chatDictationDetail ??
+                (props.chatDictationStatus === "starting"
+                  ? "Starting dictation..."
+                  : props.chatDictationStatus === "recording"
+                    ? "Recording dictation..."
+                    : props.chatDictationStatus === "transcribing"
+                      ? "Transcribing dictation..."
+                      : "Dictation unavailable")}
              </div>
            `
-          : nothing}
+          : props.realtimeTalkActive || props.realtimeTalkDetail || props.realtimeTalkTranscript
+            ? html`
+                <div class="agent-chat__stt-interim agent-chat__talk-status">
+                  ${props.realtimeTalkDetail ??
+                  props.realtimeTalkTranscript ??
+                  (props.realtimeTalkStatus === "thinking"
+                    ? "Asking OpenClaw..."
+                    : props.realtimeTalkStatus === "connecting"
+                      ? "Connecting Talk..."
+                      : "Talk live")}
+                </div>
+              `
+            : nothing}

        <div class="agent-chat__composer-combobox">
          <textarea
@@ -1252,6 +1269,29 @@ export function renderChat(props: ChatProps) {
              ${icons.paperclip}
            </button>

+            ${props.onToggleChatDictation
+              ? html`
+                  <button
+                    class="agent-chat__input-btn ${props.chatDictationStatus === "recording"
+                      ? "agent-chat__input-btn--dictating"
+                      : ""}"
+                    @click=${props.onToggleChatDictation}
+                    title=${props.chatDictationStatus === "recording"
+                      ? "Stop dictation"
+                      : "Dictate with server STT"}
+                    aria-label=${props.chatDictationStatus === "recording"
+                      ? "Stop dictation"
+                      : "Dictate with server STT"}
+                    ?disabled=${props.chatDictationStatus === "recording"
+                      ? false
+                      : !props.connected ||
+                        props.chatDictationStatus === "starting" ||
+                        props.chatDictationStatus === "transcribing"}
+                  >
+                    ${props.chatDictationStatus === "recording" ? icons.stop : icons.mic}
+                  </button>
+                `
+              : nothing}
            ${props.onToggleRealtimeTalk
              ? html`
                  <button