feat(webchat): add server-side dictation

2026-05-06 13:40:44 +00:00 · 2026-05-02 20:30:31 +00:00
parent 92b28bd80d
commit 2f53db94a8
2 changed files with 156 additions and 6 deletions
--- a/ui/src/ui/app.test.ts
+++ b/ui/src/ui/app.test.ts
@@ -0,0 +1,133 @@
+/* @vitest-environment jsdom */
+
+import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
+
+const { transcribeChatAudioMock } = vi.hoisted(() => ({
+  transcribeChatAudioMock: vi.fn(),
+}));
+
+vi.mock("./app-chat.ts", async (importOriginal) => {
+  const actual = await importOriginal<typeof import("./app-chat.ts")>();
+  return {
+    ...actual,
+    transcribeChatAudio: transcribeChatAudioMock,
+  };
+});
+
+class MockMediaRecorder extends EventTarget {
+  static instances: MockMediaRecorder[] = [];
+  static isTypeSupported = vi.fn((mimeType: string) => mimeType === "audio/webm");
+
+  readonly mimeType: string;
+  state: RecordingState = "inactive";
+
+  constructor(
+    readonly stream: MediaStream,
+    options?: MediaRecorderOptions,
+  ) {
+    super();
+    this.mimeType = options?.mimeType ?? "";
+    MockMediaRecorder.instances.push(this);
+  }
+
+  start() {
+    this.state = "recording";
+  }
+
+  stop() {
+    this.state = "inactive";
+    this.dispatchEvent(new Event("stop"));
+  }
+
+  emitData(data: Blob) {
+    const event = new Event("dataavailable") as Event & { data: Blob };
+    Object.defineProperty(event, "data", { value: data });
+    this.dispatchEvent(event);
+  }
+
+  emitError(message: string) {
+    const event = new Event("error") as Event & { error: Error; message: string };
+    Object.defineProperty(event, "error", { value: new Error(message) });
+    Object.defineProperty(event, "message", { value: message });
+    this.dispatchEvent(event);
+  }
+}
+
+type AppWithDictationInternals = {
+  client: unknown;
+  connected: boolean;
+  chatDictationStatus: string;
+  chatDictationDetail: string | null;
+  chatDictationChunks: Blob[];
+  toggleChatDictation: () => Promise<void>;
+};
+
+let originalMediaDevices: PropertyDescriptor | undefined;
+
+async function createRecordingApp() {
+  const { OpenClawApp } = await import("./app.ts");
+  const app = new OpenClawApp();
+  app.client = { request: vi.fn() } as never;
+  app.connected = true;
+  return app as unknown as AppWithDictationInternals;
+}
+
+describe("OpenClawApp dictation recorder lifecycle", () => {
+  beforeEach(() => {
+    transcribeChatAudioMock.mockReset();
+    transcribeChatAudioMock.mockResolvedValue(null);
+    MockMediaRecorder.instances = [];
+    MockMediaRecorder.isTypeSupported.mockClear();
+    vi.stubGlobal("MediaRecorder", MockMediaRecorder);
+    originalMediaDevices = Object.getOwnPropertyDescriptor(globalThis.navigator, "mediaDevices");
+    Object.defineProperty(globalThis.navigator, "mediaDevices", {
+      configurable: true,
+      value: {
+        getUserMedia: vi.fn(async () => ({
+          getTracks: () => [{ stop: vi.fn() }],
+        })),
+      },
+    });
+  });
+
+  afterEach(() => {
+    if (originalMediaDevices) {
+      Object.defineProperty(globalThis.navigator, "mediaDevices", originalMediaDevices);
+    } else {
+      Reflect.deleteProperty(globalThis.navigator, "mediaDevices");
+    }
+    vi.unstubAllGlobals();
+  });
+
+  it("does not submit collected audio after a recorder error and later stop", async () => {
+    const app = await createRecordingApp();
+    await app.toggleChatDictation();
+    const recorder = MockMediaRecorder.instances[0]!;
+
+    recorder.emitData(new Blob(["audio"], { type: "audio/webm" }));
+    recorder.emitError("microphone failed");
+    recorder.emitData(new Blob(["late audio"], { type: "audio/webm" }));
+    recorder.stop();
+
+    expect(transcribeChatAudioMock).not.toHaveBeenCalled();
+    expect(app.chatDictationStatus).toBe("error");
+    expect(app.chatDictationDetail).toBe("microphone failed");
+    expect(app.chatDictationChunks).toEqual([]);
+  });
+
+  it("releases recorded chunks after copying them for normal transcription", async () => {
+    const app = await createRecordingApp();
+    await app.toggleChatDictation();
+    const recorder = MockMediaRecorder.instances[0]!;
+    recorder.emitData(new Blob(["audio"], { type: "audio/webm" }));
+
+    await app.toggleChatDictation();
+
+    expect(app.chatDictationChunks).toEqual([]);
+    expect(transcribeChatAudioMock).toHaveBeenCalledTimes(1);
+    expect(transcribeChatAudioMock.mock.calls[0]?.[1]).toMatchObject({
+      size: 5,
+      type: "audio/webm",
+    });
+  });
+});
--- a/ui/src/ui/app.ts
+++ b/ui/src/ui/app.ts
@@ -984,11 +984,18 @@ export class OpenClawApp extends LitElement {
      this.chatDictationRecorder = recorder;
      this.chatDictationChunks = [];
      recorder.addEventListener("dataavailable", (event) => {
+        if (this.chatDictationRecorder !== recorder || this.chatDictationCancelNextStop) {
+          return;
+        }
        if (event.data.size > 0) {
          this.chatDictationChunks.push(event.data);
        }
      });
      recorder.addEventListener("error", (event) => {
+        if (this.chatDictationRecorder === recorder) {
+          this.chatDictationRecorder = null;
+        }
+        this.chatDictationChunks = [];
        this.chatDictationStatus = "error";
        this.chatDictationDetail =
          event.message || event.error?.message || "Dictation recording failed";
@@ -996,14 +1003,23 @@ export class OpenClawApp extends LitElement {
        this.stopChatDictationStream();
      });
      recorder.addEventListener("stop", () => {
-        const chunks = this.chatDictationChunks;
-        const canceled = this.chatDictationCancelNextStop;
+        const isCurrentRecorder = this.chatDictationRecorder === recorder;
+        const chunks = isCurrentRecorder ? this.chatDictationChunks : [];
+        if (isCurrentRecorder) {
+          this.chatDictationChunks = [];
+        }
+        const canceledByRequest = this.chatDictationCancelNextStop;
+        const canceled = canceledByRequest || !isCurrentRecorder;
        this.chatDictationCancelNextStop = false;
-        this.chatDictationRecorder = null;
-        this.stopChatDictationStream();
+        if (isCurrentRecorder) {
+          this.chatDictationRecorder = null;
+          this.stopChatDictationStream();
+        }
        if (canceled) {
-          this.chatDictationStatus = "idle";
-          this.chatDictationDetail = null;
+          if (canceledByRequest && this.chatDictationStatus !== "error") {
+            this.chatDictationStatus = "idle";
+            this.chatDictationDetail = null;
+          }
          return;
        }
        const blob = new Blob(chunks, {
@@ -1038,6 +1054,7 @@ export class OpenClawApp extends LitElement {
    }
    this.chatDictationRecorder = null;
    this.chatDictationChunks = [];
+    this.chatDictationCancelNextStop = false;
    this.stopChatDictationStream();
    this.chatDictationStatus = "idle";
    this.chatDictationDetail = null;