openclaw/extensions/microsoft/speech-provider.test.ts

import { mkdtempSync, writeFileSync } from "node:fs";
import os from "node:os";
import path from "node:path";
import type { OpenClawConfig } from "openclaw/plugin-sdk/config-runtime";
import { afterEach, describe, expect, it, vi } from "vitest";
import {
  buildMicrosoftSpeechProvider,
  isCjkDominant,
  listMicrosoftVoices,
} from "./speech-provider.js";
import * as ttsModule from "./tts.js";

const TEST_CFG = {} as OpenClawConfig;

describe("listMicrosoftVoices", () => {
  const originalFetch = globalThis.fetch;
  const proxyEnvKeys = [
    "OPENCLAW_DEBUG_PROXY_ENABLED",
    "OPENCLAW_DEBUG_PROXY_DB_PATH",
    "OPENCLAW_DEBUG_PROXY_BLOB_DIR",
    "OPENCLAW_DEBUG_PROXY_SESSION_ID",
  ] as const;
  let priorProxyEnv: Partial<Record<(typeof proxyEnvKeys)[number], string | undefined>> = {};

  afterEach(() => {
    globalThis.fetch = originalFetch;
    vi.restoreAllMocks();
    for (const key of proxyEnvKeys) {
      const value = priorProxyEnv[key];
      if (value === undefined) {
        delete process.env[key];
      } else {
        process.env[key] = value;
      }
    }
    priorProxyEnv = {};
  });

  it("maps Microsoft voice metadata into speech voice options", async () => {
    globalThis.fetch = vi.fn().mockResolvedValue(
      new Response(
        JSON.stringify([
          {
            ShortName: "en-US-AvaNeural",
            FriendlyName: "Microsoft Ava Online (Natural) - English (United States)",
            Locale: "en-US",
            Gender: "Female",
            VoiceTag: {
              ContentCategories: ["General"],
              VoicePersonalities: ["Friendly", "Positive"],
            },
          },
        ]),
        { status: 200 },
      ),
    ) as unknown as typeof globalThis.fetch;

    const voices = await listMicrosoftVoices();

    expect(voices).toEqual([
      {
        id: "en-US-AvaNeural",
        name: "Microsoft Ava Online (Natural) - English (United States)",
        category: "General",
        description: "Friendly, Positive",
        locale: "en-US",
        gender: "Female",
        personalities: ["Friendly", "Positive"],
      },
    ]);
  });

  it("throws on Microsoft voice list failures", async () => {
    globalThis.fetch = vi
      .fn()
      .mockResolvedValue(
        new Response("nope", { status: 503 }),
      ) as unknown as typeof globalThis.fetch;

    await expect(listMicrosoftVoices()).rejects.toThrow("Microsoft voices API error (503)");
  });

  it("records voice discovery exchanges in debug proxy capture mode", async () => {
    const tempDir = mkdtempSync(path.join(os.tmpdir(), "microsoft-voices-capture-"));
    priorProxyEnv = Object.fromEntries(
      proxyEnvKeys.map((key) => [key, process.env[key]]),
    ) as typeof priorProxyEnv;
    process.env.OPENCLAW_DEBUG_PROXY_ENABLED = "1";
    process.env.OPENCLAW_DEBUG_PROXY_DB_PATH = path.join(tempDir, "capture.sqlite");
    process.env.OPENCLAW_DEBUG_PROXY_BLOB_DIR = path.join(tempDir, "blobs");
    process.env.OPENCLAW_DEBUG_PROXY_SESSION_ID = "ms-voices-session";

    globalThis.fetch = vi
      .fn()
      .mockResolvedValue(
        new Response(JSON.stringify([{ ShortName: "en-US-AvaNeural" }]), { status: 200 }),
      ) as unknown as typeof globalThis.fetch;

    const { getDebugProxyCaptureStore } = await import("../../src/proxy-capture/store.sqlite.js");
    const store = getDebugProxyCaptureStore(
      process.env.OPENCLAW_DEBUG_PROXY_DB_PATH,
      process.env.OPENCLAW_DEBUG_PROXY_BLOB_DIR,
    );
    store.upsertSession({
      id: "ms-voices-session",
      startedAt: Date.now(),
      mode: "test",
      sourceScope: "openclaw",
      sourceProcess: "openclaw",
      dbPath: process.env.OPENCLAW_DEBUG_PROXY_DB_PATH,
      blobDir: process.env.OPENCLAW_DEBUG_PROXY_BLOB_DIR,
    });

    await listMicrosoftVoices();
    await new Promise((resolve) => setTimeout(resolve, 0));

    const events = store.getSessionEvents("ms-voices-session", 10);
    expect(
      events.some((event) => event.kind === "request" && event.host === "speech.platform.bing.com"),
    ).toBe(true);
    expect(
      events.some(
        (event) => event.kind === "response" && event.host === "speech.platform.bing.com",
      ),
    ).toBe(true);
  });

  it("does not double-capture voice discovery when the global fetch patch is installed", async () => {
    const tempDir = mkdtempSync(path.join(os.tmpdir(), "microsoft-voices-global-"));
    priorProxyEnv = Object.fromEntries(
      proxyEnvKeys.map((key) => [key, process.env[key]]),
    ) as typeof priorProxyEnv;
    process.env.OPENCLAW_DEBUG_PROXY_ENABLED = "1";
    process.env.OPENCLAW_DEBUG_PROXY_DB_PATH = path.join(tempDir, "capture.sqlite");
    process.env.OPENCLAW_DEBUG_PROXY_BLOB_DIR = path.join(tempDir, "blobs");
    process.env.OPENCLAW_DEBUG_PROXY_SESSION_ID = "ms-voices-global-session";

    globalThis.fetch = vi.fn(
      async () => new Response(JSON.stringify([{ ShortName: "en-US-AvaNeural" }]), { status: 200 }),
    ) as unknown as typeof globalThis.fetch;

    const { getDebugProxyCaptureStore } = await import("../../src/proxy-capture/store.sqlite.js");
    const { finalizeDebugProxyCapture, initializeDebugProxyCapture } =
      await import("../../src/proxy-capture/runtime.js");
    const store = getDebugProxyCaptureStore(
      process.env.OPENCLAW_DEBUG_PROXY_DB_PATH,
      process.env.OPENCLAW_DEBUG_PROXY_BLOB_DIR,
    );
    store.upsertSession({
      id: "ms-voices-global-session",
      startedAt: Date.now(),
      mode: "test",
      sourceScope: "openclaw",
      sourceProcess: "openclaw",
      dbPath: process.env.OPENCLAW_DEBUG_PROXY_DB_PATH,
      blobDir: process.env.OPENCLAW_DEBUG_PROXY_BLOB_DIR,
    });
    initializeDebugProxyCapture("test");

    try {
      await listMicrosoftVoices();
      await new Promise((resolve) => setTimeout(resolve, 0));

      const events = store
        .getSessionEvents("ms-voices-global-session", 10)
        .filter((event) => event.host === "speech.platform.bing.com");
      expect(events).toHaveLength(2);
      const kinds = events.map((event) => String(event.kind)).toSorted();
      expect(kinds).toEqual(["request", "response"]);
    } finally {
      globalThis.fetch = originalFetch;
      finalizeDebugProxyCapture();
    }
  });
});

describe("isCjkDominant", () => {
  it("returns true for Chinese text", () => {
    expect(isCjkDominant("你好世界")).toBe(true);
  });

  it("returns true for mixed text with majority CJK", () => {
    expect(isCjkDominant("你好，这是一个测试 hello")).toBe(true);
  });

  it("returns false for English text", () => {
    expect(isCjkDominant("Hello, this is a test")).toBe(false);
  });

  it("returns false for empty string", () => {
    expect(isCjkDominant("")).toBe(false);
  });

  it("returns false for mostly English with a few CJK chars", () => {
    expect(isCjkDominant("This is a long English sentence with one 字")).toBe(false);
  });
});

describe("buildMicrosoftSpeechProvider", () => {
  afterEach(() => {
    vi.restoreAllMocks();
  });

  it("switches to a Chinese voice for CJK text when no explicit voice override is set", async () => {
    const provider = buildMicrosoftSpeechProvider();
    const edgeSpy = vi.spyOn(ttsModule, "edgeTTS").mockImplementation(async ({ outputPath }) => {
      writeFileSync(outputPath, Buffer.from([0xff, 0xfb, 0x90, 0x00]));
    });

    await provider.synthesize({
      text: "你好，这是一个测试 hello",
      cfg: TEST_CFG,
      providerConfig: {
        enabled: true,
        voice: "en-US-MichelleNeural",
        lang: "en-US",
        outputFormat: "audio-24khz-48kbitrate-mono-mp3",
        outputFormatConfigured: true,
        saveSubtitles: false,
      },
      providerOverrides: {},
      timeoutMs: 1000,
      target: "audio-file",
    });

    expect(edgeSpy).toHaveBeenCalledWith(
      expect.objectContaining({
        config: expect.objectContaining({
          voice: "zh-CN-XiaoxiaoNeural",
          lang: "zh-CN",
        }),
      }),
    );
  });

  it("preserves an explicitly configured English voice for CJK text", async () => {
    const provider = buildMicrosoftSpeechProvider();
    const edgeSpy = vi.spyOn(ttsModule, "edgeTTS").mockImplementation(async ({ outputPath }) => {
      writeFileSync(outputPath, Buffer.from([0xff, 0xfb, 0x90, 0x00]));
    });

    await provider.synthesize({
      text: "你好，这是一个测试 hello",
      cfg: TEST_CFG,
      providerConfig: {
        enabled: true,
        voice: "en-US-AvaNeural",
        lang: "en-US",
        outputFormat: "audio-24khz-48kbitrate-mono-mp3",
        outputFormatConfigured: true,
        saveSubtitles: false,
      },
      providerOverrides: {},
      timeoutMs: 1000,
      target: "audio-file",
    });

    expect(edgeSpy).toHaveBeenCalledWith(
      expect.objectContaining({
        config: expect.objectContaining({
          voice: "en-US-AvaNeural",
          lang: "en-US",
        }),
      }),
    );
  });
});