mirror of
https://github.com/openclaw/openclaw.git
synced 2026-06-11 00:52:59 +00:00
299 lines
9.5 KiB
TypeScript
299 lines
9.5 KiB
TypeScript
import { mkdtempSync, writeFileSync } from "node:fs";
|
||
import os from "node:os";
|
||
import path from "node:path";
|
||
import type { OpenClawConfig } from "openclaw/plugin-sdk/config-contracts";
|
||
import {
|
||
finalizeDebugProxyCapture,
|
||
getDebugProxyCaptureStore,
|
||
initializeDebugProxyCapture,
|
||
} from "openclaw/plugin-sdk/proxy-capture";
|
||
import { afterEach, describe, expect, it, vi } from "vitest";
|
||
import { installDebugProxyTestResetHooks } from "../test-support/debug-proxy-env-test-helpers.js";
|
||
|
||
vi.mock("node-edge-tts", () => ({
|
||
EdgeTTS: class {
|
||
async ttsPromise(): Promise<void> {}
|
||
},
|
||
}));
|
||
|
||
import {
|
||
buildMicrosoftSpeechProvider,
|
||
isCjkDominant,
|
||
listMicrosoftVoices,
|
||
} from "./speech-provider.js";
|
||
import * as ttsModule from "./tts.js";
|
||
|
||
const TEST_CFG = {} as OpenClawConfig;
|
||
|
||
function requireFirstEdgeTtsCall(edgeSpy: ReturnType<typeof vi.spyOn>): {
|
||
config?: unknown;
|
||
outputPath: string;
|
||
text?: string;
|
||
timeoutMs?: number;
|
||
} {
|
||
const [call] = edgeSpy.mock.calls;
|
||
if (!call) {
|
||
throw new Error("expected Microsoft Edge TTS call");
|
||
}
|
||
const [edgeCall] = call;
|
||
if (!edgeCall || typeof edgeCall !== "object" || Array.isArray(edgeCall)) {
|
||
throw new Error("expected Microsoft Edge TTS call");
|
||
}
|
||
return edgeCall as {
|
||
config?: unknown;
|
||
outputPath: string;
|
||
text?: string;
|
||
timeoutMs?: number;
|
||
};
|
||
}
|
||
|
||
describe("listMicrosoftVoices", () => {
|
||
const proxyReset = installDebugProxyTestResetHooks();
|
||
|
||
it("maps Microsoft voice metadata into speech voice options", async () => {
|
||
globalThis.fetch = vi.fn().mockResolvedValue(
|
||
new Response(
|
||
JSON.stringify([
|
||
{
|
||
ShortName: "en-US-AvaNeural",
|
||
FriendlyName: "Microsoft Ava Online (Natural) - English (United States)",
|
||
Locale: "en-US",
|
||
Gender: "Female",
|
||
VoiceTag: {
|
||
ContentCategories: ["General"],
|
||
VoicePersonalities: ["Friendly", "Positive"],
|
||
},
|
||
},
|
||
]),
|
||
{ status: 200 },
|
||
),
|
||
) as unknown as typeof globalThis.fetch;
|
||
|
||
const voices = await listMicrosoftVoices();
|
||
|
||
expect(voices).toEqual([
|
||
{
|
||
id: "en-US-AvaNeural",
|
||
name: "Microsoft Ava Online (Natural) - English (United States)",
|
||
category: "General",
|
||
description: "Friendly, Positive",
|
||
locale: "en-US",
|
||
gender: "Female",
|
||
personalities: ["Friendly", "Positive"],
|
||
},
|
||
]);
|
||
});
|
||
|
||
it("throws on Microsoft voice list failures", async () => {
|
||
globalThis.fetch = vi
|
||
.fn()
|
||
.mockResolvedValue(
|
||
new Response("nope", { status: 503 }),
|
||
) as unknown as typeof globalThis.fetch;
|
||
|
||
await expect(listMicrosoftVoices()).rejects.toThrow("Microsoft voices API error (503)");
|
||
});
|
||
|
||
it("records voice discovery exchanges in debug proxy capture mode", async () => {
|
||
const tempDir = mkdtempSync(path.join(os.tmpdir(), "microsoft-voices-capture-"));
|
||
proxyReset.captureProxyEnv();
|
||
process.env.OPENCLAW_DEBUG_PROXY_ENABLED = "1";
|
||
process.env.OPENCLAW_DEBUG_PROXY_DB_PATH = path.join(tempDir, "capture.sqlite");
|
||
process.env.OPENCLAW_DEBUG_PROXY_BLOB_DIR = path.join(tempDir, "blobs");
|
||
process.env.OPENCLAW_DEBUG_PROXY_SESSION_ID = "ms-voices-session";
|
||
|
||
globalThis.fetch = vi
|
||
.fn()
|
||
.mockResolvedValue(
|
||
new Response(JSON.stringify([{ ShortName: "en-US-AvaNeural" }]), { status: 200 }),
|
||
) as unknown as typeof globalThis.fetch;
|
||
|
||
const store = getDebugProxyCaptureStore(
|
||
process.env.OPENCLAW_DEBUG_PROXY_DB_PATH,
|
||
process.env.OPENCLAW_DEBUG_PROXY_BLOB_DIR,
|
||
);
|
||
store.upsertSession({
|
||
id: "ms-voices-session",
|
||
startedAt: Date.now(),
|
||
mode: "test",
|
||
sourceScope: "openclaw",
|
||
sourceProcess: "openclaw",
|
||
dbPath: process.env.OPENCLAW_DEBUG_PROXY_DB_PATH,
|
||
blobDir: process.env.OPENCLAW_DEBUG_PROXY_BLOB_DIR,
|
||
});
|
||
|
||
await listMicrosoftVoices();
|
||
|
||
await vi.waitFor(() => {
|
||
const events = store.getSessionEvents("ms-voices-session", 10);
|
||
expect(
|
||
events.some(
|
||
(event) => event.kind === "request" && event.host === "speech.platform.bing.com",
|
||
),
|
||
).toBe(true);
|
||
expect(
|
||
events.some(
|
||
(event) => event.kind === "response" && event.host === "speech.platform.bing.com",
|
||
),
|
||
).toBe(true);
|
||
});
|
||
});
|
||
|
||
it("does not double-capture voice discovery when the global fetch patch is installed", async () => {
|
||
const tempDir = mkdtempSync(path.join(os.tmpdir(), "microsoft-voices-global-"));
|
||
proxyReset.captureProxyEnv();
|
||
process.env.OPENCLAW_DEBUG_PROXY_ENABLED = "1";
|
||
process.env.OPENCLAW_DEBUG_PROXY_DB_PATH = path.join(tempDir, "capture.sqlite");
|
||
process.env.OPENCLAW_DEBUG_PROXY_BLOB_DIR = path.join(tempDir, "blobs");
|
||
process.env.OPENCLAW_DEBUG_PROXY_SESSION_ID = "ms-voices-global-session";
|
||
|
||
globalThis.fetch = vi.fn(
|
||
async () => new Response(JSON.stringify([{ ShortName: "en-US-AvaNeural" }]), { status: 200 }),
|
||
) as unknown as typeof globalThis.fetch;
|
||
|
||
const store = getDebugProxyCaptureStore(
|
||
process.env.OPENCLAW_DEBUG_PROXY_DB_PATH,
|
||
process.env.OPENCLAW_DEBUG_PROXY_BLOB_DIR,
|
||
);
|
||
store.upsertSession({
|
||
id: "ms-voices-global-session",
|
||
startedAt: Date.now(),
|
||
mode: "test",
|
||
sourceScope: "openclaw",
|
||
sourceProcess: "openclaw",
|
||
dbPath: process.env.OPENCLAW_DEBUG_PROXY_DB_PATH,
|
||
blobDir: process.env.OPENCLAW_DEBUG_PROXY_BLOB_DIR,
|
||
});
|
||
initializeDebugProxyCapture("test");
|
||
|
||
try {
|
||
await listMicrosoftVoices();
|
||
|
||
let events: Array<Record<string, unknown>> = [];
|
||
await vi.waitFor(() => {
|
||
events = store
|
||
.getSessionEvents("ms-voices-global-session", 10)
|
||
.filter((event) => event.host === "speech.platform.bing.com");
|
||
expect(events).toHaveLength(2);
|
||
});
|
||
const kinds = events.map((event) => String(event.kind)).toSorted();
|
||
expect(kinds).toEqual(["request", "response"]);
|
||
} finally {
|
||
globalThis.fetch = proxyReset.originalFetch;
|
||
finalizeDebugProxyCapture();
|
||
}
|
||
});
|
||
});
|
||
|
||
describe("isCjkDominant", () => {
|
||
it("returns true for Chinese text", () => {
|
||
expect(isCjkDominant("你好世界")).toBe(true);
|
||
});
|
||
|
||
it("returns true for mixed text with majority CJK", () => {
|
||
expect(isCjkDominant("你好,这是一个测试 hello")).toBe(true);
|
||
});
|
||
|
||
it("returns false for English text", () => {
|
||
expect(isCjkDominant("Hello, this is a test")).toBe(false);
|
||
});
|
||
|
||
it("returns false for empty string", () => {
|
||
expect(isCjkDominant("")).toBe(false);
|
||
});
|
||
|
||
it("returns false for mostly English with a few CJK chars", () => {
|
||
expect(isCjkDominant("This is a long English sentence with one 字")).toBe(false);
|
||
});
|
||
});
|
||
|
||
describe("buildMicrosoftSpeechProvider", () => {
|
||
afterEach(() => {
|
||
vi.restoreAllMocks();
|
||
});
|
||
|
||
it("switches to a Chinese voice for CJK text when no explicit voice override is set", async () => {
|
||
const provider = buildMicrosoftSpeechProvider();
|
||
const edgeSpy = vi.spyOn(ttsModule, "edgeTTS").mockImplementation(async ({ outputPath }) => {
|
||
writeFileSync(outputPath, Buffer.from([0xff, 0xfb, 0x90, 0x00]));
|
||
});
|
||
|
||
await provider.synthesize({
|
||
text: "你好,这是一个测试 hello",
|
||
cfg: TEST_CFG,
|
||
providerConfig: {
|
||
enabled: true,
|
||
voice: "en-US-MichelleNeural",
|
||
lang: "en-US",
|
||
outputFormat: "audio-24khz-48kbitrate-mono-mp3",
|
||
outputFormatConfigured: true,
|
||
saveSubtitles: false,
|
||
},
|
||
providerOverrides: {},
|
||
timeoutMs: 1000,
|
||
target: "audio-file",
|
||
});
|
||
|
||
expect(edgeSpy).toHaveBeenCalledOnce();
|
||
const edgeCall = requireFirstEdgeTtsCall(edgeSpy);
|
||
expect(edgeCall.text).toBe("你好,这是一个测试 hello");
|
||
expect(path.basename(edgeCall.outputPath)).toBe("speech.mp3");
|
||
expect(edgeCall.timeoutMs).toBe(1000);
|
||
expect(edgeCall.config).toEqual({
|
||
enabled: true,
|
||
voice: "zh-CN-XiaoxiaoNeural",
|
||
lang: "zh-CN",
|
||
outputFormat: "audio-24khz-48kbitrate-mono-mp3",
|
||
outputFormatConfigured: true,
|
||
pitch: undefined,
|
||
rate: undefined,
|
||
volume: undefined,
|
||
saveSubtitles: false,
|
||
proxy: undefined,
|
||
timeoutMs: undefined,
|
||
});
|
||
});
|
||
|
||
it("preserves an explicitly configured English voice for CJK text", async () => {
|
||
const provider = buildMicrosoftSpeechProvider();
|
||
const edgeSpy = vi.spyOn(ttsModule, "edgeTTS").mockImplementation(async ({ outputPath }) => {
|
||
writeFileSync(outputPath, Buffer.from([0xff, 0xfb, 0x90, 0x00]));
|
||
});
|
||
|
||
await provider.synthesize({
|
||
text: "你好,这是一个测试 hello",
|
||
cfg: TEST_CFG,
|
||
providerConfig: {
|
||
enabled: true,
|
||
voice: "en-US-AvaNeural",
|
||
lang: "en-US",
|
||
outputFormat: "audio-24khz-48kbitrate-mono-mp3",
|
||
outputFormatConfigured: true,
|
||
saveSubtitles: false,
|
||
},
|
||
providerOverrides: {},
|
||
timeoutMs: 1000,
|
||
target: "audio-file",
|
||
});
|
||
|
||
expect(edgeSpy).toHaveBeenCalledOnce();
|
||
const edgeCall = requireFirstEdgeTtsCall(edgeSpy);
|
||
expect(edgeCall.text).toBe("你好,这是一个测试 hello");
|
||
expect(path.basename(edgeCall.outputPath)).toBe("speech.mp3");
|
||
expect(edgeCall.timeoutMs).toBe(1000);
|
||
expect(edgeCall.config).toEqual({
|
||
enabled: true,
|
||
voice: "en-US-AvaNeural",
|
||
lang: "en-US",
|
||
outputFormat: "audio-24khz-48kbitrate-mono-mp3",
|
||
outputFormatConfigured: true,
|
||
pitch: undefined,
|
||
rate: undefined,
|
||
volume: undefined,
|
||
saveSubtitles: false,
|
||
proxy: undefined,
|
||
timeoutMs: undefined,
|
||
});
|
||
});
|
||
});
|