fix(providers): harden audio response schemas

This commit is contained in:
Vincent Koc
2026-05-16 11:03:36 +08:00
parent 639107b7db
commit 202dd7590d
14 changed files with 354 additions and 34 deletions

View File

@@ -41,6 +41,7 @@ Docs: https://docs.openclaw.ai
- Providers: reject malformed successful Runway, BytePlus, and Ollama embedding responses with provider-owned errors instead of raw parser/type failures, silent bad vectors, or long bogus polling.
- Providers/images: reject malformed successful OpenAI-compatible, OpenAI, Google, fal, and OpenRouter image responses with provider-owned errors instead of raw shape failures, silent invalid base64 skips, or empty image results.
- Providers/videos: reject malformed successful xAI, OpenRouter, and fal video create, poll, and result responses with provider-owned errors instead of raw parser failures or long bogus polling.
- Providers/audio: reject malformed successful OpenAI-compatible, ElevenLabs, and Deepgram speech responses with provider-owned errors instead of raw parser failures, wrong-shaped transcripts, or JSON/text bodies treated as audio.
- Trajectory export: skip and report malformed session/runtime JSONL rows in `manifest.json` instead of letting wrong-shaped session rows crash support bundle export.
- Voice calls: persist rejected inbound-call replay keys so duplicate carrier webhook retries stay ignored after a Gateway restart.
- Config/doctor: copy fallback-enabled channel `allowFrom` entries into explicit `groupAllowFrom` allowlists during `openclaw doctor --fix`, preserving current group access without adding runtime fallback-transition flags.

View File

@@ -3,7 +3,7 @@ import {
createRequestCaptureJsonFetch,
installPinnedHostnameTestHooks,
} from "openclaw/plugin-sdk/test-env";
import { describe, expect, it } from "vitest";
import { describe, expect, it, vi } from "vitest";
import { transcribeDeepgramAudio } from "./audio.js";
installPinnedHostnameTestHooks();
@@ -83,4 +83,64 @@ describe("transcribeDeepgramAudio", () => {
}),
).rejects.toThrow("Audio transcription response missing transcript");
});
it("wraps malformed successful transcription JSON with a stable provider error", async () => {
const fetchFn = vi.fn<typeof fetch>().mockResolvedValueOnce(new Response("{ nope"));
await expect(
transcribeDeepgramAudio({
buffer: Buffer.from("audio-bytes"),
fileName: "voice.wav",
apiKey: "test-key",
timeoutMs: 1234,
fetchFn,
}),
).rejects.toThrow("Audio transcription failed: malformed JSON response");
});
it("rejects non-object successful transcription JSON with a stable provider error", async () => {
const fetchFn = vi.fn<typeof fetch>().mockResolvedValueOnce(new Response(JSON.stringify([])));
await expect(
transcribeDeepgramAudio({
buffer: Buffer.from("audio-bytes"),
fileName: "voice.wav",
apiKey: "test-key",
timeoutMs: 1234,
fetchFn,
}),
).rejects.toThrow("Audio transcription failed: malformed JSON response");
});
it("rejects wrong nested transcript shapes with a stable provider error", async () => {
const { fetchFn } = createRequestCaptureJsonFetch({
results: { channels: { alternatives: [{ transcript: "hello" }] } },
});
await expect(
transcribeDeepgramAudio({
buffer: Buffer.from("audio-bytes"),
fileName: "voice.wav",
apiKey: "test-key",
timeoutMs: 1234,
fetchFn,
}),
).rejects.toThrow("Audio transcription failed: malformed JSON response");
});
it("rejects non-string transcript values with a stable provider error", async () => {
const { fetchFn } = createRequestCaptureJsonFetch({
results: { channels: [{ alternatives: [{ transcript: 123 }] }] },
});
await expect(
transcribeDeepgramAudio({
buffer: Buffer.from("audio-bytes"),
fileName: "voice.wav",
apiKey: "test-key",
timeoutMs: 1234,
fetchFn,
}),
).rejects.toThrow("Audio transcription failed: malformed JSON response");
});
});

View File

@@ -5,6 +5,7 @@ import type {
import {
assertOkOrThrowHttpError,
postTranscriptionRequest,
readProviderJsonObjectResponse,
resolveProviderHttpRequestConfig,
requireTranscriptionText,
} from "openclaw/plugin-sdk/provider-http";
@@ -17,15 +18,36 @@ function resolveModel(model?: string): string {
return trimmed || DEFAULT_DEEPGRAM_AUDIO_MODEL;
}
type DeepgramTranscriptResponse = {
results?: {
channels?: Array<{
alternatives?: Array<{
transcript?: string;
}>;
}>;
};
};
function asRecord(value: unknown): Record<string, unknown> | undefined {
return typeof value === "object" && value !== null && !Array.isArray(value)
? (value as Record<string, unknown>)
: undefined;
}
function readDeepgramTranscript(payload: Record<string, unknown>): string | undefined {
const results = asRecord(payload.results);
if (!results) {
return undefined;
}
if (!Array.isArray(results.channels)) {
throw new Error("Audio transcription failed: malformed JSON response");
}
const channel = asRecord(results.channels[0]);
if (!channel) {
return undefined;
}
if (!Array.isArray(channel.alternatives)) {
throw new Error("Audio transcription failed: malformed JSON response");
}
const alternative = asRecord(channel.alternatives[0]);
if (!alternative) {
return undefined;
}
if (alternative.transcript !== undefined && typeof alternative.transcript !== "string") {
throw new Error("Audio transcription failed: malformed JSON response");
}
return alternative.transcript;
}
export async function transcribeDeepgramAudio(
params: AudioTranscriptionRequest,
@@ -75,9 +97,9 @@ export async function transcribeDeepgramAudio(
try {
await assertOkOrThrowHttpError(res, "Audio transcription failed");
const payload = (await res.json()) as DeepgramTranscriptResponse;
const payload = await readProviderJsonObjectResponse(res, "Audio transcription failed");
const transcript = requireTranscriptionText(
payload.results?.channels?.[0]?.alternatives?.[0]?.transcript,
readDeepgramTranscript(payload),
"Audio transcription response missing transcript",
);
return { text: transcript, model };

View File

@@ -60,4 +60,36 @@ describe("elevenLabsMediaUnderstandingProvider", () => {
expect(form.get("language_code")).toBe("en");
expect(form.get("file")).toBeInstanceOf(Blob);
});
it("wraps malformed successful speech-to-text JSON with a stable provider error", async () => {
const fetchMock = vi.fn<typeof fetch>().mockResolvedValue(new Response("{ nope"));
await expect(
transcribeElevenLabsAudio({
buffer: Buffer.from("audio"),
fileName: "voice.mp3",
mime: "audio/mpeg",
apiKey: "eleven-key",
model: "scribe_v2",
timeoutMs: 1000,
fetchFn: fetchMock,
}),
).rejects.toThrow("ElevenLabs audio transcription failed: malformed JSON response");
});
it("rejects non-object successful speech-to-text JSON with a stable provider error", async () => {
const fetchMock = vi.fn<typeof fetch>().mockResolvedValue(new Response(JSON.stringify([])));
await expect(
transcribeElevenLabsAudio({
buffer: Buffer.from("audio"),
fileName: "voice.mp3",
mime: "audio/mpeg",
apiKey: "eleven-key",
model: "scribe_v2",
timeoutMs: 1000,
fetchFn: fetchMock,
}),
).rejects.toThrow("ElevenLabs audio transcription failed: malformed JSON response");
});
});

View File

@@ -7,6 +7,7 @@ import {
assertOkOrThrowHttpError,
buildAudioTranscriptionFormData,
postTranscriptionRequest,
readProviderJsonObjectResponse,
resolveProviderHttpRequestConfig,
requireTranscriptionText,
} from "openclaw/plugin-sdk/provider-http";
@@ -61,9 +62,12 @@ export async function transcribeElevenLabsAudio(
try {
await assertOkOrThrowHttpError(response, "ElevenLabs audio transcription failed");
const payload = (await response.json()) as { text?: string };
const payload = await readProviderJsonObjectResponse(
response,
"ElevenLabs audio transcription failed",
);
const text = requireTranscriptionText(
payload.text,
typeof payload.text === "string" ? payload.text : undefined,
"ElevenLabs audio transcription response missing text",
);
return { text, model };

View File

@@ -112,6 +112,25 @@ describe("elevenlabs tts diagnostics", () => {
expect(getHeadersFromFirstFetchCall(fetchMock).get("accept")).toBe("audio/mpeg");
});
it("rejects JSON success bodies as malformed audio", async () => {
const fetchMock = vi.fn(
async () =>
new Response(JSON.stringify({ error: "not audio" }), {
headers: { "content-type": "application/json" },
}),
);
globalThis.fetch = fetchMock as unknown as typeof fetch;
await expectDefaultTtsRequestToThrow("ElevenLabs API error: malformed audio response");
});
it("rejects empty successful audio bodies as malformed audio", async () => {
const fetchMock = vi.fn(async () => new Response(new Uint8Array()));
globalThis.fetch = fetchMock as unknown as typeof fetch;
await expectDefaultTtsRequestToThrow("ElevenLabs API error: malformed audio response");
});
it("omits the MPEG Accept header for PCM telephony output", async () => {
const fetchMock = vi.fn(async () => new Response(Buffer.from("pcm")));
globalThis.fetch = fetchMock as unknown as typeof fetch;
@@ -176,4 +195,18 @@ describe("elevenlabs tts diagnostics", () => {
expect(result.audioStream).toBeInstanceOf(ReadableStream);
await result.release();
});
it("rejects JSON success stream responses as malformed audio", async () => {
const fetchMock = vi.fn(
async () =>
new Response(JSON.stringify({ error: "not audio" }), {
headers: { "content-type": "application/json" },
}),
);
globalThis.fetch = fetchMock as unknown as typeof fetch;
await expect(elevenLabsTTSStream(createDefaultTtsRequest())).rejects.toThrow(
"ElevenLabs API error: malformed audio response",
);
});
});

View File

@@ -1,4 +1,8 @@
import { assertOkOrThrowProviderError } from "openclaw/plugin-sdk/provider-http";
import {
assertOkOrThrowProviderError,
assertProviderBinaryResponseContent,
readProviderBinaryResponse,
} from "openclaw/plugin-sdk/provider-http";
import {
normalizeApplyTextNormalization,
normalizeLanguageCode,
@@ -143,7 +147,7 @@ export async function elevenLabsTTS(params: ElevenLabsTtsRequestParams): Promise
try {
await assertOkOrThrowProviderError(response, "ElevenLabs API error");
return Buffer.from(await response.arrayBuffer());
return Buffer.from(await readProviderBinaryResponse(response, "ElevenLabs API error", "audio"));
} finally {
await release();
}
@@ -177,6 +181,7 @@ export async function elevenLabsTTSStream(params: ElevenLabsTtsRequestParams): P
let handedOff = false;
try {
await assertOkOrThrowProviderError(response, "ElevenLabs API error");
assertProviderBinaryResponseContent(response, "ElevenLabs API error", "audio");
if (!response.body) {
throw new Error("ElevenLabs API response missing audio stream");
}

View File

@@ -171,3 +171,51 @@ export async function readProviderJsonResponse<T>(response: Response, label: str
throw new Error(`${label}: malformed JSON response`, { cause });
}
}
export async function readProviderJsonObjectResponse(
response: Response,
label: string,
): Promise<Record<string, unknown>> {
const payload = await readProviderJsonResponse<unknown>(response, label);
const object = asObject(payload);
if (!object) {
throw new Error(`${label}: malformed JSON response`);
}
return object;
}
function normalizeContentType(response: Response): string | undefined {
const contentType = response.headers.get("content-type")?.split(";")[0]?.trim().toLowerCase();
return contentType || undefined;
}
export function assertProviderBinaryResponseContent(
response: Response,
label: string,
kind = "binary",
): void {
const contentType = normalizeContentType(response);
if (!contentType) {
return;
}
if (
contentType === "application/json" ||
contentType.endsWith("+json") ||
contentType.startsWith("text/")
) {
throw new Error(`${label}: malformed ${kind} response`);
}
}
export async function readProviderBinaryResponse(
response: Response,
label: string,
kind = "binary",
): Promise<Uint8Array> {
assertProviderBinaryResponseContent(response, label, kind);
const bytes = new Uint8Array(await response.arrayBuffer());
if (bytes.byteLength === 0) {
throw new Error(`${label}: malformed ${kind} response`);
}
return bytes;
}

View File

@@ -88,4 +88,21 @@ describe("transcribeOpenAiCompatibleAudio", () => {
}),
).rejects.toThrow("Audio transcription failed: malformed JSON response");
});
it("rejects non-object successful transcription JSON with a stable provider error", async () => {
const fetchFn = vi.fn<typeof fetch>().mockResolvedValueOnce(new Response(JSON.stringify([])));
await expect(
transcribeOpenAiCompatibleAudio({
buffer: Buffer.from("audio"),
fileName: "note.mp3",
apiKey: "test-key",
timeoutMs: 1000,
fetchFn,
provider: "openai",
defaultBaseUrl: "https://api.openai.com/v1",
defaultModel: "gpt-4o-transcribe",
}),
).rejects.toThrow("Audio transcription failed: malformed JSON response");
});
});

View File

@@ -2,7 +2,7 @@ import {
assertOkOrThrowHttpError,
buildAudioTranscriptionFormData,
postTranscriptionRequest,
readProviderJsonResponse,
readProviderJsonObjectResponse,
resolveProviderHttpRequestConfig,
requireTranscriptionText,
} from "./shared.js";
@@ -65,12 +65,9 @@ export async function transcribeOpenAiCompatibleAudio(
try {
await assertOkOrThrowHttpError(res, "Audio transcription failed");
const payload = await readProviderJsonResponse<{ text?: string }>(
res,
"Audio transcription failed",
);
const payload = await readProviderJsonObjectResponse(res, "Audio transcription failed");
const text = requireTranscriptionText(
payload.text,
typeof payload.text === "string" ? payload.text : undefined,
"Audio transcription response missing text",
);
return { text, model };

View File

@@ -6,6 +6,7 @@ import {
} from "../agents/provider-http-errors.js";
export {
assertOkOrThrowHttpError,
readProviderJsonObjectResponse,
readProviderJsonResponse,
} from "../agents/provider-http-errors.js";
import type {

View File

@@ -4,11 +4,14 @@
export {
assertOkOrThrowHttpError,
assertOkOrThrowProviderError,
assertProviderBinaryResponseContent,
createProviderHttpError,
extractProviderErrorDetail,
extractProviderRequestId,
formatProviderErrorPayload,
formatProviderHttpErrorMessage,
readProviderBinaryResponse,
readProviderJsonObjectResponse,
readProviderJsonResponse,
readResponseTextLimited,
truncateErrorDetail,

View File

@@ -1,21 +1,37 @@
import { afterEach, describe, expect, it, vi } from "vitest";
import { createOpenAiCompatibleSpeechProvider } from "./openai-compatible-speech-provider.js";
const { assertOkOrThrowHttpErrorMock, postJsonRequestMock, resolveProviderHttpRequestConfigMock } =
vi.hoisted(() => ({
assertOkOrThrowHttpErrorMock: vi.fn(async () => {}),
postJsonRequestMock: vi.fn(),
resolveProviderHttpRequestConfigMock: vi.fn((params: Record<string, unknown>) => ({
baseUrl: params.baseUrl ?? params.defaultBaseUrl ?? "https://example.test/v1",
allowPrivateNetwork: false,
headers: new Headers(params.defaultHeaders as HeadersInit | undefined),
dispatcherPolicy: undefined,
})),
}));
const {
assertOkOrThrowHttpErrorMock,
postJsonRequestMock,
readProviderBinaryResponseMock,
resolveProviderHttpRequestConfigMock,
} = vi.hoisted(() => ({
assertOkOrThrowHttpErrorMock: vi.fn(async () => {}),
postJsonRequestMock: vi.fn(),
readProviderBinaryResponseMock: vi.fn(async (response: Response, label: string) => {
const contentType = response.headers.get("content-type")?.split(";")[0]?.trim().toLowerCase();
if (contentType === "application/json" || contentType?.startsWith("text/")) {
throw new Error(`${label}: malformed audio response`);
}
const bytes = new Uint8Array(await response.arrayBuffer());
if (bytes.byteLength === 0) {
throw new Error(`${label}: malformed audio response`);
}
return bytes;
}),
resolveProviderHttpRequestConfigMock: vi.fn((params: Record<string, unknown>) => ({
baseUrl: params.baseUrl ?? params.defaultBaseUrl ?? "https://example.test/v1",
allowPrivateNetwork: false,
headers: new Headers(params.defaultHeaders as HeadersInit | undefined),
dispatcherPolicy: undefined,
})),
}));
vi.mock("openclaw/plugin-sdk/provider-http", () => ({
assertOkOrThrowHttpError: assertOkOrThrowHttpErrorMock,
postJsonRequest: postJsonRequestMock,
readProviderBinaryResponse: readProviderBinaryResponseMock,
resolveProviderHttpRequestConfig: resolveProviderHttpRequestConfigMock,
}));
@@ -35,6 +51,7 @@ describe("createOpenAiCompatibleSpeechProvider", () => {
afterEach(() => {
assertOkOrThrowHttpErrorMock.mockClear();
postJsonRequestMock.mockReset();
readProviderBinaryResponseMock.mockClear();
resolveProviderHttpRequestConfigMock.mockClear();
vi.unstubAllEnvs();
});
@@ -159,4 +176,77 @@ describe("createOpenAiCompatibleSpeechProvider", () => {
expect(result.voiceCompatible).toBe(true);
expect(release).toHaveBeenCalledOnce();
});
it("rejects JSON success bodies from TTS responses as malformed audio", async () => {
const release = vi.fn(async () => {});
postJsonRequestMock.mockResolvedValue({
response: new Response(JSON.stringify({ error: "not audio" }), {
status: 200,
headers: { "content-type": "application/json" },
}),
release,
});
vi.stubEnv("DEMO_API_KEY", "sk-env");
const provider = createOpenAiCompatibleSpeechProvider({
id: "demo",
label: "Demo",
autoSelectOrder: 40,
models: ["demo-tts"],
voices: ["alloy"],
defaultModel: "demo-tts",
defaultVoice: "alloy",
defaultBaseUrl: "https://example.test/v1",
envKey: "DEMO_API_KEY",
responseFormats: ["mp3"],
defaultResponseFormat: "mp3",
voiceCompatibleResponseFormats: ["mp3"],
});
await expect(
provider.synthesize({
text: "hello",
cfg: {} as never,
providerConfig: {},
target: "voice-note",
timeoutMs: 1234,
}),
).rejects.toThrow("Demo TTS API error: malformed audio response");
expect(release).toHaveBeenCalledOnce();
});
it("rejects empty successful TTS bodies as malformed audio", async () => {
const release = vi.fn(async () => {});
postJsonRequestMock.mockResolvedValue({
response: new Response(new Uint8Array(), { status: 200 }),
release,
});
vi.stubEnv("DEMO_API_KEY", "sk-env");
const provider = createOpenAiCompatibleSpeechProvider({
id: "demo",
label: "Demo",
autoSelectOrder: 40,
models: ["demo-tts"],
voices: ["alloy"],
defaultModel: "demo-tts",
defaultVoice: "alloy",
defaultBaseUrl: "https://example.test/v1",
envKey: "DEMO_API_KEY",
responseFormats: ["mp3"],
defaultResponseFormat: "mp3",
voiceCompatibleResponseFormats: ["mp3"],
});
await expect(
provider.synthesize({
text: "hello",
cfg: {} as never,
providerConfig: {},
target: "voice-note",
timeoutMs: 1234,
}),
).rejects.toThrow("Demo TTS API error: malformed audio response");
expect(release).toHaveBeenCalledOnce();
});
});

View File

@@ -1,6 +1,7 @@
import {
assertOkOrThrowHttpError,
postJsonRequest,
readProviderBinaryResponse,
resolveProviderHttpRequestConfig,
} from "openclaw/plugin-sdk/provider-http";
import { normalizeResolvedSecretInputString } from "openclaw/plugin-sdk/secret-input";
@@ -382,7 +383,13 @@ export function createOpenAiCompatibleSpeechProvider<
options.apiErrorLabel ?? `${options.label} TTS API error`,
);
return {
audioBuffer: Buffer.from(await response.arrayBuffer()),
audioBuffer: Buffer.from(
await readProviderBinaryResponse(
response,
options.apiErrorLabel ?? `${options.label} TTS API error`,
"audio",
),
),
outputFormat: responseFormat,
fileExtension: responseFormatToFileExtension(responseFormat),
voiceCompatible: options.voiceCompatibleResponseFormats.includes(responseFormat),