fix(providers): harden audio response schemas

2026-05-18 12:14:46 +00:00 · 2026-05-16 11:03:36 +08:00
parent 639107b7db
commit 202dd7590d
14 changed files with 354 additions and 34 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -41,6 +41,7 @@ Docs: https://docs.openclaw.ai
 - Providers: reject malformed successful Runway, BytePlus, and Ollama embedding responses with provider-owned errors instead of raw parser/type failures, silent bad vectors, or long bogus polling.
 - Providers/images: reject malformed successful OpenAI-compatible, OpenAI, Google, fal, and OpenRouter image responses with provider-owned errors instead of raw shape failures, silent invalid base64 skips, or empty image results.
 - Providers/videos: reject malformed successful xAI, OpenRouter, and fal video create, poll, and result responses with provider-owned errors instead of raw parser failures or long bogus polling.
+- Providers/audio: reject malformed successful OpenAI-compatible, ElevenLabs, and Deepgram speech responses with provider-owned errors instead of raw parser failures, wrong-shaped transcripts, or JSON/text bodies treated as audio.
 - Trajectory export: skip and report malformed session/runtime JSONL rows in `manifest.json` instead of letting wrong-shaped session rows crash support bundle export.
 - Voice calls: persist rejected inbound-call replay keys so duplicate carrier webhook retries stay ignored after a Gateway restart.
 - Config/doctor: copy fallback-enabled channel `allowFrom` entries into explicit `groupAllowFrom` allowlists during `openclaw doctor --fix`, preserving current group access without adding runtime fallback-transition flags.
--- a/extensions/deepgram/audio.test.ts
+++ b/extensions/deepgram/audio.test.ts
@@ -3,7 +3,7 @@ import {
  createRequestCaptureJsonFetch,
  installPinnedHostnameTestHooks,
 } from "openclaw/plugin-sdk/test-env";
-import { describe, expect, it } from "vitest";
+import { describe, expect, it, vi } from "vitest";
 import { transcribeDeepgramAudio } from "./audio.js";

 installPinnedHostnameTestHooks();
@@ -83,4 +83,64 @@ describe("transcribeDeepgramAudio", () => {
      }),
    ).rejects.toThrow("Audio transcription response missing transcript");
  });
+
+  it("wraps malformed successful transcription JSON with a stable provider error", async () => {
+    const fetchFn = vi.fn<typeof fetch>().mockResolvedValueOnce(new Response("{ nope"));
+
+    await expect(
+      transcribeDeepgramAudio({
+        buffer: Buffer.from("audio-bytes"),
+        fileName: "voice.wav",
+        apiKey: "test-key",
+        timeoutMs: 1234,
+        fetchFn,
+      }),
+    ).rejects.toThrow("Audio transcription failed: malformed JSON response");
+  });
+
+  it("rejects non-object successful transcription JSON with a stable provider error", async () => {
+    const fetchFn = vi.fn<typeof fetch>().mockResolvedValueOnce(new Response(JSON.stringify([])));
+
+    await expect(
+      transcribeDeepgramAudio({
+        buffer: Buffer.from("audio-bytes"),
+        fileName: "voice.wav",
+        apiKey: "test-key",
+        timeoutMs: 1234,
+        fetchFn,
+      }),
+    ).rejects.toThrow("Audio transcription failed: malformed JSON response");
+  });
+
+  it("rejects wrong nested transcript shapes with a stable provider error", async () => {
+    const { fetchFn } = createRequestCaptureJsonFetch({
+      results: { channels: { alternatives: [{ transcript: "hello" }] } },
+    });
+
+    await expect(
+      transcribeDeepgramAudio({
+        buffer: Buffer.from("audio-bytes"),
+        fileName: "voice.wav",
+        apiKey: "test-key",
+        timeoutMs: 1234,
+        fetchFn,
+      }),
+    ).rejects.toThrow("Audio transcription failed: malformed JSON response");
+  });
+
+  it("rejects non-string transcript values with a stable provider error", async () => {
+    const { fetchFn } = createRequestCaptureJsonFetch({
+      results: { channels: [{ alternatives: [{ transcript: 123 }] }] },
+    });
+
+    await expect(
+      transcribeDeepgramAudio({
+        buffer: Buffer.from("audio-bytes"),
+        fileName: "voice.wav",
+        apiKey: "test-key",
+        timeoutMs: 1234,
+        fetchFn,
+      }),
+    ).rejects.toThrow("Audio transcription failed: malformed JSON response");
+  });
 });
--- a/extensions/deepgram/audio.ts
+++ b/extensions/deepgram/audio.ts
@@ -5,6 +5,7 @@ import type {
 import {
  assertOkOrThrowHttpError,
  postTranscriptionRequest,
+  readProviderJsonObjectResponse,
  resolveProviderHttpRequestConfig,
  requireTranscriptionText,
 } from "openclaw/plugin-sdk/provider-http";
@@ -17,15 +18,36 @@ function resolveModel(model?: string): string {
  return trimmed || DEFAULT_DEEPGRAM_AUDIO_MODEL;
 }

-type DeepgramTranscriptResponse = {
-  results?: {
-    channels?: Array<{
-      alternatives?: Array<{
-        transcript?: string;
-      }>;
-    }>;
-  };
-};
+function asRecord(value: unknown): Record<string, unknown> | undefined {
+  return typeof value === "object" && value !== null && !Array.isArray(value)
+    ? (value as Record<string, unknown>)
+    : undefined;
+}
+
+function readDeepgramTranscript(payload: Record<string, unknown>): string | undefined {
+  const results = asRecord(payload.results);
+  if (!results) {
+    return undefined;
+  }
+  if (!Array.isArray(results.channels)) {
+    throw new Error("Audio transcription failed: malformed JSON response");
+  }
+  const channel = asRecord(results.channels[0]);
+  if (!channel) {
+    return undefined;
+  }
+  if (!Array.isArray(channel.alternatives)) {
+    throw new Error("Audio transcription failed: malformed JSON response");
+  }
+  const alternative = asRecord(channel.alternatives[0]);
+  if (!alternative) {
+    return undefined;
+  }
+  if (alternative.transcript !== undefined && typeof alternative.transcript !== "string") {
+    throw new Error("Audio transcription failed: malformed JSON response");
+  }
+  return alternative.transcript;
+}

 export async function transcribeDeepgramAudio(
  params: AudioTranscriptionRequest,
@@ -75,9 +97,9 @@ export async function transcribeDeepgramAudio(
  try {
    await assertOkOrThrowHttpError(res, "Audio transcription failed");

-    const payload = (await res.json()) as DeepgramTranscriptResponse;
+    const payload = await readProviderJsonObjectResponse(res, "Audio transcription failed");
    const transcript = requireTranscriptionText(
-      payload.results?.channels?.[0]?.alternatives?.[0]?.transcript,
+      readDeepgramTranscript(payload),
      "Audio transcription response missing transcript",
    );
    return { text: transcript, model };
--- a/extensions/elevenlabs/media-understanding-provider.test.ts
+++ b/extensions/elevenlabs/media-understanding-provider.test.ts
@@ -60,4 +60,36 @@ describe("elevenLabsMediaUnderstandingProvider", () => {
    expect(form.get("language_code")).toBe("en");
    expect(form.get("file")).toBeInstanceOf(Blob);
  });
+
+  it("wraps malformed successful speech-to-text JSON with a stable provider error", async () => {
+    const fetchMock = vi.fn<typeof fetch>().mockResolvedValue(new Response("{ nope"));
+
+    await expect(
+      transcribeElevenLabsAudio({
+        buffer: Buffer.from("audio"),
+        fileName: "voice.mp3",
+        mime: "audio/mpeg",
+        apiKey: "eleven-key",
+        model: "scribe_v2",
+        timeoutMs: 1000,
+        fetchFn: fetchMock,
+      }),
+    ).rejects.toThrow("ElevenLabs audio transcription failed: malformed JSON response");
+  });
+
+  it("rejects non-object successful speech-to-text JSON with a stable provider error", async () => {
+    const fetchMock = vi.fn<typeof fetch>().mockResolvedValue(new Response(JSON.stringify([])));
+
+    await expect(
+      transcribeElevenLabsAudio({
+        buffer: Buffer.from("audio"),
+        fileName: "voice.mp3",
+        mime: "audio/mpeg",
+        apiKey: "eleven-key",
+        model: "scribe_v2",
+        timeoutMs: 1000,
+        fetchFn: fetchMock,
+      }),
+    ).rejects.toThrow("ElevenLabs audio transcription failed: malformed JSON response");
+  });
 });
--- a/extensions/elevenlabs/media-understanding-provider.ts
+++ b/extensions/elevenlabs/media-understanding-provider.ts
@@ -7,6 +7,7 @@ import {
  assertOkOrThrowHttpError,
  buildAudioTranscriptionFormData,
  postTranscriptionRequest,
+  readProviderJsonObjectResponse,
  resolveProviderHttpRequestConfig,
  requireTranscriptionText,
 } from "openclaw/plugin-sdk/provider-http";
@@ -61,9 +62,12 @@ export async function transcribeElevenLabsAudio(

  try {
    await assertOkOrThrowHttpError(response, "ElevenLabs audio transcription failed");
-    const payload = (await response.json()) as { text?: string };
+    const payload = await readProviderJsonObjectResponse(
+      response,
+      "ElevenLabs audio transcription failed",
+    );
    const text = requireTranscriptionText(
-      payload.text,
+      typeof payload.text === "string" ? payload.text : undefined,
      "ElevenLabs audio transcription response missing text",
    );
    return { text, model };
--- a/extensions/elevenlabs/tts.test.ts
+++ b/extensions/elevenlabs/tts.test.ts
@@ -112,6 +112,25 @@ describe("elevenlabs tts diagnostics", () => {
    expect(getHeadersFromFirstFetchCall(fetchMock).get("accept")).toBe("audio/mpeg");
  });

+  it("rejects JSON success bodies as malformed audio", async () => {
+    const fetchMock = vi.fn(
+      async () =>
+        new Response(JSON.stringify({ error: "not audio" }), {
+          headers: { "content-type": "application/json" },
+        }),
+    );
+    globalThis.fetch = fetchMock as unknown as typeof fetch;
+
+    await expectDefaultTtsRequestToThrow("ElevenLabs API error: malformed audio response");
+  });
+
+  it("rejects empty successful audio bodies as malformed audio", async () => {
+    const fetchMock = vi.fn(async () => new Response(new Uint8Array()));
+    globalThis.fetch = fetchMock as unknown as typeof fetch;
+
+    await expectDefaultTtsRequestToThrow("ElevenLabs API error: malformed audio response");
+  });
+
  it("omits the MPEG Accept header for PCM telephony output", async () => {
    const fetchMock = vi.fn(async () => new Response(Buffer.from("pcm")));
    globalThis.fetch = fetchMock as unknown as typeof fetch;
@@ -176,4 +195,18 @@ describe("elevenlabs tts diagnostics", () => {
    expect(result.audioStream).toBeInstanceOf(ReadableStream);
    await result.release();
  });
+
+  it("rejects JSON success stream responses as malformed audio", async () => {
+    const fetchMock = vi.fn(
+      async () =>
+        new Response(JSON.stringify({ error: "not audio" }), {
+          headers: { "content-type": "application/json" },
+        }),
+    );
+    globalThis.fetch = fetchMock as unknown as typeof fetch;
+
+    await expect(elevenLabsTTSStream(createDefaultTtsRequest())).rejects.toThrow(
+      "ElevenLabs API error: malformed audio response",
+    );
+  });
 });
--- a/extensions/elevenlabs/tts.ts
+++ b/extensions/elevenlabs/tts.ts
@@ -1,4 +1,8 @@
-import { assertOkOrThrowProviderError } from "openclaw/plugin-sdk/provider-http";
+import {
+  assertOkOrThrowProviderError,
+  assertProviderBinaryResponseContent,
+  readProviderBinaryResponse,
+} from "openclaw/plugin-sdk/provider-http";
 import {
  normalizeApplyTextNormalization,
  normalizeLanguageCode,
@@ -143,7 +147,7 @@ export async function elevenLabsTTS(params: ElevenLabsTtsRequestParams): Promise
  try {
    await assertOkOrThrowProviderError(response, "ElevenLabs API error");

-    return Buffer.from(await response.arrayBuffer());
+    return Buffer.from(await readProviderBinaryResponse(response, "ElevenLabs API error", "audio"));
  } finally {
    await release();
  }
@@ -177,6 +181,7 @@ export async function elevenLabsTTSStream(params: ElevenLabsTtsRequestParams): P
  let handedOff = false;
  try {
    await assertOkOrThrowProviderError(response, "ElevenLabs API error");
+    assertProviderBinaryResponseContent(response, "ElevenLabs API error", "audio");
    if (!response.body) {
      throw new Error("ElevenLabs API response missing audio stream");
    }
--- a/src/agents/provider-http-errors.ts
+++ b/src/agents/provider-http-errors.ts
@@ -171,3 +171,51 @@ export async function readProviderJsonResponse<T>(response: Response, label: str
    throw new Error(`${label}: malformed JSON response`, { cause });
  }
 }
+
+export async function readProviderJsonObjectResponse(
+  response: Response,
+  label: string,
+): Promise<Record<string, unknown>> {
+  const payload = await readProviderJsonResponse<unknown>(response, label);
+  const object = asObject(payload);
+  if (!object) {
+    throw new Error(`${label}: malformed JSON response`);
+  }
+  return object;
+}
+
+function normalizeContentType(response: Response): string | undefined {
+  const contentType = response.headers.get("content-type")?.split(";")[0]?.trim().toLowerCase();
+  return contentType || undefined;
+}
+
+export function assertProviderBinaryResponseContent(
+  response: Response,
+  label: string,
+  kind = "binary",
+): void {
+  const contentType = normalizeContentType(response);
+  if (!contentType) {
+    return;
+  }
+  if (
+    contentType === "application/json" ||
+    contentType.endsWith("+json") ||
+    contentType.startsWith("text/")
+  ) {
+    throw new Error(`${label}: malformed ${kind} response`);
+  }
+}
+
+export async function readProviderBinaryResponse(
+  response: Response,
+  label: string,
+  kind = "binary",
+): Promise<Uint8Array> {
+  assertProviderBinaryResponseContent(response, label, kind);
+  const bytes = new Uint8Array(await response.arrayBuffer());
+  if (bytes.byteLength === 0) {
+    throw new Error(`${label}: malformed ${kind} response`);
+  }
+  return bytes;
+}
--- a/src/media-understanding/openai-compatible-audio.test.ts
+++ b/src/media-understanding/openai-compatible-audio.test.ts
@@ -88,4 +88,21 @@ describe("transcribeOpenAiCompatibleAudio", () => {
      }),
    ).rejects.toThrow("Audio transcription failed: malformed JSON response");
  });
+
+  it("rejects non-object successful transcription JSON with a stable provider error", async () => {
+    const fetchFn = vi.fn<typeof fetch>().mockResolvedValueOnce(new Response(JSON.stringify([])));
+
+    await expect(
+      transcribeOpenAiCompatibleAudio({
+        buffer: Buffer.from("audio"),
+        fileName: "note.mp3",
+        apiKey: "test-key",
+        timeoutMs: 1000,
+        fetchFn,
+        provider: "openai",
+        defaultBaseUrl: "https://api.openai.com/v1",
+        defaultModel: "gpt-4o-transcribe",
+      }),
+    ).rejects.toThrow("Audio transcription failed: malformed JSON response");
+  });
 });
--- a/src/media-understanding/openai-compatible-audio.ts
+++ b/src/media-understanding/openai-compatible-audio.ts
@@ -2,7 +2,7 @@ import {
  assertOkOrThrowHttpError,
  buildAudioTranscriptionFormData,
  postTranscriptionRequest,
-  readProviderJsonResponse,
+  readProviderJsonObjectResponse,
  resolveProviderHttpRequestConfig,
  requireTranscriptionText,
 } from "./shared.js";
@@ -65,12 +65,9 @@ export async function transcribeOpenAiCompatibleAudio(
  try {
    await assertOkOrThrowHttpError(res, "Audio transcription failed");

-    const payload = await readProviderJsonResponse<{ text?: string }>(
-      res,
-      "Audio transcription failed",
-    );
+    const payload = await readProviderJsonObjectResponse(res, "Audio transcription failed");
    const text = requireTranscriptionText(
-      payload.text,
+      typeof payload.text === "string" ? payload.text : undefined,
      "Audio transcription response missing text",
    );
    return { text, model };
--- a/src/media-understanding/shared.ts
+++ b/src/media-understanding/shared.ts
@@ -6,6 +6,7 @@ import {
 } from "../agents/provider-http-errors.js";
 export {
  assertOkOrThrowHttpError,
+  readProviderJsonObjectResponse,
  readProviderJsonResponse,
 } from "../agents/provider-http-errors.js";
 import type {
--- a/src/plugin-sdk/provider-http.ts
+++ b/src/plugin-sdk/provider-http.ts
@@ -4,11 +4,14 @@
 export {
  assertOkOrThrowHttpError,
  assertOkOrThrowProviderError,
+  assertProviderBinaryResponseContent,
  createProviderHttpError,
  extractProviderErrorDetail,
  extractProviderRequestId,
  formatProviderErrorPayload,
  formatProviderHttpErrorMessage,
+  readProviderBinaryResponse,
+  readProviderJsonObjectResponse,
  readProviderJsonResponse,
  readResponseTextLimited,
  truncateErrorDetail,
--- a/src/tts/openai-compatible-speech-provider.test.ts
+++ b/src/tts/openai-compatible-speech-provider.test.ts
@@ -1,21 +1,37 @@
 import { afterEach, describe, expect, it, vi } from "vitest";
 import { createOpenAiCompatibleSpeechProvider } from "./openai-compatible-speech-provider.js";

-const { assertOkOrThrowHttpErrorMock, postJsonRequestMock, resolveProviderHttpRequestConfigMock } =
-  vi.hoisted(() => ({
-    assertOkOrThrowHttpErrorMock: vi.fn(async () => {}),
-    postJsonRequestMock: vi.fn(),
-    resolveProviderHttpRequestConfigMock: vi.fn((params: Record<string, unknown>) => ({
-      baseUrl: params.baseUrl ?? params.defaultBaseUrl ?? "https://example.test/v1",
-      allowPrivateNetwork: false,
-      headers: new Headers(params.defaultHeaders as HeadersInit | undefined),
-      dispatcherPolicy: undefined,
-    })),
-  }));
+const {
+  assertOkOrThrowHttpErrorMock,
+  postJsonRequestMock,
+  readProviderBinaryResponseMock,
+  resolveProviderHttpRequestConfigMock,
+} = vi.hoisted(() => ({
+  assertOkOrThrowHttpErrorMock: vi.fn(async () => {}),
+  postJsonRequestMock: vi.fn(),
+  readProviderBinaryResponseMock: vi.fn(async (response: Response, label: string) => {
+    const contentType = response.headers.get("content-type")?.split(";")[0]?.trim().toLowerCase();
+    if (contentType === "application/json" || contentType?.startsWith("text/")) {
+      throw new Error(`${label}: malformed audio response`);
+    }
+    const bytes = new Uint8Array(await response.arrayBuffer());
+    if (bytes.byteLength === 0) {
+      throw new Error(`${label}: malformed audio response`);
+    }
+    return bytes;
+  }),
+  resolveProviderHttpRequestConfigMock: vi.fn((params: Record<string, unknown>) => ({
+    baseUrl: params.baseUrl ?? params.defaultBaseUrl ?? "https://example.test/v1",
+    allowPrivateNetwork: false,
+    headers: new Headers(params.defaultHeaders as HeadersInit | undefined),
+    dispatcherPolicy: undefined,
+  })),
+}));

 vi.mock("openclaw/plugin-sdk/provider-http", () => ({
  assertOkOrThrowHttpError: assertOkOrThrowHttpErrorMock,
  postJsonRequest: postJsonRequestMock,
+  readProviderBinaryResponse: readProviderBinaryResponseMock,
  resolveProviderHttpRequestConfig: resolveProviderHttpRequestConfigMock,
 }));

@@ -35,6 +51,7 @@ describe("createOpenAiCompatibleSpeechProvider", () => {
  afterEach(() => {
    assertOkOrThrowHttpErrorMock.mockClear();
    postJsonRequestMock.mockReset();
+    readProviderBinaryResponseMock.mockClear();
    resolveProviderHttpRequestConfigMock.mockClear();
    vi.unstubAllEnvs();
  });
@@ -159,4 +176,77 @@ describe("createOpenAiCompatibleSpeechProvider", () => {
    expect(result.voiceCompatible).toBe(true);
    expect(release).toHaveBeenCalledOnce();
  });
+
+  it("rejects JSON success bodies from TTS responses as malformed audio", async () => {
+    const release = vi.fn(async () => {});
+    postJsonRequestMock.mockResolvedValue({
+      response: new Response(JSON.stringify({ error: "not audio" }), {
+        status: 200,
+        headers: { "content-type": "application/json" },
+      }),
+      release,
+    });
+    vi.stubEnv("DEMO_API_KEY", "sk-env");
+
+    const provider = createOpenAiCompatibleSpeechProvider({
+      id: "demo",
+      label: "Demo",
+      autoSelectOrder: 40,
+      models: ["demo-tts"],
+      voices: ["alloy"],
+      defaultModel: "demo-tts",
+      defaultVoice: "alloy",
+      defaultBaseUrl: "https://example.test/v1",
+      envKey: "DEMO_API_KEY",
+      responseFormats: ["mp3"],
+      defaultResponseFormat: "mp3",
+      voiceCompatibleResponseFormats: ["mp3"],
+    });
+
+    await expect(
+      provider.synthesize({
+        text: "hello",
+        cfg: {} as never,
+        providerConfig: {},
+        target: "voice-note",
+        timeoutMs: 1234,
+      }),
+    ).rejects.toThrow("Demo TTS API error: malformed audio response");
+    expect(release).toHaveBeenCalledOnce();
+  });
+
+  it("rejects empty successful TTS bodies as malformed audio", async () => {
+    const release = vi.fn(async () => {});
+    postJsonRequestMock.mockResolvedValue({
+      response: new Response(new Uint8Array(), { status: 200 }),
+      release,
+    });
+    vi.stubEnv("DEMO_API_KEY", "sk-env");
+
+    const provider = createOpenAiCompatibleSpeechProvider({
+      id: "demo",
+      label: "Demo",
+      autoSelectOrder: 40,
+      models: ["demo-tts"],
+      voices: ["alloy"],
+      defaultModel: "demo-tts",
+      defaultVoice: "alloy",
+      defaultBaseUrl: "https://example.test/v1",
+      envKey: "DEMO_API_KEY",
+      responseFormats: ["mp3"],
+      defaultResponseFormat: "mp3",
+      voiceCompatibleResponseFormats: ["mp3"],
+    });
+
+    await expect(
+      provider.synthesize({
+        text: "hello",
+        cfg: {} as never,
+        providerConfig: {},
+        target: "voice-note",
+        timeoutMs: 1234,
+      }),
+    ).rejects.toThrow("Demo TTS API error: malformed audio response");
+    expect(release).toHaveBeenCalledOnce();
+  });
 });
--- a/src/tts/openai-compatible-speech-provider.ts
+++ b/src/tts/openai-compatible-speech-provider.ts
@@ -1,6 +1,7 @@
 import {
  assertOkOrThrowHttpError,
  postJsonRequest,
+  readProviderBinaryResponse,
  resolveProviderHttpRequestConfig,
 } from "openclaw/plugin-sdk/provider-http";
 import { normalizeResolvedSecretInputString } from "openclaw/plugin-sdk/secret-input";
@@ -382,7 +383,13 @@ export function createOpenAiCompatibleSpeechProvider<
          options.apiErrorLabel ?? `${options.label} TTS API error`,
        );
        return {
-          audioBuffer: Buffer.from(await response.arrayBuffer()),
+          audioBuffer: Buffer.from(
+            await readProviderBinaryResponse(
+              response,
+              options.apiErrorLabel ?? `${options.label} TTS API error`,
+              "audio",
+            ),
+          ),
          outputFormat: responseFormat,
          fileExtension: responseFormatToFileExtension(responseFormat),
          voiceCompatible: options.voiceCompatibleResponseFormats.includes(responseFormat),