From 202dd7590d1cbbb2ecf762bab31a7707f6da7900 Mon Sep 17 00:00:00 2001
From: Vincent Koc <vincentkoc@ieee.org>
Date: Sat, 16 May 2026 11:03:36 +0800
Subject: [PATCH] fix(providers): harden audio response schemas

---
 CHANGELOG.md                                  |   1 +
 extensions/deepgram/audio.test.ts             |  62 +++++++++-
 extensions/deepgram/audio.ts                  |  44 +++++--
 .../media-understanding-provider.test.ts      |  32 +++++
 .../media-understanding-provider.ts           |   8 +-
 extensions/elevenlabs/tts.test.ts             |  33 ++++++
 extensions/elevenlabs/tts.ts                  |   9 +-
 src/agents/provider-http-errors.ts            |  48 ++++++++
 .../openai-compatible-audio.test.ts           |  17 +++
 .../openai-compatible-audio.ts                |   9 +-
 src/media-understanding/shared.ts             |   1 +
 src/plugin-sdk/provider-http.ts               |   3 +
 .../openai-compatible-speech-provider.test.ts | 112 ++++++++++++++++--
 src/tts/openai-compatible-speech-provider.ts  |   9 +-
 14 files changed, 354 insertions(+), 34 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 365a314876c..8e3b143a126 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -41,6 +41,7 @@ Docs: https://docs.openclaw.ai
 - Providers: reject malformed successful Runway, BytePlus, and Ollama embedding responses with provider-owned errors instead of raw parser/type failures, silent bad vectors, or long bogus polling.
 - Providers/images: reject malformed successful OpenAI-compatible, OpenAI, Google, fal, and OpenRouter image responses with provider-owned errors instead of raw shape failures, silent invalid base64 skips, or empty image results.
 - Providers/videos: reject malformed successful xAI, OpenRouter, and fal video create, poll, and result responses with provider-owned errors instead of raw parser failures or long bogus polling.
+- Providers/audio: reject malformed successful OpenAI-compatible, ElevenLabs, and Deepgram speech responses with provider-owned errors instead of raw parser failures, wrong-shaped transcripts, or JSON/text bodies treated as audio.
 - Trajectory export: skip and report malformed session/runtime JSONL rows in `manifest.json` instead of letting wrong-shaped session rows crash support bundle export.
 - Voice calls: persist rejected inbound-call replay keys so duplicate carrier webhook retries stay ignored after a Gateway restart.
 - Config/doctor: copy fallback-enabled channel `allowFrom` entries into explicit `groupAllowFrom` allowlists during `openclaw doctor --fix`, preserving current group access without adding runtime fallback-transition flags.
diff --git a/extensions/deepgram/audio.test.ts b/extensions/deepgram/audio.test.ts
index 92f424d0cee..60ab395f776 100644
--- a/extensions/deepgram/audio.test.ts
+++ b/extensions/deepgram/audio.test.ts
@@ -3,7 +3,7 @@ import {
   createRequestCaptureJsonFetch,
   installPinnedHostnameTestHooks,
 } from "openclaw/plugin-sdk/test-env";
-import { describe, expect, it } from "vitest";
+import { describe, expect, it, vi } from "vitest";
 import { transcribeDeepgramAudio } from "./audio.js";
 
 installPinnedHostnameTestHooks();
@@ -83,4 +83,64 @@ describe("transcribeDeepgramAudio", () => {
       }),
     ).rejects.toThrow("Audio transcription response missing transcript");
   });
+
+  it("wraps malformed successful transcription JSON with a stable provider error", async () => {
+    const fetchFn = vi.fn<typeof fetch>().mockResolvedValueOnce(new Response("{ nope"));
+
+    await expect(
+      transcribeDeepgramAudio({
+        buffer: Buffer.from("audio-bytes"),
+        fileName: "voice.wav",
+        apiKey: "test-key",
+        timeoutMs: 1234,
+        fetchFn,
+      }),
+    ).rejects.toThrow("Audio transcription failed: malformed JSON response");
+  });
+
+  it("rejects non-object successful transcription JSON with a stable provider error", async () => {
+    const fetchFn = vi.fn<typeof fetch>().mockResolvedValueOnce(new Response(JSON.stringify([])));
+
+    await expect(
+      transcribeDeepgramAudio({
+        buffer: Buffer.from("audio-bytes"),
+        fileName: "voice.wav",
+        apiKey: "test-key",
+        timeoutMs: 1234,
+        fetchFn,
+      }),
+    ).rejects.toThrow("Audio transcription failed: malformed JSON response");
+  });
+
+  it("rejects wrong nested transcript shapes with a stable provider error", async () => {
+    const { fetchFn } = createRequestCaptureJsonFetch({
+      results: { channels: { alternatives: [{ transcript: "hello" }] } },
+    });
+
+    await expect(
+      transcribeDeepgramAudio({
+        buffer: Buffer.from("audio-bytes"),
+        fileName: "voice.wav",
+        apiKey: "test-key",
+        timeoutMs: 1234,
+        fetchFn,
+      }),
+    ).rejects.toThrow("Audio transcription failed: malformed JSON response");
+  });
+
+  it("rejects non-string transcript values with a stable provider error", async () => {
+    const { fetchFn } = createRequestCaptureJsonFetch({
+      results: { channels: [{ alternatives: [{ transcript: 123 }] }] },
+    });
+
+    await expect(
+      transcribeDeepgramAudio({
+        buffer: Buffer.from("audio-bytes"),
+        fileName: "voice.wav",
+        apiKey: "test-key",
+        timeoutMs: 1234,
+        fetchFn,
+      }),
+    ).rejects.toThrow("Audio transcription failed: malformed JSON response");
+  });
 });
diff --git a/extensions/deepgram/audio.ts b/extensions/deepgram/audio.ts
index 4c9a0a45516..bc797fb5e7f 100644
--- a/extensions/deepgram/audio.ts
+++ b/extensions/deepgram/audio.ts
@@ -5,6 +5,7 @@ import type {
 import {
   assertOkOrThrowHttpError,
   postTranscriptionRequest,
+  readProviderJsonObjectResponse,
   resolveProviderHttpRequestConfig,
   requireTranscriptionText,
 } from "openclaw/plugin-sdk/provider-http";
@@ -17,15 +18,36 @@ function resolveModel(model?: string): string {
   return trimmed || DEFAULT_DEEPGRAM_AUDIO_MODEL;
 }
 
-type DeepgramTranscriptResponse = {
-  results?: {
-    channels?: Array<{
-      alternatives?: Array<{
-        transcript?: string;
-      }>;
-    }>;
-  };
-};
+function asRecord(value: unknown): Record<string, unknown> | undefined {
+  return typeof value === "object" && value !== null && !Array.isArray(value)
+    ? (value as Record<string, unknown>)
+    : undefined;
+}
+
+function readDeepgramTranscript(payload: Record<string, unknown>): string | undefined {
+  const results = asRecord(payload.results);
+  if (!results) {
+    return undefined;
+  }
+  if (!Array.isArray(results.channels)) {
+    throw new Error("Audio transcription failed: malformed JSON response");
+  }
+  const channel = asRecord(results.channels[0]);
+  if (!channel) {
+    return undefined;
+  }
+  if (!Array.isArray(channel.alternatives)) {
+    throw new Error("Audio transcription failed: malformed JSON response");
+  }
+  const alternative = asRecord(channel.alternatives[0]);
+  if (!alternative) {
+    return undefined;
+  }
+  if (alternative.transcript !== undefined && typeof alternative.transcript !== "string") {
+    throw new Error("Audio transcription failed: malformed JSON response");
+  }
+  return alternative.transcript;
+}
 
 export async function transcribeDeepgramAudio(
   params: AudioTranscriptionRequest,
@@ -75,9 +97,9 @@ export async function transcribeDeepgramAudio(
   try {
     await assertOkOrThrowHttpError(res, "Audio transcription failed");
 
-    const payload = (await res.json()) as DeepgramTranscriptResponse;
+    const payload = await readProviderJsonObjectResponse(res, "Audio transcription failed");
     const transcript = requireTranscriptionText(
-      payload.results?.channels?.[0]?.alternatives?.[0]?.transcript,
+      readDeepgramTranscript(payload),
       "Audio transcription response missing transcript",
     );
     return { text: transcript, model };
diff --git a/extensions/elevenlabs/media-understanding-provider.test.ts b/extensions/elevenlabs/media-understanding-provider.test.ts
index b173bdc6b3c..24e449b5d1d 100644
--- a/extensions/elevenlabs/media-understanding-provider.test.ts
+++ b/extensions/elevenlabs/media-understanding-provider.test.ts
@@ -60,4 +60,36 @@ describe("elevenLabsMediaUnderstandingProvider", () => {
     expect(form.get("language_code")).toBe("en");
     expect(form.get("file")).toBeInstanceOf(Blob);
   });
+
+  it("wraps malformed successful speech-to-text JSON with a stable provider error", async () => {
+    const fetchMock = vi.fn<typeof fetch>().mockResolvedValue(new Response("{ nope"));
+
+    await expect(
+      transcribeElevenLabsAudio({
+        buffer: Buffer.from("audio"),
+        fileName: "voice.mp3",
+        mime: "audio/mpeg",
+        apiKey: "eleven-key",
+        model: "scribe_v2",
+        timeoutMs: 1000,
+        fetchFn: fetchMock,
+      }),
+    ).rejects.toThrow("ElevenLabs audio transcription failed: malformed JSON response");
+  });
+
+  it("rejects non-object successful speech-to-text JSON with a stable provider error", async () => {
+    const fetchMock = vi.fn<typeof fetch>().mockResolvedValue(new Response(JSON.stringify([])));
+
+    await expect(
+      transcribeElevenLabsAudio({
+        buffer: Buffer.from("audio"),
+        fileName: "voice.mp3",
+        mime: "audio/mpeg",
+        apiKey: "eleven-key",
+        model: "scribe_v2",
+        timeoutMs: 1000,
+        fetchFn: fetchMock,
+      }),
+    ).rejects.toThrow("ElevenLabs audio transcription failed: malformed JSON response");
+  });
 });
diff --git a/extensions/elevenlabs/media-understanding-provider.ts b/extensions/elevenlabs/media-understanding-provider.ts
index 2c415b41597..d16692646cf 100644
--- a/extensions/elevenlabs/media-understanding-provider.ts
+++ b/extensions/elevenlabs/media-understanding-provider.ts
@@ -7,6 +7,7 @@ import {
   assertOkOrThrowHttpError,
   buildAudioTranscriptionFormData,
   postTranscriptionRequest,
+  readProviderJsonObjectResponse,
   resolveProviderHttpRequestConfig,
   requireTranscriptionText,
 } from "openclaw/plugin-sdk/provider-http";
@@ -61,9 +62,12 @@ export async function transcribeElevenLabsAudio(
 
   try {
     await assertOkOrThrowHttpError(response, "ElevenLabs audio transcription failed");
-    const payload = (await response.json()) as { text?: string };
+    const payload = await readProviderJsonObjectResponse(
+      response,
+      "ElevenLabs audio transcription failed",
+    );
     const text = requireTranscriptionText(
-      payload.text,
+      typeof payload.text === "string" ? payload.text : undefined,
       "ElevenLabs audio transcription response missing text",
     );
     return { text, model };
diff --git a/extensions/elevenlabs/tts.test.ts b/extensions/elevenlabs/tts.test.ts
index ba00ad27011..0c090693093 100644
--- a/extensions/elevenlabs/tts.test.ts
+++ b/extensions/elevenlabs/tts.test.ts
@@ -112,6 +112,25 @@ describe("elevenlabs tts diagnostics", () => {
     expect(getHeadersFromFirstFetchCall(fetchMock).get("accept")).toBe("audio/mpeg");
   });
 
+  it("rejects JSON success bodies as malformed audio", async () => {
+    const fetchMock = vi.fn(
+      async () =>
+        new Response(JSON.stringify({ error: "not audio" }), {
+          headers: { "content-type": "application/json" },
+        }),
+    );
+    globalThis.fetch = fetchMock as unknown as typeof fetch;
+
+    await expectDefaultTtsRequestToThrow("ElevenLabs API error: malformed audio response");
+  });
+
+  it("rejects empty successful audio bodies as malformed audio", async () => {
+    const fetchMock = vi.fn(async () => new Response(new Uint8Array()));
+    globalThis.fetch = fetchMock as unknown as typeof fetch;
+
+    await expectDefaultTtsRequestToThrow("ElevenLabs API error: malformed audio response");
+  });
+
   it("omits the MPEG Accept header for PCM telephony output", async () => {
     const fetchMock = vi.fn(async () => new Response(Buffer.from("pcm")));
     globalThis.fetch = fetchMock as unknown as typeof fetch;
@@ -176,4 +195,18 @@ describe("elevenlabs tts diagnostics", () => {
     expect(result.audioStream).toBeInstanceOf(ReadableStream);
     await result.release();
   });
+
+  it("rejects JSON success stream responses as malformed audio", async () => {
+    const fetchMock = vi.fn(
+      async () =>
+        new Response(JSON.stringify({ error: "not audio" }), {
+          headers: { "content-type": "application/json" },
+        }),
+    );
+    globalThis.fetch = fetchMock as unknown as typeof fetch;
+
+    await expect(elevenLabsTTSStream(createDefaultTtsRequest())).rejects.toThrow(
+      "ElevenLabs API error: malformed audio response",
+    );
+  });
 });
diff --git a/extensions/elevenlabs/tts.ts b/extensions/elevenlabs/tts.ts
index ea20cea088f..01ab294b6af 100644
--- a/extensions/elevenlabs/tts.ts
+++ b/extensions/elevenlabs/tts.ts
@@ -1,4 +1,8 @@
-import { assertOkOrThrowProviderError } from "openclaw/plugin-sdk/provider-http";
+import {
+  assertOkOrThrowProviderError,
+  assertProviderBinaryResponseContent,
+  readProviderBinaryResponse,
+} from "openclaw/plugin-sdk/provider-http";
 import {
   normalizeApplyTextNormalization,
   normalizeLanguageCode,
@@ -143,7 +147,7 @@ export async function elevenLabsTTS(params: ElevenLabsTtsRequestParams): Promise
   try {
     await assertOkOrThrowProviderError(response, "ElevenLabs API error");
 
-    return Buffer.from(await response.arrayBuffer());
+    return Buffer.from(await readProviderBinaryResponse(response, "ElevenLabs API error", "audio"));
   } finally {
     await release();
   }
@@ -177,6 +181,7 @@ export async function elevenLabsTTSStream(params: ElevenLabsTtsRequestParams): P
   let handedOff = false;
   try {
     await assertOkOrThrowProviderError(response, "ElevenLabs API error");
+    assertProviderBinaryResponseContent(response, "ElevenLabs API error", "audio");
     if (!response.body) {
       throw new Error("ElevenLabs API response missing audio stream");
     }
diff --git a/src/agents/provider-http-errors.ts b/src/agents/provider-http-errors.ts
index 0c00ec8aa05..ef35b2eba7a 100644
--- a/src/agents/provider-http-errors.ts
+++ b/src/agents/provider-http-errors.ts
@@ -171,3 +171,51 @@ export async function readProviderJsonResponse<T>(response: Response, label: str
     throw new Error(`${label}: malformed JSON response`, { cause });
   }
 }
+
+export async function readProviderJsonObjectResponse(
+  response: Response,
+  label: string,
+): Promise<Record<string, unknown>> {
+  const payload = await readProviderJsonResponse<unknown>(response, label);
+  const object = asObject(payload);
+  if (!object) {
+    throw new Error(`${label}: malformed JSON response`);
+  }
+  return object;
+}
+
+function normalizeContentType(response: Response): string | undefined {
+  const contentType = response.headers.get("content-type")?.split(";")[0]?.trim().toLowerCase();
+  return contentType || undefined;
+}
+
+export function assertProviderBinaryResponseContent(
+  response: Response,
+  label: string,
+  kind = "binary",
+): void {
+  const contentType = normalizeContentType(response);
+  if (!contentType) {
+    return;
+  }
+  if (
+    contentType === "application/json" ||
+    contentType.endsWith("+json") ||
+    contentType.startsWith("text/")
+  ) {
+    throw new Error(`${label}: malformed ${kind} response`);
+  }
+}
+
+export async function readProviderBinaryResponse(
+  response: Response,
+  label: string,
+  kind = "binary",
+): Promise<Uint8Array> {
+  assertProviderBinaryResponseContent(response, label, kind);
+  const bytes = new Uint8Array(await response.arrayBuffer());
+  if (bytes.byteLength === 0) {
+    throw new Error(`${label}: malformed ${kind} response`);
+  }
+  return bytes;
+}
diff --git a/src/media-understanding/openai-compatible-audio.test.ts b/src/media-understanding/openai-compatible-audio.test.ts
index 4a3e9e2ff76..2029b1ec288 100644
--- a/src/media-understanding/openai-compatible-audio.test.ts
+++ b/src/media-understanding/openai-compatible-audio.test.ts
@@ -88,4 +88,21 @@ describe("transcribeOpenAiCompatibleAudio", () => {
       }),
     ).rejects.toThrow("Audio transcription failed: malformed JSON response");
   });
+
+  it("rejects non-object successful transcription JSON with a stable provider error", async () => {
+    const fetchFn = vi.fn<typeof fetch>().mockResolvedValueOnce(new Response(JSON.stringify([])));
+
+    await expect(
+      transcribeOpenAiCompatibleAudio({
+        buffer: Buffer.from("audio"),
+        fileName: "note.mp3",
+        apiKey: "test-key",
+        timeoutMs: 1000,
+        fetchFn,
+        provider: "openai",
+        defaultBaseUrl: "https://api.openai.com/v1",
+        defaultModel: "gpt-4o-transcribe",
+      }),
+    ).rejects.toThrow("Audio transcription failed: malformed JSON response");
+  });
 });
diff --git a/src/media-understanding/openai-compatible-audio.ts b/src/media-understanding/openai-compatible-audio.ts
index 6faebda7700..4cb93005f55 100644
--- a/src/media-understanding/openai-compatible-audio.ts
+++ b/src/media-understanding/openai-compatible-audio.ts
@@ -2,7 +2,7 @@ import {
   assertOkOrThrowHttpError,
   buildAudioTranscriptionFormData,
   postTranscriptionRequest,
-  readProviderJsonResponse,
+  readProviderJsonObjectResponse,
   resolveProviderHttpRequestConfig,
   requireTranscriptionText,
 } from "./shared.js";
@@ -65,12 +65,9 @@ export async function transcribeOpenAiCompatibleAudio(
   try {
     await assertOkOrThrowHttpError(res, "Audio transcription failed");
 
-    const payload = await readProviderJsonResponse<{ text?: string }>(
-      res,
-      "Audio transcription failed",
-    );
+    const payload = await readProviderJsonObjectResponse(res, "Audio transcription failed");
     const text = requireTranscriptionText(
-      payload.text,
+      typeof payload.text === "string" ? payload.text : undefined,
       "Audio transcription response missing text",
     );
     return { text, model };
diff --git a/src/media-understanding/shared.ts b/src/media-understanding/shared.ts
index a6e2b218771..ddab6e35752 100644
--- a/src/media-understanding/shared.ts
+++ b/src/media-understanding/shared.ts
@@ -6,6 +6,7 @@ import {
 } from "../agents/provider-http-errors.js";
 export {
   assertOkOrThrowHttpError,
+  readProviderJsonObjectResponse,
   readProviderJsonResponse,
 } from "../agents/provider-http-errors.js";
 import type {
diff --git a/src/plugin-sdk/provider-http.ts b/src/plugin-sdk/provider-http.ts
index 67abef98ea9..4ae5a6423fd 100644
--- a/src/plugin-sdk/provider-http.ts
+++ b/src/plugin-sdk/provider-http.ts
@@ -4,11 +4,14 @@
 export {
   assertOkOrThrowHttpError,
   assertOkOrThrowProviderError,
+  assertProviderBinaryResponseContent,
   createProviderHttpError,
   extractProviderErrorDetail,
   extractProviderRequestId,
   formatProviderErrorPayload,
   formatProviderHttpErrorMessage,
+  readProviderBinaryResponse,
+  readProviderJsonObjectResponse,
   readProviderJsonResponse,
   readResponseTextLimited,
   truncateErrorDetail,
diff --git a/src/tts/openai-compatible-speech-provider.test.ts b/src/tts/openai-compatible-speech-provider.test.ts
index bc354ccc9f0..c76caf8700e 100644
--- a/src/tts/openai-compatible-speech-provider.test.ts
+++ b/src/tts/openai-compatible-speech-provider.test.ts
@@ -1,21 +1,37 @@
 import { afterEach, describe, expect, it, vi } from "vitest";
 import { createOpenAiCompatibleSpeechProvider } from "./openai-compatible-speech-provider.js";
 
-const { assertOkOrThrowHttpErrorMock, postJsonRequestMock, resolveProviderHttpRequestConfigMock } =
-  vi.hoisted(() => ({
-    assertOkOrThrowHttpErrorMock: vi.fn(async () => {}),
-    postJsonRequestMock: vi.fn(),
-    resolveProviderHttpRequestConfigMock: vi.fn((params: Record<string, unknown>) => ({
-      baseUrl: params.baseUrl ?? params.defaultBaseUrl ?? "https://example.test/v1",
-      allowPrivateNetwork: false,
-      headers: new Headers(params.defaultHeaders as HeadersInit | undefined),
-      dispatcherPolicy: undefined,
-    })),
-  }));
+const {
+  assertOkOrThrowHttpErrorMock,
+  postJsonRequestMock,
+  readProviderBinaryResponseMock,
+  resolveProviderHttpRequestConfigMock,
+} = vi.hoisted(() => ({
+  assertOkOrThrowHttpErrorMock: vi.fn(async () => {}),
+  postJsonRequestMock: vi.fn(),
+  readProviderBinaryResponseMock: vi.fn(async (response: Response, label: string) => {
+    const contentType = response.headers.get("content-type")?.split(";")[0]?.trim().toLowerCase();
+    if (contentType === "application/json" || contentType?.startsWith("text/")) {
+      throw new Error(`${label}: malformed audio response`);
+    }
+    const bytes = new Uint8Array(await response.arrayBuffer());
+    if (bytes.byteLength === 0) {
+      throw new Error(`${label}: malformed audio response`);
+    }
+    return bytes;
+  }),
+  resolveProviderHttpRequestConfigMock: vi.fn((params: Record<string, unknown>) => ({
+    baseUrl: params.baseUrl ?? params.defaultBaseUrl ?? "https://example.test/v1",
+    allowPrivateNetwork: false,
+    headers: new Headers(params.defaultHeaders as HeadersInit | undefined),
+    dispatcherPolicy: undefined,
+  })),
+}));
 
 vi.mock("openclaw/plugin-sdk/provider-http", () => ({
   assertOkOrThrowHttpError: assertOkOrThrowHttpErrorMock,
   postJsonRequest: postJsonRequestMock,
+  readProviderBinaryResponse: readProviderBinaryResponseMock,
   resolveProviderHttpRequestConfig: resolveProviderHttpRequestConfigMock,
 }));
 
@@ -35,6 +51,7 @@ describe("createOpenAiCompatibleSpeechProvider", () => {
   afterEach(() => {
     assertOkOrThrowHttpErrorMock.mockClear();
     postJsonRequestMock.mockReset();
+    readProviderBinaryResponseMock.mockClear();
     resolveProviderHttpRequestConfigMock.mockClear();
     vi.unstubAllEnvs();
   });
@@ -159,4 +176,77 @@ describe("createOpenAiCompatibleSpeechProvider", () => {
     expect(result.voiceCompatible).toBe(true);
     expect(release).toHaveBeenCalledOnce();
   });
+
+  it("rejects JSON success bodies from TTS responses as malformed audio", async () => {
+    const release = vi.fn(async () => {});
+    postJsonRequestMock.mockResolvedValue({
+      response: new Response(JSON.stringify({ error: "not audio" }), {
+        status: 200,
+        headers: { "content-type": "application/json" },
+      }),
+      release,
+    });
+    vi.stubEnv("DEMO_API_KEY", "sk-env");
+
+    const provider = createOpenAiCompatibleSpeechProvider({
+      id: "demo",
+      label: "Demo",
+      autoSelectOrder: 40,
+      models: ["demo-tts"],
+      voices: ["alloy"],
+      defaultModel: "demo-tts",
+      defaultVoice: "alloy",
+      defaultBaseUrl: "https://example.test/v1",
+      envKey: "DEMO_API_KEY",
+      responseFormats: ["mp3"],
+      defaultResponseFormat: "mp3",
+      voiceCompatibleResponseFormats: ["mp3"],
+    });
+
+    await expect(
+      provider.synthesize({
+        text: "hello",
+        cfg: {} as never,
+        providerConfig: {},
+        target: "voice-note",
+        timeoutMs: 1234,
+      }),
+    ).rejects.toThrow("Demo TTS API error: malformed audio response");
+    expect(release).toHaveBeenCalledOnce();
+  });
+
+  it("rejects empty successful TTS bodies as malformed audio", async () => {
+    const release = vi.fn(async () => {});
+    postJsonRequestMock.mockResolvedValue({
+      response: new Response(new Uint8Array(), { status: 200 }),
+      release,
+    });
+    vi.stubEnv("DEMO_API_KEY", "sk-env");
+
+    const provider = createOpenAiCompatibleSpeechProvider({
+      id: "demo",
+      label: "Demo",
+      autoSelectOrder: 40,
+      models: ["demo-tts"],
+      voices: ["alloy"],
+      defaultModel: "demo-tts",
+      defaultVoice: "alloy",
+      defaultBaseUrl: "https://example.test/v1",
+      envKey: "DEMO_API_KEY",
+      responseFormats: ["mp3"],
+      defaultResponseFormat: "mp3",
+      voiceCompatibleResponseFormats: ["mp3"],
+    });
+
+    await expect(
+      provider.synthesize({
+        text: "hello",
+        cfg: {} as never,
+        providerConfig: {},
+        target: "voice-note",
+        timeoutMs: 1234,
+      }),
+    ).rejects.toThrow("Demo TTS API error: malformed audio response");
+    expect(release).toHaveBeenCalledOnce();
+  });
 });
diff --git a/src/tts/openai-compatible-speech-provider.ts b/src/tts/openai-compatible-speech-provider.ts
index af2d5a707f0..e7d65546135 100644
--- a/src/tts/openai-compatible-speech-provider.ts
+++ b/src/tts/openai-compatible-speech-provider.ts
@@ -1,6 +1,7 @@
 import {
   assertOkOrThrowHttpError,
   postJsonRequest,
+  readProviderBinaryResponse,
   resolveProviderHttpRequestConfig,
 } from "openclaw/plugin-sdk/provider-http";
 import { normalizeResolvedSecretInputString } from "openclaw/plugin-sdk/secret-input";
@@ -382,7 +383,13 @@ export function createOpenAiCompatibleSpeechProvider<
           options.apiErrorLabel ?? `${options.label} TTS API error`,
         );
         return {
-          audioBuffer: Buffer.from(await response.arrayBuffer()),
+          audioBuffer: Buffer.from(
+            await readProviderBinaryResponse(
+              response,
+              options.apiErrorLabel ?? `${options.label} TTS API error`,
+              "audio",
+            ),
+          ),
           outputFormat: responseFormat,
           fileExtension: responseFormatToFileExtension(responseFormat),
           voiceCompatible: options.voiceCompatibleResponseFormats.includes(responseFormat),