From 6545317a2cb9d8a239287cbb017a0f92d0027836 Mon Sep 17 00:00:00 2001
From: Peter Steinberger <steipete@gmail.com>
Date: Mon, 2 Mar 2026 22:00:46 +0000
Subject: [PATCH] refactor(media): split audio helpers and attachment cache

---
 src/config/media-audio-field-metadata.ts      |  54 ++
 src/config/schema.help.quality.test.ts        |  11 +-
 src/config/schema.help.ts                     |  24 +-
 src/config/schema.labels.ts                   |  13 +-
 .../apply.echo-transcript.test.ts             |  20 +-
 src/media-understanding/apply.test.ts         |  15 +-
 src/media-understanding/apply.ts              |  66 +--
 src/media-understanding/attachments.cache.ts  | 323 +++++++++++
 .../attachments.normalize.ts                  | 108 ++++
 src/media-understanding/attachments.select.ts |  89 +++
 src/media-understanding/attachments.ts        | 519 +-----------------
 src/media-understanding/audio-preflight.ts    |  36 +-
 .../audio-transcription-runner.ts             |  50 ++
 src/media-understanding/echo-transcript.ts    |  62 +++
 src/media-understanding/runner.entries.ts     |  24 +-
 src/media-understanding/runner.test-utils.ts  |   9 +-
 .../transcribe-audio.test.ts                  |  66 +--
 src/media-understanding/transcribe-audio.ts   |  36 +-
 18 files changed, 776 insertions(+), 749 deletions(-)
 create mode 100644 src/config/media-audio-field-metadata.ts
 create mode 100644 src/media-understanding/attachments.cache.ts
 create mode 100644 src/media-understanding/attachments.normalize.ts
 create mode 100644 src/media-understanding/attachments.select.ts
 create mode 100644 src/media-understanding/audio-transcription-runner.ts
 create mode 100644 src/media-understanding/echo-transcript.ts

diff --git a/src/config/media-audio-field-metadata.ts b/src/config/media-audio-field-metadata.ts
new file mode 100644
index 00000000000..8750059a87b
--- /dev/null
+++ b/src/config/media-audio-field-metadata.ts
@@ -0,0 +1,54 @@
+export const MEDIA_AUDIO_FIELD_KEYS = [
+  "tools.media.audio.enabled",
+  "tools.media.audio.maxBytes",
+  "tools.media.audio.maxChars",
+  "tools.media.audio.prompt",
+  "tools.media.audio.timeoutSeconds",
+  "tools.media.audio.language",
+  "tools.media.audio.attachments",
+  "tools.media.audio.models",
+  "tools.media.audio.scope",
+  "tools.media.audio.echoTranscript",
+  "tools.media.audio.echoFormat",
+] as const;
+
+type MediaAudioFieldKey = (typeof MEDIA_AUDIO_FIELD_KEYS)[number];
+
+export const MEDIA_AUDIO_FIELD_HELP: Record<MediaAudioFieldKey, string> = {
+  "tools.media.audio.enabled":
+    "Enable audio understanding so voice notes or audio clips can be transcribed/summarized for agent context. Disable when audio ingestion is outside policy or unnecessary for your workflows.",
+  "tools.media.audio.maxBytes":
+    "Maximum accepted audio payload size in bytes before processing is rejected or clipped by policy. Set this based on expected recording length and upstream provider limits.",
+  "tools.media.audio.maxChars":
+    "Maximum characters retained from audio understanding output to prevent oversized transcript injection. Increase for long-form dictation, or lower to keep conversational turns compact.",
+  "tools.media.audio.prompt":
+    "Instruction template guiding audio understanding output style, such as concise summary versus near-verbatim transcript. Keep wording consistent so downstream automations can rely on output format.",
+  "tools.media.audio.timeoutSeconds":
+    "Timeout in seconds for audio understanding execution before the operation is cancelled. Use longer timeouts for long recordings and tighter ones for interactive chat responsiveness.",
+  "tools.media.audio.language":
+    "Preferred language hint for audio understanding/transcription when provider support is available. Set this to improve recognition accuracy for known primary languages.",
+  "tools.media.audio.attachments":
+    "Attachment policy for audio inputs indicating which uploaded files are eligible for audio processing. Keep restrictive defaults in mixed-content channels to avoid unintended audio workloads.",
+  "tools.media.audio.models":
+    "Ordered model preferences specifically for audio understanding, used before shared media model fallback. Choose models optimized for transcription quality in your primary language/domain.",
+  "tools.media.audio.scope":
+    "Scope selector for when audio understanding runs across inbound messages and attachments. Keep focused scopes in high-volume channels to reduce cost and avoid accidental transcription.",
+  "tools.media.audio.echoTranscript":
+    "Echo the audio transcript back to the originating chat before agent processing. When enabled, users immediately see what was heard from their voice note, helping them verify transcription accuracy before the agent acts on it. Default: false.",
+  "tools.media.audio.echoFormat":
+    "Format string for the echoed transcript message. Use `{transcript}` as a placeholder for the transcribed text. Default: '📝 \"{transcript}\"'.",
+};
+
+export const MEDIA_AUDIO_FIELD_LABELS: Record<MediaAudioFieldKey, string> = {
+  "tools.media.audio.enabled": "Enable Audio Understanding",
+  "tools.media.audio.maxBytes": "Audio Understanding Max Bytes",
+  "tools.media.audio.maxChars": "Audio Understanding Max Chars",
+  "tools.media.audio.prompt": "Audio Understanding Prompt",
+  "tools.media.audio.timeoutSeconds": "Audio Understanding Timeout (sec)",
+  "tools.media.audio.language": "Audio Understanding Language",
+  "tools.media.audio.attachments": "Audio Understanding Attachment Policy",
+  "tools.media.audio.models": "Audio Understanding Models",
+  "tools.media.audio.scope": "Audio Understanding Scope",
+  "tools.media.audio.echoTranscript": "Echo Transcript to Chat",
+  "tools.media.audio.echoFormat": "Transcript Echo Format",
+};
diff --git a/src/config/schema.help.quality.test.ts b/src/config/schema.help.quality.test.ts
index 0bed7956d39..ec83273eb8a 100644
--- a/src/config/schema.help.quality.test.ts
+++ b/src/config/schema.help.quality.test.ts
@@ -1,4 +1,5 @@
 import { describe, expect, it } from "vitest";
+import { MEDIA_AUDIO_FIELD_KEYS } from "./media-audio-field-metadata.js";
 import { FIELD_HELP } from "./schema.help.js";
 import { FIELD_LABELS } from "./schema.labels.js";
 
@@ -457,15 +458,7 @@ const TOOLS_HOOKS_TARGET_KEYS = [
   "tools.links.models",
   "tools.links.scope",
   "tools.links.timeoutSeconds",
-  "tools.media.audio.attachments",
-  "tools.media.audio.enabled",
-  "tools.media.audio.language",
-  "tools.media.audio.maxBytes",
-  "tools.media.audio.maxChars",
-  "tools.media.audio.models",
-  "tools.media.audio.prompt",
-  "tools.media.audio.scope",
-  "tools.media.audio.timeoutSeconds",
+  ...MEDIA_AUDIO_FIELD_KEYS,
   "tools.media.concurrency",
   "tools.media.image.attachments",
   "tools.media.image.enabled",
diff --git a/src/config/schema.help.ts b/src/config/schema.help.ts
index f7b0c32587c..80b763213e4 100644
--- a/src/config/schema.help.ts
+++ b/src/config/schema.help.ts
@@ -1,3 +1,4 @@
+import { MEDIA_AUDIO_FIELD_HELP } from "./media-audio-field-metadata.js";
 import { IRC_FIELD_HELP } from "./schema.irc.js";
 
 export const FIELD_HELP: Record<string, string> = {
@@ -527,28 +528,7 @@ export const FIELD_HELP: Record<string, string> = {
     "Ordered model preferences specifically for image understanding when you want to override shared media models. Put the most reliable multimodal model first to reduce fallback attempts.",
   "tools.media.image.scope":
     "Scope selector for when image understanding is attempted (for example only explicit requests versus broader auto-detection). Keep narrow scope in busy channels to control token and API spend.",
-  "tools.media.audio.enabled":
-    "Enable audio understanding so voice notes or audio clips can be transcribed/summarized for agent context. Disable when audio ingestion is outside policy or unnecessary for your workflows.",
-  "tools.media.audio.maxBytes":
-    "Maximum accepted audio payload size in bytes before processing is rejected or clipped by policy. Set this based on expected recording length and upstream provider limits.",
-  "tools.media.audio.maxChars":
-    "Maximum characters retained from audio understanding output to prevent oversized transcript injection. Increase for long-form dictation, or lower to keep conversational turns compact.",
-  "tools.media.audio.prompt":
-    "Instruction template guiding audio understanding output style, such as concise summary versus near-verbatim transcript. Keep wording consistent so downstream automations can rely on output format.",
-  "tools.media.audio.timeoutSeconds":
-    "Timeout in seconds for audio understanding execution before the operation is cancelled. Use longer timeouts for long recordings and tighter ones for interactive chat responsiveness.",
-  "tools.media.audio.language":
-    "Preferred language hint for audio understanding/transcription when provider support is available. Set this to improve recognition accuracy for known primary languages.",
-  "tools.media.audio.attachments":
-    "Attachment policy for audio inputs indicating which uploaded files are eligible for audio processing. Keep restrictive defaults in mixed-content channels to avoid unintended audio workloads.",
-  "tools.media.audio.models":
-    "Ordered model preferences specifically for audio understanding, used before shared media model fallback. Choose models optimized for transcription quality in your primary language/domain.",
-  "tools.media.audio.scope":
-    "Scope selector for when audio understanding runs across inbound messages and attachments. Keep focused scopes in high-volume channels to reduce cost and avoid accidental transcription.",
-  "tools.media.audio.echoTranscript":
-    "Echo the audio transcript back to the originating chat before agent processing. When enabled, users immediately see what was heard from their voice note, helping them verify transcription accuracy before the agent acts on it. Default: false.",
-  "tools.media.audio.echoFormat":
-    "Format string for the echoed transcript message. Use `{transcript}` as a placeholder for the transcribed text. Default: '📝 \"{transcript}\"'.",
+  ...MEDIA_AUDIO_FIELD_HELP,
   "tools.media.video.enabled":
     "Enable video understanding so clips can be summarized into text for downstream reasoning and responses. Disable when processing video is out of policy or too expensive for your deployment.",
   "tools.media.video.maxBytes":
diff --git a/src/config/schema.labels.ts b/src/config/schema.labels.ts
index 8d2334f1a78..4edd050f3f6 100644
--- a/src/config/schema.labels.ts
+++ b/src/config/schema.labels.ts
@@ -1,3 +1,4 @@
+import { MEDIA_AUDIO_FIELD_LABELS } from "./media-audio-field-metadata.js";
 import { IRC_FIELD_LABELS } from "./schema.irc.js";
 
 export const FIELD_LABELS: Record<string, string> = {
@@ -128,17 +129,7 @@ export const FIELD_LABELS: Record<string, string> = {
   "tools.media.image.scope": "Image Understanding Scope",
   "tools.media.models": "Media Understanding Shared Models",
   "tools.media.concurrency": "Media Understanding Concurrency",
-  "tools.media.audio.enabled": "Enable Audio Understanding",
-  "tools.media.audio.maxBytes": "Audio Understanding Max Bytes",
-  "tools.media.audio.maxChars": "Audio Understanding Max Chars",
-  "tools.media.audio.prompt": "Audio Understanding Prompt",
-  "tools.media.audio.timeoutSeconds": "Audio Understanding Timeout (sec)",
-  "tools.media.audio.language": "Audio Understanding Language",
-  "tools.media.audio.attachments": "Audio Understanding Attachment Policy",
-  "tools.media.audio.models": "Audio Understanding Models",
-  "tools.media.audio.scope": "Audio Understanding Scope",
-  "tools.media.audio.echoTranscript": "Echo Transcript to Chat",
-  "tools.media.audio.echoFormat": "Transcript Echo Format",
+  ...MEDIA_AUDIO_FIELD_LABELS,
   "tools.media.video.enabled": "Enable Video Understanding",
   "tools.media.video.maxBytes": "Video Understanding Max Bytes",
   "tools.media.video.maxChars": "Video Understanding Max Chars",
diff --git a/src/media-understanding/apply.echo-transcript.test.ts b/src/media-understanding/apply.echo-transcript.test.ts
index a088525ae46..86330f5c092 100644
--- a/src/media-understanding/apply.echo-transcript.test.ts
+++ b/src/media-understanding/apply.echo-transcript.test.ts
@@ -4,6 +4,7 @@ import { afterAll, beforeAll, beforeEach, describe, expect, it, vi } from "vites
 import type { MsgContext } from "../auto-reply/templating.js";
 import type { OpenClawConfig } from "../config/config.js";
 import { resolvePreferredOpenClawTmpDir } from "../infra/tmp-openclaw-dir.js";
+import { createSafeAudioFixtureBuffer } from "./runner.test-utils.js";
 
 // ---------------------------------------------------------------------------
 // Module mocks
@@ -30,14 +31,17 @@ vi.mock("../agents/model-auth.js", () => ({
   resolveAuthProfileOrder: vi.fn(() => []),
 }));
 
-class MediaFetchErrorMock extends Error {
-  code: string;
-  constructor(message: string, code: string) {
-    super(message);
-    this.name = "MediaFetchError";
-    this.code = code;
+const { MediaFetchErrorMock } = vi.hoisted(() => {
+  class MediaFetchErrorMock extends Error {
+    code: string;
+    constructor(message: string, code: string) {
+      super(message);
+      this.name = "MediaFetchError";
+      this.code = code;
+    }
   }
-}
+  return { MediaFetchErrorMock };
+});
 
 vi.mock("../media/fetch.js", () => ({
   fetchRemoteMedia: vi.fn(),
@@ -68,7 +72,7 @@ let suiteTempMediaRootDir = "";
 async function createTempAudioFile(): Promise<string> {
   const dir = await fs.mkdtemp(path.join(suiteTempMediaRootDir, "case-"));
   const filePath = path.join(dir, "note.ogg");
-  await fs.writeFile(filePath, Buffer.alloc(2048, 0xab));
+  await fs.writeFile(filePath, createSafeAudioFixtureBuffer(2048));
   return filePath;
 }
 
diff --git a/src/media-understanding/apply.test.ts b/src/media-understanding/apply.test.ts
index df143c72da5..2b7f7f19360 100644
--- a/src/media-understanding/apply.test.ts
+++ b/src/media-understanding/apply.test.ts
@@ -10,6 +10,7 @@ import { fetchRemoteMedia } from "../media/fetch.js";
 import { runExec } from "../process/exec.js";
 import { withEnvAsync } from "../test-utils/env.js";
 import { clearMediaUnderstandingBinaryCacheForTests } from "./runner.js";
+import { createSafeAudioFixtureBuffer } from "./runner.test-utils.js";
 
 vi.mock("../agents/model-auth.js", () => ({
   resolveApiKeyForProvider: vi.fn(async () => ({
@@ -174,7 +175,7 @@ async function createAudioCtx(params?: {
 }): Promise<MsgContext> {
   const mediaPath = await createTempMediaFile({
     fileName: params?.fileName ?? "note.ogg",
-    content: params?.content ?? Buffer.alloc(2048, 0xab),
+    content: params?.content ?? createSafeAudioFixtureBuffer(2048),
   });
   return {
     Body: params?.body ?? "<media:audio>",
@@ -190,7 +191,7 @@ async function setupAudioAutoDetectCase(stdout: string): Promise<{
   const ctx = await createAudioCtx({
     fileName: "sample.wav",
     mediaType: "audio/wav",
-    content: Buffer.alloc(2048, 0xab),
+    content: createSafeAudioFixtureBuffer(2048),
   });
   const cfg: OpenClawConfig = { tools: { media: { audio: {} } } };
   mockedRunExec.mockResolvedValueOnce({
@@ -249,7 +250,7 @@ describe("applyMediaUnderstanding", () => {
     mockedFetchRemoteMedia.mockClear();
     mockedRunExec.mockReset();
     mockedFetchRemoteMedia.mockResolvedValue({
-      buffer: Buffer.alloc(2048, 0xab),
+      buffer: createSafeAudioFixtureBuffer(2048),
       contentType: "audio/ogg",
       fileName: "note.ogg",
     });
@@ -540,7 +541,7 @@ describe("applyMediaUnderstanding", () => {
     const ctx = await createAudioCtx({
       fileName: "sample.wav",
       mediaType: "audio/wav",
-      content: Buffer.alloc(2048, 0xab),
+      content: createSafeAudioFixtureBuffer(2048),
     });
     const cfg: OpenClawConfig = { tools: { media: { audio: {} } } };
     mockedResolveApiKey.mockResolvedValue({
@@ -654,7 +655,7 @@ describe("applyMediaUnderstanding", () => {
   it("uses active model when enabled and models are missing", async () => {
     const audioPath = await createTempMediaFile({
       fileName: "fallback.ogg",
-      content: Buffer.alloc(2048, 0xab),
+      content: createSafeAudioFixtureBuffer(2048),
     });
 
     const ctx: MsgContext = {
@@ -690,7 +691,7 @@ describe("applyMediaUnderstanding", () => {
 
   it("handles multiple audio attachments when attachment mode is all", async () => {
     const dir = await createTempMediaDir();
-    const audioBytes = Buffer.alloc(2048, 0xab);
+    const audioBytes = createSafeAudioFixtureBuffer(2048);
     const audioPathA = path.join(dir, "note-a.ogg");
     const audioPathB = path.join(dir, "note-b.ogg");
     await fs.writeFile(audioPathA, audioBytes);
@@ -737,7 +738,7 @@ describe("applyMediaUnderstanding", () => {
     const audioPath = path.join(dir, "note.ogg");
     const videoPath = path.join(dir, "clip.mp4");
     await fs.writeFile(imagePath, "image-bytes");
-    await fs.writeFile(audioPath, Buffer.alloc(2048, 0xab));
+    await fs.writeFile(audioPath, createSafeAudioFixtureBuffer(2048));
     await fs.writeFile(videoPath, "video-bytes");
 
     const ctx: MsgContext = {
diff --git a/src/media-understanding/apply.ts b/src/media-understanding/apply.ts
index bfe5dbc225b..4937658ca73 100644
--- a/src/media-understanding/apply.ts
+++ b/src/media-understanding/apply.ts
@@ -8,9 +8,9 @@ import {
   normalizeMimeType,
   resolveInputFileLimits,
 } from "../media/input-files.js";
-import { isDeliverableMessageChannel } from "../utils/message-channel.js";
 import { resolveAttachmentKind } from "./attachments.js";
 import { runWithConcurrency } from "./concurrency.js";
+import { DEFAULT_ECHO_TRANSCRIPT_FORMAT, sendTranscriptEcho } from "./echo-transcript.js";
 import {
   extractMediaUserText,
   formatAudioTranscripts,
@@ -463,68 +463,6 @@ async function extractFileBlocks(params: {
   return blocks;
 }
 
-const DEFAULT_ECHO_FORMAT = '📝 "{transcript}"';
-
-/**
- * Formats a transcript echo message using the configured format string.
- * Replaces `{transcript}` placeholder with the actual transcript text.
- */
-function formatEchoTranscript(transcript: string, format: string): string {
-  return format.replace("{transcript}", transcript);
-}
-
-/**
- * Sends the transcript echo back to the originating chat.
- * Best-effort: logs on failure, never throws.
- */
-async function sendTranscriptEcho(params: {
-  ctx: MsgContext;
-  cfg: OpenClawConfig;
-  transcript: string;
-  format: string;
-}): Promise<void> {
-  const { ctx, cfg, transcript, format } = params;
-  const channel = ctx.Provider ?? ctx.Surface ?? "";
-  const to = ctx.OriginatingTo ?? ctx.From ?? "";
-
-  if (!channel || !to) {
-    if (shouldLogVerbose()) {
-      logVerbose("media: echo-transcript skipped (no channel/to resolved from ctx)");
-    }
-    return;
-  }
-
-  const normalizedChannel = channel.trim().toLowerCase();
-  if (!isDeliverableMessageChannel(normalizedChannel)) {
-    if (shouldLogVerbose()) {
-      logVerbose(
-        `media: echo-transcript skipped (channel "${String(normalizedChannel)}" is not deliverable)`,
-      );
-    }
-    return;
-  }
-
-  const text = formatEchoTranscript(transcript, format);
-
-  try {
-    const { deliverOutboundPayloads } = await import("../infra/outbound/deliver.js");
-    await deliverOutboundPayloads({
-      cfg,
-      channel: normalizedChannel,
-      to,
-      accountId: ctx.AccountId ?? undefined,
-      threadId: ctx.MessageThreadId ?? undefined,
-      payloads: [{ text }],
-      bestEffort: true,
-    });
-    if (shouldLogVerbose()) {
-      logVerbose(`media: echo-transcript sent to ${normalizedChannel}/${to}`);
-    }
-  } catch (err) {
-    logVerbose(`media: echo-transcript delivery failed: ${String(err)}`);
-  }
-}
-
 export async function applyMediaUnderstanding(params: {
   ctx: MsgContext;
   cfg: OpenClawConfig;
@@ -598,7 +536,7 @@ export async function applyMediaUnderstanding(params: {
             ctx,
             cfg,
             transcript,
-            format: audioCfg.echoFormat ?? DEFAULT_ECHO_FORMAT,
+            format: audioCfg.echoFormat ?? DEFAULT_ECHO_TRANSCRIPT_FORMAT,
           });
         }
       } else if (originalUserText) {
diff --git a/src/media-understanding/attachments.cache.ts b/src/media-understanding/attachments.cache.ts
new file mode 100644
index 00000000000..f8e61265022
--- /dev/null
+++ b/src/media-understanding/attachments.cache.ts
@@ -0,0 +1,323 @@
+import fs from "node:fs/promises";
+import path from "node:path";
+import { logVerbose, shouldLogVerbose } from "../globals.js";
+import { isAbortError } from "../infra/unhandled-rejections.js";
+import { fetchRemoteMedia, MediaFetchError } from "../media/fetch.js";
+import {
+  DEFAULT_IMESSAGE_ATTACHMENT_ROOTS,
+  isInboundPathAllowed,
+  mergeInboundPathRoots,
+} from "../media/inbound-path-policy.js";
+import { getDefaultMediaLocalRoots } from "../media/local-roots.js";
+import { detectMime } from "../media/mime.js";
+import { buildRandomTempFilePath } from "../plugin-sdk/temp-path.js";
+import { normalizeAttachmentPath } from "./attachments.normalize.js";
+import { MediaUnderstandingSkipError } from "./errors.js";
+import { fetchWithTimeout } from "./providers/shared.js";
+import type { MediaAttachment } from "./types.js";
+
+type MediaBufferResult = {
+  buffer: Buffer;
+  mime?: string;
+  fileName: string;
+  size: number;
+};
+
+type MediaPathResult = {
+  path: string;
+  cleanup?: () => Promise<void> | void;
+};
+
+type AttachmentCacheEntry = {
+  attachment: MediaAttachment;
+  resolvedPath?: string;
+  statSize?: number;
+  buffer?: Buffer;
+  bufferMime?: string;
+  bufferFileName?: string;
+  tempPath?: string;
+  tempCleanup?: () => Promise<void>;
+};
+
+const DEFAULT_LOCAL_PATH_ROOTS = mergeInboundPathRoots(
+  getDefaultMediaLocalRoots(),
+  DEFAULT_IMESSAGE_ATTACHMENT_ROOTS,
+);
+
+export type MediaAttachmentCacheOptions = {
+  localPathRoots?: readonly string[];
+};
+
+function resolveRequestUrl(input: RequestInfo | URL): string {
+  if (typeof input === "string") {
+    return input;
+  }
+  if (input instanceof URL) {
+    return input.toString();
+  }
+  return input.url;
+}
+
+export class MediaAttachmentCache {
+  private readonly entries = new Map<number, AttachmentCacheEntry>();
+  private readonly attachments: MediaAttachment[];
+  private readonly localPathRoots: readonly string[];
+  private canonicalLocalPathRoots?: Promise<readonly string[]>;
+
+  constructor(attachments: MediaAttachment[], options?: MediaAttachmentCacheOptions) {
+    this.attachments = attachments;
+    this.localPathRoots = mergeInboundPathRoots(options?.localPathRoots, DEFAULT_LOCAL_PATH_ROOTS);
+    for (const attachment of attachments) {
+      this.entries.set(attachment.index, { attachment });
+    }
+  }
+
+  async getBuffer(params: {
+    attachmentIndex: number;
+    maxBytes: number;
+    timeoutMs: number;
+  }): Promise<MediaBufferResult> {
+    const entry = await this.ensureEntry(params.attachmentIndex);
+    if (entry.buffer) {
+      if (entry.buffer.length > params.maxBytes) {
+        throw new MediaUnderstandingSkipError(
+          "maxBytes",
+          `Attachment ${params.attachmentIndex + 1} exceeds maxBytes ${params.maxBytes}`,
+        );
+      }
+      return {
+        buffer: entry.buffer,
+        mime: entry.bufferMime,
+        fileName: entry.bufferFileName ?? `media-${params.attachmentIndex + 1}`,
+        size: entry.buffer.length,
+      };
+    }
+
+    if (entry.resolvedPath) {
+      const size = await this.ensureLocalStat(entry);
+      if (entry.resolvedPath) {
+        if (size !== undefined && size > params.maxBytes) {
+          throw new MediaUnderstandingSkipError(
+            "maxBytes",
+            `Attachment ${params.attachmentIndex + 1} exceeds maxBytes ${params.maxBytes}`,
+          );
+        }
+        const buffer = await fs.readFile(entry.resolvedPath);
+        entry.buffer = buffer;
+        entry.bufferMime =
+          entry.bufferMime ??
+          entry.attachment.mime ??
+          (await detectMime({
+            buffer,
+            filePath: entry.resolvedPath,
+          }));
+        entry.bufferFileName =
+          path.basename(entry.resolvedPath) || `media-${params.attachmentIndex + 1}`;
+        return {
+          buffer,
+          mime: entry.bufferMime,
+          fileName: entry.bufferFileName,
+          size: buffer.length,
+        };
+      }
+    }
+
+    const url = entry.attachment.url?.trim();
+    if (!url) {
+      throw new MediaUnderstandingSkipError(
+        "empty",
+        `Attachment ${params.attachmentIndex + 1} has no path or URL.`,
+      );
+    }
+
+    try {
+      const fetchImpl = (input: RequestInfo | URL, init?: RequestInit) =>
+        fetchWithTimeout(resolveRequestUrl(input), init ?? {}, params.timeoutMs, fetch);
+      const fetched = await fetchRemoteMedia({ url, fetchImpl, maxBytes: params.maxBytes });
+      entry.buffer = fetched.buffer;
+      entry.bufferMime =
+        entry.attachment.mime ??
+        fetched.contentType ??
+        (await detectMime({
+          buffer: fetched.buffer,
+          filePath: fetched.fileName ?? url,
+        }));
+      entry.bufferFileName = fetched.fileName ?? `media-${params.attachmentIndex + 1}`;
+      return {
+        buffer: fetched.buffer,
+        mime: entry.bufferMime,
+        fileName: entry.bufferFileName,
+        size: fetched.buffer.length,
+      };
+    } catch (err) {
+      if (err instanceof MediaFetchError && err.code === "max_bytes") {
+        throw new MediaUnderstandingSkipError(
+          "maxBytes",
+          `Attachment ${params.attachmentIndex + 1} exceeds maxBytes ${params.maxBytes}`,
+        );
+      }
+      if (isAbortError(err)) {
+        throw new MediaUnderstandingSkipError(
+          "timeout",
+          `Attachment ${params.attachmentIndex + 1} timed out while fetching.`,
+        );
+      }
+      throw err;
+    }
+  }
+
+  async getPath(params: {
+    attachmentIndex: number;
+    maxBytes?: number;
+    timeoutMs: number;
+  }): Promise<MediaPathResult> {
+    const entry = await this.ensureEntry(params.attachmentIndex);
+    if (entry.resolvedPath) {
+      if (params.maxBytes) {
+        const size = await this.ensureLocalStat(entry);
+        if (entry.resolvedPath) {
+          if (size !== undefined && size > params.maxBytes) {
+            throw new MediaUnderstandingSkipError(
+              "maxBytes",
+              `Attachment ${params.attachmentIndex + 1} exceeds maxBytes ${params.maxBytes}`,
+            );
+          }
+        }
+      }
+      if (entry.resolvedPath) {
+        return { path: entry.resolvedPath };
+      }
+    }
+
+    if (entry.tempPath) {
+      if (params.maxBytes && entry.buffer && entry.buffer.length > params.maxBytes) {
+        throw new MediaUnderstandingSkipError(
+          "maxBytes",
+          `Attachment ${params.attachmentIndex + 1} exceeds maxBytes ${params.maxBytes}`,
+        );
+      }
+      return { path: entry.tempPath, cleanup: entry.tempCleanup };
+    }
+
+    const maxBytes = params.maxBytes ?? Number.POSITIVE_INFINITY;
+    const bufferResult = await this.getBuffer({
+      attachmentIndex: params.attachmentIndex,
+      maxBytes,
+      timeoutMs: params.timeoutMs,
+    });
+    const extension = path.extname(bufferResult.fileName || "") || "";
+    const tmpPath = buildRandomTempFilePath({
+      prefix: "openclaw-media",
+      extension,
+    });
+    await fs.writeFile(tmpPath, bufferResult.buffer);
+    entry.tempPath = tmpPath;
+    entry.tempCleanup = async () => {
+      await fs.unlink(tmpPath).catch(() => {});
+    };
+    return { path: tmpPath, cleanup: entry.tempCleanup };
+  }
+
+  async cleanup(): Promise<void> {
+    const cleanups: Array<Promise<void> | void> = [];
+    for (const entry of this.entries.values()) {
+      if (entry.tempCleanup) {
+        cleanups.push(Promise.resolve(entry.tempCleanup()));
+        entry.tempCleanup = undefined;
+      }
+    }
+    await Promise.all(cleanups);
+  }
+
+  private async ensureEntry(attachmentIndex: number): Promise<AttachmentCacheEntry> {
+    const existing = this.entries.get(attachmentIndex);
+    if (existing) {
+      if (!existing.resolvedPath) {
+        existing.resolvedPath = this.resolveLocalPath(existing.attachment);
+      }
+      return existing;
+    }
+    const attachment = this.attachments.find((item) => item.index === attachmentIndex) ?? {
+      index: attachmentIndex,
+    };
+    const entry: AttachmentCacheEntry = {
+      attachment,
+      resolvedPath: this.resolveLocalPath(attachment),
+    };
+    this.entries.set(attachmentIndex, entry);
+    return entry;
+  }
+
+  private resolveLocalPath(attachment: MediaAttachment): string | undefined {
+    const rawPath = normalizeAttachmentPath(attachment.path);
+    if (!rawPath) {
+      return undefined;
+    }
+    return path.isAbsolute(rawPath) ? rawPath : path.resolve(rawPath);
+  }
+
+  private async ensureLocalStat(entry: AttachmentCacheEntry): Promise<number | undefined> {
+    if (!entry.resolvedPath) {
+      return undefined;
+    }
+    if (!isInboundPathAllowed({ filePath: entry.resolvedPath, roots: this.localPathRoots })) {
+      entry.resolvedPath = undefined;
+      if (shouldLogVerbose()) {
+        logVerbose(
+          `Blocked attachment path outside allowed roots: ${entry.attachment.path ?? entry.attachment.url ?? "(unknown)"}`,
+        );
+      }
+      return undefined;
+    }
+    if (entry.statSize !== undefined) {
+      return entry.statSize;
+    }
+    try {
+      const currentPath = entry.resolvedPath;
+      const stat = await fs.stat(currentPath);
+      if (!stat.isFile()) {
+        entry.resolvedPath = undefined;
+        return undefined;
+      }
+      const canonicalPath = await fs.realpath(currentPath).catch(() => currentPath);
+      const canonicalRoots = await this.getCanonicalLocalPathRoots();
+      if (!isInboundPathAllowed({ filePath: canonicalPath, roots: canonicalRoots })) {
+        entry.resolvedPath = undefined;
+        if (shouldLogVerbose()) {
+          logVerbose(
+            `Blocked canonicalized attachment path outside allowed roots: ${canonicalPath}`,
+          );
+        }
+        return undefined;
+      }
+      entry.resolvedPath = canonicalPath;
+      entry.statSize = stat.size;
+      return stat.size;
+    } catch (err) {
+      entry.resolvedPath = undefined;
+      if (shouldLogVerbose()) {
+        logVerbose(`Failed to read attachment ${entry.attachment.index + 1}: ${String(err)}`);
+      }
+      return undefined;
+    }
+  }
+
+  private async getCanonicalLocalPathRoots(): Promise<readonly string[]> {
+    if (this.canonicalLocalPathRoots) {
+      return await this.canonicalLocalPathRoots;
+    }
+    this.canonicalLocalPathRoots = (async () =>
+      mergeInboundPathRoots(
+        this.localPathRoots,
+        await Promise.all(
+          this.localPathRoots.map(async (root) => {
+            if (root.includes("*")) {
+              return root;
+            }
+            return await fs.realpath(root).catch(() => root);
+          }),
+        ),
+      ))();
+    return await this.canonicalLocalPathRoots;
+  }
+}
diff --git a/src/media-understanding/attachments.normalize.ts b/src/media-understanding/attachments.normalize.ts
new file mode 100644
index 00000000000..4c248c538f9
--- /dev/null
+++ b/src/media-understanding/attachments.normalize.ts
@@ -0,0 +1,108 @@
+import { fileURLToPath } from "node:url";
+import type { MsgContext } from "../auto-reply/templating.js";
+import { getFileExtension, isAudioFileName, kindFromMime } from "../media/mime.js";
+import type { MediaAttachment } from "./types.js";
+
+export function normalizeAttachmentPath(raw?: string | null): string | undefined {
+  const value = raw?.trim();
+  if (!value) {
+    return undefined;
+  }
+  if (value.startsWith("file://")) {
+    try {
+      return fileURLToPath(value);
+    } catch {
+      return undefined;
+    }
+  }
+  return value;
+}
+
+export function normalizeAttachments(ctx: MsgContext): MediaAttachment[] {
+  const pathsFromArray = Array.isArray(ctx.MediaPaths) ? ctx.MediaPaths : undefined;
+  const urlsFromArray = Array.isArray(ctx.MediaUrls) ? ctx.MediaUrls : undefined;
+  const typesFromArray = Array.isArray(ctx.MediaTypes) ? ctx.MediaTypes : undefined;
+  const resolveMime = (count: number, index: number) => {
+    const typeHint = typesFromArray?.[index];
+    const trimmed = typeof typeHint === "string" ? typeHint.trim() : "";
+    if (trimmed) {
+      return trimmed;
+    }
+    return count === 1 ? ctx.MediaType : undefined;
+  };
+
+  if (pathsFromArray && pathsFromArray.length > 0) {
+    const count = pathsFromArray.length;
+    const urls = urlsFromArray && urlsFromArray.length > 0 ? urlsFromArray : undefined;
+    return pathsFromArray
+      .map((value, index) => ({
+        path: value?.trim() || undefined,
+        url: urls?.[index] ?? ctx.MediaUrl,
+        mime: resolveMime(count, index),
+        index,
+      }))
+      .filter((entry) => Boolean(entry.path?.trim() || entry.url?.trim()));
+  }
+
+  if (urlsFromArray && urlsFromArray.length > 0) {
+    const count = urlsFromArray.length;
+    return urlsFromArray
+      .map((value, index) => ({
+        path: undefined,
+        url: value?.trim() || undefined,
+        mime: resolveMime(count, index),
+        index,
+      }))
+      .filter((entry) => Boolean(entry.url?.trim()));
+  }
+
+  const pathValue = ctx.MediaPath?.trim();
+  const url = ctx.MediaUrl?.trim();
+  if (!pathValue && !url) {
+    return [];
+  }
+  return [
+    {
+      path: pathValue || undefined,
+      url: url || undefined,
+      mime: ctx.MediaType,
+      index: 0,
+    },
+  ];
+}
+
+export function resolveAttachmentKind(
+  attachment: MediaAttachment,
+): "image" | "audio" | "video" | "document" | "unknown" {
+  const kind = kindFromMime(attachment.mime);
+  if (kind === "image" || kind === "audio" || kind === "video") {
+    return kind;
+  }
+
+  const ext = getFileExtension(attachment.path ?? attachment.url);
+  if (!ext) {
+    return "unknown";
+  }
+  if ([".mp4", ".mov", ".mkv", ".webm", ".avi", ".m4v"].includes(ext)) {
+    return "video";
+  }
+  if (isAudioFileName(attachment.path ?? attachment.url)) {
+    return "audio";
+  }
+  if ([".png", ".jpg", ".jpeg", ".webp", ".gif", ".bmp", ".tiff", ".tif"].includes(ext)) {
+    return "image";
+  }
+  return "unknown";
+}
+
+export function isVideoAttachment(attachment: MediaAttachment): boolean {
+  return resolveAttachmentKind(attachment) === "video";
+}
+
+export function isAudioAttachment(attachment: MediaAttachment): boolean {
+  return resolveAttachmentKind(attachment) === "audio";
+}
+
+export function isImageAttachment(attachment: MediaAttachment): boolean {
+  return resolveAttachmentKind(attachment) === "image";
+}
diff --git a/src/media-understanding/attachments.select.ts b/src/media-understanding/attachments.select.ts
new file mode 100644
index 00000000000..4d5a694fac6
--- /dev/null
+++ b/src/media-understanding/attachments.select.ts
@@ -0,0 +1,89 @@
+import type { MediaUnderstandingAttachmentsConfig } from "../config/types.tools.js";
+import {
+  isAudioAttachment,
+  isImageAttachment,
+  isVideoAttachment,
+} from "./attachments.normalize.js";
+import type { MediaAttachment, MediaUnderstandingCapability } from "./types.js";
+
+const DEFAULT_MAX_ATTACHMENTS = 1;
+
+function orderAttachments(
+  attachments: MediaAttachment[],
+  prefer?: MediaUnderstandingAttachmentsConfig["prefer"],
+): MediaAttachment[] {
+  const list = Array.isArray(attachments) ? attachments.filter(isAttachmentRecord) : [];
+  if (!prefer || prefer === "first") {
+    return list;
+  }
+  if (prefer === "last") {
+    return [...list].toReversed();
+  }
+  if (prefer === "path") {
+    const withPath = list.filter((item) => item.path);
+    const withoutPath = list.filter((item) => !item.path);
+    return [...withPath, ...withoutPath];
+  }
+  if (prefer === "url") {
+    const withUrl = list.filter((item) => item.url);
+    const withoutUrl = list.filter((item) => !item.url);
+    return [...withUrl, ...withoutUrl];
+  }
+  return list;
+}
+
+function isAttachmentRecord(value: unknown): value is MediaAttachment {
+  if (!value || typeof value !== "object") {
+    return false;
+  }
+  const entry = value as Record<string, unknown>;
+  if (typeof entry.index !== "number") {
+    return false;
+  }
+  if (entry.path !== undefined && typeof entry.path !== "string") {
+    return false;
+  }
+  if (entry.url !== undefined && typeof entry.url !== "string") {
+    return false;
+  }
+  if (entry.mime !== undefined && typeof entry.mime !== "string") {
+    return false;
+  }
+  if (entry.alreadyTranscribed !== undefined && typeof entry.alreadyTranscribed !== "boolean") {
+    return false;
+  }
+  return true;
+}
+
+export function selectAttachments(params: {
+  capability: MediaUnderstandingCapability;
+  attachments: MediaAttachment[];
+  policy?: MediaUnderstandingAttachmentsConfig;
+}): MediaAttachment[] {
+  const { capability, attachments, policy } = params;
+  const input = Array.isArray(attachments) ? attachments.filter(isAttachmentRecord) : [];
+  const matches = input.filter((item) => {
+    // Skip already-transcribed audio attachments from preflight
+    if (capability === "audio" && item.alreadyTranscribed) {
+      return false;
+    }
+    if (capability === "image") {
+      return isImageAttachment(item);
+    }
+    if (capability === "audio") {
+      return isAudioAttachment(item);
+    }
+    return isVideoAttachment(item);
+  });
+  if (matches.length === 0) {
+    return [];
+  }
+
+  const ordered = orderAttachments(matches, policy?.prefer);
+  const mode = policy?.mode ?? "first";
+  const maxAttachments = policy?.maxAttachments ?? DEFAULT_MAX_ATTACHMENTS;
+  if (mode === "all") {
+    return ordered.slice(0, Math.max(1, maxAttachments));
+  }
+  return ordered.slice(0, 1);
+}
diff --git a/src/media-understanding/attachments.ts b/src/media-understanding/attachments.ts
index 0bf6da818b0..4b19da17515 100644
--- a/src/media-understanding/attachments.ts
+++ b/src/media-understanding/attachments.ts
@@ -1,510 +1,9 @@
-import fs from "node:fs/promises";
-import path from "node:path";
-import { fileURLToPath } from "node:url";
-import type { MsgContext } from "../auto-reply/templating.js";
-import type { MediaUnderstandingAttachmentsConfig } from "../config/types.tools.js";
-import { logVerbose, shouldLogVerbose } from "../globals.js";
-import { isAbortError } from "../infra/unhandled-rejections.js";
-import { fetchRemoteMedia, MediaFetchError } from "../media/fetch.js";
-import {
-  DEFAULT_IMESSAGE_ATTACHMENT_ROOTS,
-  isInboundPathAllowed,
-  mergeInboundPathRoots,
-} from "../media/inbound-path-policy.js";
-import { getDefaultMediaLocalRoots } from "../media/local-roots.js";
-import { detectMime, getFileExtension, isAudioFileName, kindFromMime } from "../media/mime.js";
-import { buildRandomTempFilePath } from "../plugin-sdk/temp-path.js";
-import { MediaUnderstandingSkipError } from "./errors.js";
-import { fetchWithTimeout } from "./providers/shared.js";
-import type { MediaAttachment, MediaUnderstandingCapability } from "./types.js";
-
-type MediaBufferResult = {
-  buffer: Buffer;
-  mime?: string;
-  fileName: string;
-  size: number;
-};
-
-type MediaPathResult = {
-  path: string;
-  cleanup?: () => Promise<void> | void;
-};
-
-type AttachmentCacheEntry = {
-  attachment: MediaAttachment;
-  resolvedPath?: string;
-  statSize?: number;
-  buffer?: Buffer;
-  bufferMime?: string;
-  bufferFileName?: string;
-  tempPath?: string;
-  tempCleanup?: () => Promise<void>;
-};
-
-const DEFAULT_MAX_ATTACHMENTS = 1;
-const DEFAULT_LOCAL_PATH_ROOTS = mergeInboundPathRoots(
-  getDefaultMediaLocalRoots(),
-  DEFAULT_IMESSAGE_ATTACHMENT_ROOTS,
-);
-
-export type MediaAttachmentCacheOptions = {
-  localPathRoots?: readonly string[];
-};
-
-function normalizeAttachmentPath(raw?: string | null): string | undefined {
-  const value = raw?.trim();
-  if (!value) {
-    return undefined;
-  }
-  if (value.startsWith("file://")) {
-    try {
-      return fileURLToPath(value);
-    } catch {
-      return undefined;
-    }
-  }
-  return value;
-}
-
-export function normalizeAttachments(ctx: MsgContext): MediaAttachment[] {
-  const pathsFromArray = Array.isArray(ctx.MediaPaths) ? ctx.MediaPaths : undefined;
-  const urlsFromArray = Array.isArray(ctx.MediaUrls) ? ctx.MediaUrls : undefined;
-  const typesFromArray = Array.isArray(ctx.MediaTypes) ? ctx.MediaTypes : undefined;
-  const resolveMime = (count: number, index: number) => {
-    const typeHint = typesFromArray?.[index];
-    const trimmed = typeof typeHint === "string" ? typeHint.trim() : "";
-    if (trimmed) {
-      return trimmed;
-    }
-    return count === 1 ? ctx.MediaType : undefined;
-  };
-
-  if (pathsFromArray && pathsFromArray.length > 0) {
-    const count = pathsFromArray.length;
-    const urls = urlsFromArray && urlsFromArray.length > 0 ? urlsFromArray : undefined;
-    return pathsFromArray
-      .map((value, index) => ({
-        path: value?.trim() || undefined,
-        url: urls?.[index] ?? ctx.MediaUrl,
-        mime: resolveMime(count, index),
-        index,
-      }))
-      .filter((entry) => Boolean(entry.path?.trim() || entry.url?.trim()));
-  }
-
-  if (urlsFromArray && urlsFromArray.length > 0) {
-    const count = urlsFromArray.length;
-    return urlsFromArray
-      .map((value, index) => ({
-        path: undefined,
-        url: value?.trim() || undefined,
-        mime: resolveMime(count, index),
-        index,
-      }))
-      .filter((entry) => Boolean(entry.url?.trim()));
-  }
-
-  const pathValue = ctx.MediaPath?.trim();
-  const url = ctx.MediaUrl?.trim();
-  if (!pathValue && !url) {
-    return [];
-  }
-  return [
-    {
-      path: pathValue || undefined,
-      url: url || undefined,
-      mime: ctx.MediaType,
-      index: 0,
-    },
-  ];
-}
-
-export function resolveAttachmentKind(
-  attachment: MediaAttachment,
-): "image" | "audio" | "video" | "document" | "unknown" {
-  const kind = kindFromMime(attachment.mime);
-  if (kind === "image" || kind === "audio" || kind === "video") {
-    return kind;
-  }
-
-  const ext = getFileExtension(attachment.path ?? attachment.url);
-  if (!ext) {
-    return "unknown";
-  }
-  if ([".mp4", ".mov", ".mkv", ".webm", ".avi", ".m4v"].includes(ext)) {
-    return "video";
-  }
-  if (isAudioFileName(attachment.path ?? attachment.url)) {
-    return "audio";
-  }
-  if ([".png", ".jpg", ".jpeg", ".webp", ".gif", ".bmp", ".tiff", ".tif"].includes(ext)) {
-    return "image";
-  }
-  return "unknown";
-}
-
-export function isVideoAttachment(attachment: MediaAttachment): boolean {
-  return resolveAttachmentKind(attachment) === "video";
-}
-
-export function isAudioAttachment(attachment: MediaAttachment): boolean {
-  return resolveAttachmentKind(attachment) === "audio";
-}
-
-export function isImageAttachment(attachment: MediaAttachment): boolean {
-  return resolveAttachmentKind(attachment) === "image";
-}
-
-function resolveRequestUrl(input: RequestInfo | URL): string {
-  if (typeof input === "string") {
-    return input;
-  }
-  if (input instanceof URL) {
-    return input.toString();
-  }
-  return input.url;
-}
-
-function orderAttachments(
-  attachments: MediaAttachment[],
-  prefer?: MediaUnderstandingAttachmentsConfig["prefer"],
-): MediaAttachment[] {
-  const list = Array.isArray(attachments) ? attachments.filter(isAttachmentRecord) : [];
-  if (!prefer || prefer === "first") {
-    return list;
-  }
-  if (prefer === "last") {
-    return [...list].toReversed();
-  }
-  if (prefer === "path") {
-    const withPath = list.filter((item) => item.path);
-    const withoutPath = list.filter((item) => !item.path);
-    return [...withPath, ...withoutPath];
-  }
-  if (prefer === "url") {
-    const withUrl = list.filter((item) => item.url);
-    const withoutUrl = list.filter((item) => !item.url);
-    return [...withUrl, ...withoutUrl];
-  }
-  return list;
-}
-
-function isAttachmentRecord(value: unknown): value is MediaAttachment {
-  if (!value || typeof value !== "object") {
-    return false;
-  }
-  const entry = value as Record<string, unknown>;
-  if (typeof entry.index !== "number") {
-    return false;
-  }
-  if (entry.path !== undefined && typeof entry.path !== "string") {
-    return false;
-  }
-  if (entry.url !== undefined && typeof entry.url !== "string") {
-    return false;
-  }
-  if (entry.mime !== undefined && typeof entry.mime !== "string") {
-    return false;
-  }
-  if (entry.alreadyTranscribed !== undefined && typeof entry.alreadyTranscribed !== "boolean") {
-    return false;
-  }
-  return true;
-}
-
-export function selectAttachments(params: {
-  capability: MediaUnderstandingCapability;
-  attachments: MediaAttachment[];
-  policy?: MediaUnderstandingAttachmentsConfig;
-}): MediaAttachment[] {
-  const { capability, attachments, policy } = params;
-  const input = Array.isArray(attachments) ? attachments.filter(isAttachmentRecord) : [];
-  const matches = input.filter((item) => {
-    // Skip already-transcribed audio attachments from preflight
-    if (capability === "audio" && item.alreadyTranscribed) {
-      return false;
-    }
-    if (capability === "image") {
-      return isImageAttachment(item);
-    }
-    if (capability === "audio") {
-      return isAudioAttachment(item);
-    }
-    return isVideoAttachment(item);
-  });
-  if (matches.length === 0) {
-    return [];
-  }
-
-  const ordered = orderAttachments(matches, policy?.prefer);
-  const mode = policy?.mode ?? "first";
-  const maxAttachments = policy?.maxAttachments ?? DEFAULT_MAX_ATTACHMENTS;
-  if (mode === "all") {
-    return ordered.slice(0, Math.max(1, maxAttachments));
-  }
-  return ordered.slice(0, 1);
-}
-
-export class MediaAttachmentCache {
-  private readonly entries = new Map<number, AttachmentCacheEntry>();
-  private readonly attachments: MediaAttachment[];
-  private readonly localPathRoots: readonly string[];
-  private canonicalLocalPathRoots?: Promise<readonly string[]>;
-
-  constructor(attachments: MediaAttachment[], options?: MediaAttachmentCacheOptions) {
-    this.attachments = attachments;
-    this.localPathRoots = mergeInboundPathRoots(options?.localPathRoots, DEFAULT_LOCAL_PATH_ROOTS);
-    for (const attachment of attachments) {
-      this.entries.set(attachment.index, { attachment });
-    }
-  }
-
-  async getBuffer(params: {
-    attachmentIndex: number;
-    maxBytes: number;
-    timeoutMs: number;
-  }): Promise<MediaBufferResult> {
-    const entry = await this.ensureEntry(params.attachmentIndex);
-    if (entry.buffer) {
-      if (entry.buffer.length > params.maxBytes) {
-        throw new MediaUnderstandingSkipError(
-          "maxBytes",
-          `Attachment ${params.attachmentIndex + 1} exceeds maxBytes ${params.maxBytes}`,
-        );
-      }
-      return {
-        buffer: entry.buffer,
-        mime: entry.bufferMime,
-        fileName: entry.bufferFileName ?? `media-${params.attachmentIndex + 1}`,
-        size: entry.buffer.length,
-      };
-    }
-
-    if (entry.resolvedPath) {
-      const size = await this.ensureLocalStat(entry);
-      if (entry.resolvedPath) {
-        if (size !== undefined && size > params.maxBytes) {
-          throw new MediaUnderstandingSkipError(
-            "maxBytes",
-            `Attachment ${params.attachmentIndex + 1} exceeds maxBytes ${params.maxBytes}`,
-          );
-        }
-        const buffer = await fs.readFile(entry.resolvedPath);
-        entry.buffer = buffer;
-        entry.bufferMime =
-          entry.bufferMime ??
-          entry.attachment.mime ??
-          (await detectMime({
-            buffer,
-            filePath: entry.resolvedPath,
-          }));
-        entry.bufferFileName =
-          path.basename(entry.resolvedPath) || `media-${params.attachmentIndex + 1}`;
-        return {
-          buffer,
-          mime: entry.bufferMime,
-          fileName: entry.bufferFileName,
-          size: buffer.length,
-        };
-      }
-    }
-
-    const url = entry.attachment.url?.trim();
-    if (!url) {
-      throw new MediaUnderstandingSkipError(
-        "empty",
-        `Attachment ${params.attachmentIndex + 1} has no path or URL.`,
-      );
-    }
-
-    try {
-      const fetchImpl = (input: RequestInfo | URL, init?: RequestInit) =>
-        fetchWithTimeout(resolveRequestUrl(input), init ?? {}, params.timeoutMs, fetch);
-      const fetched = await fetchRemoteMedia({ url, fetchImpl, maxBytes: params.maxBytes });
-      entry.buffer = fetched.buffer;
-      entry.bufferMime =
-        entry.attachment.mime ??
-        fetched.contentType ??
-        (await detectMime({
-          buffer: fetched.buffer,
-          filePath: fetched.fileName ?? url,
-        }));
-      entry.bufferFileName = fetched.fileName ?? `media-${params.attachmentIndex + 1}`;
-      return {
-        buffer: fetched.buffer,
-        mime: entry.bufferMime,
-        fileName: entry.bufferFileName,
-        size: fetched.buffer.length,
-      };
-    } catch (err) {
-      if (err instanceof MediaFetchError && err.code === "max_bytes") {
-        throw new MediaUnderstandingSkipError(
-          "maxBytes",
-          `Attachment ${params.attachmentIndex + 1} exceeds maxBytes ${params.maxBytes}`,
-        );
-      }
-      if (isAbortError(err)) {
-        throw new MediaUnderstandingSkipError(
-          "timeout",
-          `Attachment ${params.attachmentIndex + 1} timed out while fetching.`,
-        );
-      }
-      throw err;
-    }
-  }
-
-  async getPath(params: {
-    attachmentIndex: number;
-    maxBytes?: number;
-    timeoutMs: number;
-  }): Promise<MediaPathResult> {
-    const entry = await this.ensureEntry(params.attachmentIndex);
-    if (entry.resolvedPath) {
-      if (params.maxBytes) {
-        const size = await this.ensureLocalStat(entry);
-        if (entry.resolvedPath) {
-          if (size !== undefined && size > params.maxBytes) {
-            throw new MediaUnderstandingSkipError(
-              "maxBytes",
-              `Attachment ${params.attachmentIndex + 1} exceeds maxBytes ${params.maxBytes}`,
-            );
-          }
-        }
-      }
-      if (entry.resolvedPath) {
-        return { path: entry.resolvedPath };
-      }
-    }
-
-    if (entry.tempPath) {
-      if (params.maxBytes && entry.buffer && entry.buffer.length > params.maxBytes) {
-        throw new MediaUnderstandingSkipError(
-          "maxBytes",
-          `Attachment ${params.attachmentIndex + 1} exceeds maxBytes ${params.maxBytes}`,
-        );
-      }
-      return { path: entry.tempPath, cleanup: entry.tempCleanup };
-    }
-
-    const maxBytes = params.maxBytes ?? Number.POSITIVE_INFINITY;
-    const bufferResult = await this.getBuffer({
-      attachmentIndex: params.attachmentIndex,
-      maxBytes,
-      timeoutMs: params.timeoutMs,
-    });
-    const extension = path.extname(bufferResult.fileName || "") || "";
-    const tmpPath = buildRandomTempFilePath({
-      prefix: "openclaw-media",
-      extension,
-    });
-    await fs.writeFile(tmpPath, bufferResult.buffer);
-    entry.tempPath = tmpPath;
-    entry.tempCleanup = async () => {
-      await fs.unlink(tmpPath).catch(() => {});
-    };
-    return { path: tmpPath, cleanup: entry.tempCleanup };
-  }
-
-  async cleanup(): Promise<void> {
-    const cleanups: Array<Promise<void> | void> = [];
-    for (const entry of this.entries.values()) {
-      if (entry.tempCleanup) {
-        cleanups.push(Promise.resolve(entry.tempCleanup()));
-        entry.tempCleanup = undefined;
-      }
-    }
-    await Promise.all(cleanups);
-  }
-
-  private async ensureEntry(attachmentIndex: number): Promise<AttachmentCacheEntry> {
-    const existing = this.entries.get(attachmentIndex);
-    if (existing) {
-      if (!existing.resolvedPath) {
-        existing.resolvedPath = this.resolveLocalPath(existing.attachment);
-      }
-      return existing;
-    }
-    const attachment = this.attachments.find((item) => item.index === attachmentIndex) ?? {
-      index: attachmentIndex,
-    };
-    const entry: AttachmentCacheEntry = {
-      attachment,
-      resolvedPath: this.resolveLocalPath(attachment),
-    };
-    this.entries.set(attachmentIndex, entry);
-    return entry;
-  }
-
-  private resolveLocalPath(attachment: MediaAttachment): string | undefined {
-    const rawPath = normalizeAttachmentPath(attachment.path);
-    if (!rawPath) {
-      return undefined;
-    }
-    return path.isAbsolute(rawPath) ? rawPath : path.resolve(rawPath);
-  }
-
-  private async ensureLocalStat(entry: AttachmentCacheEntry): Promise<number | undefined> {
-    if (!entry.resolvedPath) {
-      return undefined;
-    }
-    if (!isInboundPathAllowed({ filePath: entry.resolvedPath, roots: this.localPathRoots })) {
-      entry.resolvedPath = undefined;
-      if (shouldLogVerbose()) {
-        logVerbose(
-          `Blocked attachment path outside allowed roots: ${entry.attachment.path ?? entry.attachment.url ?? "(unknown)"}`,
-        );
-      }
-      return undefined;
-    }
-    if (entry.statSize !== undefined) {
-      return entry.statSize;
-    }
-    try {
-      const currentPath = entry.resolvedPath;
-      const stat = await fs.stat(currentPath);
-      if (!stat.isFile()) {
-        entry.resolvedPath = undefined;
-        return undefined;
-      }
-      const canonicalPath = await fs.realpath(currentPath).catch(() => currentPath);
-      const canonicalRoots = await this.getCanonicalLocalPathRoots();
-      if (!isInboundPathAllowed({ filePath: canonicalPath, roots: canonicalRoots })) {
-        entry.resolvedPath = undefined;
-        if (shouldLogVerbose()) {
-          logVerbose(
-            `Blocked canonicalized attachment path outside allowed roots: ${canonicalPath}`,
-          );
-        }
-        return undefined;
-      }
-      entry.resolvedPath = canonicalPath;
-      entry.statSize = stat.size;
-      return stat.size;
-    } catch (err) {
-      entry.resolvedPath = undefined;
-      if (shouldLogVerbose()) {
-        logVerbose(`Failed to read attachment ${entry.attachment.index + 1}: ${String(err)}`);
-      }
-      return undefined;
-    }
-  }
-
-  private async getCanonicalLocalPathRoots(): Promise<readonly string[]> {
-    if (this.canonicalLocalPathRoots) {
-      return await this.canonicalLocalPathRoots;
-    }
-    this.canonicalLocalPathRoots = (async () =>
-      mergeInboundPathRoots(
-        this.localPathRoots,
-        await Promise.all(
-          this.localPathRoots.map(async (root) => {
-            if (root.includes("*")) {
-              return root;
-            }
-            return await fs.realpath(root).catch(() => root);
-          }),
-        ),
-      ))();
-    return await this.canonicalLocalPathRoots;
-  }
-}
+export {
+  isAudioAttachment,
+  isImageAttachment,
+  isVideoAttachment,
+  normalizeAttachments,
+  resolveAttachmentKind,
+} from "./attachments.normalize.js";
+export { selectAttachments } from "./attachments.select.js";
+export { MediaAttachmentCache, type MediaAttachmentCacheOptions } from "./attachments.cache.js";
diff --git a/src/media-understanding/audio-preflight.ts b/src/media-understanding/audio-preflight.ts
index c01ac51f589..735f921510c 100644
--- a/src/media-understanding/audio-preflight.ts
+++ b/src/media-understanding/audio-preflight.ts
@@ -2,13 +2,11 @@ import type { MsgContext } from "../auto-reply/templating.js";
 import type { OpenClawConfig } from "../config/config.js";
 import { logVerbose, shouldLogVerbose } from "../globals.js";
 import { isAudioAttachment } from "./attachments.js";
+import { runAudioTranscription } from "./audio-transcription-runner.js";
 import {
   type ActiveMediaModel,
-  buildProviderRegistry,
-  createMediaAttachmentCache,
   normalizeMediaAttachments,
   resolveMediaAttachmentLocalRoots,
-  runCapability,
 } from "./runner.js";
 import type { MediaUnderstandingProvider } from "./types.js";
 
@@ -50,31 +48,17 @@ export async function transcribeFirstAudio(params: {
     logVerbose(`audio-preflight: transcribing attachment ${firstAudio.index} for mention check`);
   }
 
-  const providerRegistry = buildProviderRegistry(params.providers);
-  const cache = createMediaAttachmentCache(attachments, {
-    localPathRoots: resolveMediaAttachmentLocalRoots({ cfg, ctx }),
-  });
-
   try {
-    const result = await runCapability({
-      capability: "audio",
-      cfg,
+    const { transcript } = await runAudioTranscription({
       ctx,
-      attachments: cache,
-      media: attachments,
+      cfg,
+      attachments,
       agentDir: params.agentDir,
-      providerRegistry,
-      config: audioConfig,
+      providers: params.providers,
       activeModel: params.activeModel,
+      localPathRoots: resolveMediaAttachmentLocalRoots({ cfg, ctx }),
     });
-
-    if (!result || result.outputs.length === 0) {
-      return undefined;
-    }
-
-    // Extract transcript from first audio output
-    const audioOutput = result.outputs.find((output) => output.kind === "audio.transcription");
-    if (!audioOutput || !audioOutput.text) {
+    if (!transcript) {
       return undefined;
     }
 
@@ -83,18 +67,16 @@ export async function transcribeFirstAudio(params: {
 
     if (shouldLogVerbose()) {
       logVerbose(
-        `audio-preflight: transcribed ${audioOutput.text.length} chars from attachment ${firstAudio.index}`,
+        `audio-preflight: transcribed ${transcript.length} chars from attachment ${firstAudio.index}`,
       );
     }
 
-    return audioOutput.text;
+    return transcript;
   } catch (err) {
     // Log but don't throw - let the message proceed with text-only mention check
     if (shouldLogVerbose()) {
       logVerbose(`audio-preflight: transcription failed: ${String(err)}`);
     }
     return undefined;
-  } finally {
-    await cache.cleanup();
   }
 }
diff --git a/src/media-understanding/audio-transcription-runner.ts b/src/media-understanding/audio-transcription-runner.ts
new file mode 100644
index 00000000000..3ef2fdfa0fa
--- /dev/null
+++ b/src/media-understanding/audio-transcription-runner.ts
@@ -0,0 +1,50 @@
+import type { MsgContext } from "../auto-reply/templating.js";
+import type { OpenClawConfig } from "../config/config.js";
+import {
+  type ActiveMediaModel,
+  buildProviderRegistry,
+  createMediaAttachmentCache,
+  normalizeMediaAttachments,
+  runCapability,
+} from "./runner.js";
+import type { MediaAttachment, MediaUnderstandingProvider } from "./types.js";
+
+export async function runAudioTranscription(params: {
+  ctx: MsgContext;
+  cfg: OpenClawConfig;
+  attachments?: MediaAttachment[];
+  agentDir?: string;
+  providers?: Record<string, MediaUnderstandingProvider>;
+  activeModel?: ActiveMediaModel;
+  localPathRoots?: readonly string[];
+}): Promise<{ transcript: string | undefined; attachments: MediaAttachment[] }> {
+  const attachments = params.attachments ?? normalizeMediaAttachments(params.ctx);
+  if (attachments.length === 0) {
+    return { transcript: undefined, attachments };
+  }
+
+  const providerRegistry = buildProviderRegistry(params.providers);
+  const cache = createMediaAttachmentCache(
+    attachments,
+    params.localPathRoots ? { localPathRoots: params.localPathRoots } : undefined,
+  );
+
+  try {
+    const result = await runCapability({
+      capability: "audio",
+      cfg: params.cfg,
+      ctx: params.ctx,
+      attachments: cache,
+      media: attachments,
+      agentDir: params.agentDir,
+      providerRegistry,
+      config: params.cfg.tools?.media?.audio,
+      activeModel: params.activeModel,
+    });
+    const output = result.outputs.find((entry) => entry.kind === "audio.transcription");
+    const transcript = output?.text?.trim();
+    return { transcript: transcript || undefined, attachments };
+  } finally {
+    await cache.cleanup();
+  }
+}
diff --git a/src/media-understanding/echo-transcript.ts b/src/media-understanding/echo-transcript.ts
new file mode 100644
index 00000000000..88764066963
--- /dev/null
+++ b/src/media-understanding/echo-transcript.ts
@@ -0,0 +1,62 @@
+import type { MsgContext } from "../auto-reply/templating.js";
+import type { OpenClawConfig } from "../config/config.js";
+import { logVerbose, shouldLogVerbose } from "../globals.js";
+import { isDeliverableMessageChannel } from "../utils/message-channel.js";
+
+export const DEFAULT_ECHO_TRANSCRIPT_FORMAT = '📝 "{transcript}"';
+
+function formatEchoTranscript(transcript: string, format: string): string {
+  return format.replace("{transcript}", transcript);
+}
+
+/**
+ * Sends the transcript echo back to the originating chat.
+ * Best-effort: logs on failure, never throws.
+ */
+export async function sendTranscriptEcho(params: {
+  ctx: MsgContext;
+  cfg: OpenClawConfig;
+  transcript: string;
+  format?: string;
+}): Promise<void> {
+  const { ctx, cfg, transcript } = params;
+  const channel = ctx.Provider ?? ctx.Surface ?? "";
+  const to = ctx.OriginatingTo ?? ctx.From ?? "";
+
+  if (!channel || !to) {
+    if (shouldLogVerbose()) {
+      logVerbose("media: echo-transcript skipped (no channel/to resolved from ctx)");
+    }
+    return;
+  }
+
+  const normalizedChannel = channel.trim().toLowerCase();
+  if (!isDeliverableMessageChannel(normalizedChannel)) {
+    if (shouldLogVerbose()) {
+      logVerbose(
+        `media: echo-transcript skipped (channel "${String(normalizedChannel)}" is not deliverable)`,
+      );
+    }
+    return;
+  }
+
+  const text = formatEchoTranscript(transcript, params.format ?? DEFAULT_ECHO_TRANSCRIPT_FORMAT);
+
+  try {
+    const { deliverOutboundPayloads } = await import("../infra/outbound/deliver.js");
+    await deliverOutboundPayloads({
+      cfg,
+      channel: normalizedChannel,
+      to,
+      accountId: ctx.AccountId ?? undefined,
+      threadId: ctx.MessageThreadId ?? undefined,
+      payloads: [{ text }],
+      bestEffort: true,
+    });
+    if (shouldLogVerbose()) {
+      logVerbose(`media: echo-transcript sent to ${normalizedChannel}/${to}`);
+    }
+  } catch (err) {
+    logVerbose(`media: echo-transcript delivery failed: ${String(err)}`);
+  }
+}
diff --git a/src/media-understanding/runner.entries.ts b/src/media-understanding/runner.entries.ts
index f2b9be0c099..6b9f0d7922b 100644
--- a/src/media-understanding/runner.entries.ts
+++ b/src/media-understanding/runner.entries.ts
@@ -368,6 +368,16 @@ export function formatDecisionSummary(decision: MediaUnderstandingDecision): str
   return `${decision.capability}: ${decision.outcome}${countLabel}${viaLabel}${reasonLabel}`;
 }
 
+function assertMinAudioSize(params: { size: number; attachmentIndex: number }): void {
+  if (params.size >= MIN_AUDIO_FILE_BYTES) {
+    return;
+  }
+  throw new MediaUnderstandingSkipError(
+    "tooSmall",
+    `Audio attachment ${params.attachmentIndex + 1} is too small (${params.size} bytes, minimum ${MIN_AUDIO_FILE_BYTES})`,
+  );
+}
+
 export async function runProviderEntry(params: {
   capability: MediaUnderstandingCapability;
   entry: MediaUnderstandingModelConfig;
@@ -449,12 +459,7 @@ export async function runProviderEntry(params: {
       maxBytes,
       timeoutMs,
     });
-    if (media.size < MIN_AUDIO_FILE_BYTES) {
-      throw new MediaUnderstandingSkipError(
-        "tooSmall",
-        `Audio attachment ${params.attachmentIndex + 1} is too small (${media.size} bytes, minimum ${MIN_AUDIO_FILE_BYTES})`,
-      );
-    }
+    assertMinAudioSize({ size: media.size, attachmentIndex: params.attachmentIndex });
     const { apiKeys, baseUrl, headers } = await resolveProviderExecutionContext({
       providerId,
       cfg,
@@ -574,12 +579,7 @@ export async function runCliEntry(params: {
   });
   if (capability === "audio") {
     const stat = await fs.stat(pathResult.path);
-    if (stat.size < MIN_AUDIO_FILE_BYTES) {
-      throw new MediaUnderstandingSkipError(
-        "tooSmall",
-        `Audio attachment ${params.attachmentIndex + 1} is too small (${stat.size} bytes, minimum ${MIN_AUDIO_FILE_BYTES})`,
-      );
-    }
+    assertMinAudioSize({ size: stat.size, attachmentIndex: params.attachmentIndex });
   }
   const outputDir = await fs.mkdtemp(
     path.join(resolvePreferredOpenClawTmpDir(), "openclaw-media-cli-"),
diff --git a/src/media-understanding/runner.test-utils.ts b/src/media-understanding/runner.test-utils.ts
index c83d3178255..086418f049d 100644
--- a/src/media-understanding/runner.test-utils.ts
+++ b/src/media-understanding/runner.test-utils.ts
@@ -2,6 +2,7 @@ import fs from "node:fs/promises";
 import os from "node:os";
 import path from "node:path";
 import { withEnvAsync } from "../test-utils/env.js";
+import { MIN_AUDIO_FILE_BYTES } from "./defaults.js";
 import { createMediaAttachmentCache, normalizeMediaAttachments } from "./runner.js";
 
 type MediaFixtureParams = {
@@ -49,12 +50,18 @@ export async function withAudioFixture(
       filePrefix,
       extension: "wav",
       mediaType: "audio/wav",
-      fileContents: Buffer.alloc(2048, 0x52),
+      fileContents: createSafeAudioFixtureBuffer(2048, 0x52),
     },
     run,
   );
 }
 
+export function createSafeAudioFixtureBuffer(size?: number, fill = 0xab): Buffer {
+  const minSafeSize = MIN_AUDIO_FILE_BYTES + 1;
+  const finalSize = Math.max(size ?? minSafeSize, minSafeSize);
+  return Buffer.alloc(finalSize, fill);
+}
+
 export async function withVideoFixture(
   filePrefix: string,
   run: (params: MediaFixtureParams) => Promise<void>,
diff --git a/src/media-understanding/transcribe-audio.test.ts b/src/media-understanding/transcribe-audio.test.ts
index 2851b9d4a4e..8e76cb2b9d7 100644
--- a/src/media-understanding/transcribe-audio.test.ts
+++ b/src/media-understanding/transcribe-audio.test.ts
@@ -1,32 +1,13 @@
 import { beforeEach, describe, expect, it, vi } from "vitest";
 import type { OpenClawConfig } from "../config/config.js";
 
-const {
-  normalizeMediaAttachments,
-  createMediaAttachmentCache,
-  buildProviderRegistry,
-  runCapability,
-  cacheCleanup,
-} = vi.hoisted(() => {
-  const normalizeMediaAttachments = vi.fn();
-  const cacheCleanup = vi.fn(async () => {});
-  const createMediaAttachmentCache = vi.fn(() => ({ cleanup: cacheCleanup }));
-  const buildProviderRegistry = vi.fn(() => new Map());
-  const runCapability = vi.fn();
-  return {
-    normalizeMediaAttachments,
-    createMediaAttachmentCache,
-    buildProviderRegistry,
-    runCapability,
-    cacheCleanup,
-  };
+const { runAudioTranscription } = vi.hoisted(() => {
+  const runAudioTranscription = vi.fn();
+  return { runAudioTranscription };
 });
 
-vi.mock("./runner.js", () => ({
-  normalizeMediaAttachments,
-  createMediaAttachmentCache,
-  buildProviderRegistry,
-  runCapability,
+vi.mock("./audio-transcription-runner.js", () => ({
+  runAudioTranscription,
 }));
 
 import { transcribeAudioFile } from "./transcribe-audio.js";
@@ -34,30 +15,29 @@ import { transcribeAudioFile } from "./transcribe-audio.js";
 describe("transcribeAudioFile", () => {
   beforeEach(() => {
     vi.clearAllMocks();
-    cacheCleanup.mockResolvedValue(undefined);
   });
 
   it("does not force audio/wav when mime is omitted", async () => {
-    normalizeMediaAttachments.mockReturnValue([{ index: 0, path: "/tmp/note.mp3" }]);
-    runCapability.mockResolvedValue({
-      outputs: [{ kind: "audio.transcription", text: "  hello  " }],
-    });
+    runAudioTranscription.mockResolvedValue({ transcript: "hello", attachments: [] });
 
     const result = await transcribeAudioFile({
       filePath: "/tmp/note.mp3",
       cfg: {} as OpenClawConfig,
     });
 
-    expect(normalizeMediaAttachments).toHaveBeenCalledWith({
-      MediaPath: "/tmp/note.mp3",
-      MediaType: undefined,
+    expect(runAudioTranscription).toHaveBeenCalledWith({
+      ctx: {
+        MediaPath: "/tmp/note.mp3",
+        MediaType: undefined,
+      },
+      cfg: {} as OpenClawConfig,
+      agentDir: undefined,
     });
     expect(result).toEqual({ text: "hello" });
-    expect(cacheCleanup).toHaveBeenCalledTimes(1);
   });
 
-  it("returns undefined and skips cache when there are no attachments", async () => {
-    normalizeMediaAttachments.mockReturnValue([]);
+  it("returns undefined when helper returns no transcript", async () => {
+    runAudioTranscription.mockResolvedValue({ transcript: undefined, attachments: [] });
 
     const result = await transcribeAudioFile({
       filePath: "/tmp/missing.wav",
@@ -65,16 +45,13 @@ describe("transcribeAudioFile", () => {
     });
 
     expect(result).toEqual({ text: undefined });
-    expect(createMediaAttachmentCache).not.toHaveBeenCalled();
-    expect(runCapability).not.toHaveBeenCalled();
   });
 
-  it("always cleans up cache on errors", async () => {
+  it("propagates helper errors", async () => {
     const cfg = {
       tools: { media: { audio: { timeoutSeconds: 10 } } },
     } as unknown as OpenClawConfig;
-    normalizeMediaAttachments.mockReturnValue([{ index: 0, path: "/tmp/note.wav" }]);
-    runCapability.mockRejectedValue(new Error("boom"));
+    runAudioTranscription.mockRejectedValue(new Error("boom"));
 
     await expect(
       transcribeAudioFile({
@@ -82,14 +59,5 @@ describe("transcribeAudioFile", () => {
         cfg,
       }),
     ).rejects.toThrow("boom");
-
-    expect(runCapability).toHaveBeenCalledWith(
-      expect.objectContaining({
-        capability: "audio",
-        cfg,
-        config: cfg.tools?.media?.audio,
-      }),
-    );
-    expect(cacheCleanup).toHaveBeenCalledTimes(1);
   });
 });
diff --git a/src/media-understanding/transcribe-audio.ts b/src/media-understanding/transcribe-audio.ts
index 463e90608fa..b2840c80ea3 100644
--- a/src/media-understanding/transcribe-audio.ts
+++ b/src/media-understanding/transcribe-audio.ts
@@ -1,10 +1,5 @@
 import type { OpenClawConfig } from "../config/config.js";
-import {
-  buildProviderRegistry,
-  createMediaAttachmentCache,
-  normalizeMediaAttachments,
-  runCapability,
-} from "./runner.js";
+import { runAudioTranscription } from "./audio-transcription-runner.js";
 
 /**
  * Transcribe an audio file using the configured media-understanding provider.
@@ -25,27 +20,10 @@ export async function transcribeAudioFile(params: {
     MediaPath: params.filePath,
     MediaType: params.mime,
   };
-  const attachments = normalizeMediaAttachments(ctx);
-  if (attachments.length === 0) {
-    return { text: undefined };
-  }
-  const cache = createMediaAttachmentCache(attachments);
-  const providerRegistry = buildProviderRegistry();
-  try {
-    const result = await runCapability({
-      capability: "audio",
-      cfg: params.cfg,
-      ctx,
-      attachments: cache,
-      media: attachments,
-      agentDir: params.agentDir,
-      providerRegistry,
-      config: params.cfg.tools?.media?.audio,
-    });
-    const output = result.outputs.find((entry) => entry.kind === "audio.transcription");
-    const text = output?.text?.trim();
-    return { text: text || undefined };
-  } finally {
-    await cache.cleanup();
-  }
+  const { transcript } = await runAudioTranscription({
+    ctx,
+    cfg: params.cfg,
+    agentDir: params.agentDir,
+  });
+  return { text: transcript };
 }