fix: deliver tool result media when verbose is off (#16679)

Merged via /review-pr -> /prepare-pr -> /merge-pr. Prepared head SHA: 6e16feb164 Co-authored-by: christianklotz <69443+christianklotz@users.noreply.github.com> Co-authored-by: christianklotz <69443+christianklotz@users.noreply.github.com> Reviewed-by: @christianklotz
2026-05-06 05:20:43 +00:00 · 2026-02-15 02:18:57 +00:00
parent 906c32da12
commit 68c78c4b43
6 changed files with 393 additions and 0 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -14,6 +14,7 @@ Docs: https://docs.openclaw.ai

 ### Fixes

+- Agents: deliver tool result media (screenshots, images, audio) to channels regardless of verbose level. (#11735) Thanks @strelov1.
 - Telegram: when `channels.telegram.commands.native` is `false`, exclude plugin commands from `setMyCommands` menu registration while keeping plugin slash handlers callable. (#15132) Thanks @Glucksberg.
 - LINE: return 200 OK for Developers Console "Verify" requests (`{"events":[]}`) without `X-Line-Signature`, while still requiring signatures for real deliveries. (#16582) Thanks @arosstale.
 - Cron: deliver text-only output directly when `delivery.to` is set so cron recipients get full output instead of summaries. (#16360) Thanks @thewilloftheshadow.
--- a/src/agents/pi-embedded-subscribe.handlers.tools.media.test.ts
+++ b/src/agents/pi-embedded-subscribe.handlers.tools.media.test.ts
@@ -0,0 +1,174 @@
+import { describe, expect, it, vi } from "vitest";
+import type { EmbeddedPiSubscribeContext } from "./pi-embedded-subscribe.handlers.types.js";
+import { handleToolExecutionEnd } from "./pi-embedded-subscribe.handlers.tools.js";
+
+// Minimal mock context factory. Only the fields needed for the media emission path.
+function createMockContext(overrides?: {
+  shouldEmitToolOutput?: boolean;
+  onToolResult?: ReturnType<typeof vi.fn>;
+}): EmbeddedPiSubscribeContext {
+  const onToolResult = overrides?.onToolResult ?? vi.fn();
+  return {
+    params: {
+      runId: "test-run",
+      onToolResult,
+      onAgentEvent: vi.fn(),
+    },
+    state: {
+      toolMetaById: new Map(),
+      toolMetas: [],
+      toolSummaryById: new Set(),
+      pendingMessagingTexts: new Map(),
+      pendingMessagingTargets: new Map(),
+      messagingToolSentTexts: [],
+      messagingToolSentTextsNormalized: [],
+      messagingToolSentTargets: [],
+    },
+    log: { debug: vi.fn(), warn: vi.fn() },
+    shouldEmitToolResult: vi.fn(() => false),
+    shouldEmitToolOutput: vi.fn(() => overrides?.shouldEmitToolOutput ?? false),
+    emitToolSummary: vi.fn(),
+    emitToolOutput: vi.fn(),
+    trimMessagingToolSent: vi.fn(),
+    hookRunner: undefined,
+    // Fill in remaining required fields with no-ops.
+    blockChunker: null,
+    noteLastAssistant: vi.fn(),
+    stripBlockTags: vi.fn((t: string) => t),
+    emitBlockChunk: vi.fn(),
+    flushBlockReplyBuffer: vi.fn(),
+    emitReasoningStream: vi.fn(),
+    consumeReplyDirectives: vi.fn(() => null),
+    consumePartialReplyDirectives: vi.fn(() => null),
+    resetAssistantMessageState: vi.fn(),
+    resetForCompactionRetry: vi.fn(),
+    finalizeAssistantTexts: vi.fn(),
+    ensureCompactionPromise: vi.fn(),
+    noteCompactionRetry: vi.fn(),
+    resolveCompactionRetry: vi.fn(),
+    maybeResolveCompactionWait: vi.fn(),
+    recordAssistantUsage: vi.fn(),
+    incrementCompactionCount: vi.fn(),
+    getUsageTotals: vi.fn(() => undefined),
+    getCompactionCount: vi.fn(() => 0),
+  } as unknown as EmbeddedPiSubscribeContext;
+}
+
+describe("handleToolExecutionEnd media emission", () => {
+  it("emits media when verbose is off and tool result has MEDIA: path", async () => {
+    const onToolResult = vi.fn();
+    const ctx = createMockContext({ shouldEmitToolOutput: false, onToolResult });
+
+    await handleToolExecutionEnd(ctx, {
+      type: "tool_execution_end",
+      toolName: "browser",
+      toolCallId: "tc-1",
+      isError: false,
+      result: {
+        content: [
+          { type: "text", text: "MEDIA:/tmp/screenshot.png" },
+          { type: "image", data: "base64", mimeType: "image/png" },
+        ],
+        details: { path: "/tmp/screenshot.png" },
+      },
+    });
+
+    expect(onToolResult).toHaveBeenCalledWith({
+      mediaUrls: ["/tmp/screenshot.png"],
+    });
+  });
+
+  it("does NOT emit media when verbose is full (emitToolOutput handles it)", async () => {
+    const onToolResult = vi.fn();
+    const ctx = createMockContext({ shouldEmitToolOutput: true, onToolResult });
+
+    await handleToolExecutionEnd(ctx, {
+      type: "tool_execution_end",
+      toolName: "browser",
+      toolCallId: "tc-1",
+      isError: false,
+      result: {
+        content: [
+          { type: "text", text: "MEDIA:/tmp/screenshot.png" },
+          { type: "image", data: "base64", mimeType: "image/png" },
+        ],
+        details: { path: "/tmp/screenshot.png" },
+      },
+    });
+
+    // onToolResult should NOT be called by the new media path (emitToolOutput handles it).
+    // It may be called by emitToolOutput, but the new block should not fire.
+    // Verify emitToolOutput was called instead.
+    expect(ctx.emitToolOutput).toHaveBeenCalled();
+    // The direct media emission should not have been called with just mediaUrls.
+    const directMediaCalls = onToolResult.mock.calls.filter(
+      (call: unknown[]) =>
+        call[0] &&
+        typeof call[0] === "object" &&
+        "mediaUrls" in (call[0] as Record<string, unknown>) &&
+        !("text" in (call[0] as Record<string, unknown>)),
+    );
+    expect(directMediaCalls).toHaveLength(0);
+  });
+
+  it("does NOT emit media for error results", async () => {
+    const onToolResult = vi.fn();
+    const ctx = createMockContext({ shouldEmitToolOutput: false, onToolResult });
+
+    await handleToolExecutionEnd(ctx, {
+      type: "tool_execution_end",
+      toolName: "browser",
+      toolCallId: "tc-1",
+      isError: true,
+      result: {
+        content: [
+          { type: "text", text: "MEDIA:/tmp/screenshot.png" },
+          { type: "image", data: "base64", mimeType: "image/png" },
+        ],
+        details: { path: "/tmp/screenshot.png" },
+      },
+    });
+
+    expect(onToolResult).not.toHaveBeenCalled();
+  });
+
+  it("does NOT emit when tool result has no media", async () => {
+    const onToolResult = vi.fn();
+    const ctx = createMockContext({ shouldEmitToolOutput: false, onToolResult });
+
+    await handleToolExecutionEnd(ctx, {
+      type: "tool_execution_end",
+      toolName: "bash",
+      toolCallId: "tc-1",
+      isError: false,
+      result: {
+        content: [{ type: "text", text: "Command executed successfully" }],
+      },
+    });
+
+    expect(onToolResult).not.toHaveBeenCalled();
+  });
+
+  it("emits media from details.path fallback when no MEDIA: text", async () => {
+    const onToolResult = vi.fn();
+    const ctx = createMockContext({ shouldEmitToolOutput: false, onToolResult });
+
+    await handleToolExecutionEnd(ctx, {
+      type: "tool_execution_end",
+      toolName: "canvas",
+      toolCallId: "tc-1",
+      isError: false,
+      result: {
+        content: [
+          { type: "text", text: "Rendered canvas" },
+          { type: "image", data: "base64", mimeType: "image/png" },
+        ],
+        details: { path: "/tmp/canvas-output.png" },
+      },
+    });
+
+    expect(onToolResult).toHaveBeenCalledWith({
+      mediaUrls: ["/tmp/canvas-output.png"],
+    });
+  });
+});
--- a/src/agents/pi-embedded-subscribe.handlers.tools.ts
+++ b/src/agents/pi-embedded-subscribe.handlers.tools.ts
@@ -10,6 +10,7 @@ import { normalizeTextForComparison } from "./pi-embedded-helpers.js";
 import { isMessagingTool, isMessagingToolSendAction } from "./pi-embedded-messaging.js";
 import {
  extractToolErrorMessage,
+  extractToolResultMediaPaths,
  extractToolResultText,
  extractMessagingToolSend,
  isToolResultError,
@@ -266,6 +267,20 @@ export async function handleToolExecutionEnd(
    }
  }

+  // Deliver media from tool results when the verbose emitToolOutput path is off.
+  // When shouldEmitToolOutput() is true, emitToolOutput already delivers media
+  // via parseReplyDirectives (MEDIA: text extraction), so skip to avoid duplicates.
+  if (ctx.params.onToolResult && !isToolError && !ctx.shouldEmitToolOutput()) {
+    const mediaPaths = extractToolResultMediaPaths(result);
+    if (mediaPaths.length > 0) {
+      try {
+        void ctx.params.onToolResult({ mediaUrls: mediaPaths });
+      } catch {
+        // ignore delivery failures
+      }
+    }
+  }
+
  // Run after_tool_call plugin hook (fire-and-forget)
  const hookRunnerAfter = ctx.hookRunner ?? getGlobalHookRunner();
  if (hookRunnerAfter?.hasHooks("after_tool_call")) {
--- a/src/agents/pi-embedded-subscribe.tools.media.test.ts
+++ b/src/agents/pi-embedded-subscribe.tools.media.test.ts
@@ -0,0 +1,132 @@
+import { describe, expect, it } from "vitest";
+import { extractToolResultMediaPaths } from "./pi-embedded-subscribe.tools.js";
+
+describe("extractToolResultMediaPaths", () => {
+  it("returns empty array for null/undefined", () => {
+    expect(extractToolResultMediaPaths(null)).toEqual([]);
+    expect(extractToolResultMediaPaths(undefined)).toEqual([]);
+  });
+
+  it("returns empty array for non-object", () => {
+    expect(extractToolResultMediaPaths("hello")).toEqual([]);
+    expect(extractToolResultMediaPaths(42)).toEqual([]);
+  });
+
+  it("returns empty array when content is missing", () => {
+    expect(extractToolResultMediaPaths({ details: { path: "/tmp/img.png" } })).toEqual([]);
+  });
+
+  it("returns empty array when content has no text or image blocks", () => {
+    expect(extractToolResultMediaPaths({ content: [{ type: "other" }] })).toEqual([]);
+  });
+
+  it("extracts MEDIA: path from text content block", () => {
+    const result = {
+      content: [
+        { type: "text", text: "MEDIA:/tmp/screenshot.png" },
+        { type: "image", data: "base64data", mimeType: "image/png" },
+      ],
+      details: { path: "/tmp/screenshot.png" },
+    };
+    expect(extractToolResultMediaPaths(result)).toEqual(["/tmp/screenshot.png"]);
+  });
+
+  it("extracts MEDIA: path with extra text in the block", () => {
+    const result = {
+      content: [{ type: "text", text: "Here is the image\nMEDIA:/tmp/output.jpg\nDone" }],
+    };
+    expect(extractToolResultMediaPaths(result)).toEqual(["/tmp/output.jpg"]);
+  });
+
+  it("extracts multiple MEDIA: paths from different text blocks", () => {
+    const result = {
+      content: [
+        { type: "text", text: "MEDIA:/tmp/page1.png" },
+        { type: "text", text: "MEDIA:/tmp/page2.png" },
+      ],
+    };
+    expect(extractToolResultMediaPaths(result)).toEqual(["/tmp/page1.png", "/tmp/page2.png"]);
+  });
+
+  it("falls back to details.path when image content exists but no MEDIA: text", () => {
+    // Pi SDK read tool doesn't include MEDIA: but OpenClaw imageResult
+    // sets details.path as fallback.
+    const result = {
+      content: [
+        { type: "text", text: "Read image file [image/png]" },
+        { type: "image", data: "base64data", mimeType: "image/png" },
+      ],
+      details: { path: "/tmp/generated.png" },
+    };
+    expect(extractToolResultMediaPaths(result)).toEqual(["/tmp/generated.png"]);
+  });
+
+  it("returns empty array when image content exists but no MEDIA: and no details.path", () => {
+    // Pi SDK read tool: has image content but no path anywhere in the result.
+    const result = {
+      content: [
+        { type: "text", text: "Read image file [image/png]" },
+        { type: "image", data: "base64data", mimeType: "image/png" },
+      ],
+    };
+    expect(extractToolResultMediaPaths(result)).toEqual([]);
+  });
+
+  it("does not fall back to details.path when MEDIA: paths are found", () => {
+    const result = {
+      content: [
+        { type: "text", text: "MEDIA:/tmp/from-text.png" },
+        { type: "image", data: "base64data", mimeType: "image/png" },
+      ],
+      details: { path: "/tmp/from-details.png" },
+    };
+    // MEDIA: text takes priority; details.path is NOT also included.
+    expect(extractToolResultMediaPaths(result)).toEqual(["/tmp/from-text.png"]);
+  });
+
+  it("handles backtick-wrapped MEDIA: paths", () => {
+    const result = {
+      content: [{ type: "text", text: "MEDIA: `/tmp/screenshot.png`" }],
+    };
+    expect(extractToolResultMediaPaths(result)).toEqual(["/tmp/screenshot.png"]);
+  });
+
+  it("ignores null/undefined items in content array", () => {
+    const result = {
+      content: [null, undefined, { type: "text", text: "MEDIA:/tmp/ok.png" }],
+    };
+    expect(extractToolResultMediaPaths(result)).toEqual(["/tmp/ok.png"]);
+  });
+
+  it("returns empty array for text-only results without MEDIA:", () => {
+    const result = {
+      content: [{ type: "text", text: "Command executed successfully" }],
+    };
+    expect(extractToolResultMediaPaths(result)).toEqual([]);
+  });
+
+  it("ignores details.path when no image content exists", () => {
+    // details.path without image content is not media.
+    const result = {
+      content: [{ type: "text", text: "File saved" }],
+      details: { path: "/tmp/data.json" },
+    };
+    expect(extractToolResultMediaPaths(result)).toEqual([]);
+  });
+
+  it("handles details.path with whitespace", () => {
+    const result = {
+      content: [{ type: "image", data: "base64", mimeType: "image/png" }],
+      details: { path: "  /tmp/image.png  " },
+    };
+    expect(extractToolResultMediaPaths(result)).toEqual(["/tmp/image.png"]);
+  });
+
+  it("skips empty details.path", () => {
+    const result = {
+      content: [{ type: "image", data: "base64", mimeType: "image/png" }],
+      details: { path: "   " },
+    };
+    expect(extractToolResultMediaPaths(result)).toEqual([]);
+  });
+});
--- a/src/agents/pi-embedded-subscribe.tools.ts
+++ b/src/agents/pi-embedded-subscribe.tools.ts
@@ -1,5 +1,6 @@
 import { getChannelPlugin, normalizeChannelId } from "../channels/plugins/index.js";
 import { normalizeTargetForProvider } from "../infra/outbound/target-normalization.js";
+import { MEDIA_TOKEN_RE } from "../media/parse.js";
 import { truncateUtf16Safe } from "../utils.js";
 import { type MessagingToolSend } from "./pi-embedded-messaging.js";

@@ -118,6 +119,72 @@ export function extractToolResultText(result: unknown): string | undefined {
  return texts.join("\n");
 }

+/**
+ * Extract media file paths from a tool result.
+ *
+ * Strategy (first match wins):
+ * 1. Parse `MEDIA:` tokens from text content blocks (all OpenClaw tools).
+ * 2. Fall back to `details.path` when image content exists (OpenClaw imageResult).
+ *
+ * Returns an empty array when no media is found (e.g. Pi SDK `read` tool
+ * returns base64 image data but no file path; those need a different delivery
+ * path like saving to a temp file).
+ */
+export function extractToolResultMediaPaths(result: unknown): string[] {
+  if (!result || typeof result !== "object") {
+    return [];
+  }
+  const record = result as Record<string, unknown>;
+  const content = Array.isArray(record.content) ? record.content : null;
+  if (!content) {
+    return [];
+  }
+
+  // Extract MEDIA: paths from text content blocks.
+  const paths: string[] = [];
+  let hasImageContent = false;
+  for (const item of content) {
+    if (!item || typeof item !== "object") {
+      continue;
+    }
+    const entry = item as Record<string, unknown>;
+    if (entry.type === "image") {
+      hasImageContent = true;
+      continue;
+    }
+    if (entry.type === "text" && typeof entry.text === "string") {
+      // Reset lastIndex since MEDIA_TOKEN_RE is global.
+      MEDIA_TOKEN_RE.lastIndex = 0;
+      let match: RegExpExecArray | null;
+      while ((match = MEDIA_TOKEN_RE.exec(entry.text)) !== null) {
+        // Strip surrounding quotes/backticks and whitespace (mirrors cleanCandidate in media/parse).
+        const p = match[1]
+          ?.replace(/^[`"'[{(]+/, "")
+          .replace(/[`"'\]})\\,]+$/, "")
+          .trim();
+        if (p && p.length <= 4096) {
+          paths.push(p);
+        }
+      }
+    }
+  }
+
+  if (paths.length > 0) {
+    return paths;
+  }
+
+  // Fall back to details.path when image content exists but no MEDIA: text.
+  if (hasImageContent) {
+    const details = record.details as Record<string, unknown> | undefined;
+    const p = typeof details?.path === "string" ? details.path.trim() : "";
+    if (p) {
+      return [p];
+    }
+  }
+
+  return [];
+}
+
 export function isToolResultError(result: unknown): boolean {
  if (!result || typeof result !== "object") {
    return false;
--- a/src/auto-reply/reply/agent-runner-execution.ts
+++ b/src/auto-reply/reply/agent-runner-execution.ts
@@ -128,6 +128,10 @@ export async function runAgentTurnWithFallback(params: {
          return { skip: true };
        }
        if (!text) {
+          // Allow media-only payloads (e.g. tool result screenshots) through.
+          if ((payload.mediaUrls?.length ?? 0) > 0) {
+            return { text: undefined, skip: false };
+          }
          return { skip: true };
        }
        const sanitized = sanitizeUserFacingText(text, {