diff --git a/CHANGELOG.md b/CHANGELOG.md index 3b5a4bcce05..bfe77f9b087 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,7 @@ Docs: https://docs.openclaw.ai ### Fixes +- Agents: deliver tool result media (screenshots, images, audio) to channels regardless of verbose level. (#11735) Thanks @strelov1. - Telegram: when `channels.telegram.commands.native` is `false`, exclude plugin commands from `setMyCommands` menu registration while keeping plugin slash handlers callable. (#15132) Thanks @Glucksberg. - LINE: return 200 OK for Developers Console "Verify" requests (`{"events":[]}`) without `X-Line-Signature`, while still requiring signatures for real deliveries. (#16582) Thanks @arosstale. - Cron: deliver text-only output directly when `delivery.to` is set so cron recipients get full output instead of summaries. (#16360) Thanks @thewilloftheshadow. diff --git a/src/agents/pi-embedded-subscribe.handlers.tools.media.test.ts b/src/agents/pi-embedded-subscribe.handlers.tools.media.test.ts new file mode 100644 index 00000000000..51ff0038ca4 --- /dev/null +++ b/src/agents/pi-embedded-subscribe.handlers.tools.media.test.ts @@ -0,0 +1,174 @@ +import { describe, expect, it, vi } from "vitest"; +import type { EmbeddedPiSubscribeContext } from "./pi-embedded-subscribe.handlers.types.js"; +import { handleToolExecutionEnd } from "./pi-embedded-subscribe.handlers.tools.js"; + +// Minimal mock context factory. Only the fields needed for the media emission path. +function createMockContext(overrides?: { + shouldEmitToolOutput?: boolean; + onToolResult?: ReturnType; +}): EmbeddedPiSubscribeContext { + const onToolResult = overrides?.onToolResult ?? vi.fn(); + return { + params: { + runId: "test-run", + onToolResult, + onAgentEvent: vi.fn(), + }, + state: { + toolMetaById: new Map(), + toolMetas: [], + toolSummaryById: new Set(), + pendingMessagingTexts: new Map(), + pendingMessagingTargets: new Map(), + messagingToolSentTexts: [], + messagingToolSentTextsNormalized: [], + messagingToolSentTargets: [], + }, + log: { debug: vi.fn(), warn: vi.fn() }, + shouldEmitToolResult: vi.fn(() => false), + shouldEmitToolOutput: vi.fn(() => overrides?.shouldEmitToolOutput ?? false), + emitToolSummary: vi.fn(), + emitToolOutput: vi.fn(), + trimMessagingToolSent: vi.fn(), + hookRunner: undefined, + // Fill in remaining required fields with no-ops. + blockChunker: null, + noteLastAssistant: vi.fn(), + stripBlockTags: vi.fn((t: string) => t), + emitBlockChunk: vi.fn(), + flushBlockReplyBuffer: vi.fn(), + emitReasoningStream: vi.fn(), + consumeReplyDirectives: vi.fn(() => null), + consumePartialReplyDirectives: vi.fn(() => null), + resetAssistantMessageState: vi.fn(), + resetForCompactionRetry: vi.fn(), + finalizeAssistantTexts: vi.fn(), + ensureCompactionPromise: vi.fn(), + noteCompactionRetry: vi.fn(), + resolveCompactionRetry: vi.fn(), + maybeResolveCompactionWait: vi.fn(), + recordAssistantUsage: vi.fn(), + incrementCompactionCount: vi.fn(), + getUsageTotals: vi.fn(() => undefined), + getCompactionCount: vi.fn(() => 0), + } as unknown as EmbeddedPiSubscribeContext; +} + +describe("handleToolExecutionEnd media emission", () => { + it("emits media when verbose is off and tool result has MEDIA: path", async () => { + const onToolResult = vi.fn(); + const ctx = createMockContext({ shouldEmitToolOutput: false, onToolResult }); + + await handleToolExecutionEnd(ctx, { + type: "tool_execution_end", + toolName: "browser", + toolCallId: "tc-1", + isError: false, + result: { + content: [ + { type: "text", text: "MEDIA:/tmp/screenshot.png" }, + { type: "image", data: "base64", mimeType: "image/png" }, + ], + details: { path: "/tmp/screenshot.png" }, + }, + }); + + expect(onToolResult).toHaveBeenCalledWith({ + mediaUrls: ["/tmp/screenshot.png"], + }); + }); + + it("does NOT emit media when verbose is full (emitToolOutput handles it)", async () => { + const onToolResult = vi.fn(); + const ctx = createMockContext({ shouldEmitToolOutput: true, onToolResult }); + + await handleToolExecutionEnd(ctx, { + type: "tool_execution_end", + toolName: "browser", + toolCallId: "tc-1", + isError: false, + result: { + content: [ + { type: "text", text: "MEDIA:/tmp/screenshot.png" }, + { type: "image", data: "base64", mimeType: "image/png" }, + ], + details: { path: "/tmp/screenshot.png" }, + }, + }); + + // onToolResult should NOT be called by the new media path (emitToolOutput handles it). + // It may be called by emitToolOutput, but the new block should not fire. + // Verify emitToolOutput was called instead. + expect(ctx.emitToolOutput).toHaveBeenCalled(); + // The direct media emission should not have been called with just mediaUrls. + const directMediaCalls = onToolResult.mock.calls.filter( + (call: unknown[]) => + call[0] && + typeof call[0] === "object" && + "mediaUrls" in (call[0] as Record) && + !("text" in (call[0] as Record)), + ); + expect(directMediaCalls).toHaveLength(0); + }); + + it("does NOT emit media for error results", async () => { + const onToolResult = vi.fn(); + const ctx = createMockContext({ shouldEmitToolOutput: false, onToolResult }); + + await handleToolExecutionEnd(ctx, { + type: "tool_execution_end", + toolName: "browser", + toolCallId: "tc-1", + isError: true, + result: { + content: [ + { type: "text", text: "MEDIA:/tmp/screenshot.png" }, + { type: "image", data: "base64", mimeType: "image/png" }, + ], + details: { path: "/tmp/screenshot.png" }, + }, + }); + + expect(onToolResult).not.toHaveBeenCalled(); + }); + + it("does NOT emit when tool result has no media", async () => { + const onToolResult = vi.fn(); + const ctx = createMockContext({ shouldEmitToolOutput: false, onToolResult }); + + await handleToolExecutionEnd(ctx, { + type: "tool_execution_end", + toolName: "bash", + toolCallId: "tc-1", + isError: false, + result: { + content: [{ type: "text", text: "Command executed successfully" }], + }, + }); + + expect(onToolResult).not.toHaveBeenCalled(); + }); + + it("emits media from details.path fallback when no MEDIA: text", async () => { + const onToolResult = vi.fn(); + const ctx = createMockContext({ shouldEmitToolOutput: false, onToolResult }); + + await handleToolExecutionEnd(ctx, { + type: "tool_execution_end", + toolName: "canvas", + toolCallId: "tc-1", + isError: false, + result: { + content: [ + { type: "text", text: "Rendered canvas" }, + { type: "image", data: "base64", mimeType: "image/png" }, + ], + details: { path: "/tmp/canvas-output.png" }, + }, + }); + + expect(onToolResult).toHaveBeenCalledWith({ + mediaUrls: ["/tmp/canvas-output.png"], + }); + }); +}); diff --git a/src/agents/pi-embedded-subscribe.handlers.tools.ts b/src/agents/pi-embedded-subscribe.handlers.tools.ts index a733e4f147b..07155d79170 100644 --- a/src/agents/pi-embedded-subscribe.handlers.tools.ts +++ b/src/agents/pi-embedded-subscribe.handlers.tools.ts @@ -10,6 +10,7 @@ import { normalizeTextForComparison } from "./pi-embedded-helpers.js"; import { isMessagingTool, isMessagingToolSendAction } from "./pi-embedded-messaging.js"; import { extractToolErrorMessage, + extractToolResultMediaPaths, extractToolResultText, extractMessagingToolSend, isToolResultError, @@ -266,6 +267,20 @@ export async function handleToolExecutionEnd( } } + // Deliver media from tool results when the verbose emitToolOutput path is off. + // When shouldEmitToolOutput() is true, emitToolOutput already delivers media + // via parseReplyDirectives (MEDIA: text extraction), so skip to avoid duplicates. + if (ctx.params.onToolResult && !isToolError && !ctx.shouldEmitToolOutput()) { + const mediaPaths = extractToolResultMediaPaths(result); + if (mediaPaths.length > 0) { + try { + void ctx.params.onToolResult({ mediaUrls: mediaPaths }); + } catch { + // ignore delivery failures + } + } + } + // Run after_tool_call plugin hook (fire-and-forget) const hookRunnerAfter = ctx.hookRunner ?? getGlobalHookRunner(); if (hookRunnerAfter?.hasHooks("after_tool_call")) { diff --git a/src/agents/pi-embedded-subscribe.tools.media.test.ts b/src/agents/pi-embedded-subscribe.tools.media.test.ts new file mode 100644 index 00000000000..f51e1e14521 --- /dev/null +++ b/src/agents/pi-embedded-subscribe.tools.media.test.ts @@ -0,0 +1,132 @@ +import { describe, expect, it } from "vitest"; +import { extractToolResultMediaPaths } from "./pi-embedded-subscribe.tools.js"; + +describe("extractToolResultMediaPaths", () => { + it("returns empty array for null/undefined", () => { + expect(extractToolResultMediaPaths(null)).toEqual([]); + expect(extractToolResultMediaPaths(undefined)).toEqual([]); + }); + + it("returns empty array for non-object", () => { + expect(extractToolResultMediaPaths("hello")).toEqual([]); + expect(extractToolResultMediaPaths(42)).toEqual([]); + }); + + it("returns empty array when content is missing", () => { + expect(extractToolResultMediaPaths({ details: { path: "/tmp/img.png" } })).toEqual([]); + }); + + it("returns empty array when content has no text or image blocks", () => { + expect(extractToolResultMediaPaths({ content: [{ type: "other" }] })).toEqual([]); + }); + + it("extracts MEDIA: path from text content block", () => { + const result = { + content: [ + { type: "text", text: "MEDIA:/tmp/screenshot.png" }, + { type: "image", data: "base64data", mimeType: "image/png" }, + ], + details: { path: "/tmp/screenshot.png" }, + }; + expect(extractToolResultMediaPaths(result)).toEqual(["/tmp/screenshot.png"]); + }); + + it("extracts MEDIA: path with extra text in the block", () => { + const result = { + content: [{ type: "text", text: "Here is the image\nMEDIA:/tmp/output.jpg\nDone" }], + }; + expect(extractToolResultMediaPaths(result)).toEqual(["/tmp/output.jpg"]); + }); + + it("extracts multiple MEDIA: paths from different text blocks", () => { + const result = { + content: [ + { type: "text", text: "MEDIA:/tmp/page1.png" }, + { type: "text", text: "MEDIA:/tmp/page2.png" }, + ], + }; + expect(extractToolResultMediaPaths(result)).toEqual(["/tmp/page1.png", "/tmp/page2.png"]); + }); + + it("falls back to details.path when image content exists but no MEDIA: text", () => { + // Pi SDK read tool doesn't include MEDIA: but OpenClaw imageResult + // sets details.path as fallback. + const result = { + content: [ + { type: "text", text: "Read image file [image/png]" }, + { type: "image", data: "base64data", mimeType: "image/png" }, + ], + details: { path: "/tmp/generated.png" }, + }; + expect(extractToolResultMediaPaths(result)).toEqual(["/tmp/generated.png"]); + }); + + it("returns empty array when image content exists but no MEDIA: and no details.path", () => { + // Pi SDK read tool: has image content but no path anywhere in the result. + const result = { + content: [ + { type: "text", text: "Read image file [image/png]" }, + { type: "image", data: "base64data", mimeType: "image/png" }, + ], + }; + expect(extractToolResultMediaPaths(result)).toEqual([]); + }); + + it("does not fall back to details.path when MEDIA: paths are found", () => { + const result = { + content: [ + { type: "text", text: "MEDIA:/tmp/from-text.png" }, + { type: "image", data: "base64data", mimeType: "image/png" }, + ], + details: { path: "/tmp/from-details.png" }, + }; + // MEDIA: text takes priority; details.path is NOT also included. + expect(extractToolResultMediaPaths(result)).toEqual(["/tmp/from-text.png"]); + }); + + it("handles backtick-wrapped MEDIA: paths", () => { + const result = { + content: [{ type: "text", text: "MEDIA: `/tmp/screenshot.png`" }], + }; + expect(extractToolResultMediaPaths(result)).toEqual(["/tmp/screenshot.png"]); + }); + + it("ignores null/undefined items in content array", () => { + const result = { + content: [null, undefined, { type: "text", text: "MEDIA:/tmp/ok.png" }], + }; + expect(extractToolResultMediaPaths(result)).toEqual(["/tmp/ok.png"]); + }); + + it("returns empty array for text-only results without MEDIA:", () => { + const result = { + content: [{ type: "text", text: "Command executed successfully" }], + }; + expect(extractToolResultMediaPaths(result)).toEqual([]); + }); + + it("ignores details.path when no image content exists", () => { + // details.path without image content is not media. + const result = { + content: [{ type: "text", text: "File saved" }], + details: { path: "/tmp/data.json" }, + }; + expect(extractToolResultMediaPaths(result)).toEqual([]); + }); + + it("handles details.path with whitespace", () => { + const result = { + content: [{ type: "image", data: "base64", mimeType: "image/png" }], + details: { path: " /tmp/image.png " }, + }; + expect(extractToolResultMediaPaths(result)).toEqual(["/tmp/image.png"]); + }); + + it("skips empty details.path", () => { + const result = { + content: [{ type: "image", data: "base64", mimeType: "image/png" }], + details: { path: " " }, + }; + expect(extractToolResultMediaPaths(result)).toEqual([]); + }); +}); diff --git a/src/agents/pi-embedded-subscribe.tools.ts b/src/agents/pi-embedded-subscribe.tools.ts index d5fe8aaf9ea..a4679183544 100644 --- a/src/agents/pi-embedded-subscribe.tools.ts +++ b/src/agents/pi-embedded-subscribe.tools.ts @@ -1,5 +1,6 @@ import { getChannelPlugin, normalizeChannelId } from "../channels/plugins/index.js"; import { normalizeTargetForProvider } from "../infra/outbound/target-normalization.js"; +import { MEDIA_TOKEN_RE } from "../media/parse.js"; import { truncateUtf16Safe } from "../utils.js"; import { type MessagingToolSend } from "./pi-embedded-messaging.js"; @@ -118,6 +119,72 @@ export function extractToolResultText(result: unknown): string | undefined { return texts.join("\n"); } +/** + * Extract media file paths from a tool result. + * + * Strategy (first match wins): + * 1. Parse `MEDIA:` tokens from text content blocks (all OpenClaw tools). + * 2. Fall back to `details.path` when image content exists (OpenClaw imageResult). + * + * Returns an empty array when no media is found (e.g. Pi SDK `read` tool + * returns base64 image data but no file path; those need a different delivery + * path like saving to a temp file). + */ +export function extractToolResultMediaPaths(result: unknown): string[] { + if (!result || typeof result !== "object") { + return []; + } + const record = result as Record; + const content = Array.isArray(record.content) ? record.content : null; + if (!content) { + return []; + } + + // Extract MEDIA: paths from text content blocks. + const paths: string[] = []; + let hasImageContent = false; + for (const item of content) { + if (!item || typeof item !== "object") { + continue; + } + const entry = item as Record; + if (entry.type === "image") { + hasImageContent = true; + continue; + } + if (entry.type === "text" && typeof entry.text === "string") { + // Reset lastIndex since MEDIA_TOKEN_RE is global. + MEDIA_TOKEN_RE.lastIndex = 0; + let match: RegExpExecArray | null; + while ((match = MEDIA_TOKEN_RE.exec(entry.text)) !== null) { + // Strip surrounding quotes/backticks and whitespace (mirrors cleanCandidate in media/parse). + const p = match[1] + ?.replace(/^[`"'[{(]+/, "") + .replace(/[`"'\]})\\,]+$/, "") + .trim(); + if (p && p.length <= 4096) { + paths.push(p); + } + } + } + } + + if (paths.length > 0) { + return paths; + } + + // Fall back to details.path when image content exists but no MEDIA: text. + if (hasImageContent) { + const details = record.details as Record | undefined; + const p = typeof details?.path === "string" ? details.path.trim() : ""; + if (p) { + return [p]; + } + } + + return []; +} + export function isToolResultError(result: unknown): boolean { if (!result || typeof result !== "object") { return false; diff --git a/src/auto-reply/reply/agent-runner-execution.ts b/src/auto-reply/reply/agent-runner-execution.ts index 9da0713dc18..e1cdaa0aed7 100644 --- a/src/auto-reply/reply/agent-runner-execution.ts +++ b/src/auto-reply/reply/agent-runner-execution.ts @@ -128,6 +128,10 @@ export async function runAgentTurnWithFallback(params: { return { skip: true }; } if (!text) { + // Allow media-only payloads (e.g. tool result screenshots) through. + if ((payload.mediaUrls?.length ?? 0) > 0) { + return { text: undefined, skip: false }; + } return { skip: true }; } const sanitized = sanitizeUserFacingText(text, {