diff --git a/docs/tools/tts.md b/docs/tools/tts.md index 4fe0da77e0a..a527d49cc21 100644 --- a/docs/tools/tts.md +++ b/docs/tools/tts.md @@ -390,9 +390,9 @@ Notes: ## Agent tool -The `tts` tool converts text to speech and returns a `MEDIA:` path. When the -result is Telegram-compatible, the tool includes `[[audio_as_voice]]` so -Telegram sends a voice bubble. +The `tts` tool converts text to speech and returns an audio attachment for +reply delivery. When the result is Telegram-compatible, OpenClaw marks it for +voice-bubble delivery. ## Gateway RPC diff --git a/docs/tts.md b/docs/tts.md index 4fe0da77e0a..a527d49cc21 100644 --- a/docs/tts.md +++ b/docs/tts.md @@ -390,9 +390,9 @@ Notes: ## Agent tool -The `tts` tool converts text to speech and returns a `MEDIA:` path. When the -result is Telegram-compatible, the tool includes `[[audio_as_voice]]` so -Telegram sends a voice bubble. +The `tts` tool converts text to speech and returns an audio attachment for +reply delivery. When the result is Telegram-compatible, OpenClaw marks it for +voice-bubble delivery. ## Gateway RPC diff --git a/src/agents/pi-embedded-subscribe.handlers.lifecycle.test.ts b/src/agents/pi-embedded-subscribe.handlers.lifecycle.test.ts index 9ffd7a53a72..e104c1ef895 100644 --- a/src/agents/pi-embedded-subscribe.handlers.lifecycle.test.ts +++ b/src/agents/pi-embedded-subscribe.handlers.lifecycle.test.ts @@ -11,16 +11,20 @@ function createContext( lastAssistant: unknown, overrides?: { onAgentEvent?: (event: unknown) => void }, ): EmbeddedPiSubscribeContext { + const onBlockReply = vi.fn(); return { params: { runId: "run-1", config: {}, sessionKey: "agent:main:main", onAgentEvent: overrides?.onAgentEvent, + onBlockReply, }, state: { lastAssistant: lastAssistant as EmbeddedPiSubscribeContext["state"]["lastAssistant"], pendingCompactionRetry: 0, + pendingToolMediaUrls: [], + pendingToolAudioAsVoice: false, blockState: { thinking: true, final: true, @@ -32,6 +36,7 @@ function createContext( warn: vi.fn(), }, flushBlockReplyBuffer: vi.fn(), + emitBlockReply: onBlockReply, resolveCompactionRetry: vi.fn(), maybeResolveCompactionWait: vi.fn(), } as unknown as EmbeddedPiSubscribeContext; @@ -159,4 +164,19 @@ describe("handleAgentEnd", () => { expect(ctx.log.warn).not.toHaveBeenCalled(); expect(ctx.log.debug).toHaveBeenCalledWith("embedded run agent end: runId=run-1 isError=false"); }); + + it("flushes orphaned tool media as a media-only block reply", () => { + const ctx = createContext(undefined); + ctx.state.pendingToolMediaUrls = ["/tmp/reply.opus"]; + ctx.state.pendingToolAudioAsVoice = true; + + handleAgentEnd(ctx); + + expect(ctx.emitBlockReply).toHaveBeenCalledWith({ + mediaUrls: ["/tmp/reply.opus"], + audioAsVoice: true, + }); + expect(ctx.state.pendingToolMediaUrls).toEqual([]); + expect(ctx.state.pendingToolAudioAsVoice).toBe(false); + }); }); diff --git a/src/agents/pi-embedded-subscribe.handlers.lifecycle.ts b/src/agents/pi-embedded-subscribe.handlers.lifecycle.ts index 7edc299460c..4d8debf430e 100644 --- a/src/agents/pi-embedded-subscribe.handlers.lifecycle.ts +++ b/src/agents/pi-embedded-subscribe.handlers.lifecycle.ts @@ -6,6 +6,10 @@ import { sanitizeForConsole, } from "./pi-embedded-error-observation.js"; import { classifyFailoverReason, formatAssistantErrorText } from "./pi-embedded-helpers.js"; +import { + consumePendingToolMediaReply, + hasAssistantVisibleReply, +} from "./pi-embedded-subscribe.handlers.messages.js"; import type { EmbeddedPiSubscribeContext } from "./pi-embedded-subscribe.handlers.types.js"; import { isAssistantMessage } from "./pi-embedded-utils.js"; @@ -97,6 +101,10 @@ export function handleAgentEnd(ctx: EmbeddedPiSubscribeContext) { } ctx.flushBlockReplyBuffer(); + const pendingToolMediaReply = consumePendingToolMediaReply(ctx.state); + if (pendingToolMediaReply && hasAssistantVisibleReply(pendingToolMediaReply)) { + ctx.emitBlockReply(pendingToolMediaReply); + } // Flush the reply pipeline so the response reaches the channel before // compaction wait blocks the run. This mirrors the pattern used by // handleToolExecutionStart and ensures delivery is not held hostage to diff --git a/src/agents/pi-embedded-subscribe.handlers.messages.test.ts b/src/agents/pi-embedded-subscribe.handlers.messages.test.ts index 1ecdd45f9af..843856b53d7 100644 --- a/src/agents/pi-embedded-subscribe.handlers.messages.test.ts +++ b/src/agents/pi-embedded-subscribe.handlers.messages.test.ts @@ -1,6 +1,8 @@ import { describe, expect, it } from "vitest"; import { buildAssistantStreamData, + consumePendingToolMediaIntoReply, + consumePendingToolMediaReply, hasAssistantVisibleReply, resolveSilentReplyFallbackText, } from "./pi-embedded-subscribe.handlers.messages.js"; @@ -61,3 +63,58 @@ describe("buildAssistantStreamData", () => { }); }); }); + +describe("consumePendingToolMediaIntoReply", () => { + it("attaches queued tool media to the next assistant reply", () => { + const state = { + pendingToolMediaUrls: ["/tmp/a.png", "/tmp/b.png"], + pendingToolAudioAsVoice: false, + }; + + expect( + consumePendingToolMediaIntoReply(state, { + text: "done", + }), + ).toEqual({ + text: "done", + mediaUrls: ["/tmp/a.png", "/tmp/b.png"], + audioAsVoice: undefined, + }); + expect(state.pendingToolMediaUrls).toEqual([]); + }); + + it("preserves reasoning replies without consuming queued media", () => { + const state = { + pendingToolMediaUrls: ["/tmp/a.png"], + pendingToolAudioAsVoice: true, + }; + + expect( + consumePendingToolMediaIntoReply(state, { + text: "thinking", + isReasoning: true, + }), + ).toEqual({ + text: "thinking", + isReasoning: true, + }); + expect(state.pendingToolMediaUrls).toEqual(["/tmp/a.png"]); + expect(state.pendingToolAudioAsVoice).toBe(true); + }); +}); + +describe("consumePendingToolMediaReply", () => { + it("builds a media-only reply for orphaned tool media", () => { + const state = { + pendingToolMediaUrls: ["/tmp/reply.opus"], + pendingToolAudioAsVoice: true, + }; + + expect(consumePendingToolMediaReply(state)).toEqual({ + mediaUrls: ["/tmp/reply.opus"], + audioAsVoice: true, + }); + expect(state.pendingToolMediaUrls).toEqual([]); + expect(state.pendingToolAudioAsVoice).toBe(false); + }); +}); diff --git a/src/agents/pi-embedded-subscribe.handlers.messages.ts b/src/agents/pi-embedded-subscribe.handlers.messages.ts index c3b4e92ba61..283457ffc0e 100644 --- a/src/agents/pi-embedded-subscribe.handlers.messages.ts +++ b/src/agents/pi-embedded-subscribe.handlers.messages.ts @@ -8,7 +8,11 @@ import { isMessagingToolDuplicateNormalized, normalizeTextForComparison, } from "./pi-embedded-helpers.js"; -import type { EmbeddedPiSubscribeContext } from "./pi-embedded-subscribe.handlers.types.js"; +import type { BlockReplyPayload } from "./pi-embedded-payloads.js"; +import type { + EmbeddedPiSubscribeContext, + EmbeddedPiSubscribeState, +} from "./pi-embedded-subscribe.handlers.types.js"; import { appendRawStream } from "./pi-embedded-subscribe.raw-stream.js"; import { extractAssistantText, @@ -57,6 +61,51 @@ export function resolveSilentReplyFallbackText(params: { return fallback; } +function clearPendingToolMedia( + state: Pick, +) { + state.pendingToolMediaUrls = []; + state.pendingToolAudioAsVoice = false; +} + +export function consumePendingToolMediaIntoReply( + state: Pick, + payload: BlockReplyPayload, +): BlockReplyPayload { + if (payload.isReasoning) { + return payload; + } + if (state.pendingToolMediaUrls.length === 0 && !state.pendingToolAudioAsVoice) { + return payload; + } + const mergedMediaUrls = Array.from( + new Set([...(payload.mediaUrls ?? []), ...state.pendingToolMediaUrls]), + ); + const mergedPayload: BlockReplyPayload = { + ...payload, + mediaUrls: mergedMediaUrls.length ? mergedMediaUrls : undefined, + audioAsVoice: payload.audioAsVoice || state.pendingToolAudioAsVoice || undefined, + }; + clearPendingToolMedia(state); + return mergedPayload; +} + +export function consumePendingToolMediaReply( + state: Pick, +): BlockReplyPayload | null { + if (state.pendingToolMediaUrls.length === 0 && !state.pendingToolAudioAsVoice) { + return null; + } + const payload: BlockReplyPayload = { + mediaUrls: state.pendingToolMediaUrls.length + ? Array.from(new Set(state.pendingToolMediaUrls)) + : undefined, + audioAsVoice: state.pendingToolAudioAsVoice || undefined, + }; + clearPendingToolMedia(state); + return payload; +} + export function hasAssistantVisibleReply(params: { text?: string; mediaUrls?: string[]; @@ -390,7 +439,7 @@ export function handleMessageEnd( } = splitResult; // Emit if there's content OR audioAsVoice flag (to propagate the flag). if (hasAssistantVisibleReply({ text: cleanedText, mediaUrls, audioAsVoice })) { - emitBlockReplySafely({ + ctx.emitBlockReply({ text: cleanedText, mediaUrls: mediaUrls?.length ? mediaUrls : undefined, audioAsVoice, diff --git a/src/agents/pi-embedded-subscribe.handlers.tools.media.test.ts b/src/agents/pi-embedded-subscribe.handlers.tools.media.test.ts index 66685f04036..a60986ec48f 100644 --- a/src/agents/pi-embedded-subscribe.handlers.tools.media.test.ts +++ b/src/agents/pi-embedded-subscribe.handlers.tools.media.test.ts @@ -24,6 +24,8 @@ function createMockContext(overrides?: { pendingMessagingTexts: new Map(), pendingMessagingTargets: new Map(), pendingMessagingMediaUrls: new Map(), + pendingToolMediaUrls: [], + pendingToolAudioAsVoice: false, messagingToolSentTexts: [], messagingToolSentTextsNormalized: [], messagingToolSentMediaUrls: [], @@ -36,6 +38,7 @@ function createMockContext(overrides?: { emitToolSummary: vi.fn(), emitToolOutput: vi.fn(), trimMessagingToolSent: vi.fn(), + emitBlockReply: vi.fn(), hookRunner: undefined, // Fill in remaining required fields with no-ops. blockChunker: null, @@ -114,9 +117,8 @@ describe("handleToolExecutionEnd media emission", () => { await emitPngMediaToolResult(ctx); - expect(onToolResult).toHaveBeenCalledWith({ - mediaUrls: ["/tmp/screenshot.png"], - }); + expect(onToolResult).not.toHaveBeenCalled(); + expect(ctx.state.pendingToolMediaUrls).toEqual(["/tmp/screenshot.png"]); }); it("does NOT emit local media for untrusted tools", async () => { @@ -126,6 +128,7 @@ describe("handleToolExecutionEnd media emission", () => { await emitUntrustedToolMediaResult(ctx, "/tmp/secret.png"); expect(onToolResult).not.toHaveBeenCalled(); + expect(ctx.state.pendingToolMediaUrls).toEqual([]); }); it("emits remote media for untrusted tools", async () => { @@ -134,12 +137,11 @@ describe("handleToolExecutionEnd media emission", () => { await emitUntrustedToolMediaResult(ctx, "https://example.com/file.png"); - expect(onToolResult).toHaveBeenCalledWith({ - mediaUrls: ["https://example.com/file.png"], - }); + expect(onToolResult).not.toHaveBeenCalled(); + expect(ctx.state.pendingToolMediaUrls).toEqual(["https://example.com/file.png"]); }); - it("does NOT emit media when verbose is full (emitToolOutput handles it)", async () => { + it("does NOT queue legacy MEDIA paths when verbose is full", async () => { const onToolResult = vi.fn(); const ctx = createMockContext({ shouldEmitToolOutput: true, onToolResult }); @@ -149,15 +151,31 @@ describe("handleToolExecutionEnd media emission", () => { // It may be called by emitToolOutput, but the new block should not fire. // Verify emitToolOutput was called instead. expect(ctx.emitToolOutput).toHaveBeenCalled(); - // The direct media emission should not have been called with just mediaUrls. - const directMediaCalls = onToolResult.mock.calls.filter( - (call: unknown[]) => - call[0] && - typeof call[0] === "object" && - "mediaUrls" in (call[0] as Record) && - !("text" in (call[0] as Record)), - ); - expect(directMediaCalls).toHaveLength(0); + expect(ctx.state.pendingToolMediaUrls).toEqual([]); + }); + + it("still queues structured media when verbose is full", async () => { + const ctx = createMockContext({ shouldEmitToolOutput: true, onToolResult: vi.fn() }); + + await handleToolExecutionEnd(ctx, { + type: "tool_execution_end", + toolName: "tts", + toolCallId: "tc-1", + isError: false, + result: { + content: [{ type: "text", text: "Generated audio reply." }], + details: { + media: { + mediaUrl: "/tmp/reply.opus", + audioAsVoice: true, + }, + }, + }, + }); + + expect(ctx.emitToolOutput).toHaveBeenCalled(); + expect(ctx.state.pendingToolMediaUrls).toEqual(["/tmp/reply.opus"]); + expect(ctx.state.pendingToolAudioAsVoice).toBe(true); }); it("does NOT emit media for error results", async () => { @@ -167,6 +185,7 @@ describe("handleToolExecutionEnd media emission", () => { await emitPngMediaToolResult(ctx, { isError: true }); expect(onToolResult).not.toHaveBeenCalled(); + expect(ctx.state.pendingToolMediaUrls).toEqual([]); }); it("does NOT emit when tool result has no media", async () => { @@ -184,6 +203,7 @@ describe("handleToolExecutionEnd media emission", () => { }); expect(onToolResult).not.toHaveBeenCalled(); + expect(ctx.state.pendingToolMediaUrls).toEqual([]); }); it("does NOT emit media for placeholder text", async () => { @@ -206,6 +226,7 @@ describe("handleToolExecutionEnd media emission", () => { }); expect(onToolResult).not.toHaveBeenCalled(); + expect(ctx.state.pendingToolMediaUrls).toEqual([]); }); it("does NOT emit media for malformed MEDIA:-prefixed prose", async () => { @@ -228,9 +249,10 @@ describe("handleToolExecutionEnd media emission", () => { }); expect(onToolResult).not.toHaveBeenCalled(); + expect(ctx.state.pendingToolMediaUrls).toEqual([]); }); - it("emits media from details.path fallback when no MEDIA: text", async () => { + it("queues media from details.path fallback when no MEDIA: text", async () => { const onToolResult = vi.fn(); const ctx = createMockContext({ shouldEmitToolOutput: false, onToolResult }); @@ -248,8 +270,29 @@ describe("handleToolExecutionEnd media emission", () => { }, }); - expect(onToolResult).toHaveBeenCalledWith({ - mediaUrls: ["/tmp/canvas-output.png"], + expect(onToolResult).not.toHaveBeenCalled(); + expect(ctx.state.pendingToolMediaUrls).toEqual(["/tmp/canvas-output.png"]); + }); + + it("queues structured details.media and voice metadata", async () => { + const ctx = createMockContext({ shouldEmitToolOutput: false, onToolResult: vi.fn() }); + + await handleToolExecutionEnd(ctx, { + type: "tool_execution_end", + toolName: "tts", + toolCallId: "tc-1", + isError: false, + result: { + details: { + media: { + mediaUrl: "/tmp/reply.opus", + audioAsVoice: true, + }, + }, + }, }); + + expect(ctx.state.pendingToolMediaUrls).toEqual(["/tmp/reply.opus"]); + expect(ctx.state.pendingToolAudioAsVoice).toBe(true); }); }); diff --git a/src/agents/pi-embedded-subscribe.handlers.tools.test.ts b/src/agents/pi-embedded-subscribe.handlers.tools.test.ts index 3cf7935a8a2..ec9b52aa6c5 100644 --- a/src/agents/pi-embedded-subscribe.handlers.tools.test.ts +++ b/src/agents/pi-embedded-subscribe.handlers.tools.test.ts @@ -40,6 +40,8 @@ function createTestContext(): { pendingMessagingTargets: new Map(), pendingMessagingTexts: new Map(), pendingMessagingMediaUrls: new Map(), + pendingToolMediaUrls: [], + pendingToolAudioAsVoice: false, messagingToolSentTexts: [], messagingToolSentTextsNormalized: [], messagingToolSentMediaUrls: [], diff --git a/src/agents/pi-embedded-subscribe.handlers.tools.ts b/src/agents/pi-embedded-subscribe.handlers.tools.ts index 70f6b54639c..42f2ea33953 100644 --- a/src/agents/pi-embedded-subscribe.handlers.tools.ts +++ b/src/agents/pi-embedded-subscribe.handlers.tools.ts @@ -13,9 +13,9 @@ import type { ToolHandlerContext, } from "./pi-embedded-subscribe.handlers.types.js"; import { + extractToolResultMediaArtifact, extractMessagingToolSend, extractToolErrorMessage, - extractToolResultMediaPaths, extractToolResultText, filterToolResultMediaUrls, isToolResultError, @@ -143,6 +143,23 @@ function collectMessagingMediaUrlsFromToolResult(result: unknown): string[] { return urls; } +function queuePendingToolMedia( + ctx: ToolHandlerContext, + mediaReply: { mediaUrls: string[]; audioAsVoice?: boolean }, +) { + const seen = new Set(ctx.state.pendingToolMediaUrls); + for (const mediaUrl of mediaReply.mediaUrls) { + if (seen.has(mediaUrl)) { + continue; + } + seen.add(mediaUrl); + ctx.state.pendingToolMediaUrls.push(mediaUrl); + } + if (mediaReply.audioAsVoice) { + ctx.state.pendingToolAudioAsVoice = true; + } +} + function readExecApprovalPendingDetails(result: unknown): { approvalId: string; approvalSlug: string; @@ -226,12 +243,20 @@ async function emitToolResultOutput(params: { sanitizedResult: unknown; }) { const { ctx, toolName, meta, isToolError, result, sanitizedResult } = params; - if (!ctx.params.onToolResult) { - return; - } - + const hasStructuredMedia = + result && + typeof result === "object" && + (result as { details?: unknown }).details && + typeof (result as { details?: unknown }).details === "object" && + !Array.isArray((result as { details?: unknown }).details) && + typeof ((result as { details?: { media?: unknown } }).details?.media ?? undefined) === + "object" && + !Array.isArray((result as { details?: { media?: unknown } }).details?.media); const approvalPending = readExecApprovalPendingDetails(result); if (!isToolError && approvalPending) { + if (!ctx.params.onToolResult) { + return; + } try { await ctx.params.onToolResult( buildExecApprovalPendingReplyPayload({ @@ -254,6 +279,9 @@ async function emitToolResultOutput(params: { const approvalUnavailable = readExecApprovalUnavailableDetails(result); if (!isToolError && approvalUnavailable) { + if (!ctx.params.onToolResult) { + return; + } try { await ctx.params.onToolResult?.( buildExecApprovalUnavailableReplyPayload({ @@ -275,24 +303,27 @@ async function emitToolResultOutput(params: { if (outputText) { ctx.emitToolOutput(toolName, meta, outputText); } - return; + if (!hasStructuredMedia) { + return; + } } if (isToolError) { return; } - // emitToolOutput() already handles MEDIA: directives when enabled; this path - // only sends raw media URLs for non-verbose delivery mode. - const mediaPaths = filterToolResultMediaUrls(toolName, extractToolResultMediaPaths(result)); - if (mediaPaths.length === 0) { + const mediaReply = extractToolResultMediaArtifact(result); + if (!mediaReply) { return; } - try { - void ctx.params.onToolResult({ mediaUrls: mediaPaths }); - } catch { - // ignore delivery failures + const mediaUrls = filterToolResultMediaUrls(toolName, mediaReply.mediaUrls); + if (mediaUrls.length === 0) { + return; } + queuePendingToolMedia(ctx, { + mediaUrls, + ...(mediaReply.audioAsVoice ? { audioAsVoice: true } : {}), + }); } export async function handleToolExecutionStart( diff --git a/src/agents/pi-embedded-subscribe.handlers.types.ts b/src/agents/pi-embedded-subscribe.handlers.types.ts index 4436e6f6aa3..d7c07cce79e 100644 --- a/src/agents/pi-embedded-subscribe.handlers.types.ts +++ b/src/agents/pi-embedded-subscribe.handlers.types.ts @@ -5,6 +5,7 @@ import type { InlineCodeState } from "../markdown/code-spans.js"; import type { HookRunner } from "../plugins/hooks.js"; import type { EmbeddedBlockChunker } from "./pi-embedded-block-chunker.js"; import type { MessagingToolSend } from "./pi-embedded-messaging.js"; +import type { BlockReplyPayload } from "./pi-embedded-payloads.js"; import type { BlockReplyChunking, SubscribeEmbeddedPiSessionParams, @@ -76,6 +77,8 @@ export type EmbeddedPiSubscribeState = { pendingMessagingTargets: Map; successfulCronAdds: number; pendingMessagingMediaUrls: Map; + pendingToolMediaUrls: string[]; + pendingToolAudioAsVoice: boolean; deterministicApprovalPromptSent: boolean; lastAssistant?: AgentMessage; }; @@ -124,6 +127,7 @@ export type EmbeddedPiSubscribeContext = { incrementCompactionCount: () => void; getUsageTotals: () => NormalizedUsage | undefined; getCompactionCount: () => number; + emitBlockReply: (payload: BlockReplyPayload) => void; }; /** @@ -151,6 +155,8 @@ export type ToolHandlerState = Pick< | "pendingMessagingTargets" | "pendingMessagingTexts" | "pendingMessagingMediaUrls" + | "pendingToolMediaUrls" + | "pendingToolAudioAsVoice" | "messagingToolSentTexts" | "messagingToolSentTextsNormalized" | "messagingToolSentMediaUrls" diff --git a/src/agents/pi-embedded-subscribe.tools.media.test.ts b/src/agents/pi-embedded-subscribe.tools.media.test.ts index 7cf51bb7c1c..cb50f6e8a40 100644 --- a/src/agents/pi-embedded-subscribe.tools.media.test.ts +++ b/src/agents/pi-embedded-subscribe.tools.media.test.ts @@ -1,5 +1,6 @@ import { describe, expect, it } from "vitest"; import { + extractToolResultMediaArtifact, extractToolResultMediaPaths, isToolResultMediaTrusted, } from "./pi-embedded-subscribe.tools.js"; @@ -15,14 +16,40 @@ describe("extractToolResultMediaPaths", () => { expect(extractToolResultMediaPaths(42)).toEqual([]); }); - it("returns empty array when content is missing", () => { - expect(extractToolResultMediaPaths({ details: { path: "/tmp/img.png" } })).toEqual([]); + it("extracts structured details.media without content blocks", () => { + expect( + extractToolResultMediaArtifact({ + details: { + media: { + mediaUrls: ["/tmp/img.png", "/tmp/img-2.png"], + }, + }, + }), + ).toEqual({ + mediaUrls: ["/tmp/img.png", "/tmp/img-2.png"], + }); }); it("returns empty array when content has no text or image blocks", () => { expect(extractToolResultMediaPaths({ content: [{ type: "other" }] })).toEqual([]); }); + it("extracts structured media with audioAsVoice", () => { + expect( + extractToolResultMediaArtifact({ + details: { + media: { + mediaUrl: "/tmp/reply.opus", + audioAsVoice: true, + }, + }, + }), + ).toEqual({ + mediaUrls: ["/tmp/reply.opus"], + audioAsVoice: true, + }); + }); + it("extracts MEDIA: path from text content block", () => { const result = { content: [ diff --git a/src/agents/pi-embedded-subscribe.tools.ts b/src/agents/pi-embedded-subscribe.tools.ts index 925f56fa6ee..20f0f417531 100644 --- a/src/agents/pi-embedded-subscribe.tools.ts +++ b/src/agents/pi-embedded-subscribe.tools.ts @@ -187,25 +187,75 @@ export function filterToolResultMediaUrls( * Extract media file paths from a tool result. * * Strategy (first match wins): - * 1. Parse `MEDIA:` tokens from text content blocks (all OpenClaw tools). - * 2. Fall back to `details.path` when image content exists (OpenClaw imageResult). + * 1. Read structured `details.media` attachments from tool details. + * 2. Parse legacy `MEDIA:` tokens from text content blocks. + * 3. Fall back to `details.path` when image content exists (legacy imageResult). * * Returns an empty array when no media is found (e.g. Pi SDK `read` tool * returns base64 image data but no file path; those need a different delivery * path like saving to a temp file). */ -export function extractToolResultMediaPaths(result: unknown): string[] { +export type ToolResultMediaArtifact = { + mediaUrls: string[]; + audioAsVoice?: boolean; +}; + +function readToolResultDetailsMedia( + result: Record, +): Record | undefined { + const details = + result.details && typeof result.details === "object" && !Array.isArray(result.details) + ? (result.details as Record) + : undefined; + const media = + details?.media && typeof details.media === "object" && !Array.isArray(details.media) + ? (details.media as Record) + : undefined; + return media; +} + +function collectStructuredMediaUrls(media: Record): string[] { + const urls: string[] = []; + if (typeof media.mediaUrl === "string" && media.mediaUrl.trim()) { + urls.push(media.mediaUrl.trim()); + } + if (Array.isArray(media.mediaUrls)) { + urls.push( + ...media.mediaUrls + .filter((value): value is string => typeof value === "string") + .map((value) => value.trim()) + .filter(Boolean), + ); + } + return Array.from(new Set(urls)); +} + +export function extractToolResultMediaArtifact( + result: unknown, +): ToolResultMediaArtifact | undefined { if (!result || typeof result !== "object") { - return []; + return undefined; } const record = result as Record; - const content = Array.isArray(record.content) ? record.content : null; - if (!content) { - return []; + const detailsMedia = readToolResultDetailsMedia(record); + if (detailsMedia) { + const mediaUrls = collectStructuredMediaUrls(detailsMedia); + if (mediaUrls.length > 0) { + return { + mediaUrls, + ...(detailsMedia.audioAsVoice === true ? { audioAsVoice: true } : {}), + }; + } } - // Extract MEDIA: paths from text content blocks using the shared parser so - // directive matching and validation stay in sync with outbound reply parsing. + const content = Array.isArray(record.content) ? record.content : null; + if (!content) { + return undefined; + } + + // Extract legacy MEDIA: paths from text content blocks using the shared + // parser so directive matching and validation stay in sync with outbound + // reply parsing. const paths: string[] = []; let hasImageContent = false; for (const item of content) { @@ -226,19 +276,24 @@ export function extractToolResultMediaPaths(result: unknown): string[] { } if (paths.length > 0) { - return paths; + return { mediaUrls: paths }; } - // Fall back to details.path when image content exists but no MEDIA: text. + // Fall back to legacy details.path when image content exists but no + // structured media details or MEDIA: text. if (hasImageContent) { const details = record.details as Record | undefined; const p = typeof details?.path === "string" ? details.path.trim() : ""; if (p) { - return [p]; + return { mediaUrls: [p] }; } } - return []; + return undefined; +} + +export function extractToolResultMediaPaths(result: unknown): string[] { + return extractToolResultMediaArtifact(result)?.mediaUrls ?? []; } export function isToolResultError(result: unknown): boolean { diff --git a/src/agents/pi-embedded-subscribe.ts b/src/agents/pi-embedded-subscribe.ts index 83592372e80..e2fd18cab70 100644 --- a/src/agents/pi-embedded-subscribe.ts +++ b/src/agents/pi-embedded-subscribe.ts @@ -11,7 +11,9 @@ import { isMessagingToolDuplicateNormalized, normalizeTextForComparison, } from "./pi-embedded-helpers.js"; +import type { BlockReplyPayload } from "./pi-embedded-payloads.js"; import { createEmbeddedPiSessionEventHandler } from "./pi-embedded-subscribe.handlers.js"; +import { consumePendingToolMediaIntoReply } from "./pi-embedded-subscribe.handlers.messages.js"; import type { EmbeddedPiSubscribeContext, EmbeddedPiSubscribeState, @@ -78,6 +80,8 @@ export function subscribeEmbeddedPiSession(params: SubscribeEmbeddedPiSessionPar pendingMessagingTargets: new Map(), successfulCronAdds: 0, pendingMessagingMediaUrls: new Map(), + pendingToolMediaUrls: [], + pendingToolAudioAsVoice: false, deterministicApprovalPromptSent: false, }; const usageTotals = { @@ -113,6 +117,9 @@ export function subscribeEmbeddedPiSession(params: SubscribeEmbeddedPiSessionPar log.warn(`block reply callback failed: ${String(err)}`); }); }; + const emitBlockReply = (payload: BlockReplyPayload) => { + emitBlockReplySafely(consumePendingToolMediaIntoReply(state, payload)); + }; const resetAssistantMessageState = (nextAssistantTextBaseline: number) => { state.deltaBuffer = ""; @@ -523,7 +530,7 @@ export function subscribeEmbeddedPiSession(params: SubscribeEmbeddedPiSessionPar if (!cleanedText && (!mediaUrls || mediaUrls.length === 0) && !audioAsVoice) { return; } - emitBlockReplySafely({ + emitBlockReply({ text: cleanedText, mediaUrls: mediaUrls?.length ? mediaUrls : undefined, audioAsVoice, @@ -599,6 +606,8 @@ export function subscribeEmbeddedPiSession(params: SubscribeEmbeddedPiSessionPar pendingMessagingTargets.clear(); state.successfulCronAdds = 0; state.pendingMessagingMediaUrls.clear(); + state.pendingToolMediaUrls = []; + state.pendingToolAudioAsVoice = false; state.deterministicApprovalPromptSent = false; resetAssistantMessageState(0); }; @@ -624,6 +633,7 @@ export function subscribeEmbeddedPiSession(params: SubscribeEmbeddedPiSessionPar stripBlockTags, emitBlockChunk, flushBlockReplyBuffer, + emitBlockReply, emitReasoningStream, consumeReplyDirectives, consumePartialReplyDirectives, diff --git a/src/agents/pi-tool-handler-state.test-helpers.ts b/src/agents/pi-tool-handler-state.test-helpers.ts index cfb559b9884..7cccdbdf3bf 100644 --- a/src/agents/pi-tool-handler-state.test-helpers.ts +++ b/src/agents/pi-tool-handler-state.test-helpers.ts @@ -6,6 +6,8 @@ export function createBaseToolHandlerState() { pendingMessagingTexts: new Map(), pendingMessagingTargets: new Map(), pendingMessagingMediaUrls: new Map(), + pendingToolMediaUrls: [] as string[], + pendingToolAudioAsVoice: false, messagingToolSentTexts: [] as string[], messagingToolSentTextsNormalized: [] as string[], messagingToolSentMediaUrls: [] as string[], diff --git a/src/agents/tools/image-generate-tool.test.ts b/src/agents/tools/image-generate-tool.test.ts index 83583d2c2ef..dfd782b597c 100644 --- a/src/agents/tools/image-generate-tool.test.ts +++ b/src/agents/tools/image-generate-tool.test.ts @@ -110,7 +110,7 @@ describe("createImageGenerateTool", () => { }); }); - it("generates images and returns MEDIA paths", async () => { + it("generates images and returns details.media paths", async () => { const generateImage = vi.spyOn(imageGenerationRuntime, "generateImage").mockResolvedValue({ provider: "openai", model: "gpt-image-1", @@ -215,14 +215,16 @@ describe("createImageGenerateTool", () => { provider: "openai", model: "gpt-image-1", count: 2, + media: { + mediaUrls: ["/tmp/generated-1.png", "/tmp/generated-2.png"], + }, paths: ["/tmp/generated-1.png", "/tmp/generated-2.png"], filename: "cats/output.png", revisedPrompts: ["A more cinematic cat"], }, }); const text = (result.content?.[0] as { text: string } | undefined)?.text ?? ""; - expect(text).toContain("MEDIA:/tmp/generated-1.png"); - expect(text).toContain("MEDIA:/tmp/generated-2.png"); + expect(text).not.toContain("MEDIA:"); }); it("rejects counts outside the supported range", async () => { diff --git a/src/agents/tools/image-generate-tool.ts b/src/agents/tools/image-generate-tool.ts index d0708842cf9..9a40f482981 100644 --- a/src/agents/tools/image-generate-tool.ts +++ b/src/agents/tools/image-generate-tool.ts @@ -610,7 +610,6 @@ export function createImageGenerateTool(options?: { .filter((entry): entry is string => Boolean(entry)); const lines = [ `Generated ${savedImages.length} image${savedImages.length === 1 ? "" : "s"} with ${result.provider}/${result.model}.`, - ...savedImages.map((image) => `MEDIA:${image.path}`), ]; return { @@ -619,6 +618,9 @@ export function createImageGenerateTool(options?: { provider: result.provider, model: result.model, count: savedImages.length, + media: { + mediaUrls: savedImages.map((image) => image.path), + }, paths: savedImages.map((image) => image.path), ...(imageInputs.length === 1 ? { diff --git a/src/agents/tools/nodes-tool.ts b/src/agents/tools/nodes-tool.ts index 5e28b235791..2c5d0835228 100644 --- a/src/agents/tools/nodes-tool.ts +++ b/src/agents/tools/nodes-tool.ts @@ -310,7 +310,6 @@ export function createNodesTool(options?: { expectedHost: resolvedNode.remoteIp, invalidPayloadMessage: "invalid camera.snap payload", }); - content.push({ type: "text", text: `MEDIA:${filePath}` }); if (options?.modelHasVision && payload.base64) { content.push({ type: "image", diff --git a/src/agents/tools/tts-tool.test.ts b/src/agents/tools/tts-tool.test.ts index fe9a6c1def9..2fc192a934d 100644 --- a/src/agents/tools/tts-tool.test.ts +++ b/src/agents/tools/tts-tool.test.ts @@ -4,7 +4,12 @@ vi.mock("../../auto-reply/tokens.js", () => ({ SILENT_REPLY_TOKEN: "QUIET_TOKEN", })); +vi.mock("../../tts/tts.js", () => ({ + textToSpeech: vi.fn(), +})); + const { createTtsTool } = await import("./tts-tool.js"); +const { textToSpeech } = await import("../../tts/tts.js"); describe("createTtsTool", () => { it("uses SILENT_REPLY_TOKEN in guidance text", () => { @@ -13,4 +18,29 @@ describe("createTtsTool", () => { expect(tool.description).toContain("QUIET_TOKEN"); expect(tool.description).not.toContain("NO_REPLY"); }); + + it("stores audio delivery in details.media", async () => { + vi.mocked(textToSpeech).mockResolvedValue({ + success: true, + audioPath: "/tmp/reply.opus", + provider: "test", + voiceCompatible: true, + }); + + const tool = createTtsTool(); + const result = await tool.execute("call-1", { text: "hello" }); + + expect(result).toMatchObject({ + content: [{ type: "text", text: "Generated audio reply." }], + details: { + audioPath: "/tmp/reply.opus", + provider: "test", + media: { + mediaUrl: "/tmp/reply.opus", + audioAsVoice: true, + }, + }, + }); + expect(JSON.stringify(result.content)).not.toContain("MEDIA:"); + }); }); diff --git a/src/agents/tools/tts-tool.ts b/src/agents/tools/tts-tool.ts index 03ed3cd9a04..e4ebd302ef4 100644 --- a/src/agents/tools/tts-tool.ts +++ b/src/agents/tools/tts-tool.ts @@ -35,15 +35,16 @@ export function createTtsTool(opts?: { }); if (result.success && result.audioPath) { - const lines: string[] = []; - // Tag Telegram Opus output as a voice bubble instead of a file attachment. - if (result.voiceCompatible) { - lines.push("[[audio_as_voice]]"); - } - lines.push(`MEDIA:${result.audioPath}`); return { - content: [{ type: "text", text: lines.join("\n") }], - details: { audioPath: result.audioPath, provider: result.provider }, + content: [{ type: "text", text: "Generated audio reply." }], + details: { + audioPath: result.audioPath, + provider: result.provider, + media: { + mediaUrl: result.audioPath, + ...(result.voiceCompatible ? { audioAsVoice: true } : {}), + }, + }, }; }