mirror of
https://github.com/openclaw/openclaw.git
synced 2026-03-22 23:41:07 +00:00
refactor(agents): unify tool media reply delivery
This commit is contained in:
@@ -390,9 +390,9 @@ Notes:
|
||||
|
||||
## Agent tool
|
||||
|
||||
The `tts` tool converts text to speech and returns a `MEDIA:` path. When the
|
||||
result is Telegram-compatible, the tool includes `[[audio_as_voice]]` so
|
||||
Telegram sends a voice bubble.
|
||||
The `tts` tool converts text to speech and returns an audio attachment for
|
||||
reply delivery. When the result is Telegram-compatible, OpenClaw marks it for
|
||||
voice-bubble delivery.
|
||||
|
||||
## Gateway RPC
|
||||
|
||||
|
||||
@@ -390,9 +390,9 @@ Notes:
|
||||
|
||||
## Agent tool
|
||||
|
||||
The `tts` tool converts text to speech and returns a `MEDIA:` path. When the
|
||||
result is Telegram-compatible, the tool includes `[[audio_as_voice]]` so
|
||||
Telegram sends a voice bubble.
|
||||
The `tts` tool converts text to speech and returns an audio attachment for
|
||||
reply delivery. When the result is Telegram-compatible, OpenClaw marks it for
|
||||
voice-bubble delivery.
|
||||
|
||||
## Gateway RPC
|
||||
|
||||
|
||||
@@ -11,16 +11,20 @@ function createContext(
|
||||
lastAssistant: unknown,
|
||||
overrides?: { onAgentEvent?: (event: unknown) => void },
|
||||
): EmbeddedPiSubscribeContext {
|
||||
const onBlockReply = vi.fn();
|
||||
return {
|
||||
params: {
|
||||
runId: "run-1",
|
||||
config: {},
|
||||
sessionKey: "agent:main:main",
|
||||
onAgentEvent: overrides?.onAgentEvent,
|
||||
onBlockReply,
|
||||
},
|
||||
state: {
|
||||
lastAssistant: lastAssistant as EmbeddedPiSubscribeContext["state"]["lastAssistant"],
|
||||
pendingCompactionRetry: 0,
|
||||
pendingToolMediaUrls: [],
|
||||
pendingToolAudioAsVoice: false,
|
||||
blockState: {
|
||||
thinking: true,
|
||||
final: true,
|
||||
@@ -32,6 +36,7 @@ function createContext(
|
||||
warn: vi.fn(),
|
||||
},
|
||||
flushBlockReplyBuffer: vi.fn(),
|
||||
emitBlockReply: onBlockReply,
|
||||
resolveCompactionRetry: vi.fn(),
|
||||
maybeResolveCompactionWait: vi.fn(),
|
||||
} as unknown as EmbeddedPiSubscribeContext;
|
||||
@@ -159,4 +164,19 @@ describe("handleAgentEnd", () => {
|
||||
expect(ctx.log.warn).not.toHaveBeenCalled();
|
||||
expect(ctx.log.debug).toHaveBeenCalledWith("embedded run agent end: runId=run-1 isError=false");
|
||||
});
|
||||
|
||||
it("flushes orphaned tool media as a media-only block reply", () => {
|
||||
const ctx = createContext(undefined);
|
||||
ctx.state.pendingToolMediaUrls = ["/tmp/reply.opus"];
|
||||
ctx.state.pendingToolAudioAsVoice = true;
|
||||
|
||||
handleAgentEnd(ctx);
|
||||
|
||||
expect(ctx.emitBlockReply).toHaveBeenCalledWith({
|
||||
mediaUrls: ["/tmp/reply.opus"],
|
||||
audioAsVoice: true,
|
||||
});
|
||||
expect(ctx.state.pendingToolMediaUrls).toEqual([]);
|
||||
expect(ctx.state.pendingToolAudioAsVoice).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
@@ -6,6 +6,10 @@ import {
|
||||
sanitizeForConsole,
|
||||
} from "./pi-embedded-error-observation.js";
|
||||
import { classifyFailoverReason, formatAssistantErrorText } from "./pi-embedded-helpers.js";
|
||||
import {
|
||||
consumePendingToolMediaReply,
|
||||
hasAssistantVisibleReply,
|
||||
} from "./pi-embedded-subscribe.handlers.messages.js";
|
||||
import type { EmbeddedPiSubscribeContext } from "./pi-embedded-subscribe.handlers.types.js";
|
||||
import { isAssistantMessage } from "./pi-embedded-utils.js";
|
||||
|
||||
@@ -97,6 +101,10 @@ export function handleAgentEnd(ctx: EmbeddedPiSubscribeContext) {
|
||||
}
|
||||
|
||||
ctx.flushBlockReplyBuffer();
|
||||
const pendingToolMediaReply = consumePendingToolMediaReply(ctx.state);
|
||||
if (pendingToolMediaReply && hasAssistantVisibleReply(pendingToolMediaReply)) {
|
||||
ctx.emitBlockReply(pendingToolMediaReply);
|
||||
}
|
||||
// Flush the reply pipeline so the response reaches the channel before
|
||||
// compaction wait blocks the run. This mirrors the pattern used by
|
||||
// handleToolExecutionStart and ensures delivery is not held hostage to
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
import { describe, expect, it } from "vitest";
|
||||
import {
|
||||
buildAssistantStreamData,
|
||||
consumePendingToolMediaIntoReply,
|
||||
consumePendingToolMediaReply,
|
||||
hasAssistantVisibleReply,
|
||||
resolveSilentReplyFallbackText,
|
||||
} from "./pi-embedded-subscribe.handlers.messages.js";
|
||||
@@ -61,3 +63,58 @@ describe("buildAssistantStreamData", () => {
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
describe("consumePendingToolMediaIntoReply", () => {
|
||||
it("attaches queued tool media to the next assistant reply", () => {
|
||||
const state = {
|
||||
pendingToolMediaUrls: ["/tmp/a.png", "/tmp/b.png"],
|
||||
pendingToolAudioAsVoice: false,
|
||||
};
|
||||
|
||||
expect(
|
||||
consumePendingToolMediaIntoReply(state, {
|
||||
text: "done",
|
||||
}),
|
||||
).toEqual({
|
||||
text: "done",
|
||||
mediaUrls: ["/tmp/a.png", "/tmp/b.png"],
|
||||
audioAsVoice: undefined,
|
||||
});
|
||||
expect(state.pendingToolMediaUrls).toEqual([]);
|
||||
});
|
||||
|
||||
it("preserves reasoning replies without consuming queued media", () => {
|
||||
const state = {
|
||||
pendingToolMediaUrls: ["/tmp/a.png"],
|
||||
pendingToolAudioAsVoice: true,
|
||||
};
|
||||
|
||||
expect(
|
||||
consumePendingToolMediaIntoReply(state, {
|
||||
text: "thinking",
|
||||
isReasoning: true,
|
||||
}),
|
||||
).toEqual({
|
||||
text: "thinking",
|
||||
isReasoning: true,
|
||||
});
|
||||
expect(state.pendingToolMediaUrls).toEqual(["/tmp/a.png"]);
|
||||
expect(state.pendingToolAudioAsVoice).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
describe("consumePendingToolMediaReply", () => {
|
||||
it("builds a media-only reply for orphaned tool media", () => {
|
||||
const state = {
|
||||
pendingToolMediaUrls: ["/tmp/reply.opus"],
|
||||
pendingToolAudioAsVoice: true,
|
||||
};
|
||||
|
||||
expect(consumePendingToolMediaReply(state)).toEqual({
|
||||
mediaUrls: ["/tmp/reply.opus"],
|
||||
audioAsVoice: true,
|
||||
});
|
||||
expect(state.pendingToolMediaUrls).toEqual([]);
|
||||
expect(state.pendingToolAudioAsVoice).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
@@ -8,7 +8,11 @@ import {
|
||||
isMessagingToolDuplicateNormalized,
|
||||
normalizeTextForComparison,
|
||||
} from "./pi-embedded-helpers.js";
|
||||
import type { EmbeddedPiSubscribeContext } from "./pi-embedded-subscribe.handlers.types.js";
|
||||
import type { BlockReplyPayload } from "./pi-embedded-payloads.js";
|
||||
import type {
|
||||
EmbeddedPiSubscribeContext,
|
||||
EmbeddedPiSubscribeState,
|
||||
} from "./pi-embedded-subscribe.handlers.types.js";
|
||||
import { appendRawStream } from "./pi-embedded-subscribe.raw-stream.js";
|
||||
import {
|
||||
extractAssistantText,
|
||||
@@ -57,6 +61,51 @@ export function resolveSilentReplyFallbackText(params: {
|
||||
return fallback;
|
||||
}
|
||||
|
||||
function clearPendingToolMedia(
|
||||
state: Pick<EmbeddedPiSubscribeState, "pendingToolMediaUrls" | "pendingToolAudioAsVoice">,
|
||||
) {
|
||||
state.pendingToolMediaUrls = [];
|
||||
state.pendingToolAudioAsVoice = false;
|
||||
}
|
||||
|
||||
export function consumePendingToolMediaIntoReply(
|
||||
state: Pick<EmbeddedPiSubscribeState, "pendingToolMediaUrls" | "pendingToolAudioAsVoice">,
|
||||
payload: BlockReplyPayload,
|
||||
): BlockReplyPayload {
|
||||
if (payload.isReasoning) {
|
||||
return payload;
|
||||
}
|
||||
if (state.pendingToolMediaUrls.length === 0 && !state.pendingToolAudioAsVoice) {
|
||||
return payload;
|
||||
}
|
||||
const mergedMediaUrls = Array.from(
|
||||
new Set([...(payload.mediaUrls ?? []), ...state.pendingToolMediaUrls]),
|
||||
);
|
||||
const mergedPayload: BlockReplyPayload = {
|
||||
...payload,
|
||||
mediaUrls: mergedMediaUrls.length ? mergedMediaUrls : undefined,
|
||||
audioAsVoice: payload.audioAsVoice || state.pendingToolAudioAsVoice || undefined,
|
||||
};
|
||||
clearPendingToolMedia(state);
|
||||
return mergedPayload;
|
||||
}
|
||||
|
||||
export function consumePendingToolMediaReply(
|
||||
state: Pick<EmbeddedPiSubscribeState, "pendingToolMediaUrls" | "pendingToolAudioAsVoice">,
|
||||
): BlockReplyPayload | null {
|
||||
if (state.pendingToolMediaUrls.length === 0 && !state.pendingToolAudioAsVoice) {
|
||||
return null;
|
||||
}
|
||||
const payload: BlockReplyPayload = {
|
||||
mediaUrls: state.pendingToolMediaUrls.length
|
||||
? Array.from(new Set(state.pendingToolMediaUrls))
|
||||
: undefined,
|
||||
audioAsVoice: state.pendingToolAudioAsVoice || undefined,
|
||||
};
|
||||
clearPendingToolMedia(state);
|
||||
return payload;
|
||||
}
|
||||
|
||||
export function hasAssistantVisibleReply(params: {
|
||||
text?: string;
|
||||
mediaUrls?: string[];
|
||||
@@ -390,7 +439,7 @@ export function handleMessageEnd(
|
||||
} = splitResult;
|
||||
// Emit if there's content OR audioAsVoice flag (to propagate the flag).
|
||||
if (hasAssistantVisibleReply({ text: cleanedText, mediaUrls, audioAsVoice })) {
|
||||
emitBlockReplySafely({
|
||||
ctx.emitBlockReply({
|
||||
text: cleanedText,
|
||||
mediaUrls: mediaUrls?.length ? mediaUrls : undefined,
|
||||
audioAsVoice,
|
||||
|
||||
@@ -24,6 +24,8 @@ function createMockContext(overrides?: {
|
||||
pendingMessagingTexts: new Map(),
|
||||
pendingMessagingTargets: new Map(),
|
||||
pendingMessagingMediaUrls: new Map(),
|
||||
pendingToolMediaUrls: [],
|
||||
pendingToolAudioAsVoice: false,
|
||||
messagingToolSentTexts: [],
|
||||
messagingToolSentTextsNormalized: [],
|
||||
messagingToolSentMediaUrls: [],
|
||||
@@ -36,6 +38,7 @@ function createMockContext(overrides?: {
|
||||
emitToolSummary: vi.fn(),
|
||||
emitToolOutput: vi.fn(),
|
||||
trimMessagingToolSent: vi.fn(),
|
||||
emitBlockReply: vi.fn(),
|
||||
hookRunner: undefined,
|
||||
// Fill in remaining required fields with no-ops.
|
||||
blockChunker: null,
|
||||
@@ -114,9 +117,8 @@ describe("handleToolExecutionEnd media emission", () => {
|
||||
|
||||
await emitPngMediaToolResult(ctx);
|
||||
|
||||
expect(onToolResult).toHaveBeenCalledWith({
|
||||
mediaUrls: ["/tmp/screenshot.png"],
|
||||
});
|
||||
expect(onToolResult).not.toHaveBeenCalled();
|
||||
expect(ctx.state.pendingToolMediaUrls).toEqual(["/tmp/screenshot.png"]);
|
||||
});
|
||||
|
||||
it("does NOT emit local media for untrusted tools", async () => {
|
||||
@@ -126,6 +128,7 @@ describe("handleToolExecutionEnd media emission", () => {
|
||||
await emitUntrustedToolMediaResult(ctx, "/tmp/secret.png");
|
||||
|
||||
expect(onToolResult).not.toHaveBeenCalled();
|
||||
expect(ctx.state.pendingToolMediaUrls).toEqual([]);
|
||||
});
|
||||
|
||||
it("emits remote media for untrusted tools", async () => {
|
||||
@@ -134,12 +137,11 @@ describe("handleToolExecutionEnd media emission", () => {
|
||||
|
||||
await emitUntrustedToolMediaResult(ctx, "https://example.com/file.png");
|
||||
|
||||
expect(onToolResult).toHaveBeenCalledWith({
|
||||
mediaUrls: ["https://example.com/file.png"],
|
||||
});
|
||||
expect(onToolResult).not.toHaveBeenCalled();
|
||||
expect(ctx.state.pendingToolMediaUrls).toEqual(["https://example.com/file.png"]);
|
||||
});
|
||||
|
||||
it("does NOT emit media when verbose is full (emitToolOutput handles it)", async () => {
|
||||
it("does NOT queue legacy MEDIA paths when verbose is full", async () => {
|
||||
const onToolResult = vi.fn();
|
||||
const ctx = createMockContext({ shouldEmitToolOutput: true, onToolResult });
|
||||
|
||||
@@ -149,15 +151,31 @@ describe("handleToolExecutionEnd media emission", () => {
|
||||
// It may be called by emitToolOutput, but the new block should not fire.
|
||||
// Verify emitToolOutput was called instead.
|
||||
expect(ctx.emitToolOutput).toHaveBeenCalled();
|
||||
// The direct media emission should not have been called with just mediaUrls.
|
||||
const directMediaCalls = onToolResult.mock.calls.filter(
|
||||
(call: unknown[]) =>
|
||||
call[0] &&
|
||||
typeof call[0] === "object" &&
|
||||
"mediaUrls" in (call[0] as Record<string, unknown>) &&
|
||||
!("text" in (call[0] as Record<string, unknown>)),
|
||||
);
|
||||
expect(directMediaCalls).toHaveLength(0);
|
||||
expect(ctx.state.pendingToolMediaUrls).toEqual([]);
|
||||
});
|
||||
|
||||
it("still queues structured media when verbose is full", async () => {
|
||||
const ctx = createMockContext({ shouldEmitToolOutput: true, onToolResult: vi.fn() });
|
||||
|
||||
await handleToolExecutionEnd(ctx, {
|
||||
type: "tool_execution_end",
|
||||
toolName: "tts",
|
||||
toolCallId: "tc-1",
|
||||
isError: false,
|
||||
result: {
|
||||
content: [{ type: "text", text: "Generated audio reply." }],
|
||||
details: {
|
||||
media: {
|
||||
mediaUrl: "/tmp/reply.opus",
|
||||
audioAsVoice: true,
|
||||
},
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
expect(ctx.emitToolOutput).toHaveBeenCalled();
|
||||
expect(ctx.state.pendingToolMediaUrls).toEqual(["/tmp/reply.opus"]);
|
||||
expect(ctx.state.pendingToolAudioAsVoice).toBe(true);
|
||||
});
|
||||
|
||||
it("does NOT emit media for error results", async () => {
|
||||
@@ -167,6 +185,7 @@ describe("handleToolExecutionEnd media emission", () => {
|
||||
await emitPngMediaToolResult(ctx, { isError: true });
|
||||
|
||||
expect(onToolResult).not.toHaveBeenCalled();
|
||||
expect(ctx.state.pendingToolMediaUrls).toEqual([]);
|
||||
});
|
||||
|
||||
it("does NOT emit when tool result has no media", async () => {
|
||||
@@ -184,6 +203,7 @@ describe("handleToolExecutionEnd media emission", () => {
|
||||
});
|
||||
|
||||
expect(onToolResult).not.toHaveBeenCalled();
|
||||
expect(ctx.state.pendingToolMediaUrls).toEqual([]);
|
||||
});
|
||||
|
||||
it("does NOT emit media for <media:audio> placeholder text", async () => {
|
||||
@@ -206,6 +226,7 @@ describe("handleToolExecutionEnd media emission", () => {
|
||||
});
|
||||
|
||||
expect(onToolResult).not.toHaveBeenCalled();
|
||||
expect(ctx.state.pendingToolMediaUrls).toEqual([]);
|
||||
});
|
||||
|
||||
it("does NOT emit media for malformed MEDIA:-prefixed prose", async () => {
|
||||
@@ -228,9 +249,10 @@ describe("handleToolExecutionEnd media emission", () => {
|
||||
});
|
||||
|
||||
expect(onToolResult).not.toHaveBeenCalled();
|
||||
expect(ctx.state.pendingToolMediaUrls).toEqual([]);
|
||||
});
|
||||
|
||||
it("emits media from details.path fallback when no MEDIA: text", async () => {
|
||||
it("queues media from details.path fallback when no MEDIA: text", async () => {
|
||||
const onToolResult = vi.fn();
|
||||
const ctx = createMockContext({ shouldEmitToolOutput: false, onToolResult });
|
||||
|
||||
@@ -248,8 +270,29 @@ describe("handleToolExecutionEnd media emission", () => {
|
||||
},
|
||||
});
|
||||
|
||||
expect(onToolResult).toHaveBeenCalledWith({
|
||||
mediaUrls: ["/tmp/canvas-output.png"],
|
||||
expect(onToolResult).not.toHaveBeenCalled();
|
||||
expect(ctx.state.pendingToolMediaUrls).toEqual(["/tmp/canvas-output.png"]);
|
||||
});
|
||||
|
||||
it("queues structured details.media and voice metadata", async () => {
|
||||
const ctx = createMockContext({ shouldEmitToolOutput: false, onToolResult: vi.fn() });
|
||||
|
||||
await handleToolExecutionEnd(ctx, {
|
||||
type: "tool_execution_end",
|
||||
toolName: "tts",
|
||||
toolCallId: "tc-1",
|
||||
isError: false,
|
||||
result: {
|
||||
details: {
|
||||
media: {
|
||||
mediaUrl: "/tmp/reply.opus",
|
||||
audioAsVoice: true,
|
||||
},
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
expect(ctx.state.pendingToolMediaUrls).toEqual(["/tmp/reply.opus"]);
|
||||
expect(ctx.state.pendingToolAudioAsVoice).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
@@ -40,6 +40,8 @@ function createTestContext(): {
|
||||
pendingMessagingTargets: new Map<string, MessagingToolSend>(),
|
||||
pendingMessagingTexts: new Map<string, string>(),
|
||||
pendingMessagingMediaUrls: new Map<string, string[]>(),
|
||||
pendingToolMediaUrls: [],
|
||||
pendingToolAudioAsVoice: false,
|
||||
messagingToolSentTexts: [],
|
||||
messagingToolSentTextsNormalized: [],
|
||||
messagingToolSentMediaUrls: [],
|
||||
|
||||
@@ -13,9 +13,9 @@ import type {
|
||||
ToolHandlerContext,
|
||||
} from "./pi-embedded-subscribe.handlers.types.js";
|
||||
import {
|
||||
extractToolResultMediaArtifact,
|
||||
extractMessagingToolSend,
|
||||
extractToolErrorMessage,
|
||||
extractToolResultMediaPaths,
|
||||
extractToolResultText,
|
||||
filterToolResultMediaUrls,
|
||||
isToolResultError,
|
||||
@@ -143,6 +143,23 @@ function collectMessagingMediaUrlsFromToolResult(result: unknown): string[] {
|
||||
return urls;
|
||||
}
|
||||
|
||||
function queuePendingToolMedia(
|
||||
ctx: ToolHandlerContext,
|
||||
mediaReply: { mediaUrls: string[]; audioAsVoice?: boolean },
|
||||
) {
|
||||
const seen = new Set(ctx.state.pendingToolMediaUrls);
|
||||
for (const mediaUrl of mediaReply.mediaUrls) {
|
||||
if (seen.has(mediaUrl)) {
|
||||
continue;
|
||||
}
|
||||
seen.add(mediaUrl);
|
||||
ctx.state.pendingToolMediaUrls.push(mediaUrl);
|
||||
}
|
||||
if (mediaReply.audioAsVoice) {
|
||||
ctx.state.pendingToolAudioAsVoice = true;
|
||||
}
|
||||
}
|
||||
|
||||
function readExecApprovalPendingDetails(result: unknown): {
|
||||
approvalId: string;
|
||||
approvalSlug: string;
|
||||
@@ -226,12 +243,20 @@ async function emitToolResultOutput(params: {
|
||||
sanitizedResult: unknown;
|
||||
}) {
|
||||
const { ctx, toolName, meta, isToolError, result, sanitizedResult } = params;
|
||||
if (!ctx.params.onToolResult) {
|
||||
return;
|
||||
}
|
||||
|
||||
const hasStructuredMedia =
|
||||
result &&
|
||||
typeof result === "object" &&
|
||||
(result as { details?: unknown }).details &&
|
||||
typeof (result as { details?: unknown }).details === "object" &&
|
||||
!Array.isArray((result as { details?: unknown }).details) &&
|
||||
typeof ((result as { details?: { media?: unknown } }).details?.media ?? undefined) ===
|
||||
"object" &&
|
||||
!Array.isArray((result as { details?: { media?: unknown } }).details?.media);
|
||||
const approvalPending = readExecApprovalPendingDetails(result);
|
||||
if (!isToolError && approvalPending) {
|
||||
if (!ctx.params.onToolResult) {
|
||||
return;
|
||||
}
|
||||
try {
|
||||
await ctx.params.onToolResult(
|
||||
buildExecApprovalPendingReplyPayload({
|
||||
@@ -254,6 +279,9 @@ async function emitToolResultOutput(params: {
|
||||
|
||||
const approvalUnavailable = readExecApprovalUnavailableDetails(result);
|
||||
if (!isToolError && approvalUnavailable) {
|
||||
if (!ctx.params.onToolResult) {
|
||||
return;
|
||||
}
|
||||
try {
|
||||
await ctx.params.onToolResult?.(
|
||||
buildExecApprovalUnavailableReplyPayload({
|
||||
@@ -275,24 +303,27 @@ async function emitToolResultOutput(params: {
|
||||
if (outputText) {
|
||||
ctx.emitToolOutput(toolName, meta, outputText);
|
||||
}
|
||||
return;
|
||||
if (!hasStructuredMedia) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
if (isToolError) {
|
||||
return;
|
||||
}
|
||||
|
||||
// emitToolOutput() already handles MEDIA: directives when enabled; this path
|
||||
// only sends raw media URLs for non-verbose delivery mode.
|
||||
const mediaPaths = filterToolResultMediaUrls(toolName, extractToolResultMediaPaths(result));
|
||||
if (mediaPaths.length === 0) {
|
||||
const mediaReply = extractToolResultMediaArtifact(result);
|
||||
if (!mediaReply) {
|
||||
return;
|
||||
}
|
||||
try {
|
||||
void ctx.params.onToolResult({ mediaUrls: mediaPaths });
|
||||
} catch {
|
||||
// ignore delivery failures
|
||||
const mediaUrls = filterToolResultMediaUrls(toolName, mediaReply.mediaUrls);
|
||||
if (mediaUrls.length === 0) {
|
||||
return;
|
||||
}
|
||||
queuePendingToolMedia(ctx, {
|
||||
mediaUrls,
|
||||
...(mediaReply.audioAsVoice ? { audioAsVoice: true } : {}),
|
||||
});
|
||||
}
|
||||
|
||||
export async function handleToolExecutionStart(
|
||||
|
||||
@@ -5,6 +5,7 @@ import type { InlineCodeState } from "../markdown/code-spans.js";
|
||||
import type { HookRunner } from "../plugins/hooks.js";
|
||||
import type { EmbeddedBlockChunker } from "./pi-embedded-block-chunker.js";
|
||||
import type { MessagingToolSend } from "./pi-embedded-messaging.js";
|
||||
import type { BlockReplyPayload } from "./pi-embedded-payloads.js";
|
||||
import type {
|
||||
BlockReplyChunking,
|
||||
SubscribeEmbeddedPiSessionParams,
|
||||
@@ -76,6 +77,8 @@ export type EmbeddedPiSubscribeState = {
|
||||
pendingMessagingTargets: Map<string, MessagingToolSend>;
|
||||
successfulCronAdds: number;
|
||||
pendingMessagingMediaUrls: Map<string, string[]>;
|
||||
pendingToolMediaUrls: string[];
|
||||
pendingToolAudioAsVoice: boolean;
|
||||
deterministicApprovalPromptSent: boolean;
|
||||
lastAssistant?: AgentMessage;
|
||||
};
|
||||
@@ -124,6 +127,7 @@ export type EmbeddedPiSubscribeContext = {
|
||||
incrementCompactionCount: () => void;
|
||||
getUsageTotals: () => NormalizedUsage | undefined;
|
||||
getCompactionCount: () => number;
|
||||
emitBlockReply: (payload: BlockReplyPayload) => void;
|
||||
};
|
||||
|
||||
/**
|
||||
@@ -151,6 +155,8 @@ export type ToolHandlerState = Pick<
|
||||
| "pendingMessagingTargets"
|
||||
| "pendingMessagingTexts"
|
||||
| "pendingMessagingMediaUrls"
|
||||
| "pendingToolMediaUrls"
|
||||
| "pendingToolAudioAsVoice"
|
||||
| "messagingToolSentTexts"
|
||||
| "messagingToolSentTextsNormalized"
|
||||
| "messagingToolSentMediaUrls"
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
import { describe, expect, it } from "vitest";
|
||||
import {
|
||||
extractToolResultMediaArtifact,
|
||||
extractToolResultMediaPaths,
|
||||
isToolResultMediaTrusted,
|
||||
} from "./pi-embedded-subscribe.tools.js";
|
||||
@@ -15,14 +16,40 @@ describe("extractToolResultMediaPaths", () => {
|
||||
expect(extractToolResultMediaPaths(42)).toEqual([]);
|
||||
});
|
||||
|
||||
it("returns empty array when content is missing", () => {
|
||||
expect(extractToolResultMediaPaths({ details: { path: "/tmp/img.png" } })).toEqual([]);
|
||||
it("extracts structured details.media without content blocks", () => {
|
||||
expect(
|
||||
extractToolResultMediaArtifact({
|
||||
details: {
|
||||
media: {
|
||||
mediaUrls: ["/tmp/img.png", "/tmp/img-2.png"],
|
||||
},
|
||||
},
|
||||
}),
|
||||
).toEqual({
|
||||
mediaUrls: ["/tmp/img.png", "/tmp/img-2.png"],
|
||||
});
|
||||
});
|
||||
|
||||
it("returns empty array when content has no text or image blocks", () => {
|
||||
expect(extractToolResultMediaPaths({ content: [{ type: "other" }] })).toEqual([]);
|
||||
});
|
||||
|
||||
it("extracts structured media with audioAsVoice", () => {
|
||||
expect(
|
||||
extractToolResultMediaArtifact({
|
||||
details: {
|
||||
media: {
|
||||
mediaUrl: "/tmp/reply.opus",
|
||||
audioAsVoice: true,
|
||||
},
|
||||
},
|
||||
}),
|
||||
).toEqual({
|
||||
mediaUrls: ["/tmp/reply.opus"],
|
||||
audioAsVoice: true,
|
||||
});
|
||||
});
|
||||
|
||||
it("extracts MEDIA: path from text content block", () => {
|
||||
const result = {
|
||||
content: [
|
||||
|
||||
@@ -187,25 +187,75 @@ export function filterToolResultMediaUrls(
|
||||
* Extract media file paths from a tool result.
|
||||
*
|
||||
* Strategy (first match wins):
|
||||
* 1. Parse `MEDIA:` tokens from text content blocks (all OpenClaw tools).
|
||||
* 2. Fall back to `details.path` when image content exists (OpenClaw imageResult).
|
||||
* 1. Read structured `details.media` attachments from tool details.
|
||||
* 2. Parse legacy `MEDIA:` tokens from text content blocks.
|
||||
* 3. Fall back to `details.path` when image content exists (legacy imageResult).
|
||||
*
|
||||
* Returns an empty array when no media is found (e.g. Pi SDK `read` tool
|
||||
* returns base64 image data but no file path; those need a different delivery
|
||||
* path like saving to a temp file).
|
||||
*/
|
||||
export function extractToolResultMediaPaths(result: unknown): string[] {
|
||||
export type ToolResultMediaArtifact = {
|
||||
mediaUrls: string[];
|
||||
audioAsVoice?: boolean;
|
||||
};
|
||||
|
||||
function readToolResultDetailsMedia(
|
||||
result: Record<string, unknown>,
|
||||
): Record<string, unknown> | undefined {
|
||||
const details =
|
||||
result.details && typeof result.details === "object" && !Array.isArray(result.details)
|
||||
? (result.details as Record<string, unknown>)
|
||||
: undefined;
|
||||
const media =
|
||||
details?.media && typeof details.media === "object" && !Array.isArray(details.media)
|
||||
? (details.media as Record<string, unknown>)
|
||||
: undefined;
|
||||
return media;
|
||||
}
|
||||
|
||||
function collectStructuredMediaUrls(media: Record<string, unknown>): string[] {
|
||||
const urls: string[] = [];
|
||||
if (typeof media.mediaUrl === "string" && media.mediaUrl.trim()) {
|
||||
urls.push(media.mediaUrl.trim());
|
||||
}
|
||||
if (Array.isArray(media.mediaUrls)) {
|
||||
urls.push(
|
||||
...media.mediaUrls
|
||||
.filter((value): value is string => typeof value === "string")
|
||||
.map((value) => value.trim())
|
||||
.filter(Boolean),
|
||||
);
|
||||
}
|
||||
return Array.from(new Set(urls));
|
||||
}
|
||||
|
||||
export function extractToolResultMediaArtifact(
|
||||
result: unknown,
|
||||
): ToolResultMediaArtifact | undefined {
|
||||
if (!result || typeof result !== "object") {
|
||||
return [];
|
||||
return undefined;
|
||||
}
|
||||
const record = result as Record<string, unknown>;
|
||||
const content = Array.isArray(record.content) ? record.content : null;
|
||||
if (!content) {
|
||||
return [];
|
||||
const detailsMedia = readToolResultDetailsMedia(record);
|
||||
if (detailsMedia) {
|
||||
const mediaUrls = collectStructuredMediaUrls(detailsMedia);
|
||||
if (mediaUrls.length > 0) {
|
||||
return {
|
||||
mediaUrls,
|
||||
...(detailsMedia.audioAsVoice === true ? { audioAsVoice: true } : {}),
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
// Extract MEDIA: paths from text content blocks using the shared parser so
|
||||
// directive matching and validation stay in sync with outbound reply parsing.
|
||||
const content = Array.isArray(record.content) ? record.content : null;
|
||||
if (!content) {
|
||||
return undefined;
|
||||
}
|
||||
|
||||
// Extract legacy MEDIA: paths from text content blocks using the shared
|
||||
// parser so directive matching and validation stay in sync with outbound
|
||||
// reply parsing.
|
||||
const paths: string[] = [];
|
||||
let hasImageContent = false;
|
||||
for (const item of content) {
|
||||
@@ -226,19 +276,24 @@ export function extractToolResultMediaPaths(result: unknown): string[] {
|
||||
}
|
||||
|
||||
if (paths.length > 0) {
|
||||
return paths;
|
||||
return { mediaUrls: paths };
|
||||
}
|
||||
|
||||
// Fall back to details.path when image content exists but no MEDIA: text.
|
||||
// Fall back to legacy details.path when image content exists but no
|
||||
// structured media details or MEDIA: text.
|
||||
if (hasImageContent) {
|
||||
const details = record.details as Record<string, unknown> | undefined;
|
||||
const p = typeof details?.path === "string" ? details.path.trim() : "";
|
||||
if (p) {
|
||||
return [p];
|
||||
return { mediaUrls: [p] };
|
||||
}
|
||||
}
|
||||
|
||||
return [];
|
||||
return undefined;
|
||||
}
|
||||
|
||||
export function extractToolResultMediaPaths(result: unknown): string[] {
|
||||
return extractToolResultMediaArtifact(result)?.mediaUrls ?? [];
|
||||
}
|
||||
|
||||
export function isToolResultError(result: unknown): boolean {
|
||||
|
||||
@@ -11,7 +11,9 @@ import {
|
||||
isMessagingToolDuplicateNormalized,
|
||||
normalizeTextForComparison,
|
||||
} from "./pi-embedded-helpers.js";
|
||||
import type { BlockReplyPayload } from "./pi-embedded-payloads.js";
|
||||
import { createEmbeddedPiSessionEventHandler } from "./pi-embedded-subscribe.handlers.js";
|
||||
import { consumePendingToolMediaIntoReply } from "./pi-embedded-subscribe.handlers.messages.js";
|
||||
import type {
|
||||
EmbeddedPiSubscribeContext,
|
||||
EmbeddedPiSubscribeState,
|
||||
@@ -78,6 +80,8 @@ export function subscribeEmbeddedPiSession(params: SubscribeEmbeddedPiSessionPar
|
||||
pendingMessagingTargets: new Map(),
|
||||
successfulCronAdds: 0,
|
||||
pendingMessagingMediaUrls: new Map(),
|
||||
pendingToolMediaUrls: [],
|
||||
pendingToolAudioAsVoice: false,
|
||||
deterministicApprovalPromptSent: false,
|
||||
};
|
||||
const usageTotals = {
|
||||
@@ -113,6 +117,9 @@ export function subscribeEmbeddedPiSession(params: SubscribeEmbeddedPiSessionPar
|
||||
log.warn(`block reply callback failed: ${String(err)}`);
|
||||
});
|
||||
};
|
||||
const emitBlockReply = (payload: BlockReplyPayload) => {
|
||||
emitBlockReplySafely(consumePendingToolMediaIntoReply(state, payload));
|
||||
};
|
||||
|
||||
const resetAssistantMessageState = (nextAssistantTextBaseline: number) => {
|
||||
state.deltaBuffer = "";
|
||||
@@ -523,7 +530,7 @@ export function subscribeEmbeddedPiSession(params: SubscribeEmbeddedPiSessionPar
|
||||
if (!cleanedText && (!mediaUrls || mediaUrls.length === 0) && !audioAsVoice) {
|
||||
return;
|
||||
}
|
||||
emitBlockReplySafely({
|
||||
emitBlockReply({
|
||||
text: cleanedText,
|
||||
mediaUrls: mediaUrls?.length ? mediaUrls : undefined,
|
||||
audioAsVoice,
|
||||
@@ -599,6 +606,8 @@ export function subscribeEmbeddedPiSession(params: SubscribeEmbeddedPiSessionPar
|
||||
pendingMessagingTargets.clear();
|
||||
state.successfulCronAdds = 0;
|
||||
state.pendingMessagingMediaUrls.clear();
|
||||
state.pendingToolMediaUrls = [];
|
||||
state.pendingToolAudioAsVoice = false;
|
||||
state.deterministicApprovalPromptSent = false;
|
||||
resetAssistantMessageState(0);
|
||||
};
|
||||
@@ -624,6 +633,7 @@ export function subscribeEmbeddedPiSession(params: SubscribeEmbeddedPiSessionPar
|
||||
stripBlockTags,
|
||||
emitBlockChunk,
|
||||
flushBlockReplyBuffer,
|
||||
emitBlockReply,
|
||||
emitReasoningStream,
|
||||
consumeReplyDirectives,
|
||||
consumePartialReplyDirectives,
|
||||
|
||||
@@ -6,6 +6,8 @@ export function createBaseToolHandlerState() {
|
||||
pendingMessagingTexts: new Map<string, string>(),
|
||||
pendingMessagingTargets: new Map<string, unknown>(),
|
||||
pendingMessagingMediaUrls: new Map<string, string[]>(),
|
||||
pendingToolMediaUrls: [] as string[],
|
||||
pendingToolAudioAsVoice: false,
|
||||
messagingToolSentTexts: [] as string[],
|
||||
messagingToolSentTextsNormalized: [] as string[],
|
||||
messagingToolSentMediaUrls: [] as string[],
|
||||
|
||||
@@ -110,7 +110,7 @@ describe("createImageGenerateTool", () => {
|
||||
});
|
||||
});
|
||||
|
||||
it("generates images and returns MEDIA paths", async () => {
|
||||
it("generates images and returns details.media paths", async () => {
|
||||
const generateImage = vi.spyOn(imageGenerationRuntime, "generateImage").mockResolvedValue({
|
||||
provider: "openai",
|
||||
model: "gpt-image-1",
|
||||
@@ -215,14 +215,16 @@ describe("createImageGenerateTool", () => {
|
||||
provider: "openai",
|
||||
model: "gpt-image-1",
|
||||
count: 2,
|
||||
media: {
|
||||
mediaUrls: ["/tmp/generated-1.png", "/tmp/generated-2.png"],
|
||||
},
|
||||
paths: ["/tmp/generated-1.png", "/tmp/generated-2.png"],
|
||||
filename: "cats/output.png",
|
||||
revisedPrompts: ["A more cinematic cat"],
|
||||
},
|
||||
});
|
||||
const text = (result.content?.[0] as { text: string } | undefined)?.text ?? "";
|
||||
expect(text).toContain("MEDIA:/tmp/generated-1.png");
|
||||
expect(text).toContain("MEDIA:/tmp/generated-2.png");
|
||||
expect(text).not.toContain("MEDIA:");
|
||||
});
|
||||
|
||||
it("rejects counts outside the supported range", async () => {
|
||||
|
||||
@@ -610,7 +610,6 @@ export function createImageGenerateTool(options?: {
|
||||
.filter((entry): entry is string => Boolean(entry));
|
||||
const lines = [
|
||||
`Generated ${savedImages.length} image${savedImages.length === 1 ? "" : "s"} with ${result.provider}/${result.model}.`,
|
||||
...savedImages.map((image) => `MEDIA:${image.path}`),
|
||||
];
|
||||
|
||||
return {
|
||||
@@ -619,6 +618,9 @@ export function createImageGenerateTool(options?: {
|
||||
provider: result.provider,
|
||||
model: result.model,
|
||||
count: savedImages.length,
|
||||
media: {
|
||||
mediaUrls: savedImages.map((image) => image.path),
|
||||
},
|
||||
paths: savedImages.map((image) => image.path),
|
||||
...(imageInputs.length === 1
|
||||
? {
|
||||
|
||||
@@ -310,7 +310,6 @@ export function createNodesTool(options?: {
|
||||
expectedHost: resolvedNode.remoteIp,
|
||||
invalidPayloadMessage: "invalid camera.snap payload",
|
||||
});
|
||||
content.push({ type: "text", text: `MEDIA:${filePath}` });
|
||||
if (options?.modelHasVision && payload.base64) {
|
||||
content.push({
|
||||
type: "image",
|
||||
|
||||
@@ -4,7 +4,12 @@ vi.mock("../../auto-reply/tokens.js", () => ({
|
||||
SILENT_REPLY_TOKEN: "QUIET_TOKEN",
|
||||
}));
|
||||
|
||||
vi.mock("../../tts/tts.js", () => ({
|
||||
textToSpeech: vi.fn(),
|
||||
}));
|
||||
|
||||
const { createTtsTool } = await import("./tts-tool.js");
|
||||
const { textToSpeech } = await import("../../tts/tts.js");
|
||||
|
||||
describe("createTtsTool", () => {
|
||||
it("uses SILENT_REPLY_TOKEN in guidance text", () => {
|
||||
@@ -13,4 +18,29 @@ describe("createTtsTool", () => {
|
||||
expect(tool.description).toContain("QUIET_TOKEN");
|
||||
expect(tool.description).not.toContain("NO_REPLY");
|
||||
});
|
||||
|
||||
it("stores audio delivery in details.media", async () => {
|
||||
vi.mocked(textToSpeech).mockResolvedValue({
|
||||
success: true,
|
||||
audioPath: "/tmp/reply.opus",
|
||||
provider: "test",
|
||||
voiceCompatible: true,
|
||||
});
|
||||
|
||||
const tool = createTtsTool();
|
||||
const result = await tool.execute("call-1", { text: "hello" });
|
||||
|
||||
expect(result).toMatchObject({
|
||||
content: [{ type: "text", text: "Generated audio reply." }],
|
||||
details: {
|
||||
audioPath: "/tmp/reply.opus",
|
||||
provider: "test",
|
||||
media: {
|
||||
mediaUrl: "/tmp/reply.opus",
|
||||
audioAsVoice: true,
|
||||
},
|
||||
},
|
||||
});
|
||||
expect(JSON.stringify(result.content)).not.toContain("MEDIA:");
|
||||
});
|
||||
});
|
||||
|
||||
@@ -35,15 +35,16 @@ export function createTtsTool(opts?: {
|
||||
});
|
||||
|
||||
if (result.success && result.audioPath) {
|
||||
const lines: string[] = [];
|
||||
// Tag Telegram Opus output as a voice bubble instead of a file attachment.
|
||||
if (result.voiceCompatible) {
|
||||
lines.push("[[audio_as_voice]]");
|
||||
}
|
||||
lines.push(`MEDIA:${result.audioPath}`);
|
||||
return {
|
||||
content: [{ type: "text", text: lines.join("\n") }],
|
||||
details: { audioPath: result.audioPath, provider: result.provider },
|
||||
content: [{ type: "text", text: "Generated audio reply." }],
|
||||
details: {
|
||||
audioPath: result.audioPath,
|
||||
provider: result.provider,
|
||||
media: {
|
||||
mediaUrl: result.audioPath,
|
||||
...(result.voiceCompatible ? { audioAsVoice: true } : {}),
|
||||
},
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user