refactor(agents): unify tool media reply delivery

This commit is contained in:
Peter Steinberger
2026-03-22 00:49:18 -07:00
parent 2d492ab534
commit 3cd4978fc2
19 changed files with 414 additions and 70 deletions

View File

@@ -390,9 +390,9 @@ Notes:
## Agent tool
The `tts` tool converts text to speech and returns a `MEDIA:` path. When the
result is Telegram-compatible, the tool includes `[[audio_as_voice]]` so
Telegram sends a voice bubble.
The `tts` tool converts text to speech and returns an audio attachment for
reply delivery. When the result is Telegram-compatible, OpenClaw marks it for
voice-bubble delivery.
## Gateway RPC

View File

@@ -390,9 +390,9 @@ Notes:
## Agent tool
The `tts` tool converts text to speech and returns a `MEDIA:` path. When the
result is Telegram-compatible, the tool includes `[[audio_as_voice]]` so
Telegram sends a voice bubble.
The `tts` tool converts text to speech and returns an audio attachment for
reply delivery. When the result is Telegram-compatible, OpenClaw marks it for
voice-bubble delivery.
## Gateway RPC

View File

@@ -11,16 +11,20 @@ function createContext(
lastAssistant: unknown,
overrides?: { onAgentEvent?: (event: unknown) => void },
): EmbeddedPiSubscribeContext {
const onBlockReply = vi.fn();
return {
params: {
runId: "run-1",
config: {},
sessionKey: "agent:main:main",
onAgentEvent: overrides?.onAgentEvent,
onBlockReply,
},
state: {
lastAssistant: lastAssistant as EmbeddedPiSubscribeContext["state"]["lastAssistant"],
pendingCompactionRetry: 0,
pendingToolMediaUrls: [],
pendingToolAudioAsVoice: false,
blockState: {
thinking: true,
final: true,
@@ -32,6 +36,7 @@ function createContext(
warn: vi.fn(),
},
flushBlockReplyBuffer: vi.fn(),
emitBlockReply: onBlockReply,
resolveCompactionRetry: vi.fn(),
maybeResolveCompactionWait: vi.fn(),
} as unknown as EmbeddedPiSubscribeContext;
@@ -159,4 +164,19 @@ describe("handleAgentEnd", () => {
expect(ctx.log.warn).not.toHaveBeenCalled();
expect(ctx.log.debug).toHaveBeenCalledWith("embedded run agent end: runId=run-1 isError=false");
});
it("flushes orphaned tool media as a media-only block reply", () => {
const ctx = createContext(undefined);
ctx.state.pendingToolMediaUrls = ["/tmp/reply.opus"];
ctx.state.pendingToolAudioAsVoice = true;
handleAgentEnd(ctx);
expect(ctx.emitBlockReply).toHaveBeenCalledWith({
mediaUrls: ["/tmp/reply.opus"],
audioAsVoice: true,
});
expect(ctx.state.pendingToolMediaUrls).toEqual([]);
expect(ctx.state.pendingToolAudioAsVoice).toBe(false);
});
});

View File

@@ -6,6 +6,10 @@ import {
sanitizeForConsole,
} from "./pi-embedded-error-observation.js";
import { classifyFailoverReason, formatAssistantErrorText } from "./pi-embedded-helpers.js";
import {
consumePendingToolMediaReply,
hasAssistantVisibleReply,
} from "./pi-embedded-subscribe.handlers.messages.js";
import type { EmbeddedPiSubscribeContext } from "./pi-embedded-subscribe.handlers.types.js";
import { isAssistantMessage } from "./pi-embedded-utils.js";
@@ -97,6 +101,10 @@ export function handleAgentEnd(ctx: EmbeddedPiSubscribeContext) {
}
ctx.flushBlockReplyBuffer();
const pendingToolMediaReply = consumePendingToolMediaReply(ctx.state);
if (pendingToolMediaReply && hasAssistantVisibleReply(pendingToolMediaReply)) {
ctx.emitBlockReply(pendingToolMediaReply);
}
// Flush the reply pipeline so the response reaches the channel before
// compaction wait blocks the run. This mirrors the pattern used by
// handleToolExecutionStart and ensures delivery is not held hostage to

View File

@@ -1,6 +1,8 @@
import { describe, expect, it } from "vitest";
import {
buildAssistantStreamData,
consumePendingToolMediaIntoReply,
consumePendingToolMediaReply,
hasAssistantVisibleReply,
resolveSilentReplyFallbackText,
} from "./pi-embedded-subscribe.handlers.messages.js";
@@ -61,3 +63,58 @@ describe("buildAssistantStreamData", () => {
});
});
});
describe("consumePendingToolMediaIntoReply", () => {
it("attaches queued tool media to the next assistant reply", () => {
const state = {
pendingToolMediaUrls: ["/tmp/a.png", "/tmp/b.png"],
pendingToolAudioAsVoice: false,
};
expect(
consumePendingToolMediaIntoReply(state, {
text: "done",
}),
).toEqual({
text: "done",
mediaUrls: ["/tmp/a.png", "/tmp/b.png"],
audioAsVoice: undefined,
});
expect(state.pendingToolMediaUrls).toEqual([]);
});
it("preserves reasoning replies without consuming queued media", () => {
const state = {
pendingToolMediaUrls: ["/tmp/a.png"],
pendingToolAudioAsVoice: true,
};
expect(
consumePendingToolMediaIntoReply(state, {
text: "thinking",
isReasoning: true,
}),
).toEqual({
text: "thinking",
isReasoning: true,
});
expect(state.pendingToolMediaUrls).toEqual(["/tmp/a.png"]);
expect(state.pendingToolAudioAsVoice).toBe(true);
});
});
describe("consumePendingToolMediaReply", () => {
it("builds a media-only reply for orphaned tool media", () => {
const state = {
pendingToolMediaUrls: ["/tmp/reply.opus"],
pendingToolAudioAsVoice: true,
};
expect(consumePendingToolMediaReply(state)).toEqual({
mediaUrls: ["/tmp/reply.opus"],
audioAsVoice: true,
});
expect(state.pendingToolMediaUrls).toEqual([]);
expect(state.pendingToolAudioAsVoice).toBe(false);
});
});

View File

@@ -8,7 +8,11 @@ import {
isMessagingToolDuplicateNormalized,
normalizeTextForComparison,
} from "./pi-embedded-helpers.js";
import type { EmbeddedPiSubscribeContext } from "./pi-embedded-subscribe.handlers.types.js";
import type { BlockReplyPayload } from "./pi-embedded-payloads.js";
import type {
EmbeddedPiSubscribeContext,
EmbeddedPiSubscribeState,
} from "./pi-embedded-subscribe.handlers.types.js";
import { appendRawStream } from "./pi-embedded-subscribe.raw-stream.js";
import {
extractAssistantText,
@@ -57,6 +61,51 @@ export function resolveSilentReplyFallbackText(params: {
return fallback;
}
function clearPendingToolMedia(
state: Pick<EmbeddedPiSubscribeState, "pendingToolMediaUrls" | "pendingToolAudioAsVoice">,
) {
state.pendingToolMediaUrls = [];
state.pendingToolAudioAsVoice = false;
}
export function consumePendingToolMediaIntoReply(
state: Pick<EmbeddedPiSubscribeState, "pendingToolMediaUrls" | "pendingToolAudioAsVoice">,
payload: BlockReplyPayload,
): BlockReplyPayload {
if (payload.isReasoning) {
return payload;
}
if (state.pendingToolMediaUrls.length === 0 && !state.pendingToolAudioAsVoice) {
return payload;
}
const mergedMediaUrls = Array.from(
new Set([...(payload.mediaUrls ?? []), ...state.pendingToolMediaUrls]),
);
const mergedPayload: BlockReplyPayload = {
...payload,
mediaUrls: mergedMediaUrls.length ? mergedMediaUrls : undefined,
audioAsVoice: payload.audioAsVoice || state.pendingToolAudioAsVoice || undefined,
};
clearPendingToolMedia(state);
return mergedPayload;
}
export function consumePendingToolMediaReply(
state: Pick<EmbeddedPiSubscribeState, "pendingToolMediaUrls" | "pendingToolAudioAsVoice">,
): BlockReplyPayload | null {
if (state.pendingToolMediaUrls.length === 0 && !state.pendingToolAudioAsVoice) {
return null;
}
const payload: BlockReplyPayload = {
mediaUrls: state.pendingToolMediaUrls.length
? Array.from(new Set(state.pendingToolMediaUrls))
: undefined,
audioAsVoice: state.pendingToolAudioAsVoice || undefined,
};
clearPendingToolMedia(state);
return payload;
}
export function hasAssistantVisibleReply(params: {
text?: string;
mediaUrls?: string[];
@@ -390,7 +439,7 @@ export function handleMessageEnd(
} = splitResult;
// Emit if there's content OR audioAsVoice flag (to propagate the flag).
if (hasAssistantVisibleReply({ text: cleanedText, mediaUrls, audioAsVoice })) {
emitBlockReplySafely({
ctx.emitBlockReply({
text: cleanedText,
mediaUrls: mediaUrls?.length ? mediaUrls : undefined,
audioAsVoice,

View File

@@ -24,6 +24,8 @@ function createMockContext(overrides?: {
pendingMessagingTexts: new Map(),
pendingMessagingTargets: new Map(),
pendingMessagingMediaUrls: new Map(),
pendingToolMediaUrls: [],
pendingToolAudioAsVoice: false,
messagingToolSentTexts: [],
messagingToolSentTextsNormalized: [],
messagingToolSentMediaUrls: [],
@@ -36,6 +38,7 @@ function createMockContext(overrides?: {
emitToolSummary: vi.fn(),
emitToolOutput: vi.fn(),
trimMessagingToolSent: vi.fn(),
emitBlockReply: vi.fn(),
hookRunner: undefined,
// Fill in remaining required fields with no-ops.
blockChunker: null,
@@ -114,9 +117,8 @@ describe("handleToolExecutionEnd media emission", () => {
await emitPngMediaToolResult(ctx);
expect(onToolResult).toHaveBeenCalledWith({
mediaUrls: ["/tmp/screenshot.png"],
});
expect(onToolResult).not.toHaveBeenCalled();
expect(ctx.state.pendingToolMediaUrls).toEqual(["/tmp/screenshot.png"]);
});
it("does NOT emit local media for untrusted tools", async () => {
@@ -126,6 +128,7 @@ describe("handleToolExecutionEnd media emission", () => {
await emitUntrustedToolMediaResult(ctx, "/tmp/secret.png");
expect(onToolResult).not.toHaveBeenCalled();
expect(ctx.state.pendingToolMediaUrls).toEqual([]);
});
it("emits remote media for untrusted tools", async () => {
@@ -134,12 +137,11 @@ describe("handleToolExecutionEnd media emission", () => {
await emitUntrustedToolMediaResult(ctx, "https://example.com/file.png");
expect(onToolResult).toHaveBeenCalledWith({
mediaUrls: ["https://example.com/file.png"],
});
expect(onToolResult).not.toHaveBeenCalled();
expect(ctx.state.pendingToolMediaUrls).toEqual(["https://example.com/file.png"]);
});
it("does NOT emit media when verbose is full (emitToolOutput handles it)", async () => {
it("does NOT queue legacy MEDIA paths when verbose is full", async () => {
const onToolResult = vi.fn();
const ctx = createMockContext({ shouldEmitToolOutput: true, onToolResult });
@@ -149,15 +151,31 @@ describe("handleToolExecutionEnd media emission", () => {
// It may be called by emitToolOutput, but the new block should not fire.
// Verify emitToolOutput was called instead.
expect(ctx.emitToolOutput).toHaveBeenCalled();
// The direct media emission should not have been called with just mediaUrls.
const directMediaCalls = onToolResult.mock.calls.filter(
(call: unknown[]) =>
call[0] &&
typeof call[0] === "object" &&
"mediaUrls" in (call[0] as Record<string, unknown>) &&
!("text" in (call[0] as Record<string, unknown>)),
);
expect(directMediaCalls).toHaveLength(0);
expect(ctx.state.pendingToolMediaUrls).toEqual([]);
});
it("still queues structured media when verbose is full", async () => {
const ctx = createMockContext({ shouldEmitToolOutput: true, onToolResult: vi.fn() });
await handleToolExecutionEnd(ctx, {
type: "tool_execution_end",
toolName: "tts",
toolCallId: "tc-1",
isError: false,
result: {
content: [{ type: "text", text: "Generated audio reply." }],
details: {
media: {
mediaUrl: "/tmp/reply.opus",
audioAsVoice: true,
},
},
},
});
expect(ctx.emitToolOutput).toHaveBeenCalled();
expect(ctx.state.pendingToolMediaUrls).toEqual(["/tmp/reply.opus"]);
expect(ctx.state.pendingToolAudioAsVoice).toBe(true);
});
it("does NOT emit media for error results", async () => {
@@ -167,6 +185,7 @@ describe("handleToolExecutionEnd media emission", () => {
await emitPngMediaToolResult(ctx, { isError: true });
expect(onToolResult).not.toHaveBeenCalled();
expect(ctx.state.pendingToolMediaUrls).toEqual([]);
});
it("does NOT emit when tool result has no media", async () => {
@@ -184,6 +203,7 @@ describe("handleToolExecutionEnd media emission", () => {
});
expect(onToolResult).not.toHaveBeenCalled();
expect(ctx.state.pendingToolMediaUrls).toEqual([]);
});
it("does NOT emit media for <media:audio> placeholder text", async () => {
@@ -206,6 +226,7 @@ describe("handleToolExecutionEnd media emission", () => {
});
expect(onToolResult).not.toHaveBeenCalled();
expect(ctx.state.pendingToolMediaUrls).toEqual([]);
});
it("does NOT emit media for malformed MEDIA:-prefixed prose", async () => {
@@ -228,9 +249,10 @@ describe("handleToolExecutionEnd media emission", () => {
});
expect(onToolResult).not.toHaveBeenCalled();
expect(ctx.state.pendingToolMediaUrls).toEqual([]);
});
it("emits media from details.path fallback when no MEDIA: text", async () => {
it("queues media from details.path fallback when no MEDIA: text", async () => {
const onToolResult = vi.fn();
const ctx = createMockContext({ shouldEmitToolOutput: false, onToolResult });
@@ -248,8 +270,29 @@ describe("handleToolExecutionEnd media emission", () => {
},
});
expect(onToolResult).toHaveBeenCalledWith({
mediaUrls: ["/tmp/canvas-output.png"],
expect(onToolResult).not.toHaveBeenCalled();
expect(ctx.state.pendingToolMediaUrls).toEqual(["/tmp/canvas-output.png"]);
});
it("queues structured details.media and voice metadata", async () => {
const ctx = createMockContext({ shouldEmitToolOutput: false, onToolResult: vi.fn() });
await handleToolExecutionEnd(ctx, {
type: "tool_execution_end",
toolName: "tts",
toolCallId: "tc-1",
isError: false,
result: {
details: {
media: {
mediaUrl: "/tmp/reply.opus",
audioAsVoice: true,
},
},
},
});
expect(ctx.state.pendingToolMediaUrls).toEqual(["/tmp/reply.opus"]);
expect(ctx.state.pendingToolAudioAsVoice).toBe(true);
});
});

View File

@@ -40,6 +40,8 @@ function createTestContext(): {
pendingMessagingTargets: new Map<string, MessagingToolSend>(),
pendingMessagingTexts: new Map<string, string>(),
pendingMessagingMediaUrls: new Map<string, string[]>(),
pendingToolMediaUrls: [],
pendingToolAudioAsVoice: false,
messagingToolSentTexts: [],
messagingToolSentTextsNormalized: [],
messagingToolSentMediaUrls: [],

View File

@@ -13,9 +13,9 @@ import type {
ToolHandlerContext,
} from "./pi-embedded-subscribe.handlers.types.js";
import {
extractToolResultMediaArtifact,
extractMessagingToolSend,
extractToolErrorMessage,
extractToolResultMediaPaths,
extractToolResultText,
filterToolResultMediaUrls,
isToolResultError,
@@ -143,6 +143,23 @@ function collectMessagingMediaUrlsFromToolResult(result: unknown): string[] {
return urls;
}
function queuePendingToolMedia(
ctx: ToolHandlerContext,
mediaReply: { mediaUrls: string[]; audioAsVoice?: boolean },
) {
const seen = new Set(ctx.state.pendingToolMediaUrls);
for (const mediaUrl of mediaReply.mediaUrls) {
if (seen.has(mediaUrl)) {
continue;
}
seen.add(mediaUrl);
ctx.state.pendingToolMediaUrls.push(mediaUrl);
}
if (mediaReply.audioAsVoice) {
ctx.state.pendingToolAudioAsVoice = true;
}
}
function readExecApprovalPendingDetails(result: unknown): {
approvalId: string;
approvalSlug: string;
@@ -226,12 +243,20 @@ async function emitToolResultOutput(params: {
sanitizedResult: unknown;
}) {
const { ctx, toolName, meta, isToolError, result, sanitizedResult } = params;
if (!ctx.params.onToolResult) {
return;
}
const hasStructuredMedia =
result &&
typeof result === "object" &&
(result as { details?: unknown }).details &&
typeof (result as { details?: unknown }).details === "object" &&
!Array.isArray((result as { details?: unknown }).details) &&
typeof ((result as { details?: { media?: unknown } }).details?.media ?? undefined) ===
"object" &&
!Array.isArray((result as { details?: { media?: unknown } }).details?.media);
const approvalPending = readExecApprovalPendingDetails(result);
if (!isToolError && approvalPending) {
if (!ctx.params.onToolResult) {
return;
}
try {
await ctx.params.onToolResult(
buildExecApprovalPendingReplyPayload({
@@ -254,6 +279,9 @@ async function emitToolResultOutput(params: {
const approvalUnavailable = readExecApprovalUnavailableDetails(result);
if (!isToolError && approvalUnavailable) {
if (!ctx.params.onToolResult) {
return;
}
try {
await ctx.params.onToolResult?.(
buildExecApprovalUnavailableReplyPayload({
@@ -275,24 +303,27 @@ async function emitToolResultOutput(params: {
if (outputText) {
ctx.emitToolOutput(toolName, meta, outputText);
}
return;
if (!hasStructuredMedia) {
return;
}
}
if (isToolError) {
return;
}
// emitToolOutput() already handles MEDIA: directives when enabled; this path
// only sends raw media URLs for non-verbose delivery mode.
const mediaPaths = filterToolResultMediaUrls(toolName, extractToolResultMediaPaths(result));
if (mediaPaths.length === 0) {
const mediaReply = extractToolResultMediaArtifact(result);
if (!mediaReply) {
return;
}
try {
void ctx.params.onToolResult({ mediaUrls: mediaPaths });
} catch {
// ignore delivery failures
const mediaUrls = filterToolResultMediaUrls(toolName, mediaReply.mediaUrls);
if (mediaUrls.length === 0) {
return;
}
queuePendingToolMedia(ctx, {
mediaUrls,
...(mediaReply.audioAsVoice ? { audioAsVoice: true } : {}),
});
}
export async function handleToolExecutionStart(

View File

@@ -5,6 +5,7 @@ import type { InlineCodeState } from "../markdown/code-spans.js";
import type { HookRunner } from "../plugins/hooks.js";
import type { EmbeddedBlockChunker } from "./pi-embedded-block-chunker.js";
import type { MessagingToolSend } from "./pi-embedded-messaging.js";
import type { BlockReplyPayload } from "./pi-embedded-payloads.js";
import type {
BlockReplyChunking,
SubscribeEmbeddedPiSessionParams,
@@ -76,6 +77,8 @@ export type EmbeddedPiSubscribeState = {
pendingMessagingTargets: Map<string, MessagingToolSend>;
successfulCronAdds: number;
pendingMessagingMediaUrls: Map<string, string[]>;
pendingToolMediaUrls: string[];
pendingToolAudioAsVoice: boolean;
deterministicApprovalPromptSent: boolean;
lastAssistant?: AgentMessage;
};
@@ -124,6 +127,7 @@ export type EmbeddedPiSubscribeContext = {
incrementCompactionCount: () => void;
getUsageTotals: () => NormalizedUsage | undefined;
getCompactionCount: () => number;
emitBlockReply: (payload: BlockReplyPayload) => void;
};
/**
@@ -151,6 +155,8 @@ export type ToolHandlerState = Pick<
| "pendingMessagingTargets"
| "pendingMessagingTexts"
| "pendingMessagingMediaUrls"
| "pendingToolMediaUrls"
| "pendingToolAudioAsVoice"
| "messagingToolSentTexts"
| "messagingToolSentTextsNormalized"
| "messagingToolSentMediaUrls"

View File

@@ -1,5 +1,6 @@
import { describe, expect, it } from "vitest";
import {
extractToolResultMediaArtifact,
extractToolResultMediaPaths,
isToolResultMediaTrusted,
} from "./pi-embedded-subscribe.tools.js";
@@ -15,14 +16,40 @@ describe("extractToolResultMediaPaths", () => {
expect(extractToolResultMediaPaths(42)).toEqual([]);
});
it("returns empty array when content is missing", () => {
expect(extractToolResultMediaPaths({ details: { path: "/tmp/img.png" } })).toEqual([]);
it("extracts structured details.media without content blocks", () => {
expect(
extractToolResultMediaArtifact({
details: {
media: {
mediaUrls: ["/tmp/img.png", "/tmp/img-2.png"],
},
},
}),
).toEqual({
mediaUrls: ["/tmp/img.png", "/tmp/img-2.png"],
});
});
it("returns empty array when content has no text or image blocks", () => {
expect(extractToolResultMediaPaths({ content: [{ type: "other" }] })).toEqual([]);
});
it("extracts structured media with audioAsVoice", () => {
expect(
extractToolResultMediaArtifact({
details: {
media: {
mediaUrl: "/tmp/reply.opus",
audioAsVoice: true,
},
},
}),
).toEqual({
mediaUrls: ["/tmp/reply.opus"],
audioAsVoice: true,
});
});
it("extracts MEDIA: path from text content block", () => {
const result = {
content: [

View File

@@ -187,25 +187,75 @@ export function filterToolResultMediaUrls(
* Extract media file paths from a tool result.
*
* Strategy (first match wins):
* 1. Parse `MEDIA:` tokens from text content blocks (all OpenClaw tools).
* 2. Fall back to `details.path` when image content exists (OpenClaw imageResult).
* 1. Read structured `details.media` attachments from tool details.
* 2. Parse legacy `MEDIA:` tokens from text content blocks.
* 3. Fall back to `details.path` when image content exists (legacy imageResult).
*
* Returns an empty array when no media is found (e.g. Pi SDK `read` tool
* returns base64 image data but no file path; those need a different delivery
* path like saving to a temp file).
*/
export function extractToolResultMediaPaths(result: unknown): string[] {
export type ToolResultMediaArtifact = {
mediaUrls: string[];
audioAsVoice?: boolean;
};
function readToolResultDetailsMedia(
result: Record<string, unknown>,
): Record<string, unknown> | undefined {
const details =
result.details && typeof result.details === "object" && !Array.isArray(result.details)
? (result.details as Record<string, unknown>)
: undefined;
const media =
details?.media && typeof details.media === "object" && !Array.isArray(details.media)
? (details.media as Record<string, unknown>)
: undefined;
return media;
}
function collectStructuredMediaUrls(media: Record<string, unknown>): string[] {
const urls: string[] = [];
if (typeof media.mediaUrl === "string" && media.mediaUrl.trim()) {
urls.push(media.mediaUrl.trim());
}
if (Array.isArray(media.mediaUrls)) {
urls.push(
...media.mediaUrls
.filter((value): value is string => typeof value === "string")
.map((value) => value.trim())
.filter(Boolean),
);
}
return Array.from(new Set(urls));
}
export function extractToolResultMediaArtifact(
result: unknown,
): ToolResultMediaArtifact | undefined {
if (!result || typeof result !== "object") {
return [];
return undefined;
}
const record = result as Record<string, unknown>;
const content = Array.isArray(record.content) ? record.content : null;
if (!content) {
return [];
const detailsMedia = readToolResultDetailsMedia(record);
if (detailsMedia) {
const mediaUrls = collectStructuredMediaUrls(detailsMedia);
if (mediaUrls.length > 0) {
return {
mediaUrls,
...(detailsMedia.audioAsVoice === true ? { audioAsVoice: true } : {}),
};
}
}
// Extract MEDIA: paths from text content blocks using the shared parser so
// directive matching and validation stay in sync with outbound reply parsing.
const content = Array.isArray(record.content) ? record.content : null;
if (!content) {
return undefined;
}
// Extract legacy MEDIA: paths from text content blocks using the shared
// parser so directive matching and validation stay in sync with outbound
// reply parsing.
const paths: string[] = [];
let hasImageContent = false;
for (const item of content) {
@@ -226,19 +276,24 @@ export function extractToolResultMediaPaths(result: unknown): string[] {
}
if (paths.length > 0) {
return paths;
return { mediaUrls: paths };
}
// Fall back to details.path when image content exists but no MEDIA: text.
// Fall back to legacy details.path when image content exists but no
// structured media details or MEDIA: text.
if (hasImageContent) {
const details = record.details as Record<string, unknown> | undefined;
const p = typeof details?.path === "string" ? details.path.trim() : "";
if (p) {
return [p];
return { mediaUrls: [p] };
}
}
return [];
return undefined;
}
export function extractToolResultMediaPaths(result: unknown): string[] {
return extractToolResultMediaArtifact(result)?.mediaUrls ?? [];
}
export function isToolResultError(result: unknown): boolean {

View File

@@ -11,7 +11,9 @@ import {
isMessagingToolDuplicateNormalized,
normalizeTextForComparison,
} from "./pi-embedded-helpers.js";
import type { BlockReplyPayload } from "./pi-embedded-payloads.js";
import { createEmbeddedPiSessionEventHandler } from "./pi-embedded-subscribe.handlers.js";
import { consumePendingToolMediaIntoReply } from "./pi-embedded-subscribe.handlers.messages.js";
import type {
EmbeddedPiSubscribeContext,
EmbeddedPiSubscribeState,
@@ -78,6 +80,8 @@ export function subscribeEmbeddedPiSession(params: SubscribeEmbeddedPiSessionPar
pendingMessagingTargets: new Map(),
successfulCronAdds: 0,
pendingMessagingMediaUrls: new Map(),
pendingToolMediaUrls: [],
pendingToolAudioAsVoice: false,
deterministicApprovalPromptSent: false,
};
const usageTotals = {
@@ -113,6 +117,9 @@ export function subscribeEmbeddedPiSession(params: SubscribeEmbeddedPiSessionPar
log.warn(`block reply callback failed: ${String(err)}`);
});
};
const emitBlockReply = (payload: BlockReplyPayload) => {
emitBlockReplySafely(consumePendingToolMediaIntoReply(state, payload));
};
const resetAssistantMessageState = (nextAssistantTextBaseline: number) => {
state.deltaBuffer = "";
@@ -523,7 +530,7 @@ export function subscribeEmbeddedPiSession(params: SubscribeEmbeddedPiSessionPar
if (!cleanedText && (!mediaUrls || mediaUrls.length === 0) && !audioAsVoice) {
return;
}
emitBlockReplySafely({
emitBlockReply({
text: cleanedText,
mediaUrls: mediaUrls?.length ? mediaUrls : undefined,
audioAsVoice,
@@ -599,6 +606,8 @@ export function subscribeEmbeddedPiSession(params: SubscribeEmbeddedPiSessionPar
pendingMessagingTargets.clear();
state.successfulCronAdds = 0;
state.pendingMessagingMediaUrls.clear();
state.pendingToolMediaUrls = [];
state.pendingToolAudioAsVoice = false;
state.deterministicApprovalPromptSent = false;
resetAssistantMessageState(0);
};
@@ -624,6 +633,7 @@ export function subscribeEmbeddedPiSession(params: SubscribeEmbeddedPiSessionPar
stripBlockTags,
emitBlockChunk,
flushBlockReplyBuffer,
emitBlockReply,
emitReasoningStream,
consumeReplyDirectives,
consumePartialReplyDirectives,

View File

@@ -6,6 +6,8 @@ export function createBaseToolHandlerState() {
pendingMessagingTexts: new Map<string, string>(),
pendingMessagingTargets: new Map<string, unknown>(),
pendingMessagingMediaUrls: new Map<string, string[]>(),
pendingToolMediaUrls: [] as string[],
pendingToolAudioAsVoice: false,
messagingToolSentTexts: [] as string[],
messagingToolSentTextsNormalized: [] as string[],
messagingToolSentMediaUrls: [] as string[],

View File

@@ -110,7 +110,7 @@ describe("createImageGenerateTool", () => {
});
});
it("generates images and returns MEDIA paths", async () => {
it("generates images and returns details.media paths", async () => {
const generateImage = vi.spyOn(imageGenerationRuntime, "generateImage").mockResolvedValue({
provider: "openai",
model: "gpt-image-1",
@@ -215,14 +215,16 @@ describe("createImageGenerateTool", () => {
provider: "openai",
model: "gpt-image-1",
count: 2,
media: {
mediaUrls: ["/tmp/generated-1.png", "/tmp/generated-2.png"],
},
paths: ["/tmp/generated-1.png", "/tmp/generated-2.png"],
filename: "cats/output.png",
revisedPrompts: ["A more cinematic cat"],
},
});
const text = (result.content?.[0] as { text: string } | undefined)?.text ?? "";
expect(text).toContain("MEDIA:/tmp/generated-1.png");
expect(text).toContain("MEDIA:/tmp/generated-2.png");
expect(text).not.toContain("MEDIA:");
});
it("rejects counts outside the supported range", async () => {

View File

@@ -610,7 +610,6 @@ export function createImageGenerateTool(options?: {
.filter((entry): entry is string => Boolean(entry));
const lines = [
`Generated ${savedImages.length} image${savedImages.length === 1 ? "" : "s"} with ${result.provider}/${result.model}.`,
...savedImages.map((image) => `MEDIA:${image.path}`),
];
return {
@@ -619,6 +618,9 @@ export function createImageGenerateTool(options?: {
provider: result.provider,
model: result.model,
count: savedImages.length,
media: {
mediaUrls: savedImages.map((image) => image.path),
},
paths: savedImages.map((image) => image.path),
...(imageInputs.length === 1
? {

View File

@@ -310,7 +310,6 @@ export function createNodesTool(options?: {
expectedHost: resolvedNode.remoteIp,
invalidPayloadMessage: "invalid camera.snap payload",
});
content.push({ type: "text", text: `MEDIA:${filePath}` });
if (options?.modelHasVision && payload.base64) {
content.push({
type: "image",

View File

@@ -4,7 +4,12 @@ vi.mock("../../auto-reply/tokens.js", () => ({
SILENT_REPLY_TOKEN: "QUIET_TOKEN",
}));
vi.mock("../../tts/tts.js", () => ({
textToSpeech: vi.fn(),
}));
const { createTtsTool } = await import("./tts-tool.js");
const { textToSpeech } = await import("../../tts/tts.js");
describe("createTtsTool", () => {
it("uses SILENT_REPLY_TOKEN in guidance text", () => {
@@ -13,4 +18,29 @@ describe("createTtsTool", () => {
expect(tool.description).toContain("QUIET_TOKEN");
expect(tool.description).not.toContain("NO_REPLY");
});
it("stores audio delivery in details.media", async () => {
vi.mocked(textToSpeech).mockResolvedValue({
success: true,
audioPath: "/tmp/reply.opus",
provider: "test",
voiceCompatible: true,
});
const tool = createTtsTool();
const result = await tool.execute("call-1", { text: "hello" });
expect(result).toMatchObject({
content: [{ type: "text", text: "Generated audio reply." }],
details: {
audioPath: "/tmp/reply.opus",
provider: "test",
media: {
mediaUrl: "/tmp/reply.opus",
audioAsVoice: true,
},
},
});
expect(JSON.stringify(result.content)).not.toContain("MEDIA:");
});
});

View File

@@ -35,15 +35,16 @@ export function createTtsTool(opts?: {
});
if (result.success && result.audioPath) {
const lines: string[] = [];
// Tag Telegram Opus output as a voice bubble instead of a file attachment.
if (result.voiceCompatible) {
lines.push("[[audio_as_voice]]");
}
lines.push(`MEDIA:${result.audioPath}`);
return {
content: [{ type: "text", text: lines.join("\n") }],
details: { audioPath: result.audioPath, provider: result.provider },
content: [{ type: "text", text: "Generated audio reply." }],
details: {
audioPath: result.audioPath,
provider: result.provider,
media: {
mediaUrl: result.audioPath,
...(result.voiceCompatible ? { audioAsVoice: true } : {}),
},
},
};
}