mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 05:50:43 +00:00
refactor(tts): clarify text media directives
This commit is contained in:
@@ -25,9 +25,9 @@ Docs: https://docs.openclaw.ai
|
||||
|
||||
### Fixes
|
||||
|
||||
- Agents/TTS: preserve legacy `[[audio_as_voice]]` hints on trusted tool-result
|
||||
`MEDIA:` payloads so generated audio still delivers as a voice note. (#46535)
|
||||
Thanks @azade-c.
|
||||
- Agents/TTS: preserve `[[audio_as_voice]]` directives on trusted text
|
||||
tool-result `MEDIA:` payloads so generated audio still delivers as a voice
|
||||
note. (#46535) Thanks @azade-c.
|
||||
- Agents/TTS: keep queued tool media when an assistant ends with `NO_REPLY` on
|
||||
non-block delivery paths, so media-only generated audio replies still send.
|
||||
(#60025) Thanks @bradlind1.
|
||||
|
||||
@@ -14,7 +14,7 @@ Assistant output can carry a small set of delivery/render directives:
|
||||
- `[embed ...]` for Control UI rich rendering
|
||||
|
||||
These directives are separate. `MEDIA:` and reply/voice tags remain delivery metadata; `[embed ...]` is the web-only rich render path.
|
||||
Trusted tool-result media uses the same `MEDIA:` / `[[audio_as_voice]]` parser before delivery, so legacy tool outputs can still mark an audio attachment as a voice note.
|
||||
Trusted tool-result media uses the same `MEDIA:` / `[[audio_as_voice]]` parser before delivery, so text tool outputs can still mark an audio attachment as a voice note.
|
||||
|
||||
When block streaming is enabled, `MEDIA:` remains single-delivery metadata for a
|
||||
turn. If the same media URL is sent in a streamed block and repeated in the final
|
||||
|
||||
@@ -165,7 +165,7 @@ describe("handleToolExecutionEnd media emission", () => {
|
||||
expect(ctx.state.pendingToolMediaUrls).toEqual(["/tmp/screenshot.png"]);
|
||||
});
|
||||
|
||||
it("preserves legacy audio_as_voice when queuing trusted MEDIA tool output", async () => {
|
||||
it("preserves audio_as_voice when queuing trusted text MEDIA tool output", async () => {
|
||||
const onToolResult = vi.fn();
|
||||
const ctx = createMockContext({
|
||||
shouldEmitToolOutput: false,
|
||||
@@ -245,7 +245,7 @@ describe("handleToolExecutionEnd media emission", () => {
|
||||
expect(ctx.state.pendingToolMediaUrls).toEqual(["https://example.com/file.png"]);
|
||||
});
|
||||
|
||||
it("does NOT queue legacy MEDIA paths when verbose is full", async () => {
|
||||
it("does NOT queue text MEDIA paths when verbose is full", async () => {
|
||||
const onToolResult = vi.fn();
|
||||
const ctx = createMockContext({ shouldEmitToolOutput: true, onToolResult });
|
||||
|
||||
|
||||
@@ -51,7 +51,7 @@ describe("extractToolResultMediaPaths", () => {
|
||||
});
|
||||
});
|
||||
|
||||
it("extracts audioAsVoice from legacy MEDIA text", () => {
|
||||
it("extracts audioAsVoice from text MEDIA directives", () => {
|
||||
expect(
|
||||
extractToolResultMediaArtifact({
|
||||
content: [
|
||||
@@ -64,7 +64,7 @@ describe("extractToolResultMediaPaths", () => {
|
||||
});
|
||||
});
|
||||
|
||||
it("keeps legacy audioAsVoice when the tag and MEDIA path are in separate text blocks", () => {
|
||||
it("keeps audioAsVoice when the tag and MEDIA path are in separate text blocks", () => {
|
||||
expect(
|
||||
extractToolResultMediaArtifact({
|
||||
content: [
|
||||
|
||||
@@ -239,7 +239,7 @@ export function filterToolResultMediaUrls(
|
||||
*
|
||||
* Strategy (first match wins):
|
||||
* 1. Read structured `details.media` attachments from tool details.
|
||||
* 2. Parse legacy `MEDIA:` tokens from text content blocks.
|
||||
* 2. Parse `MEDIA:` directive tokens from text content blocks.
|
||||
* 3. Fall back to `details.path` when image content exists (legacy imageResult).
|
||||
*
|
||||
* Returns an empty array when no media is found (e.g. Pi SDK `read` tool
|
||||
@@ -279,6 +279,44 @@ function collectStructuredMediaUrls(media: Record<string, unknown>): string[] {
|
||||
return Array.from(new Set(urls));
|
||||
}
|
||||
|
||||
function extractTextContentMediaArtifact(content: unknown[]): {
|
||||
mediaUrls: string[];
|
||||
audioAsVoice?: boolean;
|
||||
hasImageContent: boolean;
|
||||
} {
|
||||
const mediaUrls: string[] = [];
|
||||
let audioAsVoice = false;
|
||||
let hasImageContent = false;
|
||||
|
||||
for (const item of content) {
|
||||
if (!item || typeof item !== "object") {
|
||||
continue;
|
||||
}
|
||||
const entry = item as Record<string, unknown>;
|
||||
if (entry.type === "image") {
|
||||
hasImageContent = true;
|
||||
continue;
|
||||
}
|
||||
if (entry.type !== "text" || typeof entry.text !== "string") {
|
||||
continue;
|
||||
}
|
||||
|
||||
const parsed = splitMediaFromOutput(entry.text);
|
||||
if (parsed.audioAsVoice) {
|
||||
audioAsVoice = true;
|
||||
}
|
||||
if (parsed.mediaUrls?.length) {
|
||||
mediaUrls.push(...parsed.mediaUrls);
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
mediaUrls,
|
||||
...(audioAsVoice ? { audioAsVoice: true } : {}),
|
||||
hasImageContent,
|
||||
};
|
||||
}
|
||||
|
||||
export function extractToolResultMediaArtifact(
|
||||
result: unknown,
|
||||
): ToolResultMediaArtifact | undefined {
|
||||
@@ -303,42 +341,18 @@ export function extractToolResultMediaArtifact(
|
||||
return undefined;
|
||||
}
|
||||
|
||||
// Extract legacy MEDIA: paths from text content blocks using the shared
|
||||
// parser so directive matching and validation stay in sync with outbound
|
||||
// reply parsing.
|
||||
const paths: string[] = [];
|
||||
let audioAsVoice = false;
|
||||
let hasImageContent = false;
|
||||
for (const item of content) {
|
||||
if (!item || typeof item !== "object") {
|
||||
continue;
|
||||
}
|
||||
const entry = item as Record<string, unknown>;
|
||||
if (entry.type === "image") {
|
||||
hasImageContent = true;
|
||||
continue;
|
||||
}
|
||||
if (entry.type === "text" && typeof entry.text === "string") {
|
||||
const parsed = splitMediaFromOutput(entry.text);
|
||||
if (parsed.audioAsVoice) {
|
||||
audioAsVoice = true;
|
||||
}
|
||||
if (parsed.mediaUrls?.length) {
|
||||
paths.push(...parsed.mediaUrls);
|
||||
}
|
||||
}
|
||||
}
|
||||
const textMedia = extractTextContentMediaArtifact(content);
|
||||
|
||||
if (paths.length > 0) {
|
||||
if (textMedia.mediaUrls.length > 0) {
|
||||
return {
|
||||
mediaUrls: paths,
|
||||
...(audioAsVoice ? { audioAsVoice: true } : {}),
|
||||
mediaUrls: textMedia.mediaUrls,
|
||||
...(textMedia.audioAsVoice ? { audioAsVoice: true } : {}),
|
||||
};
|
||||
}
|
||||
|
||||
// Fall back to legacy details.path when image content exists but no
|
||||
// structured media details or MEDIA: text.
|
||||
if (hasImageContent) {
|
||||
if (textMedia.hasImageContent) {
|
||||
const details = record.details as Record<string, unknown> | undefined;
|
||||
const p = normalizeOptionalString(details?.path) ?? "";
|
||||
if (p) {
|
||||
|
||||
Reference in New Issue
Block a user