mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 20:40:43 +00:00
refactor(tts): clarify text media directives
This commit is contained in:
@@ -165,7 +165,7 @@ describe("handleToolExecutionEnd media emission", () => {
|
||||
expect(ctx.state.pendingToolMediaUrls).toEqual(["/tmp/screenshot.png"]);
|
||||
});
|
||||
|
||||
it("preserves legacy audio_as_voice when queuing trusted MEDIA tool output", async () => {
|
||||
it("preserves audio_as_voice when queuing trusted text MEDIA tool output", async () => {
|
||||
const onToolResult = vi.fn();
|
||||
const ctx = createMockContext({
|
||||
shouldEmitToolOutput: false,
|
||||
@@ -245,7 +245,7 @@ describe("handleToolExecutionEnd media emission", () => {
|
||||
expect(ctx.state.pendingToolMediaUrls).toEqual(["https://example.com/file.png"]);
|
||||
});
|
||||
|
||||
it("does NOT queue legacy MEDIA paths when verbose is full", async () => {
|
||||
it("does NOT queue text MEDIA paths when verbose is full", async () => {
|
||||
const onToolResult = vi.fn();
|
||||
const ctx = createMockContext({ shouldEmitToolOutput: true, onToolResult });
|
||||
|
||||
|
||||
@@ -51,7 +51,7 @@ describe("extractToolResultMediaPaths", () => {
|
||||
});
|
||||
});
|
||||
|
||||
it("extracts audioAsVoice from legacy MEDIA text", () => {
|
||||
it("extracts audioAsVoice from text MEDIA directives", () => {
|
||||
expect(
|
||||
extractToolResultMediaArtifact({
|
||||
content: [
|
||||
@@ -64,7 +64,7 @@ describe("extractToolResultMediaPaths", () => {
|
||||
});
|
||||
});
|
||||
|
||||
it("keeps legacy audioAsVoice when the tag and MEDIA path are in separate text blocks", () => {
|
||||
it("keeps audioAsVoice when the tag and MEDIA path are in separate text blocks", () => {
|
||||
expect(
|
||||
extractToolResultMediaArtifact({
|
||||
content: [
|
||||
|
||||
@@ -239,7 +239,7 @@ export function filterToolResultMediaUrls(
|
||||
*
|
||||
* Strategy (first match wins):
|
||||
* 1. Read structured `details.media` attachments from tool details.
|
||||
* 2. Parse legacy `MEDIA:` tokens from text content blocks.
|
||||
* 2. Parse `MEDIA:` directive tokens from text content blocks.
|
||||
* 3. Fall back to `details.path` when image content exists (legacy imageResult).
|
||||
*
|
||||
* Returns an empty array when no media is found (e.g. Pi SDK `read` tool
|
||||
@@ -279,6 +279,44 @@ function collectStructuredMediaUrls(media: Record<string, unknown>): string[] {
|
||||
return Array.from(new Set(urls));
|
||||
}
|
||||
|
||||
function extractTextContentMediaArtifact(content: unknown[]): {
|
||||
mediaUrls: string[];
|
||||
audioAsVoice?: boolean;
|
||||
hasImageContent: boolean;
|
||||
} {
|
||||
const mediaUrls: string[] = [];
|
||||
let audioAsVoice = false;
|
||||
let hasImageContent = false;
|
||||
|
||||
for (const item of content) {
|
||||
if (!item || typeof item !== "object") {
|
||||
continue;
|
||||
}
|
||||
const entry = item as Record<string, unknown>;
|
||||
if (entry.type === "image") {
|
||||
hasImageContent = true;
|
||||
continue;
|
||||
}
|
||||
if (entry.type !== "text" || typeof entry.text !== "string") {
|
||||
continue;
|
||||
}
|
||||
|
||||
const parsed = splitMediaFromOutput(entry.text);
|
||||
if (parsed.audioAsVoice) {
|
||||
audioAsVoice = true;
|
||||
}
|
||||
if (parsed.mediaUrls?.length) {
|
||||
mediaUrls.push(...parsed.mediaUrls);
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
mediaUrls,
|
||||
...(audioAsVoice ? { audioAsVoice: true } : {}),
|
||||
hasImageContent,
|
||||
};
|
||||
}
|
||||
|
||||
export function extractToolResultMediaArtifact(
|
||||
result: unknown,
|
||||
): ToolResultMediaArtifact | undefined {
|
||||
@@ -303,42 +341,18 @@ export function extractToolResultMediaArtifact(
|
||||
return undefined;
|
||||
}
|
||||
|
||||
// Extract legacy MEDIA: paths from text content blocks using the shared
|
||||
// parser so directive matching and validation stay in sync with outbound
|
||||
// reply parsing.
|
||||
const paths: string[] = [];
|
||||
let audioAsVoice = false;
|
||||
let hasImageContent = false;
|
||||
for (const item of content) {
|
||||
if (!item || typeof item !== "object") {
|
||||
continue;
|
||||
}
|
||||
const entry = item as Record<string, unknown>;
|
||||
if (entry.type === "image") {
|
||||
hasImageContent = true;
|
||||
continue;
|
||||
}
|
||||
if (entry.type === "text" && typeof entry.text === "string") {
|
||||
const parsed = splitMediaFromOutput(entry.text);
|
||||
if (parsed.audioAsVoice) {
|
||||
audioAsVoice = true;
|
||||
}
|
||||
if (parsed.mediaUrls?.length) {
|
||||
paths.push(...parsed.mediaUrls);
|
||||
}
|
||||
}
|
||||
}
|
||||
const textMedia = extractTextContentMediaArtifact(content);
|
||||
|
||||
if (paths.length > 0) {
|
||||
if (textMedia.mediaUrls.length > 0) {
|
||||
return {
|
||||
mediaUrls: paths,
|
||||
...(audioAsVoice ? { audioAsVoice: true } : {}),
|
||||
mediaUrls: textMedia.mediaUrls,
|
||||
...(textMedia.audioAsVoice ? { audioAsVoice: true } : {}),
|
||||
};
|
||||
}
|
||||
|
||||
// Fall back to legacy details.path when image content exists but no
|
||||
// structured media details or MEDIA: text.
|
||||
if (hasImageContent) {
|
||||
if (textMedia.hasImageContent) {
|
||||
const details = record.details as Record<string, unknown> | undefined;
|
||||
const p = normalizeOptionalString(details?.path) ?? "";
|
||||
if (p) {
|
||||
|
||||
Reference in New Issue
Block a user