From 60d892d700440a2e9df65a80cbe856f727437c1a Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Thu, 23 Apr 2026 19:34:30 -0700 Subject: [PATCH] fix(reply): parse markdown image replies as media * fix(reply): parse markdown image replies as media * fix(reply): preserve inline markdown image captions * fix(reply): harden markdown image parsing --- CHANGELOG.md | 1 + .../reply/agent-runner-payloads.test.ts | 43 +++ src/media/parse.test.ts | 70 +++++ src/media/parse.ts | 291 +++++++++++++++++- 4 files changed, 394 insertions(+), 11 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4cf86d6fdf6..445c77856e7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,6 +24,7 @@ Docs: https://docs.openclaw.ai ### Fixes - Agents/replay: stop OpenAI/Codex transcript replay from synthesizing missing tool results while still preserving synthetic repair on Anthropic, Gemini, and Bedrock transport-owned sessions. (#61556) Thanks @VictorJeon and @vincentkoc. +- Telegram/media replies: parse remote markdown image syntax into outbound media payloads on the final reply path, so Telegram group chats stop falling back to plain-text image URLs when the model or a tool emits `![...](...)` instead of a `MEDIA:` token. (#66191) Thanks @apezam and @vincentkoc. - Agents/WebChat: surface non-retryable provider failures such as billing, auth, and rate-limit errors from the embedded runner instead of logging `surface_error` and leaving webchat with no rendered error. Fixes #70124. (#70848) Thanks @truffle-dev. - Memory/CLI: declare the built-in `local` embedding provider in the memory-core manifest, so standalone `openclaw memory status`, `index`, and `search` can resolve local embeddings just like the gateway runtime. Fixes #70836. (#70873) Thanks @mattznojassist. - Gateway/WebChat: preserve image attachments for text-only primary models by offloading them as media refs instead of dropping them, so configured image tools can still inspect the original file. Fixes #68513, #44276, #51656, #70212. diff --git a/src/auto-reply/reply/agent-runner-payloads.test.ts b/src/auto-reply/reply/agent-runner-payloads.test.ts index 75ad795e288..cd783b149b9 100644 --- a/src/auto-reply/reply/agent-runner-payloads.test.ts +++ b/src/auto-reply/reply/agent-runner-payloads.test.ts @@ -246,6 +246,49 @@ describe("buildReplyPayloads media filter integration", () => { expect(replyPayloads).toHaveLength(0); }); + it("extracts markdown image replies into final payload media urls", async () => { + const { replyPayloads } = await buildReplyPayloads({ + ...baseParams, + payloads: [{ text: "Here you go\n\n![chart](https://example.com/chart.png)" }], + }); + + expect(replyPayloads).toHaveLength(1); + expect(replyPayloads[0]).toMatchObject({ + text: "Here you go", + mediaUrl: "https://example.com/chart.png", + mediaUrls: ["https://example.com/chart.png"], + }); + }); + + it("preserves inline caption text when lifting markdown image replies into media", async () => { + const { replyPayloads } = await buildReplyPayloads({ + ...baseParams, + payloads: [{ text: 'Look ![chart](https://example.com/chart.png "Quarterly chart") now' }], + }); + + expect(replyPayloads).toHaveLength(1); + expect(replyPayloads[0]).toMatchObject({ + text: "Look now", + mediaUrl: "https://example.com/chart.png", + mediaUrls: ["https://example.com/chart.png"], + }); + }); + + it("keeps markdown local file images as plain text in final replies", async () => { + const text = "Look ![chart](file:///etc/passwd) now"; + const { replyPayloads } = await buildReplyPayloads({ + ...baseParams, + payloads: [{ text }], + }); + + expect(replyPayloads).toHaveLength(1); + expect(replyPayloads[0]).toMatchObject({ + text, + }); + expect(replyPayloads[0]?.mediaUrl).toBeUndefined(); + expect(replyPayloads[0]?.mediaUrls).toBeUndefined(); + }); + it("deduplicates final payloads against directly sent block keys regardless of replyToId", async () => { // When block streaming is not active but directlySentBlockKeys has entries // (e.g. from pre-tool flush), the key should match even if replyToId differs. diff --git a/src/media/parse.test.ts b/src/media/parse.test.ts index 9a5cdab0394..5ca8a2714dc 100644 --- a/src/media/parse.test.ts +++ b/src/media/parse.test.ts @@ -103,4 +103,74 @@ describe("splitMediaFromOutput", () => { { type: "text", text: "```text\nMEDIA:https://example.com/ignored.png\n```\nAfter" }, ]); }); + + it("extracts markdown image urls while keeping surrounding caption text", () => { + expectParsedMediaOutputCase("Caption\n\n![chart](https://example.com/chart.png)", { + text: "Caption", + mediaUrls: ["https://example.com/chart.png"], + }); + }); + + it("keeps inline caption text around markdown images", () => { + expectParsedMediaOutputCase("Look ![chart](https://example.com/chart.png) now", { + text: "Look now", + mediaUrls: ["https://example.com/chart.png"], + }); + }); + + it("extracts multiple markdown image urls in order", () => { + expectParsedMediaOutputCase( + "Before\n![one](https://example.com/one.png)\nMiddle\n![two](https://example.com/two.png)\nAfter", + { + text: "Before\nMiddle\nAfter", + mediaUrls: ["https://example.com/one.png", "https://example.com/two.png"], + }, + ); + }); + + it("strips markdown image title suffixes from extracted urls", () => { + expectParsedMediaOutputCase( + 'Caption ![chart](https://example.com/chart.png "Quarterly chart")', + { + text: "Caption", + mediaUrls: ["https://example.com/chart.png"], + }, + ); + }); + + it("keeps balanced parentheses inside markdown image urls", () => { + expectParsedMediaOutputCase("Chart ![img](https://example.com/a_(1).png) now", { + text: "Chart now", + mediaUrls: ["https://example.com/a_(1).png"], + }); + }); + + it.each([ + "![x](file:///etc/passwd)", + "![x](/var/run/secrets/kubernetes.io/serviceaccount/token)", + "![x](C:\\\\Windows\\\\System32\\\\drivers\\\\etc\\\\hosts)", + ] as const)("does not lift local markdown image target: %s", (input) => { + expectParsedMediaOutputCase(input, { + text: input, + mediaUrls: undefined, + }); + }); + + it("does not lift markdown image urls that fail media validation", () => { + const longUrl = `![x](https://example.com/${"a".repeat(4097)}.png)`; + + expectParsedMediaOutputCase(longUrl, { + text: longUrl, + mediaUrls: undefined, + }); + }); + + it("leaves very long markdown-image candidate lines as text", () => { + const input = `${"prefix ".repeat(3000)}![x](https://example.com/image.png)`; + + expectParsedMediaOutputCase(input, { + text: input, + mediaUrls: undefined, + }); + }); }); diff --git a/src/media/parse.ts b/src/media/parse.ts index 3fef2db7a96..a037ddd5433 100644 --- a/src/media/parse.ts +++ b/src/media/parse.ts @@ -125,6 +125,265 @@ function mayContainFenceMarkers(input: string): boolean { return input.includes("```") || input.includes("~~~"); } +function cleanLineText(text: string): string { + return text.replace(/[ \t]{2,}/g, " ").trim(); +} + +type MarkdownImageMatch = { + start: number; + end: number; + destination: string; +}; + +const MAX_MARKDOWN_IMAGE_LINE_LENGTH = 20_000; +const MAX_MARKDOWN_IMAGE_ATTEMPTS_PER_LINE = 80; +const MAX_MARKDOWN_IMAGE_MATCHES_PER_LINE = 50; + +function findMatchingBracket( + input: string, + start: number, + open: string, + close: string, +): number | undefined { + let depth = 1; + for (let i = start; i < input.length; i += 1) { + const ch = input[i]; + if (ch === "\\") { + i += 1; + continue; + } + if (ch === open) { + depth += 1; + continue; + } + if (ch !== close) { + continue; + } + depth -= 1; + if (depth === 0) { + return i; + } + } + return undefined; +} + +function isRemoteMarkdownImageMedia(candidate: string): boolean { + return /^https?:\/\//i.test(candidate) && isValidMedia(candidate); +} + +function parseMarkdownTitle(input: string, start: number): number | undefined { + let index = start; + while (index < input.length && /\s/.test(input[index] ?? "")) { + index += 1; + } + const opener = input[index]; + if (!opener) { + return undefined; + } + const closer = opener === '"' || opener === "'" ? opener : opener === "(" ? ")" : null; + if (!closer) { + return undefined; + } + const closingIndex = + opener === "(" + ? findMatchingBracket(input, index + 1, "(", ")") + : (() => { + for (let i = index + 1; i < input.length; i += 1) { + const ch = input[i]; + if (ch === "\\") { + i += 1; + continue; + } + if (ch === closer) { + return i; + } + } + return undefined; + })(); + if (closingIndex == null) { + return undefined; + } + let tailIndex = closingIndex + 1; + while (tailIndex < input.length && /\s/.test(input[tailIndex] ?? "")) { + tailIndex += 1; + } + return input[tailIndex] === ")" ? tailIndex + 1 : undefined; +} + +function parseMarkdownImageDestination( + input: string, + start: number, +): { destination: string; end: number } | undefined { + let index = start; + while (index < input.length && /\s/.test(input[index] ?? "")) { + index += 1; + } + if (index >= input.length) { + return undefined; + } + + if (input[index] === "<") { + let closing = index + 1; + while (closing < input.length) { + const ch = input[closing]; + if (ch === "\\") { + closing += 2; + continue; + } + if (ch === ">") { + const destination = input.slice(index + 1, closing).trim(); + if (!destination) { + return undefined; + } + let tailIndex = closing + 1; + while (tailIndex < input.length && /\s/.test(input[tailIndex] ?? "")) { + tailIndex += 1; + } + if (input[tailIndex] === ")") { + return { destination, end: tailIndex + 1 }; + } + const titledEnd = parseMarkdownTitle(input, tailIndex); + return titledEnd ? { destination, end: titledEnd } : undefined; + } + closing += 1; + } + return undefined; + } + + const destinationStart = index; + let destinationEnd = index; + let parenDepth = 0; + while (index < input.length) { + const ch = input[index]; + if (ch === "\\") { + index += 2; + destinationEnd = index; + continue; + } + if (ch === "(") { + parenDepth += 1; + index += 1; + destinationEnd = index; + continue; + } + if (ch === ")") { + if (parenDepth === 0) { + const destination = input.slice(destinationStart, destinationEnd).trim(); + return destination ? { destination, end: index + 1 } : undefined; + } + parenDepth -= 1; + index += 1; + destinationEnd = index; + continue; + } + if (/\s/.test(ch) && parenDepth === 0) { + const destination = input.slice(destinationStart, destinationEnd).trim(); + if (!destination) { + return undefined; + } + const titledEnd = parseMarkdownTitle(input, index); + return titledEnd ? { destination, end: titledEnd } : undefined; + } + index += 1; + destinationEnd = index; + } + return undefined; +} + +function findMarkdownImageMatches(line: string): MarkdownImageMatch[] { + if (line.length > MAX_MARKDOWN_IMAGE_LINE_LENGTH) { + return []; + } + const matches: MarkdownImageMatch[] = []; + let searchIndex = 0; + let attempts = 0; + while ( + matches.length < MAX_MARKDOWN_IMAGE_MATCHES_PER_LINE && + attempts < MAX_MARKDOWN_IMAGE_ATTEMPTS_PER_LINE + ) { + const index = line.indexOf("![", searchIndex); + if (index < 0) { + break; + } + attempts += 1; + const altEnd = findMatchingBracket(line, index + 2, "[", "]"); + if (altEnd == null || line[altEnd + 1] !== "(") { + searchIndex = index + 2; + continue; + } + const parsed = parseMarkdownImageDestination(line, altEnd + 2); + if (!parsed) { + searchIndex = index + 2; + continue; + } + matches.push({ + start: index, + end: parsed.end, + destination: parsed.destination, + }); + searchIndex = parsed.end; + } + return matches; +} + +function collectMarkdownImageSegments(params: { line: string; media: string[] }): { + cleanedLine?: string; + lineSegments: ParsedMediaOutputSegment[]; + foundMedia: boolean; +} { + const matches = findMarkdownImageMatches(params.line); + if (matches.length === 0) { + return { lineSegments: [], foundMedia: false }; + } + + const segmentPieces: string[] = []; + const visiblePieces: string[] = []; + const lineSegments: ParsedMediaOutputSegment[] = []; + let cursor = 0; + let foundMedia = false; + + for (const match of matches) { + const before = params.line.slice(cursor, match.start); + segmentPieces.push(before); + visiblePieces.push(before); + + const target = normalizeMediaSource( + cleanCandidate(unwrapQuoted(match.destination) ?? match.destination), + ); + if (isRemoteMarkdownImageMedia(target)) { + const beforeText = cleanLineText(segmentPieces.join("")); + if (beforeText) { + lineSegments.push({ type: "text", text: beforeText }); + } + segmentPieces.length = 0; + params.media.push(target); + lineSegments.push({ type: "media", url: target }); + foundMedia = true; + } else { + const original = params.line.slice(match.start, match.end); + segmentPieces.push(original); + visiblePieces.push(original); + } + + cursor = match.end; + } + + const after = params.line.slice(cursor); + segmentPieces.push(after); + visiblePieces.push(after); + const trailingText = cleanLineText(segmentPieces.join("")); + if (trailingText) { + lineSegments.push({ type: "text", text: trailingText }); + } + const cleanedLine = cleanLineText(visiblePieces.join("")); + + return { + cleanedLine: cleanedLine || undefined, + lineSegments, + foundMedia, + }; +} + // Check if a character offset is inside any fenced code block function isInsideFence(fenceSpans: Array<{ start: number; end: number }>, offset: number): boolean { return fenceSpans.some((span) => offset >= span.start && offset < span.end); @@ -144,8 +403,9 @@ export function splitMediaFromOutput(raw: string): { return { text: "" }; } const mayContainMediaToken = /media:/i.test(trimmedRaw); + const mayContainMarkdownImage = /!\[[^\]]*]\(/.test(trimmedRaw); const mayContainAudioTag = trimmedRaw.includes("[["); - if (!mayContainMediaToken && !mayContainAudioTag) { + if (!mayContainMediaToken && !mayContainMarkdownImage && !mayContainAudioTag) { return { text: trimmedRaw }; } @@ -185,8 +445,23 @@ export function splitMediaFromOutput(raw: string): { const trimmedStart = line.trimStart(); if (!trimmedStart.toUpperCase().startsWith("MEDIA:")) { - keptLines.push(line); - pushTextSegment(line); + const markdownImageResult = collectMarkdownImageSegments({ line, media }); + if (!markdownImageResult.foundMedia) { + keptLines.push(line); + pushTextSegment(line); + } else { + foundMediaToken = true; + if (markdownImageResult.cleanedLine) { + keptLines.push(markdownImageResult.cleanedLine); + } + for (const segment of markdownImageResult.lineSegments) { + if (segment.type === "text") { + pushTextSegment(segment.text); + continue; + } + segments.push(segment); + } + } lineOffset += line.length + 1; // +1 for newline continue; } @@ -269,10 +544,7 @@ export function splitMediaFromOutput(raw: string): { } if (hasValidMedia) { - const beforeText = pieces - .join("") - .replace(/[ \t]{2,}/g, " ") - .trim(); + const beforeText = cleanLineText(pieces.join("")); if (beforeText) { lineSegments.push({ type: "text", text: beforeText }); } @@ -297,10 +569,7 @@ export function splitMediaFromOutput(raw: string): { pieces.push(line.slice(cursor)); - const cleanedLine = pieces - .join("") - .replace(/[ \t]{2,}/g, " ") - .trim(); + const cleanedLine = cleanLineText(pieces.join("")); // If the line becomes empty, drop it. if (cleanedLine) {