fix(reply): parse markdown image replies as media

* fix(reply): parse markdown image replies as media

* fix(reply): preserve inline markdown image captions

* fix(reply): harden markdown image parsing
This commit is contained in:
Vincent Koc
2026-04-23 19:34:30 -07:00
committed by GitHub
parent 04066d246a
commit 60d892d700
4 changed files with 394 additions and 11 deletions

View File

@@ -24,6 +24,7 @@ Docs: https://docs.openclaw.ai
### Fixes
- Agents/replay: stop OpenAI/Codex transcript replay from synthesizing missing tool results while still preserving synthetic repair on Anthropic, Gemini, and Bedrock transport-owned sessions. (#61556) Thanks @VictorJeon and @vincentkoc.
- Telegram/media replies: parse remote markdown image syntax into outbound media payloads on the final reply path, so Telegram group chats stop falling back to plain-text image URLs when the model or a tool emits `![...](...)` instead of a `MEDIA:` token. (#66191) Thanks @apezam and @vincentkoc.
- Agents/WebChat: surface non-retryable provider failures such as billing, auth, and rate-limit errors from the embedded runner instead of logging `surface_error` and leaving webchat with no rendered error. Fixes #70124. (#70848) Thanks @truffle-dev.
- Memory/CLI: declare the built-in `local` embedding provider in the memory-core manifest, so standalone `openclaw memory status`, `index`, and `search` can resolve local embeddings just like the gateway runtime. Fixes #70836. (#70873) Thanks @mattznojassist.
- Gateway/WebChat: preserve image attachments for text-only primary models by offloading them as media refs instead of dropping them, so configured image tools can still inspect the original file. Fixes #68513, #44276, #51656, #70212.

View File

@@ -246,6 +246,49 @@ describe("buildReplyPayloads media filter integration", () => {
expect(replyPayloads).toHaveLength(0);
});
it("extracts markdown image replies into final payload media urls", async () => {
const { replyPayloads } = await buildReplyPayloads({
...baseParams,
payloads: [{ text: "Here you go\n\n![chart](https://example.com/chart.png)" }],
});
expect(replyPayloads).toHaveLength(1);
expect(replyPayloads[0]).toMatchObject({
text: "Here you go",
mediaUrl: "https://example.com/chart.png",
mediaUrls: ["https://example.com/chart.png"],
});
});
it("preserves inline caption text when lifting markdown image replies into media", async () => {
const { replyPayloads } = await buildReplyPayloads({
...baseParams,
payloads: [{ text: 'Look ![chart](https://example.com/chart.png "Quarterly chart") now' }],
});
expect(replyPayloads).toHaveLength(1);
expect(replyPayloads[0]).toMatchObject({
text: "Look now",
mediaUrl: "https://example.com/chart.png",
mediaUrls: ["https://example.com/chart.png"],
});
});
it("keeps markdown local file images as plain text in final replies", async () => {
const text = "Look ![chart](file:///etc/passwd) now";
const { replyPayloads } = await buildReplyPayloads({
...baseParams,
payloads: [{ text }],
});
expect(replyPayloads).toHaveLength(1);
expect(replyPayloads[0]).toMatchObject({
text,
});
expect(replyPayloads[0]?.mediaUrl).toBeUndefined();
expect(replyPayloads[0]?.mediaUrls).toBeUndefined();
});
it("deduplicates final payloads against directly sent block keys regardless of replyToId", async () => {
// When block streaming is not active but directlySentBlockKeys has entries
// (e.g. from pre-tool flush), the key should match even if replyToId differs.

View File

@@ -103,4 +103,74 @@ describe("splitMediaFromOutput", () => {
{ type: "text", text: "```text\nMEDIA:https://example.com/ignored.png\n```\nAfter" },
]);
});
it("extracts markdown image urls while keeping surrounding caption text", () => {
expectParsedMediaOutputCase("Caption\n\n![chart](https://example.com/chart.png)", {
text: "Caption",
mediaUrls: ["https://example.com/chart.png"],
});
});
it("keeps inline caption text around markdown images", () => {
expectParsedMediaOutputCase("Look ![chart](https://example.com/chart.png) now", {
text: "Look now",
mediaUrls: ["https://example.com/chart.png"],
});
});
it("extracts multiple markdown image urls in order", () => {
expectParsedMediaOutputCase(
"Before\n![one](https://example.com/one.png)\nMiddle\n![two](https://example.com/two.png)\nAfter",
{
text: "Before\nMiddle\nAfter",
mediaUrls: ["https://example.com/one.png", "https://example.com/two.png"],
},
);
});
it("strips markdown image title suffixes from extracted urls", () => {
expectParsedMediaOutputCase(
'Caption ![chart](https://example.com/chart.png "Quarterly chart")',
{
text: "Caption",
mediaUrls: ["https://example.com/chart.png"],
},
);
});
it("keeps balanced parentheses inside markdown image urls", () => {
expectParsedMediaOutputCase("Chart ![img](https://example.com/a_(1).png) now", {
text: "Chart now",
mediaUrls: ["https://example.com/a_(1).png"],
});
});
it.each([
"![x](file:///etc/passwd)",
"![x](/var/run/secrets/kubernetes.io/serviceaccount/token)",
"![x](C:\\\\Windows\\\\System32\\\\drivers\\\\etc\\\\hosts)",
] as const)("does not lift local markdown image target: %s", (input) => {
expectParsedMediaOutputCase(input, {
text: input,
mediaUrls: undefined,
});
});
it("does not lift markdown image urls that fail media validation", () => {
const longUrl = `![x](https://example.com/${"a".repeat(4097)}.png)`;
expectParsedMediaOutputCase(longUrl, {
text: longUrl,
mediaUrls: undefined,
});
});
it("leaves very long markdown-image candidate lines as text", () => {
const input = `${"prefix ".repeat(3000)}![x](https://example.com/image.png)`;
expectParsedMediaOutputCase(input, {
text: input,
mediaUrls: undefined,
});
});
});

View File

@@ -125,6 +125,265 @@ function mayContainFenceMarkers(input: string): boolean {
return input.includes("```") || input.includes("~~~");
}
function cleanLineText(text: string): string {
return text.replace(/[ \t]{2,}/g, " ").trim();
}
type MarkdownImageMatch = {
start: number;
end: number;
destination: string;
};
const MAX_MARKDOWN_IMAGE_LINE_LENGTH = 20_000;
const MAX_MARKDOWN_IMAGE_ATTEMPTS_PER_LINE = 80;
const MAX_MARKDOWN_IMAGE_MATCHES_PER_LINE = 50;
function findMatchingBracket(
input: string,
start: number,
open: string,
close: string,
): number | undefined {
let depth = 1;
for (let i = start; i < input.length; i += 1) {
const ch = input[i];
if (ch === "\\") {
i += 1;
continue;
}
if (ch === open) {
depth += 1;
continue;
}
if (ch !== close) {
continue;
}
depth -= 1;
if (depth === 0) {
return i;
}
}
return undefined;
}
function isRemoteMarkdownImageMedia(candidate: string): boolean {
return /^https?:\/\//i.test(candidate) && isValidMedia(candidate);
}
function parseMarkdownTitle(input: string, start: number): number | undefined {
let index = start;
while (index < input.length && /\s/.test(input[index] ?? "")) {
index += 1;
}
const opener = input[index];
if (!opener) {
return undefined;
}
const closer = opener === '"' || opener === "'" ? opener : opener === "(" ? ")" : null;
if (!closer) {
return undefined;
}
const closingIndex =
opener === "("
? findMatchingBracket(input, index + 1, "(", ")")
: (() => {
for (let i = index + 1; i < input.length; i += 1) {
const ch = input[i];
if (ch === "\\") {
i += 1;
continue;
}
if (ch === closer) {
return i;
}
}
return undefined;
})();
if (closingIndex == null) {
return undefined;
}
let tailIndex = closingIndex + 1;
while (tailIndex < input.length && /\s/.test(input[tailIndex] ?? "")) {
tailIndex += 1;
}
return input[tailIndex] === ")" ? tailIndex + 1 : undefined;
}
function parseMarkdownImageDestination(
input: string,
start: number,
): { destination: string; end: number } | undefined {
let index = start;
while (index < input.length && /\s/.test(input[index] ?? "")) {
index += 1;
}
if (index >= input.length) {
return undefined;
}
if (input[index] === "<") {
let closing = index + 1;
while (closing < input.length) {
const ch = input[closing];
if (ch === "\\") {
closing += 2;
continue;
}
if (ch === ">") {
const destination = input.slice(index + 1, closing).trim();
if (!destination) {
return undefined;
}
let tailIndex = closing + 1;
while (tailIndex < input.length && /\s/.test(input[tailIndex] ?? "")) {
tailIndex += 1;
}
if (input[tailIndex] === ")") {
return { destination, end: tailIndex + 1 };
}
const titledEnd = parseMarkdownTitle(input, tailIndex);
return titledEnd ? { destination, end: titledEnd } : undefined;
}
closing += 1;
}
return undefined;
}
const destinationStart = index;
let destinationEnd = index;
let parenDepth = 0;
while (index < input.length) {
const ch = input[index];
if (ch === "\\") {
index += 2;
destinationEnd = index;
continue;
}
if (ch === "(") {
parenDepth += 1;
index += 1;
destinationEnd = index;
continue;
}
if (ch === ")") {
if (parenDepth === 0) {
const destination = input.slice(destinationStart, destinationEnd).trim();
return destination ? { destination, end: index + 1 } : undefined;
}
parenDepth -= 1;
index += 1;
destinationEnd = index;
continue;
}
if (/\s/.test(ch) && parenDepth === 0) {
const destination = input.slice(destinationStart, destinationEnd).trim();
if (!destination) {
return undefined;
}
const titledEnd = parseMarkdownTitle(input, index);
return titledEnd ? { destination, end: titledEnd } : undefined;
}
index += 1;
destinationEnd = index;
}
return undefined;
}
function findMarkdownImageMatches(line: string): MarkdownImageMatch[] {
if (line.length > MAX_MARKDOWN_IMAGE_LINE_LENGTH) {
return [];
}
const matches: MarkdownImageMatch[] = [];
let searchIndex = 0;
let attempts = 0;
while (
matches.length < MAX_MARKDOWN_IMAGE_MATCHES_PER_LINE &&
attempts < MAX_MARKDOWN_IMAGE_ATTEMPTS_PER_LINE
) {
const index = line.indexOf("![", searchIndex);
if (index < 0) {
break;
}
attempts += 1;
const altEnd = findMatchingBracket(line, index + 2, "[", "]");
if (altEnd == null || line[altEnd + 1] !== "(") {
searchIndex = index + 2;
continue;
}
const parsed = parseMarkdownImageDestination(line, altEnd + 2);
if (!parsed) {
searchIndex = index + 2;
continue;
}
matches.push({
start: index,
end: parsed.end,
destination: parsed.destination,
});
searchIndex = parsed.end;
}
return matches;
}
function collectMarkdownImageSegments(params: { line: string; media: string[] }): {
cleanedLine?: string;
lineSegments: ParsedMediaOutputSegment[];
foundMedia: boolean;
} {
const matches = findMarkdownImageMatches(params.line);
if (matches.length === 0) {
return { lineSegments: [], foundMedia: false };
}
const segmentPieces: string[] = [];
const visiblePieces: string[] = [];
const lineSegments: ParsedMediaOutputSegment[] = [];
let cursor = 0;
let foundMedia = false;
for (const match of matches) {
const before = params.line.slice(cursor, match.start);
segmentPieces.push(before);
visiblePieces.push(before);
const target = normalizeMediaSource(
cleanCandidate(unwrapQuoted(match.destination) ?? match.destination),
);
if (isRemoteMarkdownImageMedia(target)) {
const beforeText = cleanLineText(segmentPieces.join(""));
if (beforeText) {
lineSegments.push({ type: "text", text: beforeText });
}
segmentPieces.length = 0;
params.media.push(target);
lineSegments.push({ type: "media", url: target });
foundMedia = true;
} else {
const original = params.line.slice(match.start, match.end);
segmentPieces.push(original);
visiblePieces.push(original);
}
cursor = match.end;
}
const after = params.line.slice(cursor);
segmentPieces.push(after);
visiblePieces.push(after);
const trailingText = cleanLineText(segmentPieces.join(""));
if (trailingText) {
lineSegments.push({ type: "text", text: trailingText });
}
const cleanedLine = cleanLineText(visiblePieces.join(""));
return {
cleanedLine: cleanedLine || undefined,
lineSegments,
foundMedia,
};
}
// Check if a character offset is inside any fenced code block
function isInsideFence(fenceSpans: Array<{ start: number; end: number }>, offset: number): boolean {
return fenceSpans.some((span) => offset >= span.start && offset < span.end);
@@ -144,8 +403,9 @@ export function splitMediaFromOutput(raw: string): {
return { text: "" };
}
const mayContainMediaToken = /media:/i.test(trimmedRaw);
const mayContainMarkdownImage = /!\[[^\]]*]\(/.test(trimmedRaw);
const mayContainAudioTag = trimmedRaw.includes("[[");
if (!mayContainMediaToken && !mayContainAudioTag) {
if (!mayContainMediaToken && !mayContainMarkdownImage && !mayContainAudioTag) {
return { text: trimmedRaw };
}
@@ -185,8 +445,23 @@ export function splitMediaFromOutput(raw: string): {
const trimmedStart = line.trimStart();
if (!trimmedStart.toUpperCase().startsWith("MEDIA:")) {
keptLines.push(line);
pushTextSegment(line);
const markdownImageResult = collectMarkdownImageSegments({ line, media });
if (!markdownImageResult.foundMedia) {
keptLines.push(line);
pushTextSegment(line);
} else {
foundMediaToken = true;
if (markdownImageResult.cleanedLine) {
keptLines.push(markdownImageResult.cleanedLine);
}
for (const segment of markdownImageResult.lineSegments) {
if (segment.type === "text") {
pushTextSegment(segment.text);
continue;
}
segments.push(segment);
}
}
lineOffset += line.length + 1; // +1 for newline
continue;
}
@@ -269,10 +544,7 @@ export function splitMediaFromOutput(raw: string): {
}
if (hasValidMedia) {
const beforeText = pieces
.join("")
.replace(/[ \t]{2,}/g, " ")
.trim();
const beforeText = cleanLineText(pieces.join(""));
if (beforeText) {
lineSegments.push({ type: "text", text: beforeText });
}
@@ -297,10 +569,7 @@ export function splitMediaFromOutput(raw: string): {
pieces.push(line.slice(cursor));
const cleanedLine = pieces
.join("")
.replace(/[ \t]{2,}/g, " ")
.trim();
const cleanedLine = cleanLineText(pieces.join(""));
// If the line becomes empty, drop it.
if (cleanedLine) {