fix(reply): parse markdown image replies as media

* fix(reply): parse markdown image replies as media * fix(reply): preserve inline markdown image captions * fix(reply): harden markdown image parsing
2026-05-06 06:10:44 +00:00 · 2026-04-23 19:34:30 -07:00
parent 04066d246a
commit 60d892d700
4 changed files with 394 additions and 11 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -24,6 +24,7 @@ Docs: https://docs.openclaw.ai
 ### Fixes

 - Agents/replay: stop OpenAI/Codex transcript replay from synthesizing missing tool results while still preserving synthetic repair on Anthropic, Gemini, and Bedrock transport-owned sessions. (#61556) Thanks @VictorJeon and @vincentkoc.
+- Telegram/media replies: parse remote markdown image syntax into outbound media payloads on the final reply path, so Telegram group chats stop falling back to plain-text image URLs when the model or a tool emits `![...](...)` instead of a `MEDIA:` token. (#66191) Thanks @apezam and @vincentkoc.
 - Agents/WebChat: surface non-retryable provider failures such as billing, auth, and rate-limit errors from the embedded runner instead of logging `surface_error` and leaving webchat with no rendered error. Fixes #70124. (#70848) Thanks @truffle-dev.
 - Memory/CLI: declare the built-in `local` embedding provider in the memory-core manifest, so standalone `openclaw memory status`, `index`, and `search` can resolve local embeddings just like the gateway runtime. Fixes #70836. (#70873) Thanks @mattznojassist.
 - Gateway/WebChat: preserve image attachments for text-only primary models by offloading them as media refs instead of dropping them, so configured image tools can still inspect the original file. Fixes #68513, #44276, #51656, #70212.
--- a/src/auto-reply/reply/agent-runner-payloads.test.ts
+++ b/src/auto-reply/reply/agent-runner-payloads.test.ts
@@ -246,6 +246,49 @@ describe("buildReplyPayloads media filter integration", () => {
    expect(replyPayloads).toHaveLength(0);
  });

+  it("extracts markdown image replies into final payload media urls", async () => {
+    const { replyPayloads } = await buildReplyPayloads({
+      ...baseParams,
+      payloads: [{ text: "Here you go\n\n![chart](https://example.com/chart.png)" }],
+    });
+
+    expect(replyPayloads).toHaveLength(1);
+    expect(replyPayloads[0]).toMatchObject({
+      text: "Here you go",
+      mediaUrl: "https://example.com/chart.png",
+      mediaUrls: ["https://example.com/chart.png"],
+    });
+  });
+
+  it("preserves inline caption text when lifting markdown image replies into media", async () => {
+    const { replyPayloads } = await buildReplyPayloads({
+      ...baseParams,
+      payloads: [{ text: 'Look ![chart](https://example.com/chart.png "Quarterly chart") now' }],
+    });
+
+    expect(replyPayloads).toHaveLength(1);
+    expect(replyPayloads[0]).toMatchObject({
+      text: "Look now",
+      mediaUrl: "https://example.com/chart.png",
+      mediaUrls: ["https://example.com/chart.png"],
+    });
+  });
+
+  it("keeps markdown local file images as plain text in final replies", async () => {
+    const text = "Look ![chart](file:///etc/passwd) now";
+    const { replyPayloads } = await buildReplyPayloads({
+      ...baseParams,
+      payloads: [{ text }],
+    });
+
+    expect(replyPayloads).toHaveLength(1);
+    expect(replyPayloads[0]).toMatchObject({
+      text,
+    });
+    expect(replyPayloads[0]?.mediaUrl).toBeUndefined();
+    expect(replyPayloads[0]?.mediaUrls).toBeUndefined();
+  });
+
  it("deduplicates final payloads against directly sent block keys regardless of replyToId", async () => {
    // When block streaming is not active but directlySentBlockKeys has entries
    // (e.g. from pre-tool flush), the key should match even if replyToId differs.
--- a/src/media/parse.test.ts
+++ b/src/media/parse.test.ts
@@ -103,4 +103,74 @@ describe("splitMediaFromOutput", () => {
      { type: "text", text: "```text\nMEDIA:https://example.com/ignored.png\n```\nAfter" },
    ]);
  });
+
+  it("extracts markdown image urls while keeping surrounding caption text", () => {
+    expectParsedMediaOutputCase("Caption\n\n![chart](https://example.com/chart.png)", {
+      text: "Caption",
+      mediaUrls: ["https://example.com/chart.png"],
+    });
+  });
+
+  it("keeps inline caption text around markdown images", () => {
+    expectParsedMediaOutputCase("Look ![chart](https://example.com/chart.png) now", {
+      text: "Look now",
+      mediaUrls: ["https://example.com/chart.png"],
+    });
+  });
+
+  it("extracts multiple markdown image urls in order", () => {
+    expectParsedMediaOutputCase(
+      "Before\n![one](https://example.com/one.png)\nMiddle\n![two](https://example.com/two.png)\nAfter",
+      {
+        text: "Before\nMiddle\nAfter",
+        mediaUrls: ["https://example.com/one.png", "https://example.com/two.png"],
+      },
+    );
+  });
+
+  it("strips markdown image title suffixes from extracted urls", () => {
+    expectParsedMediaOutputCase(
+      'Caption ![chart](https://example.com/chart.png "Quarterly chart")',
+      {
+        text: "Caption",
+        mediaUrls: ["https://example.com/chart.png"],
+      },
+    );
+  });
+
+  it("keeps balanced parentheses inside markdown image urls", () => {
+    expectParsedMediaOutputCase("Chart ![img](https://example.com/a_(1).png) now", {
+      text: "Chart now",
+      mediaUrls: ["https://example.com/a_(1).png"],
+    });
+  });
+
+  it.each([
+    "![x](file:///etc/passwd)",
+    "![x](/var/run/secrets/kubernetes.io/serviceaccount/token)",
+    "![x](C:\\\\Windows\\\\System32\\\\drivers\\\\etc\\\\hosts)",
+  ] as const)("does not lift local markdown image target: %s", (input) => {
+    expectParsedMediaOutputCase(input, {
+      text: input,
+      mediaUrls: undefined,
+    });
+  });
+
+  it("does not lift markdown image urls that fail media validation", () => {
+    const longUrl = `![x](https://example.com/${"a".repeat(4097)}.png)`;
+
+    expectParsedMediaOutputCase(longUrl, {
+      text: longUrl,
+      mediaUrls: undefined,
+    });
+  });
+
+  it("leaves very long markdown-image candidate lines as text", () => {
+    const input = `${"prefix ".repeat(3000)}![x](https://example.com/image.png)`;
+
+    expectParsedMediaOutputCase(input, {
+      text: input,
+      mediaUrls: undefined,
+    });
+  });
 });
--- a/src/media/parse.ts
+++ b/src/media/parse.ts
@@ -125,6 +125,265 @@ function mayContainFenceMarkers(input: string): boolean {
  return input.includes("```") || input.includes("~~~");
 }

+function cleanLineText(text: string): string {
+  return text.replace(/[ \t]{2,}/g, " ").trim();
+}
+
+type MarkdownImageMatch = {
+  start: number;
+  end: number;
+  destination: string;
+};
+
+const MAX_MARKDOWN_IMAGE_LINE_LENGTH = 20_000;
+const MAX_MARKDOWN_IMAGE_ATTEMPTS_PER_LINE = 80;
+const MAX_MARKDOWN_IMAGE_MATCHES_PER_LINE = 50;
+
+function findMatchingBracket(
+  input: string,
+  start: number,
+  open: string,
+  close: string,
+): number | undefined {
+  let depth = 1;
+  for (let i = start; i < input.length; i += 1) {
+    const ch = input[i];
+    if (ch === "\\") {
+      i += 1;
+      continue;
+    }
+    if (ch === open) {
+      depth += 1;
+      continue;
+    }
+    if (ch !== close) {
+      continue;
+    }
+    depth -= 1;
+    if (depth === 0) {
+      return i;
+    }
+  }
+  return undefined;
+}
+
+function isRemoteMarkdownImageMedia(candidate: string): boolean {
+  return /^https?:\/\//i.test(candidate) && isValidMedia(candidate);
+}
+
+function parseMarkdownTitle(input: string, start: number): number | undefined {
+  let index = start;
+  while (index < input.length && /\s/.test(input[index] ?? "")) {
+    index += 1;
+  }
+  const opener = input[index];
+  if (!opener) {
+    return undefined;
+  }
+  const closer = opener === '"' || opener === "'" ? opener : opener === "(" ? ")" : null;
+  if (!closer) {
+    return undefined;
+  }
+  const closingIndex =
+    opener === "("
+      ? findMatchingBracket(input, index + 1, "(", ")")
+      : (() => {
+          for (let i = index + 1; i < input.length; i += 1) {
+            const ch = input[i];
+            if (ch === "\\") {
+              i += 1;
+              continue;
+            }
+            if (ch === closer) {
+              return i;
+            }
+          }
+          return undefined;
+        })();
+  if (closingIndex == null) {
+    return undefined;
+  }
+  let tailIndex = closingIndex + 1;
+  while (tailIndex < input.length && /\s/.test(input[tailIndex] ?? "")) {
+    tailIndex += 1;
+  }
+  return input[tailIndex] === ")" ? tailIndex + 1 : undefined;
+}
+
+function parseMarkdownImageDestination(
+  input: string,
+  start: number,
+): { destination: string; end: number } | undefined {
+  let index = start;
+  while (index < input.length && /\s/.test(input[index] ?? "")) {
+    index += 1;
+  }
+  if (index >= input.length) {
+    return undefined;
+  }
+
+  if (input[index] === "<") {
+    let closing = index + 1;
+    while (closing < input.length) {
+      const ch = input[closing];
+      if (ch === "\\") {
+        closing += 2;
+        continue;
+      }
+      if (ch === ">") {
+        const destination = input.slice(index + 1, closing).trim();
+        if (!destination) {
+          return undefined;
+        }
+        let tailIndex = closing + 1;
+        while (tailIndex < input.length && /\s/.test(input[tailIndex] ?? "")) {
+          tailIndex += 1;
+        }
+        if (input[tailIndex] === ")") {
+          return { destination, end: tailIndex + 1 };
+        }
+        const titledEnd = parseMarkdownTitle(input, tailIndex);
+        return titledEnd ? { destination, end: titledEnd } : undefined;
+      }
+      closing += 1;
+    }
+    return undefined;
+  }
+
+  const destinationStart = index;
+  let destinationEnd = index;
+  let parenDepth = 0;
+  while (index < input.length) {
+    const ch = input[index];
+    if (ch === "\\") {
+      index += 2;
+      destinationEnd = index;
+      continue;
+    }
+    if (ch === "(") {
+      parenDepth += 1;
+      index += 1;
+      destinationEnd = index;
+      continue;
+    }
+    if (ch === ")") {
+      if (parenDepth === 0) {
+        const destination = input.slice(destinationStart, destinationEnd).trim();
+        return destination ? { destination, end: index + 1 } : undefined;
+      }
+      parenDepth -= 1;
+      index += 1;
+      destinationEnd = index;
+      continue;
+    }
+    if (/\s/.test(ch) && parenDepth === 0) {
+      const destination = input.slice(destinationStart, destinationEnd).trim();
+      if (!destination) {
+        return undefined;
+      }
+      const titledEnd = parseMarkdownTitle(input, index);
+      return titledEnd ? { destination, end: titledEnd } : undefined;
+    }
+    index += 1;
+    destinationEnd = index;
+  }
+  return undefined;
+}
+
+function findMarkdownImageMatches(line: string): MarkdownImageMatch[] {
+  if (line.length > MAX_MARKDOWN_IMAGE_LINE_LENGTH) {
+    return [];
+  }
+  const matches: MarkdownImageMatch[] = [];
+  let searchIndex = 0;
+  let attempts = 0;
+  while (
+    matches.length < MAX_MARKDOWN_IMAGE_MATCHES_PER_LINE &&
+    attempts < MAX_MARKDOWN_IMAGE_ATTEMPTS_PER_LINE
+  ) {
+    const index = line.indexOf("![", searchIndex);
+    if (index < 0) {
+      break;
+    }
+    attempts += 1;
+    const altEnd = findMatchingBracket(line, index + 2, "[", "]");
+    if (altEnd == null || line[altEnd + 1] !== "(") {
+      searchIndex = index + 2;
+      continue;
+    }
+    const parsed = parseMarkdownImageDestination(line, altEnd + 2);
+    if (!parsed) {
+      searchIndex = index + 2;
+      continue;
+    }
+    matches.push({
+      start: index,
+      end: parsed.end,
+      destination: parsed.destination,
+    });
+    searchIndex = parsed.end;
+  }
+  return matches;
+}
+
+function collectMarkdownImageSegments(params: { line: string; media: string[] }): {
+  cleanedLine?: string;
+  lineSegments: ParsedMediaOutputSegment[];
+  foundMedia: boolean;
+} {
+  const matches = findMarkdownImageMatches(params.line);
+  if (matches.length === 0) {
+    return { lineSegments: [], foundMedia: false };
+  }
+
+  const segmentPieces: string[] = [];
+  const visiblePieces: string[] = [];
+  const lineSegments: ParsedMediaOutputSegment[] = [];
+  let cursor = 0;
+  let foundMedia = false;
+
+  for (const match of matches) {
+    const before = params.line.slice(cursor, match.start);
+    segmentPieces.push(before);
+    visiblePieces.push(before);
+
+    const target = normalizeMediaSource(
+      cleanCandidate(unwrapQuoted(match.destination) ?? match.destination),
+    );
+    if (isRemoteMarkdownImageMedia(target)) {
+      const beforeText = cleanLineText(segmentPieces.join(""));
+      if (beforeText) {
+        lineSegments.push({ type: "text", text: beforeText });
+      }
+      segmentPieces.length = 0;
+      params.media.push(target);
+      lineSegments.push({ type: "media", url: target });
+      foundMedia = true;
+    } else {
+      const original = params.line.slice(match.start, match.end);
+      segmentPieces.push(original);
+      visiblePieces.push(original);
+    }
+
+    cursor = match.end;
+  }
+
+  const after = params.line.slice(cursor);
+  segmentPieces.push(after);
+  visiblePieces.push(after);
+  const trailingText = cleanLineText(segmentPieces.join(""));
+  if (trailingText) {
+    lineSegments.push({ type: "text", text: trailingText });
+  }
+  const cleanedLine = cleanLineText(visiblePieces.join(""));
+
+  return {
+    cleanedLine: cleanedLine || undefined,
+    lineSegments,
+    foundMedia,
+  };
+}
+
 // Check if a character offset is inside any fenced code block
 function isInsideFence(fenceSpans: Array<{ start: number; end: number }>, offset: number): boolean {
  return fenceSpans.some((span) => offset >= span.start && offset < span.end);
@@ -144,8 +403,9 @@ export function splitMediaFromOutput(raw: string): {
    return { text: "" };
  }
  const mayContainMediaToken = /media:/i.test(trimmedRaw);
+  const mayContainMarkdownImage = /!\[[^\]]*]\(/.test(trimmedRaw);
  const mayContainAudioTag = trimmedRaw.includes("[[");
-  if (!mayContainMediaToken && !mayContainAudioTag) {
+  if (!mayContainMediaToken && !mayContainMarkdownImage && !mayContainAudioTag) {
    return { text: trimmedRaw };
  }

@@ -185,8 +445,23 @@ export function splitMediaFromOutput(raw: string): {

    const trimmedStart = line.trimStart();
    if (!trimmedStart.toUpperCase().startsWith("MEDIA:")) {
-      keptLines.push(line);
-      pushTextSegment(line);
+      const markdownImageResult = collectMarkdownImageSegments({ line, media });
+      if (!markdownImageResult.foundMedia) {
+        keptLines.push(line);
+        pushTextSegment(line);
+      } else {
+        foundMediaToken = true;
+        if (markdownImageResult.cleanedLine) {
+          keptLines.push(markdownImageResult.cleanedLine);
+        }
+        for (const segment of markdownImageResult.lineSegments) {
+          if (segment.type === "text") {
+            pushTextSegment(segment.text);
+            continue;
+          }
+          segments.push(segment);
+        }
+      }
      lineOffset += line.length + 1; // +1 for newline
      continue;
    }
@@ -269,10 +544,7 @@ export function splitMediaFromOutput(raw: string): {
      }

      if (hasValidMedia) {
-        const beforeText = pieces
-          .join("")
-          .replace(/[ \t]{2,}/g, " ")
-          .trim();
+        const beforeText = cleanLineText(pieces.join(""));
        if (beforeText) {
          lineSegments.push({ type: "text", text: beforeText });
        }
@@ -297,10 +569,7 @@ export function splitMediaFromOutput(raw: string): {

    pieces.push(line.slice(cursor));

-    const cleanedLine = pieces
-      .join("")
-      .replace(/[ \t]{2,}/g, " ")
-      .trim();
+    const cleanedLine = cleanLineText(pieces.join(""));

    // If the line becomes empty, drop it.
    if (cleanedLine) {