diff --git a/CHANGELOG.md b/CHANGELOG.md index fe066f4fd48..ae15b4601c8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -97,6 +97,7 @@ Docs: https://docs.openclaw.ai - Google Meet: route stateful `google_meet` tool actions through the gateway-owned runtime so created or joined realtime sessions remain visible to status, speak, and leave after the agent turn ends. Fixes #72440. (#72441) Thanks @BsnizND. - Google Meet/Voice Call: send Gemini Live a non-blocking consult continuation before long OpenClaw agent consults finish, then deliver the final result when idle so calls and meetings do not sit silent during tool-backed answers. (#72189) Thanks @VACInc. - Google Meet: preserve Gemini Live function names when replying to realtime tool calls so Google SDK validation accepts the `FunctionResponse` payload. Fixes #72425. (#72426) Thanks @BsnizND. +- Discord/media: keep incidental Markdown image badges in final replies as text unless a channel opts into Markdown-image media extraction, while preserving Telegram Markdown-image media replies and explicit `MEDIA:` attachments. Fixes #72642. Thanks @solavrc and @Bartok9. - Matrix/E2EE: stabilize recovery and broken-device QA flows while avoiding Matrix device-cleanup sync races that could leave shutdown-time crypto work running. Thanks @gumadeiras. - Cron: apply `cron.maxConcurrentRuns` to a dedicated `cron-nested` isolated agent-turn lane as well as cron dispatch, so parallel cron jobs no longer serialize on inner LLM execution while non-cron nested flows keep their existing lane behavior. Fixes #72707. Thanks @kagura-agent. - Cron: report isolated runs as successful when verified cron delivery already delivered the reply, while keeping unresolved Message/Canvas tool failures fatal. Fixes #72732 and #50170; follow-up to #54188. Thanks @zNatix, @pixeldyn, and @ChickenEggRoll. diff --git a/docs/reference/rich-output-protocol.md b/docs/reference/rich-output-protocol.md index c869dd20072..f31dd569f66 100644 --- a/docs/reference/rich-output-protocol.md +++ b/docs/reference/rich-output-protocol.md @@ -17,6 +17,10 @@ Remote `MEDIA:` attachments must be public `https:` URLs. Plain `http:`, loopback, link-local, private, and internal hostnames are ignored as attachment directives; server-side media fetchers still enforce their own network guards. +Plain Markdown image syntax stays text by default. Channels that intentionally +map Markdown image replies to media attachments opt in at their outbound +adapter; Telegram does this so `![alt](url)` can still become a media reply. + These directives are separate. `MEDIA:` and reply/voice tags remain delivery metadata; `[embed ...]` is the web-only rich render path. Trusted tool-result media uses the same `MEDIA:` / `[[audio_as_voice]]` parser before delivery, so text tool outputs can still mark an audio attachment as a voice note. diff --git a/extensions/telegram/src/outbound-adapter.ts b/extensions/telegram/src/outbound-adapter.ts index 4fb8d0395cd..03e32a888b7 100644 --- a/extensions/telegram/src/outbound-adapter.ts +++ b/extensions/telegram/src/outbound-adapter.ts @@ -121,6 +121,7 @@ export const telegramOutbound: ChannelOutboundAdapter = { deliveryMode: "direct", chunker: markdownToTelegramHtmlChunks, chunkerMode: "markdown", + extractMarkdownImages: true, textChunkLimit: TELEGRAM_TEXT_CHUNK_LIMIT, sanitizeText: ({ text }) => sanitizeForPlainText(text), shouldSkipPlainTextSanitization: ({ payload }) => Boolean(payload.channelData), diff --git a/extensions/telegram/src/outbound-base.ts b/extensions/telegram/src/outbound-base.ts index e6793c8c880..a351874ee5c 100644 --- a/extensions/telegram/src/outbound-base.ts +++ b/extensions/telegram/src/outbound-base.ts @@ -4,6 +4,7 @@ export const telegramOutboundBaseAdapter = { deliveryMode: "direct" as const, chunker: chunkMarkdownText, chunkerMode: "markdown" as const, + extractMarkdownImages: true, textChunkLimit: 4000, pollMaxOptions: 10, }; diff --git a/src/auto-reply/reply/agent-runner-payloads.test.ts b/src/auto-reply/reply/agent-runner-payloads.test.ts index c29fa559ddd..b26f1360539 100644 --- a/src/auto-reply/reply/agent-runner-payloads.test.ts +++ b/src/auto-reply/reply/agent-runner-payloads.test.ts @@ -350,6 +350,7 @@ describe("buildReplyPayloads media filter integration", () => { it("extracts markdown image replies into final payload media urls", async () => { const { replyPayloads } = await buildReplyPayloads({ ...baseParams, + extractMarkdownImages: true, payloads: [{ text: "Here you go\n\n![chart](https://example.com/chart.png)" }], }); @@ -364,6 +365,7 @@ describe("buildReplyPayloads media filter integration", () => { it("preserves inline caption text when lifting markdown image replies into media", async () => { const { replyPayloads } = await buildReplyPayloads({ ...baseParams, + extractMarkdownImages: true, payloads: [{ text: 'Look ![chart](https://example.com/chart.png "Quarterly chart") now' }], }); @@ -379,6 +381,7 @@ describe("buildReplyPayloads media filter integration", () => { const text = "Look ![chart](file:///etc/passwd) now"; const { replyPayloads } = await buildReplyPayloads({ ...baseParams, + extractMarkdownImages: true, payloads: [{ text }], }); diff --git a/src/auto-reply/reply/agent-runner-payloads.ts b/src/auto-reply/reply/agent-runner-payloads.ts index 1f6e7d2e000..4806f7b2228 100644 --- a/src/auto-reply/reply/agent-runner-payloads.ts +++ b/src/auto-reply/reply/agent-runner-payloads.ts @@ -107,6 +107,7 @@ export async function buildReplyPayloads(params: { originatingChannel?: OriginatingChannelType; originatingTo?: string; accountId?: string; + extractMarkdownImages?: boolean; normalizeMediaPaths?: (payload: ReplyPayload) => Promise; }): Promise<{ replyPayloads: ReplyPayload[]; didLogHeartbeatStrip: boolean }> { let didLogHeartbeatStrip = params.didLogHeartbeatStrip; @@ -148,6 +149,7 @@ export async function buildReplyPayloads(params: { currentMessageId: params.currentMessageId, silentToken: SILENT_REPLY_TOKEN, parseMode: "always", + extractMarkdownImages: params.extractMarkdownImages, }); const mediaNormalizedPayload = await normalizeReplyPayloadMedia({ payload: parsed.payload, diff --git a/src/auto-reply/reply/reply-delivery.ts b/src/auto-reply/reply/reply-delivery.ts index c20c6ec873d..9a428e16ec5 100644 --- a/src/auto-reply/reply/reply-delivery.ts +++ b/src/auto-reply/reply/reply-delivery.ts @@ -17,6 +17,7 @@ export function normalizeReplyPayloadDirectives(params: { silentToken?: string; trimLeadingWhitespace?: boolean; parseMode?: ReplyDirectiveParseMode; + extractMarkdownImages?: boolean; }): { payload: ReplyPayload; isSilent: boolean } { const parseMode = params.parseMode ?? "always"; const silentToken = params.silentToken ?? SILENT_REPLY_TOKEN; @@ -27,12 +28,14 @@ export function normalizeReplyPayloadDirectives(params: { (parseMode === "auto" && (sourceText.includes("[[") || /media:/i.test(sourceText) || + (params.extractMarkdownImages === true && /!\[[^\]]*]\(/.test(sourceText)) || sourceText.includes(silentToken))); const parsed = shouldParse ? parseReplyDirectives(sourceText, { currentMessageId: params.currentMessageId, silentToken, + extractMarkdownImages: params.extractMarkdownImages, }) : undefined; diff --git a/src/auto-reply/reply/reply-directives.ts b/src/auto-reply/reply/reply-directives.ts index 14faf7e4e1d..586cda7c353 100644 --- a/src/auto-reply/reply/reply-directives.ts +++ b/src/auto-reply/reply/reply-directives.ts @@ -13,11 +13,19 @@ export type ReplyDirectiveParseResult = { isSilent: boolean; }; +export type ReplyDirectiveParseOptions = { + currentMessageId?: string; + silentToken?: string; + extractMarkdownImages?: boolean; +}; + export function parseReplyDirectives( raw: string, - options: { currentMessageId?: string; silentToken?: string } = {}, + options: ReplyDirectiveParseOptions = {}, ): ReplyDirectiveParseResult { - const split = splitMediaFromOutput(raw); + const split = splitMediaFromOutput(raw, { + extractMarkdownImages: options.extractMarkdownImages, + }); let text = split.text ?? ""; const replyParsed = parseInlineDirectives(text, { diff --git a/src/channels/plugins/outbound.types.ts b/src/channels/plugins/outbound.types.ts index acc8425f2a8..75c381848b1 100644 --- a/src/channels/plugins/outbound.types.ts +++ b/src/channels/plugins/outbound.types.ts @@ -76,6 +76,8 @@ export type ChannelOutboundAdapter = { deliveryMode: "direct" | "gateway" | "hybrid"; chunker?: ((text: string, limit: number, ctx?: ChannelOutboundChunkContext) => string[]) | null; chunkerMode?: "text" | "markdown"; + /** Lift remote Markdown image syntax in text into outbound media attachments. */ + extractMarkdownImages?: boolean; textChunkLimit?: number; sanitizeText?: (params: { text: string; payload: ReplyPayload }) => string; pollMaxOptions?: number; diff --git a/src/infra/outbound/deliver.test.ts b/src/infra/outbound/deliver.test.ts index 8bc2e980f56..26110e9a482 100644 --- a/src/infra/outbound/deliver.test.ts +++ b/src/infra/outbound/deliver.test.ts @@ -1235,6 +1235,54 @@ describe("deliverOutboundPayloads", () => { ); }); + it("keeps markdown images as text for channels that do not opt in", async () => { + const sendMatrix = vi.fn().mockResolvedValue({ messageId: "m-text", roomId: "!room" }); + + await deliverOutboundPayloads({ + cfg: matrixChunkConfig, + channel: "matrix", + to: "!room:example", + payloads: [{ text: "Tech: ![Node.js](https://img.shields.io/badge/Node.js-339933)" }], + deps: { matrix: sendMatrix }, + }); + + expect(sendMatrix).toHaveBeenCalledWith( + "!room:example", + "Tech: ![Node.js](https://img.shields.io/badge/Node.js-339933)", + expect.not.objectContaining({ mediaUrl: expect.any(String) }), + ); + }); + + it("extracts markdown images for channels that opt in", async () => { + const sendMatrix = vi.fn().mockResolvedValue({ messageId: "m-media", roomId: "!room" }); + setActivePluginRegistry( + createTestRegistry([ + { + pluginId: "matrix", + source: "test", + plugin: createOutboundTestPlugin({ + id: "matrix", + outbound: { ...matrixOutboundForTest, extractMarkdownImages: true }, + }), + }, + ]), + ); + + await deliverOutboundPayloads({ + cfg: matrixChunkConfig, + channel: "matrix", + to: "!room:example", + payloads: [{ text: "Chart ![chart](https://example.com/chart.png) now" }], + deps: { matrix: sendMatrix }, + }); + + expect(sendMatrix).toHaveBeenCalledWith( + "!room:example", + "Chart now", + expect.objectContaining({ mediaUrl: "https://example.com/chart.png" }), + ); + }); + it("normalizes payloads and drops empty entries", () => { const normalized = normalizeOutboundPayloads([ { text: "hi" }, diff --git a/src/infra/outbound/deliver.ts b/src/infra/outbound/deliver.ts index 0724ee5390c..4ea06eac070 100644 --- a/src/infra/outbound/deliver.ts +++ b/src/infra/outbound/deliver.ts @@ -147,6 +147,24 @@ type ChannelHandlerParams = { }; // Channel docking: outbound delivery delegates to plugin.outbound adapters. +async function resolveChannelOutboundDirectiveOptions(params: { + cfg: OpenClawConfig; + channel: Exclude; +}): Promise<{ extractMarkdownImages?: boolean }> { + let outbound = await loadChannelOutboundAdapter(params.channel); + if (!outbound) { + const { bootstrapOutboundChannelPlugin } = await loadChannelBootstrapRuntime(); + bootstrapOutboundChannelPlugin({ + channel: params.channel, + cfg: params.cfg, + }); + outbound = await loadChannelOutboundAdapter(params.channel); + } + return { + extractMarkdownImages: outbound?.extractMarkdownImages === true ? true : undefined, + }; +} + async function createChannelHandler(params: ChannelHandlerParams): Promise { let outbound = await loadChannelOutboundAdapter(params.channel); if (!outbound) { @@ -841,11 +859,13 @@ async function deliverOutboundPayloadsCore( params: DeliverOutboundPayloadsCoreParams, ): Promise { const { cfg, channel, to, payloads } = params; + const directiveOptions = await resolveChannelOutboundDirectiveOptions({ cfg, channel }); const outboundPayloadPlan = createOutboundPayloadPlan(payloads, { cfg, sessionKey: params.session?.policyKey ?? params.session?.key, surface: channel, conversationType: params.session?.conversationType, + extractMarkdownImages: directiveOptions.extractMarkdownImages, }); const accountId = params.accountId; const deps = params.deps; diff --git a/src/infra/outbound/payloads.test.ts b/src/infra/outbound/payloads.test.ts index cb0785ef187..97b06015e5f 100644 --- a/src/infra/outbound/payloads.test.ts +++ b/src/infra/outbound/payloads.test.ts @@ -642,6 +642,44 @@ describe("OutboundPayloadPlan projections", () => { const plan = createOutboundPayloadPlan(matrix); expect(projectOutboundPayloadPlanForMirror(plan)).toEqual(resolveMirrorProjection(matrix)); }); + + it("keeps markdown images as text unless extraction is enabled", () => { + const input = "Tech: ![Node.js](https://img.shields.io/badge/Node.js-339933)"; + + expect( + projectOutboundPayloadPlanForDelivery(createOutboundPayloadPlan([{ text: input }])), + ).toEqual([ + { + text: input, + mediaUrl: undefined, + mediaUrls: undefined, + replyToId: undefined, + replyToCurrent: undefined, + replyToTag: false, + audioAsVoice: false, + }, + ]); + }); + + it("extracts markdown images when the outbound channel opts in", () => { + const input = "Chart ![chart](https://example.com/chart.png) now"; + + expect( + projectOutboundPayloadPlanForDelivery( + createOutboundPayloadPlan([{ text: input }], { extractMarkdownImages: true }), + ), + ).toEqual([ + { + text: "Chart now", + mediaUrl: "https://example.com/chart.png", + mediaUrls: ["https://example.com/chart.png"], + replyToId: undefined, + replyToCurrent: undefined, + replyToTag: false, + audioAsVoice: false, + }, + ]); + }); }); describe("formatOutboundPayloadLog", () => { diff --git a/src/infra/outbound/payloads.ts b/src/infra/outbound/payloads.ts index c9a0fed521b..a67190a51bb 100644 --- a/src/infra/outbound/payloads.ts +++ b/src/infra/outbound/payloads.ts @@ -67,6 +67,7 @@ type OutboundPayloadPlanContext = { * (see `pending-spawn-query.ts`). */ hasPendingSpawnedChildren?: boolean; + extractMarkdownImages?: boolean; }; export type OutboundPayloadMirror = { @@ -131,11 +132,14 @@ type PreparedOutboundPayloadPlanEntry = { function createOutboundPayloadPlanEntry( payload: ReplyPayload, + context: Pick = {}, ): PreparedOutboundPayloadPlanEntry | null { if (shouldSuppressReasoningPayload(payload)) { return null; } - const parsed = parseReplyDirectives(payload.text ?? ""); + const parsed = parseReplyDirectives(payload.text ?? "", { + extractMarkdownImages: context.extractMarkdownImages, + }); const explicitMediaUrls = payload.mediaUrls ?? parsed.mediaUrls; const explicitMediaUrl = payload.mediaUrl ?? parsed.mediaUrl; const mergedMedia = mergeMediaUrls( @@ -193,7 +197,9 @@ export function createOutboundPayloadPlan( context.hasPendingSpawnedChildren ?? resolvePendingSpawnedChildren(context.sessionKey); const prepared: PreparedOutboundPayloadPlanEntry[] = []; for (const payload of payloads) { - const entry = createOutboundPayloadPlanEntry(payload); + const entry = createOutboundPayloadPlanEntry(payload, { + extractMarkdownImages: context.extractMarkdownImages, + }); if (!entry) { continue; } diff --git a/src/media/parse.test.ts b/src/media/parse.test.ts index 7ac1f2b3d59..a01f931a2a7 100644 --- a/src/media/parse.test.ts +++ b/src/media/parse.test.ts @@ -1,5 +1,5 @@ import { describe, expect, it } from "vitest"; -import { splitMediaFromOutput } from "./parse.js"; +import { splitMediaFromOutput, type SplitMediaFromOutputOptions } from "./parse.js"; describe("splitMediaFromOutput", () => { function expectParsedMediaOutputCase( @@ -9,8 +9,9 @@ describe("splitMediaFromOutput", () => { text?: string; audioAsVoice?: boolean; }, + options?: SplitMediaFromOutputOptions, ) { - const result = splitMediaFromOutput(input); + const result = splitMediaFromOutput(input, options); expect(result.text).toBe(expected.text ?? ""); if ("audioAsVoice" in expected) { expect(result.audioAsVoice).toBe(expected.audioAsVoice); @@ -126,18 +127,36 @@ describe("splitMediaFromOutput", () => { ]); }); - it("extracts markdown image urls while keeping surrounding caption text", () => { - expectParsedMediaOutputCase("Caption\n\n![chart](https://example.com/chart.png)", { - text: "Caption", - mediaUrls: ["https://example.com/chart.png"], + const extractMarkdownImages = { extractMarkdownImages: true } as const; + + it("keeps markdown image urls as text by default", () => { + const input = "Caption\n\n![chart](https://example.com/chart.png)"; + expectParsedMediaOutputCase(input, { + text: input, + mediaUrls: undefined, }); }); - it("keeps inline caption text around markdown images", () => { - expectParsedMediaOutputCase("Look ![chart](https://example.com/chart.png) now", { - text: "Look now", - mediaUrls: ["https://example.com/chart.png"], - }); + it("extracts markdown image urls while keeping surrounding caption text when enabled", () => { + expectParsedMediaOutputCase( + "Caption\n\n![chart](https://example.com/chart.png)", + { + text: "Caption", + mediaUrls: ["https://example.com/chart.png"], + }, + extractMarkdownImages, + ); + }); + + it("keeps inline caption text around markdown images when enabled", () => { + expectParsedMediaOutputCase( + "Look ![chart](https://example.com/chart.png) now", + { + text: "Look now", + mediaUrls: ["https://example.com/chart.png"], + }, + extractMarkdownImages, + ); }); it("extracts multiple markdown image urls in order", () => { @@ -147,6 +166,7 @@ describe("splitMediaFromOutput", () => { text: "Before\nMiddle\nAfter", mediaUrls: ["https://example.com/one.png", "https://example.com/two.png"], }, + extractMarkdownImages, ); }); @@ -157,14 +177,19 @@ describe("splitMediaFromOutput", () => { text: "Caption", mediaUrls: ["https://example.com/chart.png"], }, + extractMarkdownImages, ); }); it("keeps balanced parentheses inside markdown image urls", () => { - expectParsedMediaOutputCase("Chart ![img](https://example.com/a_(1).png) now", { - text: "Chart now", - mediaUrls: ["https://example.com/a_(1).png"], - }); + expectParsedMediaOutputCase( + "Chart ![img](https://example.com/a_(1).png) now", + { + text: "Chart now", + mediaUrls: ["https://example.com/a_(1).png"], + }, + extractMarkdownImages, + ); }); it.each([ @@ -174,27 +199,76 @@ describe("splitMediaFromOutput", () => { "![x](http://example.com/a.png)", "![x](https://127.0.0.1/a.png)", ] as const)("does not lift local markdown image target: %s", (input) => { - expectParsedMediaOutputCase(input, { - text: input, - mediaUrls: undefined, - }); + expectParsedMediaOutputCase( + input, + { + text: input, + mediaUrls: undefined, + }, + extractMarkdownImages, + ); }); it("does not lift markdown image urls that fail media validation", () => { const longUrl = `![x](https://example.com/${"a".repeat(4097)}.png)`; - expectParsedMediaOutputCase(longUrl, { - text: longUrl, - mediaUrls: undefined, - }); + expectParsedMediaOutputCase( + longUrl, + { + text: longUrl, + mediaUrls: undefined, + }, + extractMarkdownImages, + ); }); it("leaves very long markdown-image candidate lines as text", () => { const input = `${"prefix ".repeat(3000)}![x](https://example.com/image.png)`; + expectParsedMediaOutputCase( + input, + { + text: input, + mediaUrls: undefined, + }, + extractMarkdownImages, + ); + }); + + it.each([ + "![Node.js](https://img.shields.io/badge/Node.js-339933?logo=node.js&logoColor=white)", + "![build](https://img.shields.io/github/actions/workflow/status/owner/repo/ci.yml)", + "![npm](https://badge.fury.io/js/some-package.svg)", + "![badgen](https://badgen.net/npm/v/some-package)", + "![CI](https://github.com/owner/repo/actions/workflows/ci.yml/badge.svg)", + "![flat-badge](https://flat.badgen.net/npm/v/some-package)", + ] as const)("keeps markdown badge image as text by default: %s", (input) => { expectParsedMediaOutputCase(input, { text: input, mediaUrls: undefined, }); }); + + it("keeps surrounding text around inline badge images by default", () => { + expectParsedMediaOutputCase( + "tech: ![Node.js](https://img.shields.io/badge/Node.js-339933?logo=node.js&logoColor=white) stack", + { + text: "tech: ![Node.js](https://img.shields.io/badge/Node.js-339933?logo=node.js&logoColor=white) stack", + mediaUrls: undefined, + }, + ); + }); + + it("still extracts markdown images when explicitly enabled", () => { + expectParsedMediaOutputCase( + "![badge](https://img.shields.io/badge/status-passing-green)\n![photo](https://example.com/photo.png)", + { + mediaUrls: [ + "https://img.shields.io/badge/status-passing-green", + "https://example.com/photo.png", + ], + }, + extractMarkdownImages, + ); + }); }); diff --git a/src/media/parse.ts b/src/media/parse.ts index c725c191370..8d832447bda 100644 --- a/src/media/parse.ts +++ b/src/media/parse.ts @@ -26,6 +26,10 @@ export type ParsedMediaOutputSegment = url: string; }; +export type SplitMediaFromOutputOptions = { + extractMarkdownImages?: boolean; +}; + export function normalizeMediaSource(src: string) { return src.startsWith("file://") ? src.replace("file://", "") : src; } @@ -462,7 +466,10 @@ function isInsideFence(fenceSpans: Array<{ start: number; end: number }>, offset return fenceSpans.some((span) => offset >= span.start && offset < span.end); } -export function splitMediaFromOutput(raw: string): { +export function splitMediaFromOutput( + raw: string, + options: SplitMediaFromOutputOptions = {}, +): { text: string; mediaUrls?: string[]; mediaUrl?: string; // legacy first item for backward compatibility @@ -475,8 +482,9 @@ export function splitMediaFromOutput(raw: string): { if (!trimmedRaw.trim()) { return { text: "" }; } + const extractMarkdownImages = options.extractMarkdownImages === true; const mayContainMediaToken = /media:/i.test(trimmedRaw); - const mayContainMarkdownImage = /!\[[^\]]*]\(/.test(trimmedRaw); + const mayContainMarkdownImage = extractMarkdownImages && /!\[[^\]]*]\(/.test(trimmedRaw); const mayContainAudioTag = trimmedRaw.includes("[["); if (!mayContainMediaToken && !mayContainMarkdownImage && !mayContainAudioTag) { return { text: trimmedRaw }; @@ -518,7 +526,9 @@ export function splitMediaFromOutput(raw: string): { const trimmedStart = line.trimStart(); if (!trimmedStart.toUpperCase().startsWith("MEDIA:")) { - const markdownImageResult = collectMarkdownImageSegments({ line, media }); + const markdownImageResult = extractMarkdownImages + ? collectMarkdownImageSegments({ line, media }) + : { lineSegments: [], foundMedia: false }; if (!markdownImageResult.foundMedia) { keptLines.push(line); pushTextSegment(line);