From bb0cc5b0b97ee659b3f45ae82908023e5b0dd26c Mon Sep 17 00:00:00 2001 From: Muhammed Mukhthar CM Date: Tue, 17 Mar 2026 06:19:40 +0000 Subject: [PATCH] fix(mattermost): harden DM channel retry handling --- CHANGELOG.md | 1 + docs/channels/mattermost.md | 29 ++++ .../src/mattermost/client.retry.test.ts | 39 ++++- .../mattermost/src/mattermost/client.ts | 159 +++++++++++++++--- .../mattermost/src/mattermost/send.test.ts | 9 +- extensions/mattermost/src/mattermost/send.ts | 45 +++-- 6 files changed, 246 insertions(+), 36 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a5dfdbd71e6..f24d843e508 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -133,6 +133,7 @@ Docs: https://docs.openclaw.ai - macOS/launch at login: stop emitting `KeepAlive` for the desktop app launch agent so OpenClaw no longer relaunches immediately after a manual quit while launch at login remains enabled. (#40213) Thanks @stablegenius49. - ACP/gateway startup: use direct Telegram and Discord startup/status helpers instead of routing probes through the plugin runtime, and prepend the selected daemon Node bin dir to service PATH so plugin-local installs can still find `npm` and `pnpm`. - ACP/configured bindings: reinitialize configured ACP sessions that are stuck in `error` state instead of reusing the failed runtime. +- Mattermost/DM send: retry transient direct-channel creation failures for DM deliveries, with configurable backoff and per-request timeout. (#42398) Thanks @JonathanJing. ## 2026.3.13 diff --git a/docs/channels/mattermost.md b/docs/channels/mattermost.md index 2ceb6c17626..41f6ffa19a0 100644 --- a/docs/channels/mattermost.md +++ b/docs/channels/mattermost.md @@ -191,6 +191,35 @@ OpenClaw resolves them **user-first**: If you need deterministic behavior, always use the explicit prefixes (`user:` / `channel:`). +## DM channel retry + +When OpenClaw sends to a Mattermost DM target and needs to resolve the direct channel first, it +retries transient direct-channel creation failures by default. + +Use `channels.mattermost.dmChannelRetry` to tune that behavior globally for the Mattermost plugin, +or `channels.mattermost.accounts..dmChannelRetry` for one account. + +```json5 +{ + channels: { + mattermost: { + dmChannelRetry: { + maxRetries: 3, + initialDelayMs: 1000, + maxDelayMs: 10000, + timeoutMs: 30000, + }, + }, + }, +} +``` + +Notes: + +- This applies only to DM channel creation (`/api/v4/channels/direct`), not every Mattermost API call. +- Retries apply to transient failures such as rate limits, 5xx responses, and network or timeout errors. +- 4xx client errors other than `429` are treated as permanent and are not retried. + ## Reactions (message tool) - Use `message action=react` with `channel=mattermost`. diff --git a/extensions/mattermost/src/mattermost/client.retry.test.ts b/extensions/mattermost/src/mattermost/client.retry.test.ts index 1296b8448d0..c5f62357fe4 100644 --- a/extensions/mattermost/src/mattermost/client.retry.test.ts +++ b/extensions/mattermost/src/mattermost/client.retry.test.ts @@ -1,9 +1,5 @@ import { describe, expect, it, vi, beforeEach } from "vitest"; -import { - createMattermostClient, - createMattermostDirectChannelWithRetry, - type CreateDmChannelRetryOptions, -} from "./client.js"; +import { createMattermostClient, createMattermostDirectChannelWithRetry } from "./client.js"; describe("createMattermostDirectChannelWithRetry", () => { const mockFetch = vi.fn(); @@ -20,6 +16,13 @@ describe("createMattermostDirectChannelWithRetry", () => { }); } + function createFetchFailedError(params: { message: string; code?: string }): TypeError { + const cause = Object.assign(new Error(params.message), { + code: params.code, + }); + return Object.assign(new TypeError("fetch failed"), { cause }); + } + it("succeeds on first attempt without retries", async () => { mockFetch.mockResolvedValueOnce({ ok: true, @@ -178,6 +181,32 @@ describe("createMattermostDirectChannelWithRetry", () => { expect(mockFetch).toHaveBeenCalledTimes(3); }); + it("retries on fetch failed errors when the cause carries a transient code", async () => { + mockFetch + .mockRejectedValueOnce( + createFetchFailedError({ + message: "connect ECONNREFUSED 127.0.0.1:81", + code: "ECONNREFUSED", + }), + ) + .mockResolvedValueOnce({ + ok: true, + status: 201, + headers: new Headers({ "content-type": "application/json" }), + json: async () => ({ id: "dm-channel-fetch-failed" }), + } as Response); + + const client = createMockClient(); + + const result = await createMattermostDirectChannelWithRetry(client, ["user-1", "user-2"], { + maxRetries: 3, + initialDelayMs: 10, + }); + + expect(result.id).toBe("dm-channel-fetch-failed"); + expect(mockFetch).toHaveBeenCalledTimes(2); + }); + it("does not retry on 4xx client errors (except 429)", async () => { mockFetch.mockResolvedValueOnce({ ok: false, diff --git a/extensions/mattermost/src/mattermost/client.ts b/extensions/mattermost/src/mattermost/client.ts index dfb8fdde726..c514160590f 100644 --- a/extensions/mattermost/src/mattermost/client.ts +++ b/extensions/mattermost/src/mattermost/client.ts @@ -190,6 +190,47 @@ export type CreateDmChannelRetryOptions = { onRetry?: (attempt: number, delayMs: number, error: Error) => void; }; +const RETRYABLE_NETWORK_ERROR_CODES = new Set([ + "ECONNRESET", + "ECONNREFUSED", + "ETIMEDOUT", + "ESOCKETTIMEDOUT", + "ECONNABORTED", + "ENOTFOUND", + "EAI_AGAIN", + "EHOSTUNREACH", + "ENETUNREACH", + "EPIPE", + "UND_ERR_CONNECT_TIMEOUT", + "UND_ERR_DNS_RESOLVE_FAILED", + "UND_ERR_CONNECT", + "UND_ERR_SOCKET", + "UND_ERR_HEADERS_TIMEOUT", + "UND_ERR_BODY_TIMEOUT", +]); + +const RETRYABLE_NETWORK_ERROR_NAMES = new Set([ + "AbortError", + "TimeoutError", + "ConnectTimeoutError", + "HeadersTimeoutError", + "BodyTimeoutError", +]); + +const RETRYABLE_NETWORK_MESSAGE_SNIPPETS = [ + "network error", + "timeout", + "timed out", + "abort", + "connection refused", + "econnreset", + "econnrefused", + "etimedout", + "enotfound", + "socket hang up", + "getaddrinfo", +]; + /** * Creates a Mattermost DM channel with exponential backoff retry logic. * Retries on transient errors (429, 5xx, network errors) but not on @@ -255,27 +296,37 @@ export async function createMattermostDirectChannelWithRetry( } function isRetryableError(error: Error): boolean { - const message = error.message.toLowerCase(); + const candidates = collectErrorCandidates(error); + const messages = candidates + .map((candidate) => readErrorMessage(candidate)?.toLowerCase()) + .filter((message): message is string => Boolean(message)); // Retry on 5xx server errors FIRST (before checking 4xx) // Use "mattermost api" prefix to avoid matching port numbers (e.g., :443) or IP octets // This prevents misclassification when a 5xx error detail contains a 4xx substring // e.g., "Mattermost API 503: upstream returned 404" - if (/mattermost api 5\d{2}\b/.test(message)) { + if (messages.some((message) => /mattermost api 5\d{2}\b/.test(message))) { return true; } // Check for explicit 429 rate limiting FIRST (before generic "429" text match) // This avoids retrying when error detail contains "429" but it's not the status code - if (/mattermost api 429\b/.test(message) || message.includes("too many requests")) { + if ( + messages.some( + (message) => /mattermost api 429\b/.test(message) || message.includes("too many requests"), + ) + ) { return true; } // Check for explicit 4xx status codes - these are client errors and should NOT be retried // (except 429 which is handled above) // Use "mattermost api" prefix to avoid matching port numbers like :443 - const clientErrorMatch = message.match(/mattermost api (4\d{2})\b/); - if (clientErrorMatch) { + for (const message of messages) { + const clientErrorMatch = message.match(/mattermost api (4\d{2})\b/); + if (!clientErrorMatch) { + continue; + } const statusCode = parseInt(clientErrorMatch[1], 10); if (statusCode >= 400 && statusCode < 500) { return false; @@ -286,25 +337,95 @@ function isRetryableError(error: Error): boolean { // This avoids false positives like: // - "400 Bad Request: connection timed out" (has status code) // - "connect ECONNRESET 104.18.32.10:443" (has port number, not status) - const hasMattermostApiStatusCode = /mattermost api \d{3}\b/.test(message); + const hasMattermostApiStatusCode = messages.some((message) => + /mattermost api \d{3}\b/.test(message), + ); if (hasMattermostApiStatusCode) { return false; } - // Retry on network/transient errors (no explicit HTTP status code in message) - const retryablePatterns = [ - "network error", - "timeout", - "abort", - "connection refused", - "econnreset", - "econnrefused", - "etimedout", - "enotfound", - "socket hang up", - ]; + const codes = candidates + .map((candidate) => readErrorCode(candidate)) + .filter((code): code is string => Boolean(code)); + if (codes.some((code) => RETRYABLE_NETWORK_ERROR_CODES.has(code))) { + return true; + } - return retryablePatterns.some((pattern) => message.includes(pattern)); + const names = candidates + .map((candidate) => readErrorName(candidate)) + .filter((name): name is string => Boolean(name)); + if (names.some((name) => RETRYABLE_NETWORK_ERROR_NAMES.has(name))) { + return true; + } + + return messages.some((message) => + RETRYABLE_NETWORK_MESSAGE_SNIPPETS.some((pattern) => message.includes(pattern)), + ); +} + +function collectErrorCandidates(error: unknown): unknown[] { + const queue: unknown[] = [error]; + const seen = new Set(); + const candidates: unknown[] = []; + + while (queue.length > 0) { + const current = queue.shift(); + if (!current || seen.has(current)) { + continue; + } + seen.add(current); + candidates.push(current); + + if (typeof current !== "object") { + continue; + } + + const nested = current as { + cause?: unknown; + reason?: unknown; + errors?: unknown; + }; + queue.push(nested.cause, nested.reason); + if (Array.isArray(nested.errors)) { + queue.push(...nested.errors); + } + } + + return candidates; +} + +function readErrorMessage(error: unknown): string | undefined { + if (!error || typeof error !== "object") { + return undefined; + } + const message = (error as { message?: unknown }).message; + return typeof message === "string" && message.trim() ? message : undefined; +} + +function readErrorName(error: unknown): string | undefined { + if (!error || typeof error !== "object") { + return undefined; + } + const name = (error as { name?: unknown }).name; + return typeof name === "string" && name.trim() ? name : undefined; +} + +function readErrorCode(error: unknown): string | undefined { + if (!error || typeof error !== "object") { + return undefined; + } + const { code, errno } = error as { + code?: unknown; + errno?: unknown; + }; + const raw = typeof code === "string" && code.trim() ? code : errno; + if (typeof raw === "string" && raw.trim()) { + return raw.trim().toUpperCase(); + } + if (typeof raw === "number" && Number.isFinite(raw)) { + return String(raw); + } + return undefined; } function sleep(ms: number): Promise { diff --git a/extensions/mattermost/src/mattermost/send.test.ts b/extensions/mattermost/src/mattermost/send.test.ts index 93a5dd85ec7..5f61622babb 100644 --- a/extensions/mattermost/src/mattermost/send.test.ts +++ b/extensions/mattermost/src/mattermost/send.test.ts @@ -541,7 +541,7 @@ describe("sendMessageMattermost user-first resolution", () => { ); }); - it("opts.dmRetryOptions overrides account config dmChannelRetry", async () => { + it("opts.dmRetryOptions overrides provided fields and preserves account defaults", async () => { const userId = "hhhhhh8888888888hhhhhh8888"; // 26 chars mockState.resolveMattermostAccount.mockReturnValue({ accountId: "default", @@ -570,5 +570,12 @@ describe("sendMessageMattermost user-first resolution", () => { ["bot-id", userId], expect.objectContaining(overrideOptions), ); + expect(mockState.createMattermostDirectChannelWithRetry).toHaveBeenCalledWith( + {}, + ["bot-id", userId], + expect.objectContaining({ + initialDelayMs: 1000, + }), + ); }); }); diff --git a/extensions/mattermost/src/mattermost/send.ts b/extensions/mattermost/src/mattermost/send.ts index 2f8a4bb4e4b..c589c8829a0 100644 --- a/extensions/mattermost/src/mattermost/send.ts +++ b/extensions/mattermost/src/mattermost/send.ts @@ -193,6 +193,31 @@ type ResolveTargetChannelIdParams = { logger?: { debug?: (msg: string) => void; warn?: (msg: string) => void }; }; +function mergeDmRetryOptions( + base?: CreateDmChannelRetryOptions, + override?: CreateDmChannelRetryOptions, +): CreateDmChannelRetryOptions | undefined { + const merged: CreateDmChannelRetryOptions = { + maxRetries: override?.maxRetries ?? base?.maxRetries, + initialDelayMs: override?.initialDelayMs ?? base?.initialDelayMs, + maxDelayMs: override?.maxDelayMs ?? base?.maxDelayMs, + timeoutMs: override?.timeoutMs ?? base?.timeoutMs, + onRetry: override?.onRetry, + }; + + if ( + merged.maxRetries === undefined && + merged.initialDelayMs === undefined && + merged.maxDelayMs === undefined && + merged.timeoutMs === undefined && + merged.onRetry === undefined + ) { + return undefined; + } + + return merged; +} + async function resolveTargetChannelId(params: ResolveTargetChannelIdParams): Promise { if (params.target.kind === "channel") { return params.target.id; @@ -284,17 +309,15 @@ async function resolveMattermostSendContext( ? { kind: "channel" as const, id: opaqueTarget.id } : parseMattermostTarget(trimmedTo); // Build retry options from account config, allowing opts to override - const accountRetryConfig = account.config.dmChannelRetry; - const dmRetryOptions: CreateDmChannelRetryOptions | undefined = - opts.dmRetryOptions ?? - (accountRetryConfig - ? { - maxRetries: accountRetryConfig.maxRetries, - initialDelayMs: accountRetryConfig.initialDelayMs, - maxDelayMs: accountRetryConfig.maxDelayMs, - timeoutMs: accountRetryConfig.timeoutMs, - } - : undefined); + const accountRetryConfig: CreateDmChannelRetryOptions | undefined = account.config.dmChannelRetry + ? { + maxRetries: account.config.dmChannelRetry.maxRetries, + initialDelayMs: account.config.dmChannelRetry.initialDelayMs, + maxDelayMs: account.config.dmChannelRetry.maxDelayMs, + timeoutMs: account.config.dmChannelRetry.timeoutMs, + } + : undefined; + const dmRetryOptions = mergeDmRetryOptions(accountRetryConfig, opts.dmRetryOptions); const channelId = await resolveTargetChannelId({ target,