From 2145eb5908c0f625988f2d284db1762ee3af024e Mon Sep 17 00:00:00 2001 From: Jonathan Jing Date: Tue, 17 Mar 2026 09:46:56 -0700 Subject: [PATCH] feat(mattermost): add retry logic and timeout handling for DM channel creation (#42398) Merged via squash. Prepared head SHA: 3db47be907decd78116603c6ab4a48ff91eb2c25 Co-authored-by: JonathanJing <17068507+JonathanJing@users.noreply.github.com> Co-authored-by: mukhtharcm <56378562+mukhtharcm@users.noreply.github.com> Reviewed-by: @mukhtharcm --- CHANGELOG.md | 1 + docs/channels/mattermost.md | 29 ++ extensions/mattermost/src/config-schema.ts | 28 ++ .../src/mattermost/client.retry.test.ts | 466 ++++++++++++++++++ .../mattermost/src/mattermost/client.ts | 257 ++++++++++ .../mattermost/src/mattermost/send.test.ts | 159 +++++- extensions/mattermost/src/mattermost/send.ts | 67 ++- extensions/mattermost/src/types.ts | 11 + 8 files changed, 1005 insertions(+), 13 deletions(-) create mode 100644 extensions/mattermost/src/mattermost/client.retry.test.ts diff --git a/CHANGELOG.md b/CHANGELOG.md index a5dfdbd71e6..f24d843e508 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -133,6 +133,7 @@ Docs: https://docs.openclaw.ai - macOS/launch at login: stop emitting `KeepAlive` for the desktop app launch agent so OpenClaw no longer relaunches immediately after a manual quit while launch at login remains enabled. (#40213) Thanks @stablegenius49. - ACP/gateway startup: use direct Telegram and Discord startup/status helpers instead of routing probes through the plugin runtime, and prepend the selected daemon Node bin dir to service PATH so plugin-local installs can still find `npm` and `pnpm`. - ACP/configured bindings: reinitialize configured ACP sessions that are stuck in `error` state instead of reusing the failed runtime. +- Mattermost/DM send: retry transient direct-channel creation failures for DM deliveries, with configurable backoff and per-request timeout. (#42398) Thanks @JonathanJing. ## 2026.3.13 diff --git a/docs/channels/mattermost.md b/docs/channels/mattermost.md index 2ceb6c17626..41f6ffa19a0 100644 --- a/docs/channels/mattermost.md +++ b/docs/channels/mattermost.md @@ -191,6 +191,35 @@ OpenClaw resolves them **user-first**: If you need deterministic behavior, always use the explicit prefixes (`user:` / `channel:`). +## DM channel retry + +When OpenClaw sends to a Mattermost DM target and needs to resolve the direct channel first, it +retries transient direct-channel creation failures by default. + +Use `channels.mattermost.dmChannelRetry` to tune that behavior globally for the Mattermost plugin, +or `channels.mattermost.accounts..dmChannelRetry` for one account. + +```json5 +{ + channels: { + mattermost: { + dmChannelRetry: { + maxRetries: 3, + initialDelayMs: 1000, + maxDelayMs: 10000, + timeoutMs: 30000, + }, + }, + }, +} +``` + +Notes: + +- This applies only to DM channel creation (`/api/v4/channels/direct`), not every Mattermost API call. +- Retries apply to transient failures such as rate limits, 5xx responses, and network or timeout errors. +- 4xx client errors other than `429` are treated as permanent and are not retried. + ## Reactions (message tool) - Use `message action=react` with `channel=mattermost`. diff --git a/extensions/mattermost/src/config-schema.ts b/extensions/mattermost/src/config-schema.ts index 16ee615454c..d578de86e9a 100644 --- a/extensions/mattermost/src/config-schema.ts +++ b/extensions/mattermost/src/config-schema.ts @@ -9,6 +9,32 @@ import { z } from "zod"; import { requireChannelOpenAllowFrom } from "../../shared/config-schema-helpers.js"; import { buildSecretInputSchema } from "./secret-input.js"; +const DmChannelRetrySchema = z + .object({ + /** Maximum number of retry attempts for DM channel creation (default: 3) */ + maxRetries: z.number().int().min(0).max(10).optional(), + /** Initial delay in milliseconds before first retry (default: 1000) */ + initialDelayMs: z.number().int().min(100).max(60000).optional(), + /** Maximum delay in milliseconds between retries (default: 10000) */ + maxDelayMs: z.number().int().min(1000).max(60000).optional(), + /** Timeout for each individual DM channel creation request in milliseconds (default: 30000) */ + timeoutMs: z.number().int().min(5000).max(120000).optional(), + }) + .strict() + .refine( + (data) => { + if (data.initialDelayMs !== undefined && data.maxDelayMs !== undefined) { + return data.initialDelayMs <= data.maxDelayMs; + } + return true; + }, + { + message: "initialDelayMs must be less than or equal to maxDelayMs", + path: ["initialDelayMs"], + }, + ) + .optional(); + const MattermostSlashCommandsSchema = z .object({ /** Enable native slash commands. "auto" resolves to false (opt-in). */ @@ -58,6 +84,8 @@ const MattermostAccountSchemaBase = z allowedSourceIps: z.array(z.string()).optional(), }) .optional(), + /** Retry configuration for DM channel creation */ + dmChannelRetry: DmChannelRetrySchema, }) .strict(); diff --git a/extensions/mattermost/src/mattermost/client.retry.test.ts b/extensions/mattermost/src/mattermost/client.retry.test.ts new file mode 100644 index 00000000000..c5f62357fe4 --- /dev/null +++ b/extensions/mattermost/src/mattermost/client.retry.test.ts @@ -0,0 +1,466 @@ +import { describe, expect, it, vi, beforeEach } from "vitest"; +import { createMattermostClient, createMattermostDirectChannelWithRetry } from "./client.js"; + +describe("createMattermostDirectChannelWithRetry", () => { + const mockFetch = vi.fn(); + + beforeEach(() => { + vi.resetAllMocks(); + }); + + function createMockClient() { + return createMattermostClient({ + baseUrl: "https://mattermost.example.com", + botToken: "test-token", + fetchImpl: mockFetch as unknown as typeof fetch, + }); + } + + function createFetchFailedError(params: { message: string; code?: string }): TypeError { + const cause = Object.assign(new Error(params.message), { + code: params.code, + }); + return Object.assign(new TypeError("fetch failed"), { cause }); + } + + it("succeeds on first attempt without retries", async () => { + mockFetch.mockResolvedValueOnce({ + ok: true, + status: 201, + headers: new Headers({ "content-type": "application/json" }), + json: async () => ({ id: "dm-channel-123" }), + } as Response); + + const client = createMockClient(); + const onRetry = vi.fn(); + + const result = await createMattermostDirectChannelWithRetry(client, ["user-1", "user-2"], { + onRetry, + }); + + expect(result.id).toBe("dm-channel-123"); + expect(mockFetch).toHaveBeenCalledTimes(1); + expect(onRetry).not.toHaveBeenCalled(); + }); + + it("retries on 429 rate limit error and succeeds", async () => { + mockFetch + .mockResolvedValueOnce({ + ok: false, + status: 429, + headers: new Headers({ "content-type": "application/json" }), + json: async () => ({ message: "Too many requests" }), + text: async () => "Too many requests", + } as Response) + .mockResolvedValueOnce({ + ok: true, + status: 201, + headers: new Headers({ "content-type": "application/json" }), + json: async () => ({ id: "dm-channel-456" }), + } as Response); + + const client = createMockClient(); + const onRetry = vi.fn(); + + const result = await createMattermostDirectChannelWithRetry(client, ["user-1", "user-2"], { + maxRetries: 3, + initialDelayMs: 10, + onRetry, + }); + + expect(result.id).toBe("dm-channel-456"); + expect(mockFetch).toHaveBeenCalledTimes(2); + expect(onRetry).toHaveBeenCalledTimes(1); + expect(onRetry).toHaveBeenCalledWith( + 1, + expect.any(Number), + expect.objectContaining({ message: expect.stringContaining("429") }), + ); + }); + + it("retries on port 443 connection errors (not misclassified as 4xx)", async () => { + // This tests that port numbers like :443 don't trigger false 4xx classification + mockFetch + .mockRejectedValueOnce(new Error("connect ECONNRESET 104.18.32.10:443")) + .mockResolvedValueOnce({ + ok: true, + status: 201, + headers: new Headers({ "content-type": "application/json" }), + json: async () => ({ id: "dm-channel-port" }), + } as Response); + + const client = createMockClient(); + + const result = await createMattermostDirectChannelWithRetry(client, ["user-1", "user-2"], { + maxRetries: 3, + initialDelayMs: 10, + }); + + // Should retry and succeed on second attempt (port 443 should NOT be treated as 4xx) + expect(mockFetch).toHaveBeenCalledTimes(2); + expect(result.id).toBe("dm-channel-port"); + }); + + it("does not retry on 400 even if error message contains '429' text", async () => { + // This tests that "429" in error detail doesn't trigger false rate-limit retry + // e.g., "Invalid user ID: 4294967295" should NOT be retried + mockFetch.mockResolvedValueOnce({ + ok: false, + status: 400, + headers: new Headers({ "content-type": "application/json" }), + json: async () => ({ message: "Invalid user ID: 4294967295" }), + text: async () => "Invalid user ID: 4294967295", + } as Response); + + const client = createMockClient(); + + await expect( + createMattermostDirectChannelWithRetry(client, ["user-1", "user-2"], { + maxRetries: 3, + initialDelayMs: 10, + }), + ).rejects.toThrow(); + + // Should not retry - only called once (400 is a client error, even though message contains "429") + expect(mockFetch).toHaveBeenCalledTimes(1); + }); + + it("retries on 5xx server errors", async () => { + mockFetch + .mockResolvedValueOnce({ + ok: false, + status: 503, + headers: new Headers({ "content-type": "application/json" }), + json: async () => ({ message: "Service unavailable" }), + text: async () => "Service unavailable", + } as Response) + .mockResolvedValueOnce({ + ok: false, + status: 502, + headers: new Headers({ "content-type": "application/json" }), + json: async () => ({ message: "Bad gateway" }), + text: async () => "Bad gateway", + } as Response) + .mockResolvedValueOnce({ + ok: true, + status: 201, + headers: new Headers({ "content-type": "application/json" }), + json: async () => ({ id: "dm-channel-789" }), + } as Response); + + const client = createMockClient(); + + const result = await createMattermostDirectChannelWithRetry(client, ["user-1", "user-2"], { + maxRetries: 3, + initialDelayMs: 10, + }); + + expect(result.id).toBe("dm-channel-789"); + expect(mockFetch).toHaveBeenCalledTimes(3); + }); + + it("retries on network errors", async () => { + mockFetch + .mockRejectedValueOnce(new Error("Network error: connection refused")) + .mockRejectedValueOnce(new Error("ECONNRESET")) + .mockResolvedValueOnce({ + ok: true, + status: 201, + headers: new Headers({ "content-type": "application/json" }), + json: async () => ({ id: "dm-channel-abc" }), + } as Response); + + const client = createMockClient(); + + const result = await createMattermostDirectChannelWithRetry(client, ["user-1", "user-2"], { + maxRetries: 3, + initialDelayMs: 10, + }); + + expect(result.id).toBe("dm-channel-abc"); + expect(mockFetch).toHaveBeenCalledTimes(3); + }); + + it("retries on fetch failed errors when the cause carries a transient code", async () => { + mockFetch + .mockRejectedValueOnce( + createFetchFailedError({ + message: "connect ECONNREFUSED 127.0.0.1:81", + code: "ECONNREFUSED", + }), + ) + .mockResolvedValueOnce({ + ok: true, + status: 201, + headers: new Headers({ "content-type": "application/json" }), + json: async () => ({ id: "dm-channel-fetch-failed" }), + } as Response); + + const client = createMockClient(); + + const result = await createMattermostDirectChannelWithRetry(client, ["user-1", "user-2"], { + maxRetries: 3, + initialDelayMs: 10, + }); + + expect(result.id).toBe("dm-channel-fetch-failed"); + expect(mockFetch).toHaveBeenCalledTimes(2); + }); + + it("does not retry on 4xx client errors (except 429)", async () => { + mockFetch.mockResolvedValueOnce({ + ok: false, + status: 400, + headers: new Headers({ "content-type": "application/json" }), + json: async () => ({ message: "Bad request" }), + text: async () => "Bad request", + } as Response); + + const client = createMockClient(); + + await expect( + createMattermostDirectChannelWithRetry(client, ["user-1", "user-2"], { + maxRetries: 3, + initialDelayMs: 10, + }), + ).rejects.toThrow("400"); + + expect(mockFetch).toHaveBeenCalledTimes(1); + }); + + it("does not retry on 404 not found", async () => { + mockFetch.mockResolvedValueOnce({ + ok: false, + status: 404, + headers: new Headers({ "content-type": "application/json" }), + json: async () => ({ message: "User not found" }), + text: async () => "User not found", + } as Response); + + const client = createMockClient(); + + await expect( + createMattermostDirectChannelWithRetry(client, ["user-1", "user-2"], { + maxRetries: 3, + initialDelayMs: 10, + }), + ).rejects.toThrow("404"); + + expect(mockFetch).toHaveBeenCalledTimes(1); + }); + + it("throws after exhausting all retries", async () => { + mockFetch.mockResolvedValue({ + ok: false, + status: 503, + headers: new Headers({ "content-type": "application/json" }), + json: async () => ({ message: "Service unavailable" }), + text: async () => "Service unavailable", + } as Response); + + const client = createMockClient(); + + await expect( + createMattermostDirectChannelWithRetry(client, ["user-1", "user-2"], { + maxRetries: 2, + initialDelayMs: 10, + }), + ).rejects.toThrow(); + + expect(mockFetch).toHaveBeenCalledTimes(3); // initial + 2 retries + }); + + it("respects custom timeout option and aborts fetch", async () => { + let abortSignal: AbortSignal | undefined; + let abortListenerCalled = false; + + mockFetch.mockImplementationOnce((url, init) => { + abortSignal = init?.signal; + if (abortSignal) { + abortSignal.addEventListener("abort", () => { + abortListenerCalled = true; + }); + } + // Return a promise that rejects when aborted, otherwise never resolves + return new Promise((_, reject) => { + if (abortSignal) { + const checkAbort = () => { + if (abortSignal?.aborted) { + reject(new Error("AbortError")); + } else { + setTimeout(checkAbort, 10); + } + }; + setTimeout(checkAbort, 10); + } + }); + }); + + const client = createMockClient(); + + await expect( + createMattermostDirectChannelWithRetry(client, ["user-1", "user-2"], { + timeoutMs: 50, + maxRetries: 0, + initialDelayMs: 10, + }), + ).rejects.toThrow(); + + expect(mockFetch).toHaveBeenCalledTimes(1); + expect(abortSignal).toBeDefined(); + expect(abortListenerCalled).toBe(true); + }); + + it("uses exponential backoff with jitter between retries", async () => { + const delays: number[] = []; + mockFetch + .mockRejectedValueOnce(new Error("Mattermost API 503 Service Unavailable")) + .mockRejectedValueOnce(new Error("Mattermost API 503 Service Unavailable")) + .mockResolvedValueOnce({ + ok: true, + status: 201, + headers: new Headers({ "content-type": "application/json" }), + json: async () => ({ id: "dm-channel-delay" }), + } as Response); + + const client = createMockClient(); + + await createMattermostDirectChannelWithRetry(client, ["user-1", "user-2"], { + maxRetries: 3, + initialDelayMs: 100, + maxDelayMs: 1000, + onRetry: (attempt, delayMs) => { + delays.push(delayMs); + }, + }); + + expect(delays).toHaveLength(2); + // First retry: exponentialDelay = 100ms, jitter = 0-100ms, total = 100-200ms + expect(delays[0]).toBeGreaterThanOrEqual(100); + expect(delays[0]).toBeLessThanOrEqual(200); + // Second retry: exponentialDelay = 200ms, jitter = 0-200ms, total = 200-400ms + expect(delays[1]).toBeGreaterThanOrEqual(200); + expect(delays[1]).toBeLessThanOrEqual(400); + }); + + it("respects maxDelayMs cap", async () => { + const delays: number[] = []; + mockFetch + .mockRejectedValueOnce(new Error("Mattermost API 503")) + .mockRejectedValueOnce(new Error("Mattermost API 503")) + .mockRejectedValueOnce(new Error("Mattermost API 503")) + .mockRejectedValueOnce(new Error("Mattermost API 503")) + .mockResolvedValueOnce({ + ok: true, + status: 201, + headers: new Headers({ "content-type": "application/json" }), + json: async () => ({ id: "dm-channel-max" }), + } as Response); + + const client = createMockClient(); + + await createMattermostDirectChannelWithRetry(client, ["user-1", "user-2"], { + maxRetries: 4, + initialDelayMs: 1000, + maxDelayMs: 2500, + onRetry: (attempt, delayMs) => { + delays.push(delayMs); + }, + }); + + expect(delays).toHaveLength(4); + // All delays should be capped at maxDelayMs + delays.forEach((delay) => { + expect(delay).toBeLessThanOrEqual(2500); + }); + }); + + it("does not retry on 4xx errors even if message contains retryable keywords", async () => { + // This tests the fix for false positives where a 400 error with "timeout" in the message + // would incorrectly be retried + mockFetch.mockResolvedValueOnce({ + ok: false, + status: 400, + headers: new Headers({ "content-type": "application/json" }), + json: async () => ({ message: "Request timeout: connection timed out" }), + text: async () => "Request timeout: connection timed out", + } as Response); + + const client = createMockClient(); + + await expect( + createMattermostDirectChannelWithRetry(client, ["user-1", "user-2"], { + maxRetries: 3, + initialDelayMs: 10, + }), + ).rejects.toThrow("400"); + + // Should not retry - only called once + expect(mockFetch).toHaveBeenCalledTimes(1); + }); + + it("does not retry on 403 Forbidden even with 'abort' in message", async () => { + mockFetch.mockResolvedValueOnce({ + ok: false, + status: 403, + headers: new Headers({ "content-type": "application/json" }), + json: async () => ({ message: "Request aborted: forbidden" }), + text: async () => "Request aborted: forbidden", + } as Response); + + const client = createMockClient(); + + await expect( + createMattermostDirectChannelWithRetry(client, ["user-1", "user-2"], { + maxRetries: 3, + initialDelayMs: 10, + }), + ).rejects.toThrow("403"); + + expect(mockFetch).toHaveBeenCalledTimes(1); + }); + + it("passes AbortSignal to fetch for timeout support", async () => { + let capturedSignal: AbortSignal | undefined; + mockFetch.mockImplementationOnce((url, init) => { + capturedSignal = init?.signal; + return Promise.resolve({ + ok: true, + status: 201, + headers: new Headers({ "content-type": "application/json" }), + json: async () => ({ id: "dm-channel-signal" }), + } as Response); + }); + + const client = createMockClient(); + await createMattermostDirectChannelWithRetry(client, ["user-1", "user-2"], { + timeoutMs: 5000, + }); + + expect(capturedSignal).toBeDefined(); + expect(capturedSignal).toBeInstanceOf(AbortSignal); + }); + + it("retries on 5xx even if error message contains 4xx substring", async () => { + // This tests the fix for the ordering bug: 503 with "upstream 404" should be retried + mockFetch + .mockRejectedValueOnce(new Error("Mattermost API 503: upstream returned 404 Not Found")) + .mockResolvedValueOnce({ + ok: true, + status: 201, + headers: new Headers({ "content-type": "application/json" }), + json: async () => ({ id: "dm-channel-5xx-with-404" }), + } as Response); + + const client = createMockClient(); + + const result = await createMattermostDirectChannelWithRetry(client, ["user-1", "user-2"], { + maxRetries: 3, + initialDelayMs: 10, + }); + + // Should retry and succeed on second attempt + expect(mockFetch).toHaveBeenCalledTimes(2); + expect(result.id).toBe("dm-channel-5xx-with-404"); + }); +}); diff --git a/extensions/mattermost/src/mattermost/client.ts b/extensions/mattermost/src/mattermost/client.ts index 1a8219340b9..c514160590f 100644 --- a/extensions/mattermost/src/mattermost/client.ts +++ b/extensions/mattermost/src/mattermost/client.ts @@ -168,13 +168,270 @@ export async function sendMattermostTyping( export async function createMattermostDirectChannel( client: MattermostClient, userIds: string[], + signal?: AbortSignal, ): Promise { return await client.request("/channels/direct", { method: "POST", body: JSON.stringify(userIds), + signal, }); } +export type CreateDmChannelRetryOptions = { + /** Maximum number of retry attempts (default: 3) */ + maxRetries?: number; + /** Initial delay in milliseconds (default: 1000) */ + initialDelayMs?: number; + /** Maximum delay in milliseconds (default: 10000) */ + maxDelayMs?: number; + /** Timeout for each individual request in milliseconds (default: 30000) */ + timeoutMs?: number; + /** Optional logger for retry events */ + onRetry?: (attempt: number, delayMs: number, error: Error) => void; +}; + +const RETRYABLE_NETWORK_ERROR_CODES = new Set([ + "ECONNRESET", + "ECONNREFUSED", + "ETIMEDOUT", + "ESOCKETTIMEDOUT", + "ECONNABORTED", + "ENOTFOUND", + "EAI_AGAIN", + "EHOSTUNREACH", + "ENETUNREACH", + "EPIPE", + "UND_ERR_CONNECT_TIMEOUT", + "UND_ERR_DNS_RESOLVE_FAILED", + "UND_ERR_CONNECT", + "UND_ERR_SOCKET", + "UND_ERR_HEADERS_TIMEOUT", + "UND_ERR_BODY_TIMEOUT", +]); + +const RETRYABLE_NETWORK_ERROR_NAMES = new Set([ + "AbortError", + "TimeoutError", + "ConnectTimeoutError", + "HeadersTimeoutError", + "BodyTimeoutError", +]); + +const RETRYABLE_NETWORK_MESSAGE_SNIPPETS = [ + "network error", + "timeout", + "timed out", + "abort", + "connection refused", + "econnreset", + "econnrefused", + "etimedout", + "enotfound", + "socket hang up", + "getaddrinfo", +]; + +/** + * Creates a Mattermost DM channel with exponential backoff retry logic. + * Retries on transient errors (429, 5xx, network errors) but not on + * client errors (4xx except 429) or permanent failures. + */ +export async function createMattermostDirectChannelWithRetry( + client: MattermostClient, + userIds: string[], + options: CreateDmChannelRetryOptions = {}, +): Promise { + const { + maxRetries = 3, + initialDelayMs = 1000, + maxDelayMs = 10000, + timeoutMs = 30000, + onRetry, + } = options; + + let lastError: Error | undefined; + + for (let attempt = 0; attempt <= maxRetries; attempt++) { + try { + // Use AbortController for per-request timeout + const controller = new AbortController(); + const timeoutId = setTimeout(() => controller.abort(), timeoutMs); + + try { + const result = await createMattermostDirectChannel(client, userIds, controller.signal); + return result; + } finally { + clearTimeout(timeoutId); + } + } catch (err) { + lastError = err instanceof Error ? err : new Error(String(err)); + + // Don't retry on the last attempt + if (attempt >= maxRetries) { + break; + } + + // Check if error is retryable + if (!isRetryableError(lastError)) { + throw lastError; + } + + // Calculate exponential backoff delay with full-jitter + // Jitter is proportional to the exponential delay, not a fixed 1000ms + // This ensures backoff behaves correctly for small delay configurations + const exponentialDelay = initialDelayMs * Math.pow(2, attempt); + const jitter = Math.random() * exponentialDelay; + const delayMs = Math.min(exponentialDelay + jitter, maxDelayMs); + + if (onRetry) { + onRetry(attempt + 1, delayMs, lastError); + } + + // Wait before retrying + await sleep(delayMs); + } + } + + throw lastError ?? new Error("Failed to create DM channel after retries"); +} + +function isRetryableError(error: Error): boolean { + const candidates = collectErrorCandidates(error); + const messages = candidates + .map((candidate) => readErrorMessage(candidate)?.toLowerCase()) + .filter((message): message is string => Boolean(message)); + + // Retry on 5xx server errors FIRST (before checking 4xx) + // Use "mattermost api" prefix to avoid matching port numbers (e.g., :443) or IP octets + // This prevents misclassification when a 5xx error detail contains a 4xx substring + // e.g., "Mattermost API 503: upstream returned 404" + if (messages.some((message) => /mattermost api 5\d{2}\b/.test(message))) { + return true; + } + + // Check for explicit 429 rate limiting FIRST (before generic "429" text match) + // This avoids retrying when error detail contains "429" but it's not the status code + if ( + messages.some( + (message) => /mattermost api 429\b/.test(message) || message.includes("too many requests"), + ) + ) { + return true; + } + + // Check for explicit 4xx status codes - these are client errors and should NOT be retried + // (except 429 which is handled above) + // Use "mattermost api" prefix to avoid matching port numbers like :443 + for (const message of messages) { + const clientErrorMatch = message.match(/mattermost api (4\d{2})\b/); + if (!clientErrorMatch) { + continue; + } + const statusCode = parseInt(clientErrorMatch[1], 10); + if (statusCode >= 400 && statusCode < 500) { + return false; + } + } + + // Retry on network/transient errors only if no explicit Mattermost API status code is present + // This avoids false positives like: + // - "400 Bad Request: connection timed out" (has status code) + // - "connect ECONNRESET 104.18.32.10:443" (has port number, not status) + const hasMattermostApiStatusCode = messages.some((message) => + /mattermost api \d{3}\b/.test(message), + ); + if (hasMattermostApiStatusCode) { + return false; + } + + const codes = candidates + .map((candidate) => readErrorCode(candidate)) + .filter((code): code is string => Boolean(code)); + if (codes.some((code) => RETRYABLE_NETWORK_ERROR_CODES.has(code))) { + return true; + } + + const names = candidates + .map((candidate) => readErrorName(candidate)) + .filter((name): name is string => Boolean(name)); + if (names.some((name) => RETRYABLE_NETWORK_ERROR_NAMES.has(name))) { + return true; + } + + return messages.some((message) => + RETRYABLE_NETWORK_MESSAGE_SNIPPETS.some((pattern) => message.includes(pattern)), + ); +} + +function collectErrorCandidates(error: unknown): unknown[] { + const queue: unknown[] = [error]; + const seen = new Set(); + const candidates: unknown[] = []; + + while (queue.length > 0) { + const current = queue.shift(); + if (!current || seen.has(current)) { + continue; + } + seen.add(current); + candidates.push(current); + + if (typeof current !== "object") { + continue; + } + + const nested = current as { + cause?: unknown; + reason?: unknown; + errors?: unknown; + }; + queue.push(nested.cause, nested.reason); + if (Array.isArray(nested.errors)) { + queue.push(...nested.errors); + } + } + + return candidates; +} + +function readErrorMessage(error: unknown): string | undefined { + if (!error || typeof error !== "object") { + return undefined; + } + const message = (error as { message?: unknown }).message; + return typeof message === "string" && message.trim() ? message : undefined; +} + +function readErrorName(error: unknown): string | undefined { + if (!error || typeof error !== "object") { + return undefined; + } + const name = (error as { name?: unknown }).name; + return typeof name === "string" && name.trim() ? name : undefined; +} + +function readErrorCode(error: unknown): string | undefined { + if (!error || typeof error !== "object") { + return undefined; + } + const { code, errno } = error as { + code?: unknown; + errno?: unknown; + }; + const raw = typeof code === "string" && code.trim() ? code : errno; + if (typeof raw === "string" && raw.trim()) { + return raw.trim().toUpperCase(); + } + if (typeof raw === "number" && Number.isFinite(raw)) { + return String(raw); + } + return undefined; +} + +function sleep(ms: number): Promise { + return new Promise((resolve) => setTimeout(resolve, ms)); +} + export async function createMattermostPost( client: MattermostClient, params: { diff --git a/extensions/mattermost/src/mattermost/send.test.ts b/extensions/mattermost/src/mattermost/send.test.ts index 15cf05eb541..784b27677e6 100644 --- a/extensions/mattermost/src/mattermost/send.test.ts +++ b/extensions/mattermost/src/mattermost/send.test.ts @@ -13,9 +13,11 @@ const mockState = vi.hoisted(() => ({ accountId: "default", botToken: "bot-token", baseUrl: "https://mattermost.example.com", + config: {}, })), createMattermostClient: vi.fn(), createMattermostDirectChannel: vi.fn(), + createMattermostDirectChannelWithRetry: vi.fn(), createMattermostPost: vi.fn(), fetchMattermostChannelByName: vi.fn(), fetchMattermostMe: vi.fn(), @@ -37,6 +39,7 @@ vi.mock("./accounts.js", () => ({ vi.mock("./client.js", () => ({ createMattermostClient: mockState.createMattermostClient, createMattermostDirectChannel: mockState.createMattermostDirectChannel, + createMattermostDirectChannelWithRetry: mockState.createMattermostDirectChannelWithRetry, createMattermostPost: mockState.createMattermostPost, fetchMattermostChannelByName: mockState.fetchMattermostChannelByName, fetchMattermostMe: mockState.fetchMattermostMe, @@ -77,10 +80,12 @@ describe("sendMessageMattermost", () => { accountId: "default", botToken: "bot-token", baseUrl: "https://mattermost.example.com", + config: {}, }); mockState.loadOutboundMediaFromUrl.mockReset(); mockState.createMattermostClient.mockReset(); mockState.createMattermostDirectChannel.mockReset(); + mockState.createMattermostDirectChannelWithRetry.mockReset(); mockState.createMattermostPost.mockReset(); mockState.fetchMattermostChannelByName.mockReset(); mockState.fetchMattermostMe.mockReset(); @@ -91,6 +96,7 @@ describe("sendMessageMattermost", () => { resetMattermostOpaqueTargetCacheForTests(); mockState.createMattermostClient.mockReturnValue({}); mockState.createMattermostPost.mockResolvedValue({ id: "post-1" }); + mockState.createMattermostDirectChannelWithRetry.mockResolvedValue({ id: "dm-channel-1" }); mockState.fetchMattermostMe.mockResolvedValue({ id: "bot-user" }); mockState.fetchMattermostUserTeams.mockResolvedValue([{ id: "team-1" }]); mockState.fetchMattermostChannelByName.mockResolvedValue({ id: "town-square" }); @@ -105,6 +111,12 @@ describe("sendMessageMattermost", () => { }, }, }; + mockState.resolveMattermostAccount.mockReturnValue({ + accountId: "work", + botToken: "provided-token", + baseUrl: "https://mattermost.example.com", + config: {}, + }); await sendMessageMattermost("channel:town-square", "hello", { cfg: providedCfg as any, @@ -128,6 +140,12 @@ describe("sendMessageMattermost", () => { }, }; mockState.loadConfig.mockReturnValueOnce(runtimeCfg); + mockState.resolveMattermostAccount.mockReturnValue({ + accountId: "default", + botToken: "runtime-token", + baseUrl: "https://mattermost.example.com", + config: {}, + }); await sendMessageMattermost("channel:town-square", "hello"); @@ -146,6 +164,12 @@ describe("sendMessageMattermost", () => { contentType: "image/png", kind: "image", }); + mockState.resolveMattermostAccount.mockReturnValue({ + accountId: "default", + botToken: "bot-token", + baseUrl: "https://mattermost.example.com", + config: {}, + }); await sendMessageMattermost("channel:town-square", "hello", { mediaUrl: "file:///tmp/agent-workspace/photo.png", @@ -169,6 +193,13 @@ describe("sendMessageMattermost", () => { }); it("builds interactive button props when buttons are provided", async () => { + mockState.resolveMattermostAccount.mockReturnValue({ + accountId: "default", + botToken: "bot-token", + baseUrl: "https://mattermost.example.com", + config: {}, + }); + await sendMessageMattermost("channel:town-square", "Pick a model", { buttons: [[{ callback_data: "mdlprov", text: "Browse providers" }]], }); @@ -196,8 +227,13 @@ describe("sendMessageMattermost", () => { it("resolves a bare Mattermost user id as a DM target before upload", async () => { const userId = "dthcxgoxhifn3pwh65cut3ud3w"; + mockState.resolveMattermostAccount.mockReturnValue({ + accountId: "default", + botToken: "bot-token", + baseUrl: "https://mattermost.example.com", + config: {}, + }); mockState.fetchMattermostUser.mockResolvedValueOnce({ id: userId }); - mockState.createMattermostDirectChannel.mockResolvedValueOnce({ id: "dm-channel-1" }); mockState.loadOutboundMediaFromUrl.mockResolvedValueOnce({ buffer: Buffer.from("media-bytes"), fileName: "photo.png", @@ -211,7 +247,11 @@ describe("sendMessageMattermost", () => { }); expect(mockState.fetchMattermostUser).toHaveBeenCalledWith({}, userId); - expect(mockState.createMattermostDirectChannel).toHaveBeenCalledWith({}, ["bot-user", userId]); + expect(mockState.createMattermostDirectChannelWithRetry).toHaveBeenCalledWith( + {}, + ["bot-user", userId], + expect.any(Object), + ); expect(mockState.uploadMattermostFile).toHaveBeenCalledWith( {}, expect.objectContaining({ @@ -223,6 +263,12 @@ describe("sendMessageMattermost", () => { it("falls back to a channel target when bare Mattermost id is not a user", async () => { const channelId = "aaaaaaaaaaaaaaaaaaaaaaaaaa"; + mockState.resolveMattermostAccount.mockReturnValue({ + accountId: "default", + botToken: "bot-token", + baseUrl: "https://mattermost.example.com", + config: {}, + }); mockState.fetchMattermostUser.mockRejectedValueOnce( new Error("Mattermost API 404 Not Found: user not found"), ); @@ -239,7 +285,7 @@ describe("sendMessageMattermost", () => { }); expect(mockState.fetchMattermostUser).toHaveBeenCalledWith({}, channelId); - expect(mockState.createMattermostDirectChannel).not.toHaveBeenCalled(); + expect(mockState.createMattermostDirectChannelWithRetry).not.toHaveBeenCalled(); expect(mockState.uploadMattermostFile).toHaveBeenCalledWith( {}, expect.objectContaining({ @@ -337,11 +383,12 @@ describe("parseMattermostTarget", () => { // userIdResolutionCache and dmChannelCache are module singletons that survive across tests. // Using unique cache keys per test ensures full isolation without needing a cache reset API. describe("sendMessageMattermost user-first resolution", () => { - function makeAccount(token: string) { + function makeAccount(token: string, config = {}) { return { accountId: "default", botToken: token, baseUrl: "https://mattermost.example.com", + config, }; } @@ -350,6 +397,7 @@ describe("sendMessageMattermost user-first resolution", () => { mockState.createMattermostClient.mockReturnValue({}); mockState.createMattermostPost.mockResolvedValue({ id: "post-id" }); mockState.createMattermostDirectChannel.mockResolvedValue({ id: "dm-channel-id" }); + mockState.createMattermostDirectChannelWithRetry.mockResolvedValue({ id: "dm-channel-id" }); mockState.fetchMattermostMe.mockResolvedValue({ id: "bot-id" }); }); @@ -362,7 +410,7 @@ describe("sendMessageMattermost user-first resolution", () => { const res = await sendMessageMattermost(userId, "hello"); expect(mockState.fetchMattermostUser).toHaveBeenCalledTimes(1); - expect(mockState.createMattermostDirectChannel).toHaveBeenCalledTimes(1); + expect(mockState.createMattermostDirectChannelWithRetry).toHaveBeenCalledTimes(1); const params = mockState.createMattermostPost.mock.calls[0]?.[1]; expect(params.channelId).toBe("dm-channel-id"); expect(res.channelId).toBe("dm-channel-id"); @@ -379,7 +427,7 @@ describe("sendMessageMattermost user-first resolution", () => { const res = await sendMessageMattermost(channelId, "hello"); expect(mockState.fetchMattermostUser).toHaveBeenCalledTimes(1); - expect(mockState.createMattermostDirectChannel).not.toHaveBeenCalled(); + expect(mockState.createMattermostDirectChannelWithRetry).not.toHaveBeenCalled(); const params = mockState.createMattermostPost.mock.calls[0]?.[1]; expect(params.channelId).toBe(channelId); expect(res.channelId).toBe(channelId); @@ -403,7 +451,7 @@ describe("sendMessageMattermost user-first resolution", () => { vi.clearAllMocks(); mockState.createMattermostClient.mockReturnValue({}); mockState.createMattermostPost.mockResolvedValue({ id: "post-id-2" }); - mockState.createMattermostDirectChannel.mockResolvedValue({ id: "dm-channel-id" }); + mockState.createMattermostDirectChannelWithRetry.mockResolvedValue({ id: "dm-channel-id" }); mockState.fetchMattermostMe.mockResolvedValue({ id: "bot-id" }); mockState.resolveMattermostAccount.mockReturnValue(makeAccount(tokenB)); mockState.fetchMattermostUser.mockResolvedValueOnce({ id: userId }); @@ -417,11 +465,12 @@ describe("sendMessageMattermost user-first resolution", () => { // Unique token + id — explicit user: prefix bypasses probe, goes straight to DM const userId = "dddddd4444444444dddddd4444"; // 26 chars mockState.resolveMattermostAccount.mockReturnValue(makeAccount("token-explicit-user-t4")); + mockState.createMattermostDirectChannelWithRetry.mockResolvedValue({ id: "dm-channel-id" }); const res = await sendMessageMattermost(`user:${userId}`, "hello"); expect(mockState.fetchMattermostUser).not.toHaveBeenCalled(); - expect(mockState.createMattermostDirectChannel).toHaveBeenCalledTimes(1); + expect(mockState.createMattermostDirectChannelWithRetry).toHaveBeenCalledTimes(1); expect(res.channelId).toBe("dm-channel-id"); }); @@ -433,9 +482,101 @@ describe("sendMessageMattermost user-first resolution", () => { const res = await sendMessageMattermost(`channel:${chanId}`, "hello"); expect(mockState.fetchMattermostUser).not.toHaveBeenCalled(); - expect(mockState.createMattermostDirectChannel).not.toHaveBeenCalled(); + expect(mockState.createMattermostDirectChannelWithRetry).not.toHaveBeenCalled(); const params = mockState.createMattermostPost.mock.calls[0]?.[1]; expect(params.channelId).toBe(chanId); expect(res.channelId).toBe(chanId); }); + + it("passes dmRetryOptions from opts to createMattermostDirectChannelWithRetry", async () => { + const userId = "ffffff6666666666ffffff6666"; // 26 chars + mockState.resolveMattermostAccount.mockReturnValue(makeAccount("token-retry-opts-t6")); + mockState.fetchMattermostUser.mockResolvedValueOnce({ id: userId }); + + const retryOptions = { + maxRetries: 5, + initialDelayMs: 500, + maxDelayMs: 5000, + timeoutMs: 10000, + }; + + await sendMessageMattermost(`user:${userId}`, "hello", { + dmRetryOptions: retryOptions, + }); + + expect(mockState.createMattermostDirectChannelWithRetry).toHaveBeenCalledWith( + {}, + ["bot-id", userId], + expect.objectContaining(retryOptions), + ); + }); + + it("uses dmChannelRetry from account config when opts.dmRetryOptions not provided", async () => { + const userId = "gggggg7777777777gggggg7777"; // 26 chars + mockState.resolveMattermostAccount.mockReturnValue({ + accountId: "default", + botToken: "token-retry-config-t7", + baseUrl: "https://mattermost.example.com", + config: { + dmChannelRetry: { + maxRetries: 4, + initialDelayMs: 2000, + maxDelayMs: 8000, + timeoutMs: 15000, + }, + }, + }); + mockState.fetchMattermostUser.mockResolvedValueOnce({ id: userId }); + + await sendMessageMattermost(`user:${userId}`, "hello"); + + expect(mockState.createMattermostDirectChannelWithRetry).toHaveBeenCalledWith( + {}, + ["bot-id", userId], + expect.objectContaining({ + maxRetries: 4, + initialDelayMs: 2000, + maxDelayMs: 8000, + timeoutMs: 15000, + }), + ); + }); + + it("opts.dmRetryOptions overrides provided fields and preserves account defaults", async () => { + const userId = "hhhhhh8888888888hhhhhh8888"; // 26 chars + mockState.resolveMattermostAccount.mockReturnValue({ + accountId: "default", + botToken: "token-retry-override-t8", + baseUrl: "https://mattermost.example.com", + config: { + dmChannelRetry: { + maxRetries: 2, + initialDelayMs: 1000, + }, + }, + }); + mockState.fetchMattermostUser.mockResolvedValueOnce({ id: userId }); + + const overrideOptions = { + maxRetries: 7, + timeoutMs: 20000, + }; + + await sendMessageMattermost(`user:${userId}`, "hello", { + dmRetryOptions: overrideOptions, + }); + + expect(mockState.createMattermostDirectChannelWithRetry).toHaveBeenCalledWith( + {}, + ["bot-id", userId], + expect.objectContaining(overrideOptions), + ); + expect(mockState.createMattermostDirectChannelWithRetry).toHaveBeenCalledWith( + {}, + ["bot-id", userId], + expect.objectContaining({ + initialDelayMs: 1000, + }), + ); + }); }); diff --git a/extensions/mattermost/src/mattermost/send.ts b/extensions/mattermost/src/mattermost/send.ts index 4655dab2f7d..c589c8829a0 100644 --- a/extensions/mattermost/src/mattermost/send.ts +++ b/extensions/mattermost/src/mattermost/send.ts @@ -3,7 +3,7 @@ import { getMattermostRuntime } from "../runtime.js"; import { resolveMattermostAccount } from "./accounts.js"; import { createMattermostClient, - createMattermostDirectChannel, + createMattermostDirectChannelWithRetry, createMattermostPost, fetchMattermostChannelByName, fetchMattermostMe, @@ -12,6 +12,7 @@ import { normalizeMattermostBaseUrl, uploadMattermostFile, type MattermostUser, + type CreateDmChannelRetryOptions, } from "./client.js"; import { buildButtonProps, @@ -32,6 +33,8 @@ export type MattermostSendOpts = { props?: Record; buttons?: Array; attachmentText?: string; + /** Retry options for DM channel creation */ + dmRetryOptions?: CreateDmChannelRetryOptions; }; export type MattermostSendResult = { @@ -182,11 +185,40 @@ async function resolveChannelIdByName(params: { throw new Error(`Mattermost channel "#${name}" not found in any team the bot belongs to`); } -async function resolveTargetChannelId(params: { +type ResolveTargetChannelIdParams = { target: MattermostTarget; baseUrl: string; token: string; -}): Promise { + dmRetryOptions?: CreateDmChannelRetryOptions; + logger?: { debug?: (msg: string) => void; warn?: (msg: string) => void }; +}; + +function mergeDmRetryOptions( + base?: CreateDmChannelRetryOptions, + override?: CreateDmChannelRetryOptions, +): CreateDmChannelRetryOptions | undefined { + const merged: CreateDmChannelRetryOptions = { + maxRetries: override?.maxRetries ?? base?.maxRetries, + initialDelayMs: override?.initialDelayMs ?? base?.initialDelayMs, + maxDelayMs: override?.maxDelayMs ?? base?.maxDelayMs, + timeoutMs: override?.timeoutMs ?? base?.timeoutMs, + onRetry: override?.onRetry, + }; + + if ( + merged.maxRetries === undefined && + merged.initialDelayMs === undefined && + merged.maxDelayMs === undefined && + merged.timeoutMs === undefined && + merged.onRetry === undefined + ) { + return undefined; + } + + return merged; +} + +async function resolveTargetChannelId(params: ResolveTargetChannelIdParams): Promise { if (params.target.kind === "channel") { return params.target.id; } @@ -214,7 +246,20 @@ async function resolveTargetChannelId(params: { baseUrl: params.baseUrl, botToken: params.token, }); - const channel = await createMattermostDirectChannel(client, [botUser.id, userId]); + + const channel = await createMattermostDirectChannelWithRetry(client, [botUser.id, userId], { + ...params.dmRetryOptions, + onRetry: (attempt, delayMs, error) => { + // Call user's onRetry if provided + params.dmRetryOptions?.onRetry?.(attempt, delayMs, error); + // Log if verbose mode is enabled + if (params.logger) { + params.logger.warn?.( + `DM channel creation retry ${attempt} after ${delayMs}ms: ${error.message}`, + ); + } + }, + }); dmChannelCache.set(dmKey, channel.id); return channel.id; } @@ -232,6 +277,7 @@ async function resolveMattermostSendContext( opts: MattermostSendOpts = {}, ): Promise { const core = getCore(); + const logger = core.logging.getChildLogger({ module: "mattermost" }); const cfg = opts.cfg ?? core.config.loadConfig(); const account = resolveMattermostAccount({ cfg, @@ -262,10 +308,23 @@ async function resolveMattermostSendContext( : opaqueTarget?.kind === "channel" ? { kind: "channel" as const, id: opaqueTarget.id } : parseMattermostTarget(trimmedTo); + // Build retry options from account config, allowing opts to override + const accountRetryConfig: CreateDmChannelRetryOptions | undefined = account.config.dmChannelRetry + ? { + maxRetries: account.config.dmChannelRetry.maxRetries, + initialDelayMs: account.config.dmChannelRetry.initialDelayMs, + maxDelayMs: account.config.dmChannelRetry.maxDelayMs, + timeoutMs: account.config.dmChannelRetry.timeoutMs, + } + : undefined; + const dmRetryOptions = mergeDmRetryOptions(accountRetryConfig, opts.dmRetryOptions); + const channelId = await resolveTargetChannelId({ target, baseUrl, token, + dmRetryOptions, + logger: core.logging.shouldLogVerbose() ? logger : undefined, }); return { diff --git a/extensions/mattermost/src/types.ts b/extensions/mattermost/src/types.ts index f4038ac6920..e6fcc19098c 100644 --- a/extensions/mattermost/src/types.ts +++ b/extensions/mattermost/src/types.ts @@ -90,6 +90,17 @@ export type MattermostAccountConfig = { */ allowedSourceIps?: string[]; }; + /** Retry configuration for DM channel creation */ + dmChannelRetry?: { + /** Maximum number of retry attempts (default: 3) */ + maxRetries?: number; + /** Initial delay in milliseconds before first retry (default: 1000) */ + initialDelayMs?: number; + /** Maximum delay in milliseconds between retries (default: 10000) */ + maxDelayMs?: number; + /** Timeout for each individual request in milliseconds (default: 30000) */ + timeoutMs?: number; + }; }; export type MattermostConfig = {