fix(mattermost): harden DM channel retry handling

This commit is contained in:
Muhammed Mukhthar CM
2026-03-17 06:19:40 +00:00
parent 84fa7e2baa
commit bb0cc5b0b9
6 changed files with 246 additions and 36 deletions

View File

@@ -133,6 +133,7 @@ Docs: https://docs.openclaw.ai
- macOS/launch at login: stop emitting `KeepAlive` for the desktop app launch agent so OpenClaw no longer relaunches immediately after a manual quit while launch at login remains enabled. (#40213) Thanks @stablegenius49.
- ACP/gateway startup: use direct Telegram and Discord startup/status helpers instead of routing probes through the plugin runtime, and prepend the selected daemon Node bin dir to service PATH so plugin-local installs can still find `npm` and `pnpm`.
- ACP/configured bindings: reinitialize configured ACP sessions that are stuck in `error` state instead of reusing the failed runtime.
- Mattermost/DM send: retry transient direct-channel creation failures for DM deliveries, with configurable backoff and per-request timeout. (#42398) Thanks @JonathanJing.
## 2026.3.13

View File

@@ -191,6 +191,35 @@ OpenClaw resolves them **user-first**:
If you need deterministic behavior, always use the explicit prefixes (`user:<id>` / `channel:<id>`).
## DM channel retry
When OpenClaw sends to a Mattermost DM target and needs to resolve the direct channel first, it
retries transient direct-channel creation failures by default.
Use `channels.mattermost.dmChannelRetry` to tune that behavior globally for the Mattermost plugin,
or `channels.mattermost.accounts.<id>.dmChannelRetry` for one account.
```json5
{
channels: {
mattermost: {
dmChannelRetry: {
maxRetries: 3,
initialDelayMs: 1000,
maxDelayMs: 10000,
timeoutMs: 30000,
},
},
},
}
```
Notes:
- This applies only to DM channel creation (`/api/v4/channels/direct`), not every Mattermost API call.
- Retries apply to transient failures such as rate limits, 5xx responses, and network or timeout errors.
- 4xx client errors other than `429` are treated as permanent and are not retried.
## Reactions (message tool)
- Use `message action=react` with `channel=mattermost`.

View File

@@ -1,9 +1,5 @@
import { describe, expect, it, vi, beforeEach } from "vitest";
import {
createMattermostClient,
createMattermostDirectChannelWithRetry,
type CreateDmChannelRetryOptions,
} from "./client.js";
import { createMattermostClient, createMattermostDirectChannelWithRetry } from "./client.js";
describe("createMattermostDirectChannelWithRetry", () => {
const mockFetch = vi.fn();
@@ -20,6 +16,13 @@ describe("createMattermostDirectChannelWithRetry", () => {
});
}
function createFetchFailedError(params: { message: string; code?: string }): TypeError {
const cause = Object.assign(new Error(params.message), {
code: params.code,
});
return Object.assign(new TypeError("fetch failed"), { cause });
}
it("succeeds on first attempt without retries", async () => {
mockFetch.mockResolvedValueOnce({
ok: true,
@@ -178,6 +181,32 @@ describe("createMattermostDirectChannelWithRetry", () => {
expect(mockFetch).toHaveBeenCalledTimes(3);
});
it("retries on fetch failed errors when the cause carries a transient code", async () => {
mockFetch
.mockRejectedValueOnce(
createFetchFailedError({
message: "connect ECONNREFUSED 127.0.0.1:81",
code: "ECONNREFUSED",
}),
)
.mockResolvedValueOnce({
ok: true,
status: 201,
headers: new Headers({ "content-type": "application/json" }),
json: async () => ({ id: "dm-channel-fetch-failed" }),
} as Response);
const client = createMockClient();
const result = await createMattermostDirectChannelWithRetry(client, ["user-1", "user-2"], {
maxRetries: 3,
initialDelayMs: 10,
});
expect(result.id).toBe("dm-channel-fetch-failed");
expect(mockFetch).toHaveBeenCalledTimes(2);
});
it("does not retry on 4xx client errors (except 429)", async () => {
mockFetch.mockResolvedValueOnce({
ok: false,

View File

@@ -190,6 +190,47 @@ export type CreateDmChannelRetryOptions = {
onRetry?: (attempt: number, delayMs: number, error: Error) => void;
};
const RETRYABLE_NETWORK_ERROR_CODES = new Set([
"ECONNRESET",
"ECONNREFUSED",
"ETIMEDOUT",
"ESOCKETTIMEDOUT",
"ECONNABORTED",
"ENOTFOUND",
"EAI_AGAIN",
"EHOSTUNREACH",
"ENETUNREACH",
"EPIPE",
"UND_ERR_CONNECT_TIMEOUT",
"UND_ERR_DNS_RESOLVE_FAILED",
"UND_ERR_CONNECT",
"UND_ERR_SOCKET",
"UND_ERR_HEADERS_TIMEOUT",
"UND_ERR_BODY_TIMEOUT",
]);
const RETRYABLE_NETWORK_ERROR_NAMES = new Set([
"AbortError",
"TimeoutError",
"ConnectTimeoutError",
"HeadersTimeoutError",
"BodyTimeoutError",
]);
const RETRYABLE_NETWORK_MESSAGE_SNIPPETS = [
"network error",
"timeout",
"timed out",
"abort",
"connection refused",
"econnreset",
"econnrefused",
"etimedout",
"enotfound",
"socket hang up",
"getaddrinfo",
];
/**
* Creates a Mattermost DM channel with exponential backoff retry logic.
* Retries on transient errors (429, 5xx, network errors) but not on
@@ -255,27 +296,37 @@ export async function createMattermostDirectChannelWithRetry(
}
function isRetryableError(error: Error): boolean {
const message = error.message.toLowerCase();
const candidates = collectErrorCandidates(error);
const messages = candidates
.map((candidate) => readErrorMessage(candidate)?.toLowerCase())
.filter((message): message is string => Boolean(message));
// Retry on 5xx server errors FIRST (before checking 4xx)
// Use "mattermost api" prefix to avoid matching port numbers (e.g., :443) or IP octets
// This prevents misclassification when a 5xx error detail contains a 4xx substring
// e.g., "Mattermost API 503: upstream returned 404"
if (/mattermost api 5\d{2}\b/.test(message)) {
if (messages.some((message) => /mattermost api 5\d{2}\b/.test(message))) {
return true;
}
// Check for explicit 429 rate limiting FIRST (before generic "429" text match)
// This avoids retrying when error detail contains "429" but it's not the status code
if (/mattermost api 429\b/.test(message) || message.includes("too many requests")) {
if (
messages.some(
(message) => /mattermost api 429\b/.test(message) || message.includes("too many requests"),
)
) {
return true;
}
// Check for explicit 4xx status codes - these are client errors and should NOT be retried
// (except 429 which is handled above)
// Use "mattermost api" prefix to avoid matching port numbers like :443
const clientErrorMatch = message.match(/mattermost api (4\d{2})\b/);
if (clientErrorMatch) {
for (const message of messages) {
const clientErrorMatch = message.match(/mattermost api (4\d{2})\b/);
if (!clientErrorMatch) {
continue;
}
const statusCode = parseInt(clientErrorMatch[1], 10);
if (statusCode >= 400 && statusCode < 500) {
return false;
@@ -286,25 +337,95 @@ function isRetryableError(error: Error): boolean {
// This avoids false positives like:
// - "400 Bad Request: connection timed out" (has status code)
// - "connect ECONNRESET 104.18.32.10:443" (has port number, not status)
const hasMattermostApiStatusCode = /mattermost api \d{3}\b/.test(message);
const hasMattermostApiStatusCode = messages.some((message) =>
/mattermost api \d{3}\b/.test(message),
);
if (hasMattermostApiStatusCode) {
return false;
}
// Retry on network/transient errors (no explicit HTTP status code in message)
const retryablePatterns = [
"network error",
"timeout",
"abort",
"connection refused",
"econnreset",
"econnrefused",
"etimedout",
"enotfound",
"socket hang up",
];
const codes = candidates
.map((candidate) => readErrorCode(candidate))
.filter((code): code is string => Boolean(code));
if (codes.some((code) => RETRYABLE_NETWORK_ERROR_CODES.has(code))) {
return true;
}
return retryablePatterns.some((pattern) => message.includes(pattern));
const names = candidates
.map((candidate) => readErrorName(candidate))
.filter((name): name is string => Boolean(name));
if (names.some((name) => RETRYABLE_NETWORK_ERROR_NAMES.has(name))) {
return true;
}
return messages.some((message) =>
RETRYABLE_NETWORK_MESSAGE_SNIPPETS.some((pattern) => message.includes(pattern)),
);
}
function collectErrorCandidates(error: unknown): unknown[] {
const queue: unknown[] = [error];
const seen = new Set<unknown>();
const candidates: unknown[] = [];
while (queue.length > 0) {
const current = queue.shift();
if (!current || seen.has(current)) {
continue;
}
seen.add(current);
candidates.push(current);
if (typeof current !== "object") {
continue;
}
const nested = current as {
cause?: unknown;
reason?: unknown;
errors?: unknown;
};
queue.push(nested.cause, nested.reason);
if (Array.isArray(nested.errors)) {
queue.push(...nested.errors);
}
}
return candidates;
}
function readErrorMessage(error: unknown): string | undefined {
if (!error || typeof error !== "object") {
return undefined;
}
const message = (error as { message?: unknown }).message;
return typeof message === "string" && message.trim() ? message : undefined;
}
function readErrorName(error: unknown): string | undefined {
if (!error || typeof error !== "object") {
return undefined;
}
const name = (error as { name?: unknown }).name;
return typeof name === "string" && name.trim() ? name : undefined;
}
function readErrorCode(error: unknown): string | undefined {
if (!error || typeof error !== "object") {
return undefined;
}
const { code, errno } = error as {
code?: unknown;
errno?: unknown;
};
const raw = typeof code === "string" && code.trim() ? code : errno;
if (typeof raw === "string" && raw.trim()) {
return raw.trim().toUpperCase();
}
if (typeof raw === "number" && Number.isFinite(raw)) {
return String(raw);
}
return undefined;
}
function sleep(ms: number): Promise<void> {

View File

@@ -541,7 +541,7 @@ describe("sendMessageMattermost user-first resolution", () => {
);
});
it("opts.dmRetryOptions overrides account config dmChannelRetry", async () => {
it("opts.dmRetryOptions overrides provided fields and preserves account defaults", async () => {
const userId = "hhhhhh8888888888hhhhhh8888"; // 26 chars
mockState.resolveMattermostAccount.mockReturnValue({
accountId: "default",
@@ -570,5 +570,12 @@ describe("sendMessageMattermost user-first resolution", () => {
["bot-id", userId],
expect.objectContaining(overrideOptions),
);
expect(mockState.createMattermostDirectChannelWithRetry).toHaveBeenCalledWith(
{},
["bot-id", userId],
expect.objectContaining({
initialDelayMs: 1000,
}),
);
});
});

View File

@@ -193,6 +193,31 @@ type ResolveTargetChannelIdParams = {
logger?: { debug?: (msg: string) => void; warn?: (msg: string) => void };
};
function mergeDmRetryOptions(
base?: CreateDmChannelRetryOptions,
override?: CreateDmChannelRetryOptions,
): CreateDmChannelRetryOptions | undefined {
const merged: CreateDmChannelRetryOptions = {
maxRetries: override?.maxRetries ?? base?.maxRetries,
initialDelayMs: override?.initialDelayMs ?? base?.initialDelayMs,
maxDelayMs: override?.maxDelayMs ?? base?.maxDelayMs,
timeoutMs: override?.timeoutMs ?? base?.timeoutMs,
onRetry: override?.onRetry,
};
if (
merged.maxRetries === undefined &&
merged.initialDelayMs === undefined &&
merged.maxDelayMs === undefined &&
merged.timeoutMs === undefined &&
merged.onRetry === undefined
) {
return undefined;
}
return merged;
}
async function resolveTargetChannelId(params: ResolveTargetChannelIdParams): Promise<string> {
if (params.target.kind === "channel") {
return params.target.id;
@@ -284,17 +309,15 @@ async function resolveMattermostSendContext(
? { kind: "channel" as const, id: opaqueTarget.id }
: parseMattermostTarget(trimmedTo);
// Build retry options from account config, allowing opts to override
const accountRetryConfig = account.config.dmChannelRetry;
const dmRetryOptions: CreateDmChannelRetryOptions | undefined =
opts.dmRetryOptions ??
(accountRetryConfig
? {
maxRetries: accountRetryConfig.maxRetries,
initialDelayMs: accountRetryConfig.initialDelayMs,
maxDelayMs: accountRetryConfig.maxDelayMs,
timeoutMs: accountRetryConfig.timeoutMs,
}
: undefined);
const accountRetryConfig: CreateDmChannelRetryOptions | undefined = account.config.dmChannelRetry
? {
maxRetries: account.config.dmChannelRetry.maxRetries,
initialDelayMs: account.config.dmChannelRetry.initialDelayMs,
maxDelayMs: account.config.dmChannelRetry.maxDelayMs,
timeoutMs: account.config.dmChannelRetry.timeoutMs,
}
: undefined;
const dmRetryOptions = mergeDmRetryOptions(accountRetryConfig, opts.dmRetryOptions);
const channelId = await resolveTargetChannelId({
target,