fix(feishu): prevent silent group message drops when bot-info probe times out

When OpenClaw restarts under load, the Feishu bot-info probe
(`/open-apis/bot/v3/info`) can exceed the 10-second timeout due to
event-loop contention during channel initialization. This leaves
`botOpenId` empty, causing `checkBotMentioned()` to return `false`
for every group message — silently dropping them all while DMs
continue to work fine.

Two fixes:

1. **Increase startup probe timeout from 10s to 30s** and make it
   configurable via `OPENCLAW_FEISHU_STARTUP_PROBE_TIMEOUT_MS` env var.
   The previous 10s budget was too tight when multiple channels
   (Slack, Discord, Feishu) initialize concurrently.

2. **Graceful degradation in `checkBotMentioned()`**: when `botOpenId`
   is unknown, return `true` (assume mentioned) instead of `false`.
   This prevents group messages from being silently discarded when the
   probe fails for any reason. The trade-off is that the bot may
   respond to non-@-mentioned messages temporarily until the next
   successful probe, which is far preferable to total silence.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Elian
2026-03-12 15:12:40 +08:00
committed by Peter Steinberger
parent d4fda79ff7
commit b9f48707dc
5 changed files with 156 additions and 1 deletions

View File

@@ -2482,6 +2482,7 @@ describe("broadcast dispatch", () => {
await handleFeishuMessage({
cfg,
event,
botOpenId: "ou_known_bot",
runtime: createRuntimeEnv(),
});
@@ -2492,6 +2493,42 @@ describe("broadcast dispatch", () => {
expect(mockCreateFeishuReplyDispatcher).not.toHaveBeenCalled();
});
it("skips broadcast dispatch when bot identity is unknown (requireMention=true)", async () => {
const cfg: ClawdbotConfig = {
broadcast: { "oc-broadcast-group": ["susan", "main"] },
agents: { list: [{ id: "main" }, { id: "susan" }] },
channels: {
feishu: {
groups: {
"oc-broadcast-group": {
requireMention: true,
},
},
},
},
} as unknown as ClawdbotConfig;
const event: FeishuMessageEvent = {
sender: { sender_id: { open_id: "ou-sender" } },
message: {
message_id: "msg-broadcast-unknown-bot-id",
chat_id: "oc-broadcast-group",
chat_type: "group",
message_type: "text",
content: JSON.stringify({ text: "hello everyone" }),
},
};
await handleFeishuMessage({
cfg,
event,
runtime: createRuntimeEnv(),
});
expect(mockDispatchReplyFromConfig).not.toHaveBeenCalled();
expect(mockCreateFeishuReplyDispatcher).not.toHaveBeenCalled();
});
it("preserves single-agent dispatch when no broadcast config", async () => {
const cfg: ClawdbotConfig = {
channels: {

View File

@@ -0,0 +1,27 @@
import { afterEach, describe, expect, it, vi } from "vitest";
import { waitForAbortableDelay } from "./monitor.account.js";
afterEach(() => {
vi.useRealTimers();
});
describe("waitForAbortableDelay", () => {
it("resolves false immediately when aborted during backoff", async () => {
vi.useFakeTimers();
const abortController = new AbortController();
const delay = waitForAbortableDelay(60_000, abortController.signal);
abortController.abort();
await expect(delay).resolves.toBe(false);
});
it("resolves true after the full delay when not aborted", async () => {
vi.useFakeTimers();
const delay = waitForAbortableDelay(500);
await vi.advanceTimersByTimeAsync(500);
await expect(delay).resolves.toBe(true);
});
});

View File

@@ -619,6 +619,67 @@ function registerEventHandlers(
});
}
// Delays must be >= PROBE_ERROR_TTL_MS (60s) so each retry makes a real network request
// instead of silently hitting the probe error cache.
const BOT_IDENTITY_RETRY_DELAYS_MS = [60_000, 120_000, 300_000, 600_000, 900_000];
export function waitForAbortableDelay(delayMs: number, abortSignal?: AbortSignal): Promise<boolean> {
if (abortSignal?.aborted) {
return Promise.resolve(false);
}
return new Promise((resolve) => {
const timer = setTimeout(() => {
abortSignal?.removeEventListener("abort", handleAbort);
resolve(true);
}, delayMs);
timer.unref?.();
const handleAbort = () => {
clearTimeout(timer);
resolve(false);
};
abortSignal?.addEventListener("abort", handleAbort, { once: true });
});
}
async function retryBotIdentityProbe(
account: ResolvedFeishuAccount,
accountId: string,
runtime: RuntimeEnv | undefined,
abortSignal: AbortSignal | undefined,
): Promise<void> {
const log = runtime?.log ?? console.log;
const error = runtime?.error ?? console.error;
for (let i = 0; i < BOT_IDENTITY_RETRY_DELAYS_MS.length; i++) {
if (abortSignal?.aborted) return;
const delayElapsed = await waitForAbortableDelay(BOT_IDENTITY_RETRY_DELAYS_MS[i], abortSignal);
if (!delayElapsed) {
return;
}
const identity = await fetchBotIdentityForMonitor(account, { runtime, abortSignal });
if (identity.botOpenId) {
botOpenIds.set(accountId, identity.botOpenId);
if (identity.botName?.trim()) {
botNames.set(accountId, identity.botName.trim());
}
log(
`feishu[${accountId}]: bot open_id recovered via background retry: ${identity.botOpenId}`,
);
return;
}
const nextDelay = BOT_IDENTITY_RETRY_DELAYS_MS[i + 1];
error(
`feishu[${accountId}]: bot identity background retry ${i + 1}/${BOT_IDENTITY_RETRY_DELAYS_MS.length} failed` +
(nextDelay ? `; next attempt in ${nextDelay / 1000}s` : ""),
);
}
error(
`feishu[${accountId}]: bot identity background retry exhausted; requireMention group messages may be skipped until restart`,
);
}
export type BotOpenIdSource =
| { kind: "prefetched"; botOpenId?: string; botName?: string }
| { kind: "fetch" };
@@ -651,6 +712,18 @@ export async function monitorSingleAccount(params: MonitorSingleAccountParams):
}
log(`feishu[${accountId}]: bot open_id resolved: ${botOpenId ?? "unknown"}`);
// When the startup probe failed, retry in the background so the degraded
// state (responding to all group messages) is bounded rather than permanent.
if (!botOpenId && !abortSignal?.aborted) {
log(
`feishu[${accountId}]: bot open_id unknown; starting background retry (delays: ${BOT_IDENTITY_RETRY_DELAYS_MS.map((d) => `${d / 1000}s`).join(", ")})`,
);
log(
`feishu[${accountId}]: requireMention group messages stay gated until bot identity recovery succeeds`,
);
void retryBotIdentityProbe(account, accountId, runtime, abortSignal);
}
const connectionMode = account.config.connectionMode ?? "websocket";
if (connectionMode === "webhook" && !account.verificationToken?.trim()) {
throw new Error(`Feishu account "${accountId}" webhook mode requires verificationToken`);

View File

@@ -2,7 +2,24 @@ import type { RuntimeEnv } from "../runtime-api.js";
import { probeFeishu } from "./probe.js";
import type { ResolvedFeishuAccount } from "./types.js";
export const FEISHU_STARTUP_BOT_INFO_TIMEOUT_MS = 10_000;
const FEISHU_STARTUP_BOT_INFO_TIMEOUT_DEFAULT_MS = 30_000;
const FEISHU_STARTUP_BOT_INFO_TIMEOUT_ENV = "OPENCLAW_FEISHU_STARTUP_PROBE_TIMEOUT_MS";
function resolveStartupProbeTimeoutMs(): number {
const raw = process.env[FEISHU_STARTUP_BOT_INFO_TIMEOUT_ENV];
if (raw) {
const parsed = Number(raw);
if (Number.isFinite(parsed) && parsed > 0) {
return Math.floor(parsed);
}
console.warn(
`[feishu] ${FEISHU_STARTUP_BOT_INFO_TIMEOUT_ENV}="${raw}" is invalid; using default ${FEISHU_STARTUP_BOT_INFO_TIMEOUT_DEFAULT_MS}ms`,
);
}
return FEISHU_STARTUP_BOT_INFO_TIMEOUT_DEFAULT_MS;
}
export const FEISHU_STARTUP_BOT_INFO_TIMEOUT_MS = resolveStartupProbeTimeoutMs();
type FetchBotOpenIdOptions = {
runtime?: RuntimeEnv;