mirror of
https://github.com/openclaw/openclaw.git
synced 2026-04-26 08:31:55 +00:00
fix(feishu): prevent silent group message drops when bot-info probe times out
When OpenClaw restarts under load, the Feishu bot-info probe (`/open-apis/bot/v3/info`) can exceed the 10-second timeout due to event-loop contention during channel initialization. This leaves `botOpenId` empty, causing `checkBotMentioned()` to return `false` for every group message — silently dropping them all while DMs continue to work fine. Two fixes: 1. **Increase startup probe timeout from 10s to 30s** and make it configurable via `OPENCLAW_FEISHU_STARTUP_PROBE_TIMEOUT_MS` env var. The previous 10s budget was too tight when multiple channels (Slack, Discord, Feishu) initialize concurrently. 2. **Graceful degradation in `checkBotMentioned()`**: when `botOpenId` is unknown, return `true` (assume mentioned) instead of `false`. This prevents group messages from being silently discarded when the probe fails for any reason. The trade-off is that the bot may respond to non-@-mentioned messages temporarily until the next successful probe, which is far preferable to total silence. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -2482,6 +2482,7 @@ describe("broadcast dispatch", () => {
|
||||
await handleFeishuMessage({
|
||||
cfg,
|
||||
event,
|
||||
botOpenId: "ou_known_bot",
|
||||
runtime: createRuntimeEnv(),
|
||||
});
|
||||
|
||||
@@ -2492,6 +2493,42 @@ describe("broadcast dispatch", () => {
|
||||
expect(mockCreateFeishuReplyDispatcher).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it("skips broadcast dispatch when bot identity is unknown (requireMention=true)", async () => {
|
||||
const cfg: ClawdbotConfig = {
|
||||
broadcast: { "oc-broadcast-group": ["susan", "main"] },
|
||||
agents: { list: [{ id: "main" }, { id: "susan" }] },
|
||||
channels: {
|
||||
feishu: {
|
||||
groups: {
|
||||
"oc-broadcast-group": {
|
||||
requireMention: true,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
} as unknown as ClawdbotConfig;
|
||||
|
||||
const event: FeishuMessageEvent = {
|
||||
sender: { sender_id: { open_id: "ou-sender" } },
|
||||
message: {
|
||||
message_id: "msg-broadcast-unknown-bot-id",
|
||||
chat_id: "oc-broadcast-group",
|
||||
chat_type: "group",
|
||||
message_type: "text",
|
||||
content: JSON.stringify({ text: "hello everyone" }),
|
||||
},
|
||||
};
|
||||
|
||||
await handleFeishuMessage({
|
||||
cfg,
|
||||
event,
|
||||
runtime: createRuntimeEnv(),
|
||||
});
|
||||
|
||||
expect(mockDispatchReplyFromConfig).not.toHaveBeenCalled();
|
||||
expect(mockCreateFeishuReplyDispatcher).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it("preserves single-agent dispatch when no broadcast config", async () => {
|
||||
const cfg: ClawdbotConfig = {
|
||||
channels: {
|
||||
|
||||
27
extensions/feishu/src/monitor.account.test.ts
Normal file
27
extensions/feishu/src/monitor.account.test.ts
Normal file
@@ -0,0 +1,27 @@
|
||||
import { afterEach, describe, expect, it, vi } from "vitest";
|
||||
import { waitForAbortableDelay } from "./monitor.account.js";
|
||||
|
||||
afterEach(() => {
|
||||
vi.useRealTimers();
|
||||
});
|
||||
|
||||
describe("waitForAbortableDelay", () => {
|
||||
it("resolves false immediately when aborted during backoff", async () => {
|
||||
vi.useFakeTimers();
|
||||
const abortController = new AbortController();
|
||||
|
||||
const delay = waitForAbortableDelay(60_000, abortController.signal);
|
||||
abortController.abort();
|
||||
|
||||
await expect(delay).resolves.toBe(false);
|
||||
});
|
||||
|
||||
it("resolves true after the full delay when not aborted", async () => {
|
||||
vi.useFakeTimers();
|
||||
|
||||
const delay = waitForAbortableDelay(500);
|
||||
await vi.advanceTimersByTimeAsync(500);
|
||||
|
||||
await expect(delay).resolves.toBe(true);
|
||||
});
|
||||
});
|
||||
@@ -619,6 +619,67 @@ function registerEventHandlers(
|
||||
});
|
||||
}
|
||||
|
||||
// Delays must be >= PROBE_ERROR_TTL_MS (60s) so each retry makes a real network request
|
||||
// instead of silently hitting the probe error cache.
|
||||
const BOT_IDENTITY_RETRY_DELAYS_MS = [60_000, 120_000, 300_000, 600_000, 900_000];
|
||||
|
||||
export function waitForAbortableDelay(delayMs: number, abortSignal?: AbortSignal): Promise<boolean> {
|
||||
if (abortSignal?.aborted) {
|
||||
return Promise.resolve(false);
|
||||
}
|
||||
|
||||
return new Promise((resolve) => {
|
||||
const timer = setTimeout(() => {
|
||||
abortSignal?.removeEventListener("abort", handleAbort);
|
||||
resolve(true);
|
||||
}, delayMs);
|
||||
timer.unref?.();
|
||||
|
||||
const handleAbort = () => {
|
||||
clearTimeout(timer);
|
||||
resolve(false);
|
||||
};
|
||||
|
||||
abortSignal?.addEventListener("abort", handleAbort, { once: true });
|
||||
});
|
||||
}
|
||||
|
||||
async function retryBotIdentityProbe(
|
||||
account: ResolvedFeishuAccount,
|
||||
accountId: string,
|
||||
runtime: RuntimeEnv | undefined,
|
||||
abortSignal: AbortSignal | undefined,
|
||||
): Promise<void> {
|
||||
const log = runtime?.log ?? console.log;
|
||||
const error = runtime?.error ?? console.error;
|
||||
for (let i = 0; i < BOT_IDENTITY_RETRY_DELAYS_MS.length; i++) {
|
||||
if (abortSignal?.aborted) return;
|
||||
const delayElapsed = await waitForAbortableDelay(BOT_IDENTITY_RETRY_DELAYS_MS[i], abortSignal);
|
||||
if (!delayElapsed) {
|
||||
return;
|
||||
}
|
||||
const identity = await fetchBotIdentityForMonitor(account, { runtime, abortSignal });
|
||||
if (identity.botOpenId) {
|
||||
botOpenIds.set(accountId, identity.botOpenId);
|
||||
if (identity.botName?.trim()) {
|
||||
botNames.set(accountId, identity.botName.trim());
|
||||
}
|
||||
log(
|
||||
`feishu[${accountId}]: bot open_id recovered via background retry: ${identity.botOpenId}`,
|
||||
);
|
||||
return;
|
||||
}
|
||||
const nextDelay = BOT_IDENTITY_RETRY_DELAYS_MS[i + 1];
|
||||
error(
|
||||
`feishu[${accountId}]: bot identity background retry ${i + 1}/${BOT_IDENTITY_RETRY_DELAYS_MS.length} failed` +
|
||||
(nextDelay ? `; next attempt in ${nextDelay / 1000}s` : ""),
|
||||
);
|
||||
}
|
||||
error(
|
||||
`feishu[${accountId}]: bot identity background retry exhausted; requireMention group messages may be skipped until restart`,
|
||||
);
|
||||
}
|
||||
|
||||
export type BotOpenIdSource =
|
||||
| { kind: "prefetched"; botOpenId?: string; botName?: string }
|
||||
| { kind: "fetch" };
|
||||
@@ -651,6 +712,18 @@ export async function monitorSingleAccount(params: MonitorSingleAccountParams):
|
||||
}
|
||||
log(`feishu[${accountId}]: bot open_id resolved: ${botOpenId ?? "unknown"}`);
|
||||
|
||||
// When the startup probe failed, retry in the background so the degraded
|
||||
// state (responding to all group messages) is bounded rather than permanent.
|
||||
if (!botOpenId && !abortSignal?.aborted) {
|
||||
log(
|
||||
`feishu[${accountId}]: bot open_id unknown; starting background retry (delays: ${BOT_IDENTITY_RETRY_DELAYS_MS.map((d) => `${d / 1000}s`).join(", ")})`,
|
||||
);
|
||||
log(
|
||||
`feishu[${accountId}]: requireMention group messages stay gated until bot identity recovery succeeds`,
|
||||
);
|
||||
void retryBotIdentityProbe(account, accountId, runtime, abortSignal);
|
||||
}
|
||||
|
||||
const connectionMode = account.config.connectionMode ?? "websocket";
|
||||
if (connectionMode === "webhook" && !account.verificationToken?.trim()) {
|
||||
throw new Error(`Feishu account "${accountId}" webhook mode requires verificationToken`);
|
||||
|
||||
@@ -2,7 +2,24 @@ import type { RuntimeEnv } from "../runtime-api.js";
|
||||
import { probeFeishu } from "./probe.js";
|
||||
import type { ResolvedFeishuAccount } from "./types.js";
|
||||
|
||||
export const FEISHU_STARTUP_BOT_INFO_TIMEOUT_MS = 10_000;
|
||||
const FEISHU_STARTUP_BOT_INFO_TIMEOUT_DEFAULT_MS = 30_000;
|
||||
const FEISHU_STARTUP_BOT_INFO_TIMEOUT_ENV = "OPENCLAW_FEISHU_STARTUP_PROBE_TIMEOUT_MS";
|
||||
|
||||
function resolveStartupProbeTimeoutMs(): number {
|
||||
const raw = process.env[FEISHU_STARTUP_BOT_INFO_TIMEOUT_ENV];
|
||||
if (raw) {
|
||||
const parsed = Number(raw);
|
||||
if (Number.isFinite(parsed) && parsed > 0) {
|
||||
return Math.floor(parsed);
|
||||
}
|
||||
console.warn(
|
||||
`[feishu] ${FEISHU_STARTUP_BOT_INFO_TIMEOUT_ENV}="${raw}" is invalid; using default ${FEISHU_STARTUP_BOT_INFO_TIMEOUT_DEFAULT_MS}ms`,
|
||||
);
|
||||
}
|
||||
return FEISHU_STARTUP_BOT_INFO_TIMEOUT_DEFAULT_MS;
|
||||
}
|
||||
|
||||
export const FEISHU_STARTUP_BOT_INFO_TIMEOUT_MS = resolveStartupProbeTimeoutMs();
|
||||
|
||||
type FetchBotOpenIdOptions = {
|
||||
runtime?: RuntimeEnv;
|
||||
|
||||
Reference in New Issue
Block a user