Subagents: stop retrying external completion timeouts (#41235) (#43847)

* Changelog: add subagent announce timeout note

* Tests: cover subagent completion timeout no-retry

* Subagents: stop retrying external completion timeouts

* Config: update subagent announce timeout default docs

* Tests: use fake timers for subagent timeout retry guard
This commit is contained in:
Vincent Koc
2026-03-12 11:03:06 -04:00
committed by GitHub
parent 7844bc89a1
commit 8ad0ca309e
4 changed files with 59 additions and 9 deletions

View File

@@ -34,6 +34,7 @@ Docs: https://docs.openclaw.ai
- Mattermost/block streaming: fix duplicate message delivery (one threaded, one top-level) when block streaming is active by excluding `replyToId` from the block reply dedup key and adding an explicit `threading` dock to the Mattermost plugin. (#41362) Thanks @mathiasnagler and @vincentkoc.
- BlueBubbles/self-chat echo dedupe: drop reflected duplicate webhook copies only when a matching `fromMe` event was just seen for the same chat, body, and timestamp, preventing self-chat loops without broad webhook suppression. Related to #32166. (#38442) Thanks @vincentkoc.
- Models/Kimi Coding: send `anthropic-messages` tools in native Anthropic format again so `kimi-coding` stops degrading tool calls into XML/plain-text pseudo invocations instead of real `tool_use` blocks. (#38669, #39907, #40552) Thanks @opriz.
- Subagents/completion announce retries: raise the default announce timeout to 90 seconds and stop retrying gateway-timeout failures for externally delivered completion announces, preventing duplicate user-facing completion messages after slow gateway responses. Fixes #41235. Thanks @vasujain00 and @vincentkoc.
- Sandbox/write: preserve pinned mutation-helper payload stdin so sandboxed `write` no longer reports success while creating empty files. (#43876) Thanks @glitch418x.
- Gateway/main-session routing: keep TUI and other `mode:UI` main-session sends on the internal surface when `deliver` is enabled, so replies no longer inherit the session's persisted Telegram/WhatsApp route. (#43918) Thanks @obviyus.
- Doctor/gateway service audit: canonicalize service entrypoint paths before comparing them so symlink-vs-realpath installs no longer trigger false "entrypoint does not match the current install" repair prompts. (#43882) Thanks @ngutman.

View File

@@ -8,6 +8,12 @@ type GatewayCall = {
};
const gatewayCalls: GatewayCall[] = [];
let callGatewayImpl: (request: GatewayCall) => Promise<unknown> = async (request) => {
if (request.method === "chat.history") {
return { messages: [] };
}
return {};
};
let sessionStore: Record<string, Record<string, unknown>> = {};
let configOverride: ReturnType<(typeof import("../config/config.js"))["loadConfig"]> = {
session: {
@@ -27,10 +33,7 @@ let fallbackRequesterResolution: {
vi.mock("../gateway/call.js", () => ({
callGateway: vi.fn(async (request: GatewayCall) => {
gatewayCalls.push(request);
if (request.method === "chat.history") {
return { messages: [] };
}
return {};
return await callGatewayImpl(request);
}),
}));
@@ -120,6 +123,12 @@ function findGatewayCall(predicate: (call: GatewayCall) => boolean): GatewayCall
describe("subagent announce timeout config", () => {
beforeEach(() => {
gatewayCalls.length = 0;
callGatewayImpl = async (request) => {
if (request.method === "chat.history") {
return { messages: [] };
}
return {};
};
sessionStore = {};
configOverride = {
session: defaultSessionConfig,
@@ -131,13 +140,13 @@ describe("subagent announce timeout config", () => {
fallbackRequesterResolution = null;
});
it("uses 60s timeout by default for direct announce agent call", async () => {
it("uses 90s timeout by default for direct announce agent call", async () => {
await runAnnounceFlowForTest("run-default-timeout");
const directAgentCall = findGatewayCall(
(call) => call.method === "agent" && call.expectFinal === true,
);
expect(directAgentCall?.timeoutMs).toBe(60_000);
expect(directAgentCall?.timeoutMs).toBe(90_000);
});
it("honors configured announce timeout for direct announce agent call", async () => {
@@ -166,6 +175,35 @@ describe("subagent announce timeout config", () => {
expect(completionDirectAgentCall?.timeoutMs).toBe(90_000);
});
it("does not retry gateway timeout for externally delivered completion announces", async () => {
vi.useFakeTimers();
try {
callGatewayImpl = async (request) => {
if (request.method === "chat.history") {
return { messages: [] };
}
throw new Error("gateway timeout after 90000ms");
};
await expect(
runAnnounceFlowForTest("run-completion-timeout-no-retry", {
requesterOrigin: {
channel: "telegram",
to: "12345",
},
expectsCompletionMessage: true,
}),
).resolves.toBe(false);
const directAgentCalls = gatewayCalls.filter(
(call) => call.method === "agent" && call.expectFinal === true,
);
expect(directAgentCalls).toHaveLength(1);
} finally {
vi.useRealTimers();
}
});
it("regression, skips parent announce while descendants are still pending", async () => {
requesterDepthResolver = () => 1;
pendingDescendantRuns = 2;

View File

@@ -51,8 +51,9 @@ import { isAnnounceSkip } from "./tools/sessions-send-helpers.js";
const FAST_TEST_MODE = process.env.OPENCLAW_TEST_FAST === "1";
const FAST_TEST_RETRY_INTERVAL_MS = 8;
const DEFAULT_SUBAGENT_ANNOUNCE_TIMEOUT_MS = 60_000;
const DEFAULT_SUBAGENT_ANNOUNCE_TIMEOUT_MS = 90_000;
const MAX_TIMER_SAFE_TIMEOUT_MS = 2_147_000_000;
const GATEWAY_TIMEOUT_PATTERN = /gateway timeout/i;
let subagentRegistryRuntimePromise: Promise<
typeof import("./subagent-registry-runtime.js")
> | null = null;
@@ -107,7 +108,7 @@ const TRANSIENT_ANNOUNCE_DELIVERY_ERROR_PATTERNS: readonly RegExp[] = [
/no active .* listener/i,
/gateway not connected/i,
/gateway closed \(1006/i,
/gateway timeout/i,
GATEWAY_TIMEOUT_PATTERN,
/\b(econnreset|econnrefused|etimedout|enotfound|ehostunreach|network error)\b/i,
];
@@ -133,6 +134,11 @@ function isTransientAnnounceDeliveryError(error: unknown): boolean {
return TRANSIENT_ANNOUNCE_DELIVERY_ERROR_PATTERNS.some((re) => re.test(message));
}
function isGatewayTimeoutError(error: unknown): boolean {
const message = summarizeDeliveryError(error);
return Boolean(message) && GATEWAY_TIMEOUT_PATTERN.test(message);
}
async function waitForAnnounceRetryDelay(ms: number, signal?: AbortSignal): Promise<void> {
if (ms <= 0) {
return;
@@ -160,6 +166,7 @@ async function waitForAnnounceRetryDelay(ms: number, signal?: AbortSignal): Prom
async function runAnnounceDeliveryWithRetry<T>(params: {
operation: string;
noRetryOnGatewayTimeout?: boolean;
signal?: AbortSignal;
run: () => Promise<T>;
}): Promise<T> {
@@ -171,6 +178,9 @@ async function runAnnounceDeliveryWithRetry<T>(params: {
try {
return await params.run();
} catch (err) {
if (params.noRetryOnGatewayTimeout && isGatewayTimeoutError(err)) {
throw err;
}
const delayMs = DIRECT_ANNOUNCE_TRANSIENT_RETRY_DELAYS_MS[retryIndex];
if (delayMs == null || !isTransientAnnounceDeliveryError(err) || params.signal?.aborted) {
throw err;
@@ -789,6 +799,7 @@ async function sendSubagentAnnounceDirectly(params: {
operation: params.expectsCompletionMessage
? "completion direct announce agent call"
: "direct announce agent call",
noRetryOnGatewayTimeout: params.expectsCompletionMessage && shouldDeliverExternally,
signal: params.signal,
run: async () =>
await callGateway({

View File

@@ -279,7 +279,7 @@ export type AgentDefaultsConfig = {
thinking?: string;
/** Default run timeout in seconds for spawned sub-agents (0 = no timeout). */
runTimeoutSeconds?: number;
/** Gateway timeout in ms for sub-agent announce delivery calls (default: 60000). */
/** Gateway timeout in ms for sub-agent announce delivery calls (default: 90000). */
announceTimeoutMs?: number;
};
/** Optional sandbox settings for non-main sessions. */