From 8ad0ca309e846bd2fe4efbf8cf7b61ea9bbec7e9 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Thu, 12 Mar 2026 11:03:06 -0400 Subject: [PATCH] Subagents: stop retrying external completion timeouts (#41235) (#43847) * Changelog: add subagent announce timeout note * Tests: cover subagent completion timeout no-retry * Subagents: stop retrying external completion timeouts * Config: update subagent announce timeout default docs * Tests: use fake timers for subagent timeout retry guard --- CHANGELOG.md | 1 + src/agents/subagent-announce.timeout.test.ts | 50 +++++++++++++++++--- src/agents/subagent-announce.ts | 15 +++++- src/config/types.agent-defaults.ts | 2 +- 4 files changed, 59 insertions(+), 9 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 83eab5cde4e..130fb9e20d1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -34,6 +34,7 @@ Docs: https://docs.openclaw.ai - Mattermost/block streaming: fix duplicate message delivery (one threaded, one top-level) when block streaming is active by excluding `replyToId` from the block reply dedup key and adding an explicit `threading` dock to the Mattermost plugin. (#41362) Thanks @mathiasnagler and @vincentkoc. - BlueBubbles/self-chat echo dedupe: drop reflected duplicate webhook copies only when a matching `fromMe` event was just seen for the same chat, body, and timestamp, preventing self-chat loops without broad webhook suppression. Related to #32166. (#38442) Thanks @vincentkoc. - Models/Kimi Coding: send `anthropic-messages` tools in native Anthropic format again so `kimi-coding` stops degrading tool calls into XML/plain-text pseudo invocations instead of real `tool_use` blocks. (#38669, #39907, #40552) Thanks @opriz. +- Subagents/completion announce retries: raise the default announce timeout to 90 seconds and stop retrying gateway-timeout failures for externally delivered completion announces, preventing duplicate user-facing completion messages after slow gateway responses. Fixes #41235. Thanks @vasujain00 and @vincentkoc. - Sandbox/write: preserve pinned mutation-helper payload stdin so sandboxed `write` no longer reports success while creating empty files. (#43876) Thanks @glitch418x. - Gateway/main-session routing: keep TUI and other `mode:UI` main-session sends on the internal surface when `deliver` is enabled, so replies no longer inherit the session's persisted Telegram/WhatsApp route. (#43918) Thanks @obviyus. - Doctor/gateway service audit: canonicalize service entrypoint paths before comparing them so symlink-vs-realpath installs no longer trigger false "entrypoint does not match the current install" repair prompts. (#43882) Thanks @ngutman. diff --git a/src/agents/subagent-announce.timeout.test.ts b/src/agents/subagent-announce.timeout.test.ts index 1c4925d9272..b003276e56e 100644 --- a/src/agents/subagent-announce.timeout.test.ts +++ b/src/agents/subagent-announce.timeout.test.ts @@ -8,6 +8,12 @@ type GatewayCall = { }; const gatewayCalls: GatewayCall[] = []; +let callGatewayImpl: (request: GatewayCall) => Promise = async (request) => { + if (request.method === "chat.history") { + return { messages: [] }; + } + return {}; +}; let sessionStore: Record> = {}; let configOverride: ReturnType<(typeof import("../config/config.js"))["loadConfig"]> = { session: { @@ -27,10 +33,7 @@ let fallbackRequesterResolution: { vi.mock("../gateway/call.js", () => ({ callGateway: vi.fn(async (request: GatewayCall) => { gatewayCalls.push(request); - if (request.method === "chat.history") { - return { messages: [] }; - } - return {}; + return await callGatewayImpl(request); }), })); @@ -120,6 +123,12 @@ function findGatewayCall(predicate: (call: GatewayCall) => boolean): GatewayCall describe("subagent announce timeout config", () => { beforeEach(() => { gatewayCalls.length = 0; + callGatewayImpl = async (request) => { + if (request.method === "chat.history") { + return { messages: [] }; + } + return {}; + }; sessionStore = {}; configOverride = { session: defaultSessionConfig, @@ -131,13 +140,13 @@ describe("subagent announce timeout config", () => { fallbackRequesterResolution = null; }); - it("uses 60s timeout by default for direct announce agent call", async () => { + it("uses 90s timeout by default for direct announce agent call", async () => { await runAnnounceFlowForTest("run-default-timeout"); const directAgentCall = findGatewayCall( (call) => call.method === "agent" && call.expectFinal === true, ); - expect(directAgentCall?.timeoutMs).toBe(60_000); + expect(directAgentCall?.timeoutMs).toBe(90_000); }); it("honors configured announce timeout for direct announce agent call", async () => { @@ -166,6 +175,35 @@ describe("subagent announce timeout config", () => { expect(completionDirectAgentCall?.timeoutMs).toBe(90_000); }); + it("does not retry gateway timeout for externally delivered completion announces", async () => { + vi.useFakeTimers(); + try { + callGatewayImpl = async (request) => { + if (request.method === "chat.history") { + return { messages: [] }; + } + throw new Error("gateway timeout after 90000ms"); + }; + + await expect( + runAnnounceFlowForTest("run-completion-timeout-no-retry", { + requesterOrigin: { + channel: "telegram", + to: "12345", + }, + expectsCompletionMessage: true, + }), + ).resolves.toBe(false); + + const directAgentCalls = gatewayCalls.filter( + (call) => call.method === "agent" && call.expectFinal === true, + ); + expect(directAgentCalls).toHaveLength(1); + } finally { + vi.useRealTimers(); + } + }); + it("regression, skips parent announce while descendants are still pending", async () => { requesterDepthResolver = () => 1; pendingDescendantRuns = 2; diff --git a/src/agents/subagent-announce.ts b/src/agents/subagent-announce.ts index 62b2cc6f0d3..5070b204392 100644 --- a/src/agents/subagent-announce.ts +++ b/src/agents/subagent-announce.ts @@ -51,8 +51,9 @@ import { isAnnounceSkip } from "./tools/sessions-send-helpers.js"; const FAST_TEST_MODE = process.env.OPENCLAW_TEST_FAST === "1"; const FAST_TEST_RETRY_INTERVAL_MS = 8; -const DEFAULT_SUBAGENT_ANNOUNCE_TIMEOUT_MS = 60_000; +const DEFAULT_SUBAGENT_ANNOUNCE_TIMEOUT_MS = 90_000; const MAX_TIMER_SAFE_TIMEOUT_MS = 2_147_000_000; +const GATEWAY_TIMEOUT_PATTERN = /gateway timeout/i; let subagentRegistryRuntimePromise: Promise< typeof import("./subagent-registry-runtime.js") > | null = null; @@ -107,7 +108,7 @@ const TRANSIENT_ANNOUNCE_DELIVERY_ERROR_PATTERNS: readonly RegExp[] = [ /no active .* listener/i, /gateway not connected/i, /gateway closed \(1006/i, - /gateway timeout/i, + GATEWAY_TIMEOUT_PATTERN, /\b(econnreset|econnrefused|etimedout|enotfound|ehostunreach|network error)\b/i, ]; @@ -133,6 +134,11 @@ function isTransientAnnounceDeliveryError(error: unknown): boolean { return TRANSIENT_ANNOUNCE_DELIVERY_ERROR_PATTERNS.some((re) => re.test(message)); } +function isGatewayTimeoutError(error: unknown): boolean { + const message = summarizeDeliveryError(error); + return Boolean(message) && GATEWAY_TIMEOUT_PATTERN.test(message); +} + async function waitForAnnounceRetryDelay(ms: number, signal?: AbortSignal): Promise { if (ms <= 0) { return; @@ -160,6 +166,7 @@ async function waitForAnnounceRetryDelay(ms: number, signal?: AbortSignal): Prom async function runAnnounceDeliveryWithRetry(params: { operation: string; + noRetryOnGatewayTimeout?: boolean; signal?: AbortSignal; run: () => Promise; }): Promise { @@ -171,6 +178,9 @@ async function runAnnounceDeliveryWithRetry(params: { try { return await params.run(); } catch (err) { + if (params.noRetryOnGatewayTimeout && isGatewayTimeoutError(err)) { + throw err; + } const delayMs = DIRECT_ANNOUNCE_TRANSIENT_RETRY_DELAYS_MS[retryIndex]; if (delayMs == null || !isTransientAnnounceDeliveryError(err) || params.signal?.aborted) { throw err; @@ -789,6 +799,7 @@ async function sendSubagentAnnounceDirectly(params: { operation: params.expectsCompletionMessage ? "completion direct announce agent call" : "direct announce agent call", + noRetryOnGatewayTimeout: params.expectsCompletionMessage && shouldDeliverExternally, signal: params.signal, run: async () => await callGateway({ diff --git a/src/config/types.agent-defaults.ts b/src/config/types.agent-defaults.ts index 9124e4084d8..5abaab2c169 100644 --- a/src/config/types.agent-defaults.ts +++ b/src/config/types.agent-defaults.ts @@ -279,7 +279,7 @@ export type AgentDefaultsConfig = { thinking?: string; /** Default run timeout in seconds for spawned sub-agents (0 = no timeout). */ runTimeoutSeconds?: number; - /** Gateway timeout in ms for sub-agent announce delivery calls (default: 60000). */ + /** Gateway timeout in ms for sub-agent announce delivery calls (default: 90000). */ announceTimeoutMs?: number; }; /** Optional sandbox settings for non-main sessions. */