From ebfb834dcdad5f09675d76af05fb80d3ecfd9fde Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Fri, 22 May 2026 13:07:40 +0100 Subject: [PATCH] fix(cron): classify network retry errors (#85344) --- CHANGELOG.md | 1 + src/cron/retry-hint.test.ts | 37 ++++++++++++++++++++++++++++++++++++ src/cron/retry-hint.ts | 38 +++++++++++++++++++++++++++++++++++++ src/cron/service/timer.ts | 36 ++++++++++------------------------- 4 files changed, 86 insertions(+), 26 deletions(-) create mode 100644 src/cron/retry-hint.test.ts create mode 100644 src/cron/retry-hint.ts diff --git a/CHANGELOG.md b/CHANGELOG.md index 53df81988ac..e35ef21b995 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -33,6 +33,7 @@ Docs: https://docs.openclaw.ai ### Fixes +- Cron: honor `cron.retry.retryOn: ["network"]` for common network error codes such as `EAI_AGAIN`, `EHOSTUNREACH`, and `ENETUNREACH`. - Agents/OpenAI: preserve structured provider error code, type, and redacted body metadata on boundary-aware transport failures. - CLI/agents: retry transient normal-close Gateway handshakes before falling back to embedded `openclaw agent` execution. - CLI/update: keep managed Gateway service stop/restart status lines out of `openclaw update --json` stdout so package-update automation can parse the JSON payload. diff --git a/src/cron/retry-hint.test.ts b/src/cron/retry-hint.test.ts new file mode 100644 index 00000000000..54fb14c1eb6 --- /dev/null +++ b/src/cron/retry-hint.test.ts @@ -0,0 +1,37 @@ +import { describe, expect, it } from "vitest"; +import { resolveCronExecutionRetryHint } from "./retry-hint.js"; + +describe("resolveCronExecutionRetryHint", () => { + it("matches classified transient errors", () => { + expect(resolveCronExecutionRetryHint("HTTP 529", ["overloaded"])).toEqual({ + retryable: true, + category: "overloaded", + }); + expect(resolveCronExecutionRetryHint("429 rate limit exceeded", ["rate_limit"])).toEqual({ + retryable: true, + category: "rate_limit", + }); + }); + + it("treats common network error codes as network when retryOn only includes network", () => { + for (const code of [ + "EAI_AGAIN", + "EHOSTUNREACH", + "EHOSTDOWN", + "ENETRESET", + "ENETUNREACH", + "EPIPE", + ]) { + expect(resolveCronExecutionRetryHint(`temporary DNS failure: ${code}`, ["network"])).toEqual({ + retryable: true, + category: "network", + }); + } + }); + + it("does not retry permanent errors", () => { + expect(resolveCronExecutionRetryHint("invalid API key", ["network"])).toEqual({ + retryable: false, + }); + }); +}); diff --git a/src/cron/retry-hint.ts b/src/cron/retry-hint.ts new file mode 100644 index 00000000000..e84efd229c5 --- /dev/null +++ b/src/cron/retry-hint.ts @@ -0,0 +1,38 @@ +import type { CronRetryOn } from "../config/types.cron.js"; + +export type CronRetryHint = { + retryable: boolean; + category?: CronRetryOn; +}; + +const TRANSIENT_PATTERNS: Record = { + rate_limit: + /(rate[_ ]limit|too many requests|429|resource has been exhausted|cloudflare|tokens per day)/i, + overloaded: + /\b529\b|\boverloaded(?:_error)?\b|high demand|temporar(?:ily|y) overloaded|capacity exceeded/i, + network: + /(network|fetch failed|socket|econnreset|econnrefused|eai_again|ehostunreach|ehostdown|enetreset|enetunreach|epipe)/i, + timeout: /(timeout|etimedout)/i, + server_error: /\b5\d{2}\b/, +}; + +export function resolveCronExecutionRetryHint( + error: string | undefined, + retryOn?: CronRetryOn[], + classifiedReason?: string | null, +): CronRetryHint { + if (!error || typeof error !== "string") { + return { retryable: false }; + } + const keys = retryOn?.length ? retryOn : (Object.keys(TRANSIENT_PATTERNS) as CronRetryOn[]); + const classified = classifiedReason ?? undefined; + if (classified && keys.includes(classified as CronRetryOn)) { + return { retryable: true, category: classified as CronRetryOn }; + } + for (const key of keys) { + if (TRANSIENT_PATTERNS[key]?.test(error)) { + return { retryable: true, category: key }; + } + } + return { retryable: false }; +} diff --git a/src/cron/service/timer.ts b/src/cron/service/timer.ts index fe7babc32e7..28184187776 100644 --- a/src/cron/service/timer.ts +++ b/src/cron/service/timer.ts @@ -2,7 +2,7 @@ import { resolveFailoverReasonFromError } from "../../agents/failover-error.js"; import { formatEmbeddedAgentExecutionPhase } from "../../agents/pi-embedded-runner/execution-phase.js"; import { readSessionEntry } from "../../config/sessions/store-load.js"; import type { SessionEntry } from "../../config/sessions/types.js"; -import type { CronConfig, CronRetryOn } from "../../config/types.cron.js"; +import type { CronConfig } from "../../config/types.cron.js"; import type { HeartbeatRunResult } from "../../infra/heartbeat-wake.js"; import { HEARTBEAT_SKIP_CRON_IN_PROGRESS, @@ -25,6 +25,7 @@ import type { DeliveryContext } from "../../utils/delivery-context.types.js"; import { clearCronJobActive, markCronJobActive } from "../active-jobs.js"; import { resolveCronDeliveryPlan, resolveFailureDestination } from "../delivery-plan.js"; import { resolveCronAgentSessionKey } from "../isolated-agent/session-key.js"; +import { resolveCronExecutionRetryHint } from "../retry-hint.js"; import { createCronRunDiagnosticsFromError, normalizeCronRunDiagnostics, @@ -560,28 +561,6 @@ function tryFinishCronTaskRun( /** Default max retries for one-shot jobs on transient errors (#24355). */ const DEFAULT_MAX_TRANSIENT_RETRIES = 3; -const TRANSIENT_PATTERNS: Record = { - rate_limit: - /(rate[_ ]limit|too many requests|429|resource has been exhausted|cloudflare|tokens per day)/i, - overloaded: - /\b529\b|\boverloaded(?:_error)?\b|high demand|temporar(?:ily|y) overloaded|capacity exceeded/i, - network: /(network|econnreset|econnrefused|fetch failed|socket)/i, - timeout: /(timeout|etimedout)/i, - server_error: /\b5\d{2}\b/, -}; - -function isTransientCronError(error: string | undefined, retryOn?: CronRetryOn[]): boolean { - if (!error || typeof error !== "string") { - return false; - } - const keys = retryOn?.length ? retryOn : (Object.keys(TRANSIENT_PATTERNS) as CronRetryOn[]); - const classified = resolveFailoverReasonFromError(error); - if (classified && keys.includes(classified as CronRetryOn)) { - return true; - } - return keys.some((k) => TRANSIENT_PATTERNS[k]?.test(error)); -} - function resolveCronNextRunWithLowerBound(params: { state: CronServiceState; job: CronJob; @@ -970,10 +949,14 @@ export function applyJobResult( job.state.nextRunAtMs = undefined; } else if (result.status === "error") { const retryConfig = resolveRetryConfig(state.deps.cronConfig); - const transient = isTransientCronError(result.error, retryConfig.retryOn); + const retryHint = resolveCronExecutionRetryHint( + result.error, + retryConfig.retryOn, + job.state.lastErrorReason, + ); // consecutiveErrors is always set to ≥1 by the increment block above. const consecutive = job.state.consecutiveErrors; - if (transient && consecutive <= retryConfig.maxAttempts) { + if (retryHint.retryable && consecutive <= retryConfig.maxAttempts) { // Schedule retry with backoff (#24355). const backoff = errorBackoffMs(consecutive, retryConfig.backoffMs); job.state.nextRunAtMs = result.endedAt + backoff; @@ -1000,7 +983,8 @@ export function applyJobResult( jobName: job.name, consecutiveErrors: consecutive, error: result.error, - reason: transient ? "max retries exhausted" : "permanent error", + reason: retryHint.retryable ? "max retries exhausted" : "permanent error", + retryCategory: retryHint.category, }, "cron: disabling one-shot job after error", );