fix(cron): classify network retry errors (#85344)

This commit is contained in:
Peter Steinberger
2026-05-22 13:07:40 +01:00
committed by GitHub
parent 3551e98433
commit ebfb834dcd
4 changed files with 86 additions and 26 deletions

View File

@@ -33,6 +33,7 @@ Docs: https://docs.openclaw.ai
### Fixes
- Cron: honor `cron.retry.retryOn: ["network"]` for common network error codes such as `EAI_AGAIN`, `EHOSTUNREACH`, and `ENETUNREACH`.
- Agents/OpenAI: preserve structured provider error code, type, and redacted body metadata on boundary-aware transport failures.
- CLI/agents: retry transient normal-close Gateway handshakes before falling back to embedded `openclaw agent` execution.
- CLI/update: keep managed Gateway service stop/restart status lines out of `openclaw update --json` stdout so package-update automation can parse the JSON payload.

View File

@@ -0,0 +1,37 @@
import { describe, expect, it } from "vitest";
import { resolveCronExecutionRetryHint } from "./retry-hint.js";
describe("resolveCronExecutionRetryHint", () => {
it("matches classified transient errors", () => {
expect(resolveCronExecutionRetryHint("HTTP 529", ["overloaded"])).toEqual({
retryable: true,
category: "overloaded",
});
expect(resolveCronExecutionRetryHint("429 rate limit exceeded", ["rate_limit"])).toEqual({
retryable: true,
category: "rate_limit",
});
});
it("treats common network error codes as network when retryOn only includes network", () => {
for (const code of [
"EAI_AGAIN",
"EHOSTUNREACH",
"EHOSTDOWN",
"ENETRESET",
"ENETUNREACH",
"EPIPE",
]) {
expect(resolveCronExecutionRetryHint(`temporary DNS failure: ${code}`, ["network"])).toEqual({
retryable: true,
category: "network",
});
}
});
it("does not retry permanent errors", () => {
expect(resolveCronExecutionRetryHint("invalid API key", ["network"])).toEqual({
retryable: false,
});
});
});

38
src/cron/retry-hint.ts Normal file
View File

@@ -0,0 +1,38 @@
import type { CronRetryOn } from "../config/types.cron.js";
export type CronRetryHint = {
retryable: boolean;
category?: CronRetryOn;
};
const TRANSIENT_PATTERNS: Record<CronRetryOn, RegExp> = {
rate_limit:
/(rate[_ ]limit|too many requests|429|resource has been exhausted|cloudflare|tokens per day)/i,
overloaded:
/\b529\b|\boverloaded(?:_error)?\b|high demand|temporar(?:ily|y) overloaded|capacity exceeded/i,
network:
/(network|fetch failed|socket|econnreset|econnrefused|eai_again|ehostunreach|ehostdown|enetreset|enetunreach|epipe)/i,
timeout: /(timeout|etimedout)/i,
server_error: /\b5\d{2}\b/,
};
export function resolveCronExecutionRetryHint(
error: string | undefined,
retryOn?: CronRetryOn[],
classifiedReason?: string | null,
): CronRetryHint {
if (!error || typeof error !== "string") {
return { retryable: false };
}
const keys = retryOn?.length ? retryOn : (Object.keys(TRANSIENT_PATTERNS) as CronRetryOn[]);
const classified = classifiedReason ?? undefined;
if (classified && keys.includes(classified as CronRetryOn)) {
return { retryable: true, category: classified as CronRetryOn };
}
for (const key of keys) {
if (TRANSIENT_PATTERNS[key]?.test(error)) {
return { retryable: true, category: key };
}
}
return { retryable: false };
}

View File

@@ -2,7 +2,7 @@ import { resolveFailoverReasonFromError } from "../../agents/failover-error.js";
import { formatEmbeddedAgentExecutionPhase } from "../../agents/pi-embedded-runner/execution-phase.js";
import { readSessionEntry } from "../../config/sessions/store-load.js";
import type { SessionEntry } from "../../config/sessions/types.js";
import type { CronConfig, CronRetryOn } from "../../config/types.cron.js";
import type { CronConfig } from "../../config/types.cron.js";
import type { HeartbeatRunResult } from "../../infra/heartbeat-wake.js";
import {
HEARTBEAT_SKIP_CRON_IN_PROGRESS,
@@ -25,6 +25,7 @@ import type { DeliveryContext } from "../../utils/delivery-context.types.js";
import { clearCronJobActive, markCronJobActive } from "../active-jobs.js";
import { resolveCronDeliveryPlan, resolveFailureDestination } from "../delivery-plan.js";
import { resolveCronAgentSessionKey } from "../isolated-agent/session-key.js";
import { resolveCronExecutionRetryHint } from "../retry-hint.js";
import {
createCronRunDiagnosticsFromError,
normalizeCronRunDiagnostics,
@@ -560,28 +561,6 @@ function tryFinishCronTaskRun(
/** Default max retries for one-shot jobs on transient errors (#24355). */
const DEFAULT_MAX_TRANSIENT_RETRIES = 3;
const TRANSIENT_PATTERNS: Record<string, RegExp> = {
rate_limit:
/(rate[_ ]limit|too many requests|429|resource has been exhausted|cloudflare|tokens per day)/i,
overloaded:
/\b529\b|\boverloaded(?:_error)?\b|high demand|temporar(?:ily|y) overloaded|capacity exceeded/i,
network: /(network|econnreset|econnrefused|fetch failed|socket)/i,
timeout: /(timeout|etimedout)/i,
server_error: /\b5\d{2}\b/,
};
function isTransientCronError(error: string | undefined, retryOn?: CronRetryOn[]): boolean {
if (!error || typeof error !== "string") {
return false;
}
const keys = retryOn?.length ? retryOn : (Object.keys(TRANSIENT_PATTERNS) as CronRetryOn[]);
const classified = resolveFailoverReasonFromError(error);
if (classified && keys.includes(classified as CronRetryOn)) {
return true;
}
return keys.some((k) => TRANSIENT_PATTERNS[k]?.test(error));
}
function resolveCronNextRunWithLowerBound(params: {
state: CronServiceState;
job: CronJob;
@@ -970,10 +949,14 @@ export function applyJobResult(
job.state.nextRunAtMs = undefined;
} else if (result.status === "error") {
const retryConfig = resolveRetryConfig(state.deps.cronConfig);
const transient = isTransientCronError(result.error, retryConfig.retryOn);
const retryHint = resolveCronExecutionRetryHint(
result.error,
retryConfig.retryOn,
job.state.lastErrorReason,
);
// consecutiveErrors is always set to ≥1 by the increment block above.
const consecutive = job.state.consecutiveErrors;
if (transient && consecutive <= retryConfig.maxAttempts) {
if (retryHint.retryable && consecutive <= retryConfig.maxAttempts) {
// Schedule retry with backoff (#24355).
const backoff = errorBackoffMs(consecutive, retryConfig.backoffMs);
job.state.nextRunAtMs = result.endedAt + backoff;
@@ -1000,7 +983,8 @@ export function applyJobResult(
jobName: job.name,
consecutiveErrors: consecutive,
error: result.error,
reason: transient ? "max retries exhausted" : "permanent error",
reason: retryHint.retryable ? "max retries exhausted" : "permanent error",
retryCategory: retryHint.category,
},
"cron: disabling one-shot job after error",
);