mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-28 05:36:17 +00:00
fix(cron): classify network retry errors (#85344)
This commit is contained in:
committed by
GitHub
parent
3551e98433
commit
ebfb834dcd
@@ -33,6 +33,7 @@ Docs: https://docs.openclaw.ai
|
||||
|
||||
### Fixes
|
||||
|
||||
- Cron: honor `cron.retry.retryOn: ["network"]` for common network error codes such as `EAI_AGAIN`, `EHOSTUNREACH`, and `ENETUNREACH`.
|
||||
- Agents/OpenAI: preserve structured provider error code, type, and redacted body metadata on boundary-aware transport failures.
|
||||
- CLI/agents: retry transient normal-close Gateway handshakes before falling back to embedded `openclaw agent` execution.
|
||||
- CLI/update: keep managed Gateway service stop/restart status lines out of `openclaw update --json` stdout so package-update automation can parse the JSON payload.
|
||||
|
||||
37
src/cron/retry-hint.test.ts
Normal file
37
src/cron/retry-hint.test.ts
Normal file
@@ -0,0 +1,37 @@
|
||||
import { describe, expect, it } from "vitest";
|
||||
import { resolveCronExecutionRetryHint } from "./retry-hint.js";
|
||||
|
||||
describe("resolveCronExecutionRetryHint", () => {
|
||||
it("matches classified transient errors", () => {
|
||||
expect(resolveCronExecutionRetryHint("HTTP 529", ["overloaded"])).toEqual({
|
||||
retryable: true,
|
||||
category: "overloaded",
|
||||
});
|
||||
expect(resolveCronExecutionRetryHint("429 rate limit exceeded", ["rate_limit"])).toEqual({
|
||||
retryable: true,
|
||||
category: "rate_limit",
|
||||
});
|
||||
});
|
||||
|
||||
it("treats common network error codes as network when retryOn only includes network", () => {
|
||||
for (const code of [
|
||||
"EAI_AGAIN",
|
||||
"EHOSTUNREACH",
|
||||
"EHOSTDOWN",
|
||||
"ENETRESET",
|
||||
"ENETUNREACH",
|
||||
"EPIPE",
|
||||
]) {
|
||||
expect(resolveCronExecutionRetryHint(`temporary DNS failure: ${code}`, ["network"])).toEqual({
|
||||
retryable: true,
|
||||
category: "network",
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
it("does not retry permanent errors", () => {
|
||||
expect(resolveCronExecutionRetryHint("invalid API key", ["network"])).toEqual({
|
||||
retryable: false,
|
||||
});
|
||||
});
|
||||
});
|
||||
38
src/cron/retry-hint.ts
Normal file
38
src/cron/retry-hint.ts
Normal file
@@ -0,0 +1,38 @@
|
||||
import type { CronRetryOn } from "../config/types.cron.js";
|
||||
|
||||
export type CronRetryHint = {
|
||||
retryable: boolean;
|
||||
category?: CronRetryOn;
|
||||
};
|
||||
|
||||
const TRANSIENT_PATTERNS: Record<CronRetryOn, RegExp> = {
|
||||
rate_limit:
|
||||
/(rate[_ ]limit|too many requests|429|resource has been exhausted|cloudflare|tokens per day)/i,
|
||||
overloaded:
|
||||
/\b529\b|\boverloaded(?:_error)?\b|high demand|temporar(?:ily|y) overloaded|capacity exceeded/i,
|
||||
network:
|
||||
/(network|fetch failed|socket|econnreset|econnrefused|eai_again|ehostunreach|ehostdown|enetreset|enetunreach|epipe)/i,
|
||||
timeout: /(timeout|etimedout)/i,
|
||||
server_error: /\b5\d{2}\b/,
|
||||
};
|
||||
|
||||
export function resolveCronExecutionRetryHint(
|
||||
error: string | undefined,
|
||||
retryOn?: CronRetryOn[],
|
||||
classifiedReason?: string | null,
|
||||
): CronRetryHint {
|
||||
if (!error || typeof error !== "string") {
|
||||
return { retryable: false };
|
||||
}
|
||||
const keys = retryOn?.length ? retryOn : (Object.keys(TRANSIENT_PATTERNS) as CronRetryOn[]);
|
||||
const classified = classifiedReason ?? undefined;
|
||||
if (classified && keys.includes(classified as CronRetryOn)) {
|
||||
return { retryable: true, category: classified as CronRetryOn };
|
||||
}
|
||||
for (const key of keys) {
|
||||
if (TRANSIENT_PATTERNS[key]?.test(error)) {
|
||||
return { retryable: true, category: key };
|
||||
}
|
||||
}
|
||||
return { retryable: false };
|
||||
}
|
||||
@@ -2,7 +2,7 @@ import { resolveFailoverReasonFromError } from "../../agents/failover-error.js";
|
||||
import { formatEmbeddedAgentExecutionPhase } from "../../agents/pi-embedded-runner/execution-phase.js";
|
||||
import { readSessionEntry } from "../../config/sessions/store-load.js";
|
||||
import type { SessionEntry } from "../../config/sessions/types.js";
|
||||
import type { CronConfig, CronRetryOn } from "../../config/types.cron.js";
|
||||
import type { CronConfig } from "../../config/types.cron.js";
|
||||
import type { HeartbeatRunResult } from "../../infra/heartbeat-wake.js";
|
||||
import {
|
||||
HEARTBEAT_SKIP_CRON_IN_PROGRESS,
|
||||
@@ -25,6 +25,7 @@ import type { DeliveryContext } from "../../utils/delivery-context.types.js";
|
||||
import { clearCronJobActive, markCronJobActive } from "../active-jobs.js";
|
||||
import { resolveCronDeliveryPlan, resolveFailureDestination } from "../delivery-plan.js";
|
||||
import { resolveCronAgentSessionKey } from "../isolated-agent/session-key.js";
|
||||
import { resolveCronExecutionRetryHint } from "../retry-hint.js";
|
||||
import {
|
||||
createCronRunDiagnosticsFromError,
|
||||
normalizeCronRunDiagnostics,
|
||||
@@ -560,28 +561,6 @@ function tryFinishCronTaskRun(
|
||||
/** Default max retries for one-shot jobs on transient errors (#24355). */
|
||||
const DEFAULT_MAX_TRANSIENT_RETRIES = 3;
|
||||
|
||||
const TRANSIENT_PATTERNS: Record<string, RegExp> = {
|
||||
rate_limit:
|
||||
/(rate[_ ]limit|too many requests|429|resource has been exhausted|cloudflare|tokens per day)/i,
|
||||
overloaded:
|
||||
/\b529\b|\boverloaded(?:_error)?\b|high demand|temporar(?:ily|y) overloaded|capacity exceeded/i,
|
||||
network: /(network|econnreset|econnrefused|fetch failed|socket)/i,
|
||||
timeout: /(timeout|etimedout)/i,
|
||||
server_error: /\b5\d{2}\b/,
|
||||
};
|
||||
|
||||
function isTransientCronError(error: string | undefined, retryOn?: CronRetryOn[]): boolean {
|
||||
if (!error || typeof error !== "string") {
|
||||
return false;
|
||||
}
|
||||
const keys = retryOn?.length ? retryOn : (Object.keys(TRANSIENT_PATTERNS) as CronRetryOn[]);
|
||||
const classified = resolveFailoverReasonFromError(error);
|
||||
if (classified && keys.includes(classified as CronRetryOn)) {
|
||||
return true;
|
||||
}
|
||||
return keys.some((k) => TRANSIENT_PATTERNS[k]?.test(error));
|
||||
}
|
||||
|
||||
function resolveCronNextRunWithLowerBound(params: {
|
||||
state: CronServiceState;
|
||||
job: CronJob;
|
||||
@@ -970,10 +949,14 @@ export function applyJobResult(
|
||||
job.state.nextRunAtMs = undefined;
|
||||
} else if (result.status === "error") {
|
||||
const retryConfig = resolveRetryConfig(state.deps.cronConfig);
|
||||
const transient = isTransientCronError(result.error, retryConfig.retryOn);
|
||||
const retryHint = resolveCronExecutionRetryHint(
|
||||
result.error,
|
||||
retryConfig.retryOn,
|
||||
job.state.lastErrorReason,
|
||||
);
|
||||
// consecutiveErrors is always set to ≥1 by the increment block above.
|
||||
const consecutive = job.state.consecutiveErrors;
|
||||
if (transient && consecutive <= retryConfig.maxAttempts) {
|
||||
if (retryHint.retryable && consecutive <= retryConfig.maxAttempts) {
|
||||
// Schedule retry with backoff (#24355).
|
||||
const backoff = errorBackoffMs(consecutive, retryConfig.backoffMs);
|
||||
job.state.nextRunAtMs = result.endedAt + backoff;
|
||||
@@ -1000,7 +983,8 @@ export function applyJobResult(
|
||||
jobName: job.name,
|
||||
consecutiveErrors: consecutive,
|
||||
error: result.error,
|
||||
reason: transient ? "max retries exhausted" : "permanent error",
|
||||
reason: retryHint.retryable ? "max retries exhausted" : "permanent error",
|
||||
retryCategory: retryHint.category,
|
||||
},
|
||||
"cron: disabling one-shot job after error",
|
||||
);
|
||||
|
||||
Reference in New Issue
Block a user