fix(cron): fail isolated runs on run-level errors

This commit is contained in:
Peter Steinberger
2026-04-27 05:42:22 +01:00
parent 390b965460
commit 382e03a2d8
7 changed files with 225 additions and 27 deletions

View File

@@ -15,6 +15,7 @@ Docs: https://docs.openclaw.ai
- CLI/startup: read generated startup metadata from the bundled `dist` layout before falling back to live help rendering, so root/browser help and channel-option bootstrap stay on the fast path. Thanks @vincentkoc.
- CLI/help: treat positional `help` invocations like `openclaw channels help` as help paths for startup gating, avoiding model/auth warmup while preserving positional arguments such as `openclaw docs help`. Thanks @gumadeiras.
- Matrix/E2EE: stabilize recovery and broken-device QA flows while avoiding Matrix device-cleanup sync races that could leave shutdown-time crypto work running. Thanks @gumadeiras.
- Cron: treat isolated run-level agent failures as job errors even when no reply payload is produced, synthesizing a safe error payload so model/provider failures increment error counters and trigger failure notifications instead of clearing as successful. Fixes #43604; carries forward #43631. Thanks @SPFAdvisors.
- Cron: classify isolated runs as errors from structured embedded-run execution-denial metadata, with final-output marker fallback for `SYSTEM_RUN_DENIED`, `INVALID_REQUEST`, and approval-binding refusals, so blocked commands no longer appear green in cron history. Fixes #67172; carries forward #67186. Thanks @oc-gh-dr, @hclsys, and @1yihui.
- Onboarding/GitHub Copilot: add manifest-owned `--github-copilot-token` support for non-interactive setup, including env fallback, tokenRef storage in ref mode, saved-profile reuse, and current Copilot default-model wiring. Refs #50002 and supersedes #50003. Thanks @scottgl9.
- Gateway/install: add a validated `--wrapper`/`OPENCLAW_WRAPPER` service install path that persists executable LaunchAgent/systemd wrappers across forced reinstalls, updates, and doctor repairs instead of falling back to raw node/bun `ProgramArguments`. Fixes #69400. (#72445) Thanks @willtmc.

View File

@@ -48,6 +48,7 @@ Cron is the Gateway's built-in scheduler. It persists jobs, wakes the agent at t
- Isolated cron runs best-effort close tracked browser tabs/processes for their `cron:<jobId>` session when the run completes, so detached browser automation does not leave orphaned processes behind.
- Isolated cron runs also guard against stale acknowledgement replies. If the first result is just an interim status update (`on it`, `pulling everything together`, and similar hints) and no descendant subagent run is still responsible for the final answer, OpenClaw re-prompts once for the actual result before delivery.
- Isolated cron runs prefer structured execution-denial metadata from the embedded run, then fall back to known final summary/output markers such as `SYSTEM_RUN_DENIED` and `INVALID_REQUEST`, so a blocked command is not reported as a green run.
- Isolated cron runs also treat run-level agent failures as job errors even when no reply payload is produced, so model/provider failures increment error counters and trigger failure notifications instead of clearing the job as successful.
<a id="maintenance"></a>

View File

@@ -63,6 +63,10 @@ Failure notifications resolve in this order:
Main-session jobs may only use `delivery.failureDestination` when primary delivery mode is `webhook`. Isolated jobs accept it in all modes.
</Note>
Note: isolated cron runs treat run-level agent failures as job errors even when
no reply payload is produced, so model/provider failures still increment error
counters and trigger failure notifications.
## Scheduling
### One-shot jobs

View File

@@ -76,6 +76,84 @@ describe("resolveCronPayloadOutcome", () => {
expect(result.hasFatalErrorPayload).toBe(true);
expect(result.embeddedRunError).toContain("Model context overflow");
expect(result.outputText).toBe("Model context overflow");
expect(result.deliveryPayloads).toEqual([{ text: "Model context overflow", isError: true }]);
});
it("treats standalone run-level errors as fatal and synthesizes delivery", () => {
const result = resolveCronPayloadOutcome({
payloads: [],
runLevelError: { kind: "provider_error", message: "model provider unreachable" },
});
expect(result.hasFatalErrorPayload).toBe(true);
expect(result.embeddedRunError).toBe("cron isolated run failed: model provider unreachable");
expect(result.summary).toBe("cron isolated run failed: model provider unreachable");
expect(result.outputText).toBe("cron isolated run failed: model provider unreachable");
expect(result.synthesizedText).toBe("cron isolated run failed: model provider unreachable");
expect(result.deliveryPayload).toEqual({
text: "cron isolated run failed: model provider unreachable",
isError: true,
});
expect(result.deliveryPayloads).toEqual([
{ text: "cron isolated run failed: model provider unreachable", isError: true },
]);
expect(result.deliveryPayloadHasStructuredContent).toBe(false);
});
it("uses string run-level errors when no error payload exists", () => {
const result = resolveCronPayloadOutcome({
payloads: [{ text: " " }],
runLevelError: "rate limit exceeded",
});
expect(result.hasFatalErrorPayload).toBe(true);
expect(result.embeddedRunError).toBe("cron isolated run failed: rate limit exceeded");
expect(result.deliveryPayloads).toEqual([
{ text: "cron isolated run failed: rate limit exceeded", isError: true },
]);
});
it("falls back to run-level error kind without exposing arbitrary objects", () => {
const result = resolveCronPayloadOutcome({
payloads: [{ text: "Partial assistant text before failure" }],
runLevelError: { kind: "retry_limit", detail: { provider: "example" } },
});
expect(result.hasFatalErrorPayload).toBe(true);
expect(result.embeddedRunError).toBe("cron isolated run failed: retry_limit");
expect(result.outputText).toBe("cron isolated run failed: retry_limit");
expect(result.deliveryPayloads).toEqual([
{ text: "cron isolated run failed: retry_limit", isError: true },
]);
});
it("uses a generic run-level error for unrecognized objects", () => {
const result = resolveCronPayloadOutcome({
payloads: [],
runLevelError: { detail: { provider: "example" } },
});
expect(result.hasFatalErrorPayload).toBe(true);
expect(result.embeddedRunError).toBe("cron isolated run failed");
expect(result.deliveryPayloads).toEqual([{ text: "cron isolated run failed", isError: true }]);
});
it("does not let later success clear a run-level error", () => {
const result = resolveCronPayloadOutcome({
payloads: [
{ text: "Temporary provider failure", isError: true },
{ text: "Partial success-looking text" },
],
runLevelError: "retry limit exceeded",
});
expect(result.hasFatalErrorPayload).toBe(true);
expect(result.embeddedRunError).toBe("Temporary provider failure");
expect(result.outputText).toBe("Temporary provider failure");
expect(result.deliveryPayloads).toEqual([
{ text: "Temporary provider failure", isError: true },
]);
});
it("truncates long summaries", () => {

View File

@@ -108,6 +108,26 @@ function formatCronFailureSignal(signal: NormalizedCronFailureSignal): string {
}: ${signal.message}`;
}
function formatCronRunLevelError(error: unknown): string | undefined {
const direct = normalizeOptionalString(error);
if (direct) {
return `cron isolated run failed: ${direct}`;
}
if (!error || typeof error !== "object") {
return undefined;
}
const record = error as { message?: unknown; kind?: unknown };
const message = normalizeOptionalString(record.message);
if (message) {
return `cron isolated run failed: ${message}`;
}
const kind = normalizeOptionalString(record.kind);
if (kind) {
return `cron isolated run failed: ${kind}`;
}
return "cron isolated run failed";
}
export function pickSummaryFromOutput(text: string | undefined) {
const clean = (text ?? "").trim();
if (!clean) {
@@ -289,33 +309,43 @@ export function resolveCronPayloadOutcome(params: {
})),
]);
const failureSignal = normalizeCronFailureSignal(params.failureSignal);
const runLevelError = formatCronRunLevelError(params.runLevelError);
const hasFatalErrorPayload =
hasFatalStructuredErrorPayload || failureSignal !== undefined || denialSignal !== undefined;
const shouldUseFailureSignalPayload =
failureSignal !== undefined && !hasFatalStructuredErrorPayload;
const failureSignalDeliveryPayload = shouldUseFailureSignalPayload
? ({ text: failureSignal.message, isError: true } satisfies DeliveryPayload)
hasFatalStructuredErrorPayload ||
failureSignal !== undefined ||
denialSignal !== undefined ||
runLevelError !== undefined;
const structuredErrorText = hasFatalStructuredErrorPayload
? (lastErrorPayloadText ?? "cron isolated run returned an error payload")
: undefined;
const shouldUseRunLevelErrorPayload =
runLevelError !== undefined &&
structuredErrorText === undefined &&
failureSignal === undefined &&
denialSignal === undefined;
const fatalDeliveryText =
structuredErrorText ??
failureSignal?.message ??
(shouldUseRunLevelErrorPayload ? runLevelError : undefined);
const fatalDeliveryPayload = fatalDeliveryText
? ({ text: fatalDeliveryText, isError: true } satisfies DeliveryPayload)
: undefined;
return {
summary: shouldUseFailureSignalPayload
? (pickSummaryFromOutput(failureSignal.message) ?? summary)
: summary,
outputText: shouldUseFailureSignalPayload ? failureSignal.message : outputText,
synthesizedText: shouldUseFailureSignalPayload ? failureSignal.message : synthesizedText,
deliveryPayload: failureSignalDeliveryPayload ?? deliveryPayload,
deliveryPayloads: failureSignalDeliveryPayload
? [failureSignalDeliveryPayload]
: resolvedDeliveryPayloads,
deliveryPayloadHasStructuredContent: failureSignalDeliveryPayload
summary: fatalDeliveryText ? (pickSummaryFromOutput(fatalDeliveryText) ?? summary) : summary,
outputText: fatalDeliveryText ?? outputText,
synthesizedText: fatalDeliveryText ?? synthesizedText,
deliveryPayload: fatalDeliveryPayload ?? deliveryPayload,
deliveryPayloads: fatalDeliveryPayload ? [fatalDeliveryPayload] : resolvedDeliveryPayloads,
deliveryPayloadHasStructuredContent: fatalDeliveryPayload
? false
: deliveryPayloadHasStructuredContent,
hasFatalErrorPayload,
embeddedRunError: hasFatalStructuredErrorPayload
? (lastErrorPayloadText ?? "cron isolated run returned an error payload")
embeddedRunError: structuredErrorText
? structuredErrorText
: failureSignal
? formatCronFailureSignal(failureSignal)
: denialSignal
? formatCronDenialSignal(denialSignal)
: undefined,
: runLevelError,
};
}

View File

@@ -0,0 +1,54 @@
import { describe, expect, it } from "vitest";
import {
makeIsolatedAgentTurnParams,
setupRunCronIsolatedAgentTurnSuite,
} from "./run.suite-helpers.js";
import { loadRunCronIsolatedAgentTurn, runWithModelFallbackMock } from "./run.test-harness.js";
const runCronIsolatedAgentTurn = await loadRunCronIsolatedAgentTurn();
describe("runCronIsolatedAgentTurn - meta.error status propagation", () => {
setupRunCronIsolatedAgentTurnSuite();
it("marks a run-level error with empty payloads as a cron error", async () => {
runWithModelFallbackMock.mockResolvedValueOnce({
result: {
payloads: [],
meta: {
error: { kind: "provider_error", message: "model provider unreachable" },
agentMeta: { usage: { input: 0, output: 0 } },
},
},
provider: "openai",
model: "gpt-5.4",
attempts: [],
});
const result = await runCronIsolatedAgentTurn(makeIsolatedAgentTurnParams());
expect(result.status).toBe("error");
expect(result.error).toBe("cron isolated run failed: model provider unreachable");
expect(result.outputText).toBe("cron isolated run failed: model provider unreachable");
});
it("does not deliver partial success text when a run-level error is present", async () => {
runWithModelFallbackMock.mockResolvedValueOnce({
result: {
payloads: [{ text: "Partial success-looking text" }],
meta: {
error: { kind: "retry_limit", message: "retry limit exceeded" },
agentMeta: { usage: { input: 0, output: 0 } },
},
},
provider: "openai",
model: "gpt-5.4",
attempts: [],
});
const result = await runCronIsolatedAgentTurn(makeIsolatedAgentTurnParams());
expect(result.status).toBe("error");
expect(result.error).toBe("cron isolated run failed: retry limit exceeded");
expect(result.outputText).toBe("cron isolated run failed: retry limit exceeded");
});
});

View File

@@ -365,21 +365,55 @@ function resetRunOutcomeMocks(): void {
({
payloads,
failureSignal,
runLevelError,
}: {
payloads: Array<{ isError?: boolean }>;
failureSignal?: { fatalForCron?: boolean; message?: string };
runLevelError?: unknown;
}) => {
const runLevelErrorMessage =
typeof runLevelError === "string" && runLevelError.trim()
? `cron isolated run failed: ${runLevelError.trim()}`
: runLevelError && typeof runLevelError === "object"
? (() => {
const record = runLevelError as { message?: unknown; kind?: unknown };
const message =
typeof record.message === "string" && record.message.trim()
? record.message.trim()
: undefined;
if (message) {
return `cron isolated run failed: ${message}`;
}
const kind =
typeof record.kind === "string" && record.kind.trim()
? record.kind.trim()
: undefined;
return kind ? `cron isolated run failed: ${kind}` : "cron isolated run failed";
})()
: undefined;
const failureMessage =
failureSignal?.fatalForCron === true
? (failureSignal.message ?? "cron isolated run returned a fatal failure signal")
: undefined;
const outputText = failureMessage ?? pickLastNonEmptyTextFromPayloadsMock(payloads);
const errorPayloadMessage = payloads.some((payload) => payload?.isError === true)
? "cron isolated run returned an error payload"
: undefined;
const outputText =
errorPayloadMessage ??
failureMessage ??
runLevelErrorMessage ??
pickLastNonEmptyTextFromPayloadsMock(payloads);
const synthesizedText = outputText?.trim() || "summary";
const hasFatalErrorPayload =
payloads.some((payload) => payload?.isError === true) || failureMessage !== undefined;
const deliveryPayload = failureMessage ? { text: failureMessage, isError: true } : undefined;
errorPayloadMessage !== undefined ||
failureMessage !== undefined ||
runLevelErrorMessage !== undefined;
const deliveryPayload =
errorPayloadMessage || failureMessage || runLevelErrorMessage
? { text: errorPayloadMessage ?? failureMessage ?? runLevelErrorMessage, isError: true }
: undefined;
return {
summary: failureMessage ?? "summary",
summary: errorPayloadMessage ?? failureMessage ?? runLevelErrorMessage ?? "summary",
outputText,
synthesizedText,
deliveryPayload,
@@ -391,11 +425,7 @@ function resetRunOutcomeMocks(): void {
deliveryPayloadHasStructuredContent: false,
hasFatalErrorPayload,
embeddedRunError:
failureMessage !== undefined
? failureMessage
: hasFatalErrorPayload
? "cron isolated run returned an error payload"
: undefined,
errorPayloadMessage ?? failureMessage ?? runLevelErrorMessage ?? undefined,
};
},
);