fix(cron): fail isolated runs on run-level errors

2026-05-06 12:50:42 +00:00 · 2026-04-27 05:42:22 +01:00
parent 390b965460
commit 382e03a2d8
7 changed files with 225 additions and 27 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -15,6 +15,7 @@ Docs: https://docs.openclaw.ai
 - CLI/startup: read generated startup metadata from the bundled `dist` layout before falling back to live help rendering, so root/browser help and channel-option bootstrap stay on the fast path. Thanks @vincentkoc.
 - CLI/help: treat positional `help` invocations like `openclaw channels help` as help paths for startup gating, avoiding model/auth warmup while preserving positional arguments such as `openclaw docs help`. Thanks @gumadeiras.
 - Matrix/E2EE: stabilize recovery and broken-device QA flows while avoiding Matrix device-cleanup sync races that could leave shutdown-time crypto work running. Thanks @gumadeiras.
+- Cron: treat isolated run-level agent failures as job errors even when no reply payload is produced, synthesizing a safe error payload so model/provider failures increment error counters and trigger failure notifications instead of clearing as successful. Fixes #43604; carries forward #43631. Thanks @SPFAdvisors.
 - Cron: classify isolated runs as errors from structured embedded-run execution-denial metadata, with final-output marker fallback for `SYSTEM_RUN_DENIED`, `INVALID_REQUEST`, and approval-binding refusals, so blocked commands no longer appear green in cron history. Fixes #67172; carries forward #67186. Thanks @oc-gh-dr, @hclsys, and @1yihui.
 - Onboarding/GitHub Copilot: add manifest-owned `--github-copilot-token` support for non-interactive setup, including env fallback, tokenRef storage in ref mode, saved-profile reuse, and current Copilot default-model wiring. Refs #50002 and supersedes #50003. Thanks @scottgl9.
 - Gateway/install: add a validated `--wrapper`/`OPENCLAW_WRAPPER` service install path that persists executable LaunchAgent/systemd wrappers across forced reinstalls, updates, and doctor repairs instead of falling back to raw node/bun `ProgramArguments`. Fixes #69400. (#72445) Thanks @willtmc.
--- a/docs/automation/cron-jobs.md
+++ b/docs/automation/cron-jobs.md
@@ -48,6 +48,7 @@ Cron is the Gateway's built-in scheduler. It persists jobs, wakes the agent at t
 - Isolated cron runs best-effort close tracked browser tabs/processes for their `cron:<jobId>` session when the run completes, so detached browser automation does not leave orphaned processes behind.
 - Isolated cron runs also guard against stale acknowledgement replies. If the first result is just an interim status update (`on it`, `pulling everything together`, and similar hints) and no descendant subagent run is still responsible for the final answer, OpenClaw re-prompts once for the actual result before delivery.
 - Isolated cron runs prefer structured execution-denial metadata from the embedded run, then fall back to known final summary/output markers such as `SYSTEM_RUN_DENIED` and `INVALID_REQUEST`, so a blocked command is not reported as a green run.
+- Isolated cron runs also treat run-level agent failures as job errors even when no reply payload is produced, so model/provider failures increment error counters and trigger failure notifications instead of clearing the job as successful.

 <a id="maintenance"></a>

--- a/docs/cli/cron.md
+++ b/docs/cli/cron.md
@@ -63,6 +63,10 @@ Failure notifications resolve in this order:
 Main-session jobs may only use `delivery.failureDestination` when primary delivery mode is `webhook`. Isolated jobs accept it in all modes.
 </Note>

+Note: isolated cron runs treat run-level agent failures as job errors even when
+no reply payload is produced, so model/provider failures still increment error
+counters and trigger failure notifications.
+
 ## Scheduling

 ### One-shot jobs
--- a/src/cron/isolated-agent.helpers.test.ts
+++ b/src/cron/isolated-agent.helpers.test.ts
@@ -76,6 +76,84 @@ describe("resolveCronPayloadOutcome", () => {

    expect(result.hasFatalErrorPayload).toBe(true);
    expect(result.embeddedRunError).toContain("Model context overflow");
+    expect(result.outputText).toBe("Model context overflow");
+    expect(result.deliveryPayloads).toEqual([{ text: "Model context overflow", isError: true }]);
+  });
+
+  it("treats standalone run-level errors as fatal and synthesizes delivery", () => {
+    const result = resolveCronPayloadOutcome({
+      payloads: [],
+      runLevelError: { kind: "provider_error", message: "model provider unreachable" },
+    });
+
+    expect(result.hasFatalErrorPayload).toBe(true);
+    expect(result.embeddedRunError).toBe("cron isolated run failed: model provider unreachable");
+    expect(result.summary).toBe("cron isolated run failed: model provider unreachable");
+    expect(result.outputText).toBe("cron isolated run failed: model provider unreachable");
+    expect(result.synthesizedText).toBe("cron isolated run failed: model provider unreachable");
+    expect(result.deliveryPayload).toEqual({
+      text: "cron isolated run failed: model provider unreachable",
+      isError: true,
+    });
+    expect(result.deliveryPayloads).toEqual([
+      { text: "cron isolated run failed: model provider unreachable", isError: true },
+    ]);
+    expect(result.deliveryPayloadHasStructuredContent).toBe(false);
+  });
+
+  it("uses string run-level errors when no error payload exists", () => {
+    const result = resolveCronPayloadOutcome({
+      payloads: [{ text: " " }],
+      runLevelError: "rate limit exceeded",
+    });
+
+    expect(result.hasFatalErrorPayload).toBe(true);
+    expect(result.embeddedRunError).toBe("cron isolated run failed: rate limit exceeded");
+    expect(result.deliveryPayloads).toEqual([
+      { text: "cron isolated run failed: rate limit exceeded", isError: true },
+    ]);
+  });
+
+  it("falls back to run-level error kind without exposing arbitrary objects", () => {
+    const result = resolveCronPayloadOutcome({
+      payloads: [{ text: "Partial assistant text before failure" }],
+      runLevelError: { kind: "retry_limit", detail: { provider: "example" } },
+    });
+
+    expect(result.hasFatalErrorPayload).toBe(true);
+    expect(result.embeddedRunError).toBe("cron isolated run failed: retry_limit");
+    expect(result.outputText).toBe("cron isolated run failed: retry_limit");
+    expect(result.deliveryPayloads).toEqual([
+      { text: "cron isolated run failed: retry_limit", isError: true },
+    ]);
+  });
+
+  it("uses a generic run-level error for unrecognized objects", () => {
+    const result = resolveCronPayloadOutcome({
+      payloads: [],
+      runLevelError: { detail: { provider: "example" } },
+    });
+
+    expect(result.hasFatalErrorPayload).toBe(true);
+    expect(result.embeddedRunError).toBe("cron isolated run failed");
+    expect(result.deliveryPayloads).toEqual([{ text: "cron isolated run failed", isError: true }]);
+  });
+
+  it("does not let later success clear a run-level error", () => {
+    const result = resolveCronPayloadOutcome({
+      payloads: [
+        { text: "Temporary provider failure", isError: true },
+        { text: "Partial success-looking text" },
+      ],
+      runLevelError: "retry limit exceeded",
+    });
+
+    expect(result.hasFatalErrorPayload).toBe(true);
+    expect(result.embeddedRunError).toBe("Temporary provider failure");
+    expect(result.outputText).toBe("Temporary provider failure");
+    expect(result.deliveryPayloads).toEqual([
+      { text: "Temporary provider failure", isError: true },
+    ]);
  });

  it("truncates long summaries", () => {
--- a/src/cron/isolated-agent/helpers.ts
+++ b/src/cron/isolated-agent/helpers.ts
@@ -108,6 +108,26 @@ function formatCronFailureSignal(signal: NormalizedCronFailureSignal): string {
  }: ${signal.message}`;
 }

+function formatCronRunLevelError(error: unknown): string | undefined {
+  const direct = normalizeOptionalString(error);
+  if (direct) {
+    return `cron isolated run failed: ${direct}`;
+  }
+  if (!error || typeof error !== "object") {
+    return undefined;
+  }
+  const record = error as { message?: unknown; kind?: unknown };
+  const message = normalizeOptionalString(record.message);
+  if (message) {
+    return `cron isolated run failed: ${message}`;
+  }
+  const kind = normalizeOptionalString(record.kind);
+  if (kind) {
+    return `cron isolated run failed: ${kind}`;
+  }
+  return "cron isolated run failed";
+}
+
 export function pickSummaryFromOutput(text: string | undefined) {
  const clean = (text ?? "").trim();
  if (!clean) {
@@ -289,33 +309,43 @@ export function resolveCronPayloadOutcome(params: {
    })),
  ]);
  const failureSignal = normalizeCronFailureSignal(params.failureSignal);
+  const runLevelError = formatCronRunLevelError(params.runLevelError);
  const hasFatalErrorPayload =
-    hasFatalStructuredErrorPayload || failureSignal !== undefined || denialSignal !== undefined;
-  const shouldUseFailureSignalPayload =
-    failureSignal !== undefined && !hasFatalStructuredErrorPayload;
-  const failureSignalDeliveryPayload = shouldUseFailureSignalPayload
-    ? ({ text: failureSignal.message, isError: true } satisfies DeliveryPayload)
+    hasFatalStructuredErrorPayload ||
+    failureSignal !== undefined ||
+    denialSignal !== undefined ||
+    runLevelError !== undefined;
+  const structuredErrorText = hasFatalStructuredErrorPayload
+    ? (lastErrorPayloadText ?? "cron isolated run returned an error payload")
+    : undefined;
+  const shouldUseRunLevelErrorPayload =
+    runLevelError !== undefined &&
+    structuredErrorText === undefined &&
+    failureSignal === undefined &&
+    denialSignal === undefined;
+  const fatalDeliveryText =
+    structuredErrorText ??
+    failureSignal?.message ??
+    (shouldUseRunLevelErrorPayload ? runLevelError : undefined);
+  const fatalDeliveryPayload = fatalDeliveryText
+    ? ({ text: fatalDeliveryText, isError: true } satisfies DeliveryPayload)
    : undefined;
  return {
-    summary: shouldUseFailureSignalPayload
-      ? (pickSummaryFromOutput(failureSignal.message) ?? summary)
-      : summary,
-    outputText: shouldUseFailureSignalPayload ? failureSignal.message : outputText,
-    synthesizedText: shouldUseFailureSignalPayload ? failureSignal.message : synthesizedText,
-    deliveryPayload: failureSignalDeliveryPayload ?? deliveryPayload,
-    deliveryPayloads: failureSignalDeliveryPayload
-      ? [failureSignalDeliveryPayload]
-      : resolvedDeliveryPayloads,
-    deliveryPayloadHasStructuredContent: failureSignalDeliveryPayload
+    summary: fatalDeliveryText ? (pickSummaryFromOutput(fatalDeliveryText) ?? summary) : summary,
+    outputText: fatalDeliveryText ?? outputText,
+    synthesizedText: fatalDeliveryText ?? synthesizedText,
+    deliveryPayload: fatalDeliveryPayload ?? deliveryPayload,
+    deliveryPayloads: fatalDeliveryPayload ? [fatalDeliveryPayload] : resolvedDeliveryPayloads,
+    deliveryPayloadHasStructuredContent: fatalDeliveryPayload
      ? false
      : deliveryPayloadHasStructuredContent,
    hasFatalErrorPayload,
-    embeddedRunError: hasFatalStructuredErrorPayload
-      ? (lastErrorPayloadText ?? "cron isolated run returned an error payload")
+    embeddedRunError: structuredErrorText
+      ? structuredErrorText
      : failureSignal
        ? formatCronFailureSignal(failureSignal)
        : denialSignal
          ? formatCronDenialSignal(denialSignal)
-          : undefined,
+          : runLevelError,
  };
 }
--- a/src/cron/isolated-agent/run.meta-error-status.test.ts
+++ b/src/cron/isolated-agent/run.meta-error-status.test.ts
@@ -0,0 +1,54 @@
+import { describe, expect, it } from "vitest";
+import {
+  makeIsolatedAgentTurnParams,
+  setupRunCronIsolatedAgentTurnSuite,
+} from "./run.suite-helpers.js";
+import { loadRunCronIsolatedAgentTurn, runWithModelFallbackMock } from "./run.test-harness.js";
+
+const runCronIsolatedAgentTurn = await loadRunCronIsolatedAgentTurn();
+
+describe("runCronIsolatedAgentTurn - meta.error status propagation", () => {
+  setupRunCronIsolatedAgentTurnSuite();
+
+  it("marks a run-level error with empty payloads as a cron error", async () => {
+    runWithModelFallbackMock.mockResolvedValueOnce({
+      result: {
+        payloads: [],
+        meta: {
+          error: { kind: "provider_error", message: "model provider unreachable" },
+          agentMeta: { usage: { input: 0, output: 0 } },
+        },
+      },
+      provider: "openai",
+      model: "gpt-5.4",
+      attempts: [],
+    });
+
+    const result = await runCronIsolatedAgentTurn(makeIsolatedAgentTurnParams());
+
+    expect(result.status).toBe("error");
+    expect(result.error).toBe("cron isolated run failed: model provider unreachable");
+    expect(result.outputText).toBe("cron isolated run failed: model provider unreachable");
+  });
+
+  it("does not deliver partial success text when a run-level error is present", async () => {
+    runWithModelFallbackMock.mockResolvedValueOnce({
+      result: {
+        payloads: [{ text: "Partial success-looking text" }],
+        meta: {
+          error: { kind: "retry_limit", message: "retry limit exceeded" },
+          agentMeta: { usage: { input: 0, output: 0 } },
+        },
+      },
+      provider: "openai",
+      model: "gpt-5.4",
+      attempts: [],
+    });
+
+    const result = await runCronIsolatedAgentTurn(makeIsolatedAgentTurnParams());
+
+    expect(result.status).toBe("error");
+    expect(result.error).toBe("cron isolated run failed: retry limit exceeded");
+    expect(result.outputText).toBe("cron isolated run failed: retry limit exceeded");
+  });
+});
--- a/src/cron/isolated-agent/run.test-harness.ts
+++ b/src/cron/isolated-agent/run.test-harness.ts
@@ -365,21 +365,55 @@ function resetRunOutcomeMocks(): void {
    ({
      payloads,
      failureSignal,
+      runLevelError,
    }: {
      payloads: Array<{ isError?: boolean }>;
      failureSignal?: { fatalForCron?: boolean; message?: string };
+      runLevelError?: unknown;
    }) => {
+      const runLevelErrorMessage =
+        typeof runLevelError === "string" && runLevelError.trim()
+          ? `cron isolated run failed: ${runLevelError.trim()}`
+          : runLevelError && typeof runLevelError === "object"
+            ? (() => {
+                const record = runLevelError as { message?: unknown; kind?: unknown };
+                const message =
+                  typeof record.message === "string" && record.message.trim()
+                    ? record.message.trim()
+                    : undefined;
+                if (message) {
+                  return `cron isolated run failed: ${message}`;
+                }
+                const kind =
+                  typeof record.kind === "string" && record.kind.trim()
+                    ? record.kind.trim()
+                    : undefined;
+                return kind ? `cron isolated run failed: ${kind}` : "cron isolated run failed";
+              })()
+            : undefined;
      const failureMessage =
        failureSignal?.fatalForCron === true
          ? (failureSignal.message ?? "cron isolated run returned a fatal failure signal")
          : undefined;
-      const outputText = failureMessage ?? pickLastNonEmptyTextFromPayloadsMock(payloads);
+      const errorPayloadMessage = payloads.some((payload) => payload?.isError === true)
+        ? "cron isolated run returned an error payload"
+        : undefined;
+      const outputText =
+        errorPayloadMessage ??
+        failureMessage ??
+        runLevelErrorMessage ??
+        pickLastNonEmptyTextFromPayloadsMock(payloads);
      const synthesizedText = outputText?.trim() || "summary";
      const hasFatalErrorPayload =
-        payloads.some((payload) => payload?.isError === true) || failureMessage !== undefined;
-      const deliveryPayload = failureMessage ? { text: failureMessage, isError: true } : undefined;
+        errorPayloadMessage !== undefined ||
+        failureMessage !== undefined ||
+        runLevelErrorMessage !== undefined;
+      const deliveryPayload =
+        errorPayloadMessage || failureMessage || runLevelErrorMessage
+          ? { text: errorPayloadMessage ?? failureMessage ?? runLevelErrorMessage, isError: true }
+          : undefined;
      return {
-        summary: failureMessage ?? "summary",
+        summary: errorPayloadMessage ?? failureMessage ?? runLevelErrorMessage ?? "summary",
        outputText,
        synthesizedText,
        deliveryPayload,
@@ -391,11 +425,7 @@ function resetRunOutcomeMocks(): void {
        deliveryPayloadHasStructuredContent: false,
        hasFatalErrorPayload,
        embeddedRunError:
-          failureMessage !== undefined
-            ? failureMessage
-            : hasFatalErrorPayload
-              ? "cron isolated run returned an error payload"
-              : undefined,
+          errorPayloadMessage ?? failureMessage ?? runLevelErrorMessage ?? undefined,
      };
    },
  );