fix(cron): surface classified run failure causes

Surface classified cron failure causes without changing raw cron JSON error text.

- add additive CLI `cause` output for finished run entries with `errorReason`
- persist/backfill full `FailoverReason` values on cron run-log entries
- thread provider context through cron finalization so provider-specific failure causes stay accurate
- extend protocol/Swift models and regression coverage for CLI JSON, run-log parsing/search, alerts, and protocol conformance

Verification:
- `pnpm lint --threads=8`
- `pnpm protocol:check`
- `pnpm exec oxfmt --check src/cli/cron-cli/shared.ts src/cli/cron-cli/shared.cause-display.test.ts src/cron/run-log.ts src/cron/run-log.error-reason.test.ts src/cron/cron-protocol-conformance.test.ts src/cron/service.failure-alert.test.ts src/cron/service/timer.ts src/cron/service/ops.ts src/gateway/protocol/schema/cron.ts scripts/protocol-gen-swift.ts`
- `git diff --check`
- AWS Crabbox `cbx_8a6a65ab83b0` / `run_42b73a4a9750`: 4 files, 20 tests passed
- autoreview clean, no accepted/actionable findings
- GitHub CI/CodeQL/OpenGrep/Workflow Sanity green/skipped/neutral on `aa29b087b2587d0aed3d409de5e7a2c706c32cdf`

Co-authored-by: Yoshikazu Terashi <yterashi@peperon-works.jp>
This commit is contained in:
Yoshikazu Terashi
2026-05-27 17:03:17 +09:00
committed by GitHub
parent 57b1c0b3d9
commit 3104f36329
11 changed files with 420 additions and 22 deletions

View File

@@ -5508,6 +5508,7 @@ public struct CronRunLogEntry: Codable, Sendable {
public let action: String
public let status: AnyCodable?
public let error: String?
public let errorreason: AnyCodable?
public let summary: String?
public let diagnostics: [String: AnyCodable]?
public let delivered: Bool?
@@ -5531,6 +5532,7 @@ public struct CronRunLogEntry: Codable, Sendable {
action: String,
status: AnyCodable?,
error: String?,
errorreason: AnyCodable? = nil,
summary: String?,
diagnostics: [String: AnyCodable]?,
delivered: Bool?,
@@ -5553,6 +5555,7 @@ public struct CronRunLogEntry: Codable, Sendable {
self.action = action
self.status = status
self.error = error
self.errorreason = errorreason
self.summary = summary
self.diagnostics = diagnostics
self.delivered = delivered
@@ -5577,6 +5580,7 @@ public struct CronRunLogEntry: Codable, Sendable {
case action
case status
case error
case errorreason = "errorReason"
case summary
case diagnostics
case delivered

View File

@@ -47,7 +47,7 @@ const DEFAULTED_OPTIONAL_INIT_PARAMS: Record<string, Set<string>> = {
ArtifactsGetParams: new Set(["agentId"]),
ArtifactsDownloadParams: new Set(["agentId"]),
MessageActionParams: new Set(["inboundTurnKind"]),
CronRunLogEntry: new Set(["failureNotificationDelivery"]),
CronRunLogEntry: new Set(["errorReason", "failureNotificationDelivery"]),
};
const header = `// Generated by scripts/protocol-gen-swift.ts — do not edit by hand\n// swiftlint:disable file_length\nimport Foundation\n\npublic let GATEWAY_PROTOCOL_VERSION = ${PROTOCOL_VERSION}\npublic let GATEWAY_MIN_PROTOCOL_VERSION = ${MIN_CLIENT_PROTOCOL_VERSION}\n\nprivate struct GatewayAnyCodingKey: CodingKey, Hashable {\n let stringValue: String\n let intValue: Int?\n\n init?(stringValue: String) {\n self.stringValue = stringValue\n self.intValue = nil\n }\n\n init?(intValue: Int) {\n self.stringValue = String(intValue)\n self.intValue = intValue\n }\n}\n\npublic enum ErrorCode: String, Codable, Sendable {\n${Object.values(

View File

@@ -0,0 +1,52 @@
import { describe, expect, it } from "vitest";
import { defaultRuntime } from "../../runtime.js";
import { printCronJson } from "./shared.js";
describe("printCronJson cause display", () => {
it("adds an additive cause without changing raw cron run errors", () => {
let written: unknown;
const original = defaultRuntime.writeJson;
defaultRuntime.writeJson = (value: unknown) => {
written = value;
};
try {
printCronJson({
entries: [
{
ts: 1,
jobId: "job-1",
action: "finished",
status: "error",
errorReason: "timeout",
error: "cron: job execution timed out",
},
],
});
} finally {
defaultRuntime.writeJson = original;
}
const result = written as { entries: Array<Record<string, unknown>> };
expect(result.entries[0]?.cause).toBe("timeout");
expect(result.entries[0]?.error).toBe("cron: job execution timed out");
expect(result.entries[0]?.errorReason).toBe("timeout");
});
it("does not add cause fields to non-run-log entries", () => {
let written: unknown;
const original = defaultRuntime.writeJson;
defaultRuntime.writeJson = (value: unknown) => {
written = value;
};
try {
printCronJson({
entries: [{ errorReason: "timeout", status: "error" }],
});
} finally {
defaultRuntime.writeJson = original;
}
const result = written as { entries: Array<Record<string, unknown>> };
expect(result.entries[0]?.cause).toBeUndefined();
});
});

View File

@@ -25,8 +25,31 @@ export const getCronChannelOptions = () => {
return pluginIds.length > 0 ? ["last", ...pluginIds].join("|") : "last|<channel-id>";
};
function addCronRunCauseFields(value: unknown): unknown {
if (!value || typeof value !== "object") {
return value;
}
const record = value as Record<string, unknown>;
const entries = record.entries;
if (!Array.isArray(entries)) {
return value;
}
const nextEntries = entries.map((entry) => {
if (!entry || typeof entry !== "object") {
return entry;
}
const item = entry as Record<string, unknown>;
if (item.action !== "finished" || typeof item.errorReason !== "string") {
return item;
}
const cause = item.errorReason.trim();
return cause ? Object.assign({}, item, { cause }) : item;
});
return { ...record, entries: nextEntries };
}
export function printCronJson(value: unknown) {
defaultRuntime.writeJson(value);
defaultRuntime.writeJson(addCronRunCauseFields(value));
}
/**

View File

@@ -2,7 +2,11 @@ import fs from "node:fs/promises";
import path from "node:path";
import { describe, expect, it } from "vitest";
import { MACOS_APP_SOURCES_DIR } from "../compat/legacy-names.js";
import { CronDeliverySchema, CronJobStateSchema } from "../gateway/protocol/schema.js";
import {
CronDeliverySchema,
CronJobStateSchema,
CronRunLogEntrySchema,
} from "../gateway/protocol/schema.js";
type SchemaLike = {
anyOf?: Array<SchemaLike>;
@@ -30,13 +34,9 @@ function extractDeliveryModes(schema: SchemaLike): string[] {
}
function extractConstUnionValues(schema: SchemaLike): string[] {
return Array.from(
new Set(
(schema.anyOf ?? [])
.map((entry) => entry?.const)
.filter((value): value is string => typeof value === "string"),
),
);
return (schema.anyOf ?? [])
.map((entry) => entry?.const)
.filter((value): value is string => typeof value === "string");
}
const UI_FILES = ["ui/src/ui/types.ts", "ui/src/ui/ui-types.ts", "ui/src/ui/views/cron.ts"];
@@ -97,24 +97,32 @@ describe("cron protocol conformance", () => {
expect(swift).toContain("let jobs:");
});
it("cron job state schema keeps the full failover reason set", () => {
const properties = (CronJobStateSchema as SchemaLike).properties ?? {};
const lastErrorReason = properties.lastErrorReason as SchemaLike | undefined;
if (lastErrorReason === undefined) {
throw new Error("missing lastErrorReason schema");
}
expect(extractConstUnionValues(lastErrorReason)).toEqual([
it("cron public schemas keep the full failover reason set", () => {
const expectedReasons = [
"auth",
"auth_permanent",
"format",
"rate_limit",
"overloaded",
"billing",
"server_error",
"timeout",
"model_not_found",
"session_expired",
"empty_response",
"no_error_details",
"unclassified",
"unknown",
]);
];
const stateProperties = (CronJobStateSchema as SchemaLike).properties ?? {};
const lastErrorReason = stateProperties.lastErrorReason as SchemaLike | undefined;
expect(lastErrorReason).toBeDefined();
expect(extractConstUnionValues(lastErrorReason ?? {})).toEqual(expectedReasons);
const runLogProperties = (CronRunLogEntrySchema as SchemaLike).properties ?? {};
const errorReason = runLogProperties.errorReason as SchemaLike | undefined;
expect(errorReason).toBeDefined();
expect(extractConstUnionValues(errorReason ?? {})).toEqual(expectedReasons);
});
});

View File

@@ -0,0 +1,125 @@
import fs from "node:fs/promises";
import os from "node:os";
import path from "node:path";
import { describe, expect, it } from "vitest";
import { readCronRunLogEntriesPage } from "./run-log.js";
describe("cron run log errorReason", () => {
it("backfills errorReason from timeout error text for older entries", async () => {
const dir = await fs.mkdtemp(path.join(os.tmpdir(), "cron-run-log-"));
const file = path.join(dir, "job.jsonl");
await fs.writeFile(
file,
`${JSON.stringify({
ts: 1,
jobId: "job-1",
action: "finished",
status: "error",
error: "cron: job execution timed out",
})}\n`,
"utf8",
);
const page = await readCronRunLogEntriesPage(file, { limit: 10 });
expect(page.entries[0]?.errorReason).toBe("timeout");
});
it("validates persisted errorReason against the full failover reason set", async () => {
const dir = await fs.mkdtemp(path.join(os.tmpdir(), "cron-run-log-"));
const file = path.join(dir, "job.jsonl");
const reasons = [
"auth",
"auth_permanent",
"format",
"rate_limit",
"overloaded",
"billing",
"server_error",
"timeout",
"model_not_found",
"session_expired",
"empty_response",
"no_error_details",
"unclassified",
"unknown",
];
await fs.writeFile(
file,
reasons
.map((errorReason, index) =>
JSON.stringify({
ts: index + 1,
jobId: "job-1",
action: "finished",
status: "error",
errorReason,
}),
)
.join("\n") + "\n",
"utf8",
);
const page = await readCronRunLogEntriesPage(file, { limit: 50, sortDir: "asc" });
expect(page.entries.map((entry) => entry.errorReason)).toEqual(reasons);
});
it("derives an invalid persisted reason from raw error text before exposing entries", async () => {
const dir = await fs.mkdtemp(path.join(os.tmpdir(), "cron-run-log-"));
const file = path.join(dir, "job.jsonl");
await fs.writeFile(
file,
`${JSON.stringify({
ts: 1,
jobId: "job-1",
action: "finished",
status: "error",
error: "upstream unavailable: 503 overloaded",
errorReason: "not-a-real-reason",
})}\n`,
"utf8",
);
const page = await readCronRunLogEntriesPage(file, { limit: 10 });
expect(page.entries[0]?.errorReason).toBe("overloaded");
});
it("uses provider context when deriving persisted run-log reasons", async () => {
const dir = await fs.mkdtemp(path.join(os.tmpdir(), "cron-run-log-"));
const file = path.join(dir, "job.jsonl");
await fs.writeFile(
file,
`${JSON.stringify({
ts: 1,
jobId: "job-1",
action: "finished",
status: "error",
error: "403 Key limit exceeded (monthly limit)",
provider: "openrouter",
})}\n`,
"utf8",
);
const page = await readCronRunLogEntriesPage(file, { limit: 10 });
expect(page.entries[0]?.errorReason).toBe("billing");
});
it("includes derived errorReason values in run-log search", async () => {
const dir = await fs.mkdtemp(path.join(os.tmpdir(), "cron-run-log-"));
const file = path.join(dir, "job.jsonl");
await fs.writeFile(
file,
`${JSON.stringify({
ts: 1,
jobId: "job-1",
action: "finished",
status: "error",
error: "cron: job execution timed out",
})}\n`,
"utf8",
);
const page = await readCronRunLogEntriesPage(file, { limit: 10, query: "timeout" });
expect(page.entries).toHaveLength(1);
expect(page.entries[0]?.errorReason).toBe("timeout");
});
});

View File

@@ -1,6 +1,8 @@
import fsSync from "node:fs";
import fs from "node:fs/promises";
import path from "node:path";
import { resolveFailoverReasonFromError } from "../agents/failover-error.js";
import type { FailoverReason } from "../agents/pi-embedded-helpers/types.js";
import { parseByteSize } from "../cli/parse-bytes.js";
import type { CronConfig } from "../config/types.cron.js";
import { appendRegularFile, isPathInside, pathExists, root as fsRoot } from "../infra/fs-safe.js";
@@ -27,6 +29,7 @@ export type CronRunLogEntry = {
action: "finished";
status?: CronRunStatus;
error?: string;
errorReason?: FailoverReason;
summary?: string;
diagnostics?: CronRunDiagnostics;
delivered?: boolean;
@@ -72,6 +75,29 @@ type ReadCronRunLogAllPageOptions = Omit<ReadCronRunLogPageOptions, "jobId"> & {
jobNameById?: Record<string, string>;
};
const CRON_FAILOVER_REASONS = new Set<FailoverReason>([
"auth",
"auth_permanent",
"format",
"rate_limit",
"overloaded",
"billing",
"server_error",
"timeout",
"model_not_found",
"session_expired",
"empty_response",
"no_error_details",
"unclassified",
"unknown",
]);
function normalizeCronRunLogErrorReason(value: unknown): FailoverReason | undefined {
return typeof value === "string" && CRON_FAILOVER_REASONS.has(value as FailoverReason)
? (value as FailoverReason)
: undefined;
}
function assertSafeCronRunLogJobId(jobId: string): string {
const trimmed = jobId.trim();
if (!trimmed) {
@@ -309,12 +335,20 @@ function parseAllRunLogEntries(raw: string, opts?: { jobId?: string }): CronRunL
obj.usage && typeof obj.usage === "object"
? (obj.usage as Record<string, unknown>)
: undefined;
const normalizedError = typeof obj.error === "string" ? obj.error : undefined;
const normalizedProvider =
typeof obj.provider === "string" && obj.provider.trim() ? obj.provider : undefined;
const normalizedErrorReason =
normalizeCronRunLogErrorReason(obj.errorReason) ??
resolveFailoverReasonFromError(normalizedError, normalizedProvider) ??
undefined;
const entry: CronRunLogEntry = {
ts: obj.ts,
jobId: obj.jobId,
action: "finished",
status: obj.status,
error: obj.error,
error: normalizedError,
errorReason: normalizedErrorReason,
summary: obj.summary,
runId: typeof obj.runId === "string" && obj.runId.trim() ? obj.runId : undefined,
diagnostics: normalizeCronRunDiagnostics(obj.diagnostics),
@@ -322,8 +356,7 @@ function parseAllRunLogEntries(raw: string, opts?: { jobId?: string }): CronRunL
durationMs: obj.durationMs,
nextRunAtMs: obj.nextRunAtMs,
model: typeof obj.model === "string" && obj.model.trim() ? obj.model : undefined,
provider:
typeof obj.provider === "string" && obj.provider.trim() ? obj.provider : undefined,
provider: normalizedProvider,
usage: usage
? {
input_tokens: typeof usage.input_tokens === "number" ? usage.input_tokens : undefined,
@@ -447,6 +480,7 @@ export async function readCronRunLogEntriesPage(
[
entry.summary ?? "",
entry.error ?? "",
entry.errorReason ?? "",
entry.diagnostics?.summary ?? "",
...(entry.diagnostics?.entries ?? []).map((diagnostic) => diagnostic.message),
entry.jobId,
@@ -535,6 +569,7 @@ export async function readCronRunLogEntriesPageAll(
return [
entry.summary ?? "",
entry.error ?? "",
entry.errorReason ?? "",
entry.diagnostics?.summary ?? "",
...(entry.diagnostics?.entries ?? []).map((diagnostic) => diagnostic.message),
entry.jobId,

View File

@@ -414,6 +414,138 @@ describe("CronService failure alerts", () => {
await store.cleanup();
});
it("surfaces classified causes before raw errors in failure alerts", async () => {
const store = await makeStorePath();
const sendCronFailureAlert = vi.fn(async () => undefined);
const runIsolatedAgentJob = vi.fn(async () => ({
status: "error" as const,
error: "cron: job execution timed out",
}));
const cron = createFailureAlertCron({
storePath: store.storePath,
cronConfig: {
failureAlert: {
enabled: true,
after: 1,
},
},
runIsolatedAgentJob,
sendCronFailureAlert,
});
await cron.start();
const job = await cron.add({
name: "timeout cause alert",
enabled: true,
schedule: { kind: "every", everyMs: 60_000 },
sessionTarget: "isolated",
wakeMode: "next-heartbeat",
payload: { kind: "agentTurn", message: "ping" },
delivery: { mode: "announce", channel: "telegram", to: "19098680" },
});
await cron.run(job.id, "force");
expect(sendCronFailureAlert).toHaveBeenCalledTimes(1);
const alertText = alertCallArg(sendCronFailureAlert).text;
expect(alertText).toBe(
'Cron job "timeout cause alert" failed 1 times\n' +
"Cause: timeout\n" +
"Last error: cron: job execution timed out",
);
cron.stop();
await store.cleanup();
});
it("uses provider context when surfacing failure alert causes", async () => {
const store = await makeStorePath();
const sendCronFailureAlert = vi.fn(async () => undefined);
const runIsolatedAgentJob = vi.fn(async () => ({
status: "error" as const,
error: "403 Key limit exceeded (monthly limit)",
provider: "openrouter",
}));
const cron = createFailureAlertCron({
storePath: store.storePath,
cronConfig: {
failureAlert: {
enabled: true,
after: 1,
},
},
runIsolatedAgentJob,
sendCronFailureAlert,
});
await cron.start();
const job = await cron.add({
name: "provider limit alert",
enabled: true,
schedule: { kind: "every", everyMs: 60_000 },
sessionTarget: "isolated",
wakeMode: "next-heartbeat",
payload: { kind: "agentTurn", message: "ping" },
delivery: { mode: "announce", channel: "telegram", to: "19098680" },
});
await cron.run(job.id, "force");
expect(sendCronFailureAlert).toHaveBeenCalledTimes(1);
const alertText = alertCallArg(sendCronFailureAlert).text;
expect(alertText).toBe(
'Cron job "provider limit alert" failed 1 times\n' +
"Cause: billing\n" +
"Last error: 403 Key limit exceeded (monthly limit)",
);
cron.stop();
await store.cleanup();
});
it("keeps skipped alert text unchanged when the skip reason looks classifiable", async () => {
const store = await makeStorePath();
const sendCronFailureAlert = vi.fn(async () => undefined);
const runIsolatedAgentJob = vi.fn(async () => ({
status: "skipped" as const,
error: "cron: job execution timed out",
}));
const cron = createFailureAlertCron({
storePath: store.storePath,
cronConfig: {
failureAlert: {
enabled: true,
after: 1,
includeSkipped: true,
},
},
runIsolatedAgentJob,
sendCronFailureAlert,
});
await cron.start();
const job = await cron.add({
name: "skipped timeout",
enabled: true,
schedule: { kind: "every", everyMs: 60_000 },
sessionTarget: "isolated",
wakeMode: "next-heartbeat",
payload: { kind: "agentTurn", message: "ping" },
delivery: { mode: "announce", channel: "telegram", to: "19098680" },
});
await cron.run(job.id, "force");
expect(sendCronFailureAlert).toHaveBeenCalledTimes(1);
const alertText = alertCallArg(sendCronFailureAlert).text;
expect(alertText).toBe(
'Cron job "skipped timeout" skipped 1 times\nSkip reason: cron: job execution timed out',
);
cron.stop();
await store.cleanup();
});
it("tracks skipped runs without alerting or affecting error backoff when includeSkipped is off", async () => {
const store = await makeStorePath();
const sendCronFailureAlert = vi.fn(async () => undefined);

View File

@@ -775,6 +775,7 @@ async function finishPreparedManualRun(
error: coreResult.error,
diagnostics: coreResult.diagnostics,
delivered: coreResult.delivered,
provider: coreResult.provider,
startedAt,
endedAt,
},

View File

@@ -755,14 +755,20 @@ function emitFailureAlert(
mode?: "announce" | "webhook";
accountId?: string;
status: "error" | "skipped";
provider?: string;
},
) {
const safeJobName = params.job.name || params.job.id;
const truncatedError = (params.error?.trim() || "unknown reason").slice(0, 200);
const errorReason =
params.status === "error" && typeof params.error === "string"
? (resolveFailoverReasonFromError(params.error, params.provider) ?? undefined)
: undefined;
const statusVerb = params.status === "skipped" ? "skipped" : "failed";
const detailLabel = params.status === "skipped" ? "Skip reason" : "Last error";
const text = [
`Cron job "${safeJobName}" ${statusVerb} ${params.consecutiveErrors} times`,
...(errorReason ? [`Cause: ${errorReason}`] : []),
`${detailLabel}: ${truncatedError}`,
].join("\n");
@@ -802,6 +808,7 @@ function maybeEmitFailureAlert(
alertConfig: ResolvedFailureAlert | null;
status: "error" | "skipped";
error?: string;
provider?: string;
consecutiveCount: number;
},
) {
@@ -828,6 +835,7 @@ function maybeEmitFailureAlert(
mode: params.alertConfig.mode,
accountId: params.alertConfig.accountId,
status: params.status,
provider: params.provider,
});
params.job.state.lastFailureAlertAtMs = now;
}
@@ -845,6 +853,7 @@ export function applyJobResult(
error?: string;
diagnostics?: CronRunOutcome["diagnostics"];
delivered?: boolean;
provider?: string;
startedAt: number;
endedAt: number;
},
@@ -873,7 +882,7 @@ export function applyJobResult(
job.state.lastDiagnosticSummary = summarizeCronRunDiagnostics(job.state.lastDiagnostics);
job.state.lastErrorReason =
result.status === "error" && typeof result.error === "string"
? (resolveFailoverReasonFromError(result.error) ?? undefined)
? (resolveFailoverReasonFromError(result.error, result.provider) ?? undefined)
: undefined;
if (result.status === "error") {
state.deps.log.warn(
@@ -915,6 +924,7 @@ export function applyJobResult(
alertConfig,
status: "error",
error: result.error,
provider: result.provider,
consecutiveCount: job.state.consecutiveErrors,
});
} else if (result.status === "skipped") {
@@ -926,6 +936,7 @@ export function applyJobResult(
alertConfig,
status: "skipped",
error: result.error,
provider: result.provider,
consecutiveCount: job.state.consecutiveSkipped,
});
} else {
@@ -1080,6 +1091,7 @@ function applyOutcomeToStoredJob(state: CronServiceState, result: TimedCronRunOu
error: result.error,
diagnostics: result.diagnostics,
delivered: result.delivered,
provider: result.provider,
startedAt: result.startedAt,
endedAt: result.endedAt,
});
@@ -1102,6 +1114,7 @@ function applyOutcomeToStoredJob(state: CronServiceState, result: TimedCronRunOu
error: result.error,
diagnostics: result.diagnostics,
delivered: result.delivered,
provider: result.provider,
startedAt: result.startedAt,
endedAt: result.endedAt,
});
@@ -1952,6 +1965,7 @@ export async function executeJob(
error: coreResult.error,
diagnostics: coreResult.diagnostics,
delivered: coreResult.delivered,
provider: coreResult.provider,
startedAt,
endedAt,
});

View File

@@ -64,12 +64,15 @@ const CronDeliveryStatusSchema = Type.Union([
]);
const CronFailoverReasonSchema = Type.Union([
Type.Literal("auth"),
Type.Literal("auth_permanent"),
Type.Literal("format"),
Type.Literal("rate_limit"),
Type.Literal("overloaded"),
Type.Literal("billing"),
Type.Literal("server_error"),
Type.Literal("timeout"),
Type.Literal("model_not_found"),
Type.Literal("session_expired"),
Type.Literal("empty_response"),
Type.Literal("no_error_details"),
Type.Literal("unclassified"),
@@ -434,6 +437,7 @@ export const CronRunLogEntrySchema = Type.Object(
action: Type.Literal("finished"),
status: Type.Optional(CronRunStatusSchema),
error: Type.Optional(Type.String()),
errorReason: Type.Optional(CronFailoverReasonSchema),
summary: Type.Optional(Type.String()),
diagnostics: Type.Optional(CronRunDiagnosticsSchema),
delivered: Type.Optional(Type.Boolean()),