mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-27 17:46:02 +00:00
fix(cron): surface classified run failure causes
Surface classified cron failure causes without changing raw cron JSON error text. - add additive CLI `cause` output for finished run entries with `errorReason` - persist/backfill full `FailoverReason` values on cron run-log entries - thread provider context through cron finalization so provider-specific failure causes stay accurate - extend protocol/Swift models and regression coverage for CLI JSON, run-log parsing/search, alerts, and protocol conformance Verification: - `pnpm lint --threads=8` - `pnpm protocol:check` - `pnpm exec oxfmt --check src/cli/cron-cli/shared.ts src/cli/cron-cli/shared.cause-display.test.ts src/cron/run-log.ts src/cron/run-log.error-reason.test.ts src/cron/cron-protocol-conformance.test.ts src/cron/service.failure-alert.test.ts src/cron/service/timer.ts src/cron/service/ops.ts src/gateway/protocol/schema/cron.ts scripts/protocol-gen-swift.ts` - `git diff --check` - AWS Crabbox `cbx_8a6a65ab83b0` / `run_42b73a4a9750`: 4 files, 20 tests passed - autoreview clean, no accepted/actionable findings - GitHub CI/CodeQL/OpenGrep/Workflow Sanity green/skipped/neutral on `aa29b087b2587d0aed3d409de5e7a2c706c32cdf` Co-authored-by: Yoshikazu Terashi <yterashi@peperon-works.jp>
This commit is contained in:
committed by
GitHub
parent
57b1c0b3d9
commit
3104f36329
@@ -5508,6 +5508,7 @@ public struct CronRunLogEntry: Codable, Sendable {
|
||||
public let action: String
|
||||
public let status: AnyCodable?
|
||||
public let error: String?
|
||||
public let errorreason: AnyCodable?
|
||||
public let summary: String?
|
||||
public let diagnostics: [String: AnyCodable]?
|
||||
public let delivered: Bool?
|
||||
@@ -5531,6 +5532,7 @@ public struct CronRunLogEntry: Codable, Sendable {
|
||||
action: String,
|
||||
status: AnyCodable?,
|
||||
error: String?,
|
||||
errorreason: AnyCodable? = nil,
|
||||
summary: String?,
|
||||
diagnostics: [String: AnyCodable]?,
|
||||
delivered: Bool?,
|
||||
@@ -5553,6 +5555,7 @@ public struct CronRunLogEntry: Codable, Sendable {
|
||||
self.action = action
|
||||
self.status = status
|
||||
self.error = error
|
||||
self.errorreason = errorreason
|
||||
self.summary = summary
|
||||
self.diagnostics = diagnostics
|
||||
self.delivered = delivered
|
||||
@@ -5577,6 +5580,7 @@ public struct CronRunLogEntry: Codable, Sendable {
|
||||
case action
|
||||
case status
|
||||
case error
|
||||
case errorreason = "errorReason"
|
||||
case summary
|
||||
case diagnostics
|
||||
case delivered
|
||||
|
||||
@@ -47,7 +47,7 @@ const DEFAULTED_OPTIONAL_INIT_PARAMS: Record<string, Set<string>> = {
|
||||
ArtifactsGetParams: new Set(["agentId"]),
|
||||
ArtifactsDownloadParams: new Set(["agentId"]),
|
||||
MessageActionParams: new Set(["inboundTurnKind"]),
|
||||
CronRunLogEntry: new Set(["failureNotificationDelivery"]),
|
||||
CronRunLogEntry: new Set(["errorReason", "failureNotificationDelivery"]),
|
||||
};
|
||||
|
||||
const header = `// Generated by scripts/protocol-gen-swift.ts — do not edit by hand\n// swiftlint:disable file_length\nimport Foundation\n\npublic let GATEWAY_PROTOCOL_VERSION = ${PROTOCOL_VERSION}\npublic let GATEWAY_MIN_PROTOCOL_VERSION = ${MIN_CLIENT_PROTOCOL_VERSION}\n\nprivate struct GatewayAnyCodingKey: CodingKey, Hashable {\n let stringValue: String\n let intValue: Int?\n\n init?(stringValue: String) {\n self.stringValue = stringValue\n self.intValue = nil\n }\n\n init?(intValue: Int) {\n self.stringValue = String(intValue)\n self.intValue = intValue\n }\n}\n\npublic enum ErrorCode: String, Codable, Sendable {\n${Object.values(
|
||||
|
||||
52
src/cli/cron-cli/shared.cause-display.test.ts
Normal file
52
src/cli/cron-cli/shared.cause-display.test.ts
Normal file
@@ -0,0 +1,52 @@
|
||||
import { describe, expect, it } from "vitest";
|
||||
import { defaultRuntime } from "../../runtime.js";
|
||||
import { printCronJson } from "./shared.js";
|
||||
|
||||
describe("printCronJson cause display", () => {
|
||||
it("adds an additive cause without changing raw cron run errors", () => {
|
||||
let written: unknown;
|
||||
const original = defaultRuntime.writeJson;
|
||||
defaultRuntime.writeJson = (value: unknown) => {
|
||||
written = value;
|
||||
};
|
||||
try {
|
||||
printCronJson({
|
||||
entries: [
|
||||
{
|
||||
ts: 1,
|
||||
jobId: "job-1",
|
||||
action: "finished",
|
||||
status: "error",
|
||||
errorReason: "timeout",
|
||||
error: "cron: job execution timed out",
|
||||
},
|
||||
],
|
||||
});
|
||||
} finally {
|
||||
defaultRuntime.writeJson = original;
|
||||
}
|
||||
|
||||
const result = written as { entries: Array<Record<string, unknown>> };
|
||||
expect(result.entries[0]?.cause).toBe("timeout");
|
||||
expect(result.entries[0]?.error).toBe("cron: job execution timed out");
|
||||
expect(result.entries[0]?.errorReason).toBe("timeout");
|
||||
});
|
||||
|
||||
it("does not add cause fields to non-run-log entries", () => {
|
||||
let written: unknown;
|
||||
const original = defaultRuntime.writeJson;
|
||||
defaultRuntime.writeJson = (value: unknown) => {
|
||||
written = value;
|
||||
};
|
||||
try {
|
||||
printCronJson({
|
||||
entries: [{ errorReason: "timeout", status: "error" }],
|
||||
});
|
||||
} finally {
|
||||
defaultRuntime.writeJson = original;
|
||||
}
|
||||
|
||||
const result = written as { entries: Array<Record<string, unknown>> };
|
||||
expect(result.entries[0]?.cause).toBeUndefined();
|
||||
});
|
||||
});
|
||||
@@ -25,8 +25,31 @@ export const getCronChannelOptions = () => {
|
||||
return pluginIds.length > 0 ? ["last", ...pluginIds].join("|") : "last|<channel-id>";
|
||||
};
|
||||
|
||||
function addCronRunCauseFields(value: unknown): unknown {
|
||||
if (!value || typeof value !== "object") {
|
||||
return value;
|
||||
}
|
||||
const record = value as Record<string, unknown>;
|
||||
const entries = record.entries;
|
||||
if (!Array.isArray(entries)) {
|
||||
return value;
|
||||
}
|
||||
const nextEntries = entries.map((entry) => {
|
||||
if (!entry || typeof entry !== "object") {
|
||||
return entry;
|
||||
}
|
||||
const item = entry as Record<string, unknown>;
|
||||
if (item.action !== "finished" || typeof item.errorReason !== "string") {
|
||||
return item;
|
||||
}
|
||||
const cause = item.errorReason.trim();
|
||||
return cause ? Object.assign({}, item, { cause }) : item;
|
||||
});
|
||||
return { ...record, entries: nextEntries };
|
||||
}
|
||||
|
||||
export function printCronJson(value: unknown) {
|
||||
defaultRuntime.writeJson(value);
|
||||
defaultRuntime.writeJson(addCronRunCauseFields(value));
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@@ -2,7 +2,11 @@ import fs from "node:fs/promises";
|
||||
import path from "node:path";
|
||||
import { describe, expect, it } from "vitest";
|
||||
import { MACOS_APP_SOURCES_DIR } from "../compat/legacy-names.js";
|
||||
import { CronDeliverySchema, CronJobStateSchema } from "../gateway/protocol/schema.js";
|
||||
import {
|
||||
CronDeliverySchema,
|
||||
CronJobStateSchema,
|
||||
CronRunLogEntrySchema,
|
||||
} from "../gateway/protocol/schema.js";
|
||||
|
||||
type SchemaLike = {
|
||||
anyOf?: Array<SchemaLike>;
|
||||
@@ -30,13 +34,9 @@ function extractDeliveryModes(schema: SchemaLike): string[] {
|
||||
}
|
||||
|
||||
function extractConstUnionValues(schema: SchemaLike): string[] {
|
||||
return Array.from(
|
||||
new Set(
|
||||
(schema.anyOf ?? [])
|
||||
.map((entry) => entry?.const)
|
||||
.filter((value): value is string => typeof value === "string"),
|
||||
),
|
||||
);
|
||||
return (schema.anyOf ?? [])
|
||||
.map((entry) => entry?.const)
|
||||
.filter((value): value is string => typeof value === "string");
|
||||
}
|
||||
|
||||
const UI_FILES = ["ui/src/ui/types.ts", "ui/src/ui/ui-types.ts", "ui/src/ui/views/cron.ts"];
|
||||
@@ -97,24 +97,32 @@ describe("cron protocol conformance", () => {
|
||||
expect(swift).toContain("let jobs:");
|
||||
});
|
||||
|
||||
it("cron job state schema keeps the full failover reason set", () => {
|
||||
const properties = (CronJobStateSchema as SchemaLike).properties ?? {};
|
||||
const lastErrorReason = properties.lastErrorReason as SchemaLike | undefined;
|
||||
if (lastErrorReason === undefined) {
|
||||
throw new Error("missing lastErrorReason schema");
|
||||
}
|
||||
expect(extractConstUnionValues(lastErrorReason)).toEqual([
|
||||
it("cron public schemas keep the full failover reason set", () => {
|
||||
const expectedReasons = [
|
||||
"auth",
|
||||
"auth_permanent",
|
||||
"format",
|
||||
"rate_limit",
|
||||
"overloaded",
|
||||
"billing",
|
||||
"server_error",
|
||||
"timeout",
|
||||
"model_not_found",
|
||||
"session_expired",
|
||||
"empty_response",
|
||||
"no_error_details",
|
||||
"unclassified",
|
||||
"unknown",
|
||||
]);
|
||||
];
|
||||
|
||||
const stateProperties = (CronJobStateSchema as SchemaLike).properties ?? {};
|
||||
const lastErrorReason = stateProperties.lastErrorReason as SchemaLike | undefined;
|
||||
expect(lastErrorReason).toBeDefined();
|
||||
expect(extractConstUnionValues(lastErrorReason ?? {})).toEqual(expectedReasons);
|
||||
|
||||
const runLogProperties = (CronRunLogEntrySchema as SchemaLike).properties ?? {};
|
||||
const errorReason = runLogProperties.errorReason as SchemaLike | undefined;
|
||||
expect(errorReason).toBeDefined();
|
||||
expect(extractConstUnionValues(errorReason ?? {})).toEqual(expectedReasons);
|
||||
});
|
||||
});
|
||||
|
||||
125
src/cron/run-log.error-reason.test.ts
Normal file
125
src/cron/run-log.error-reason.test.ts
Normal file
@@ -0,0 +1,125 @@
|
||||
import fs from "node:fs/promises";
|
||||
import os from "node:os";
|
||||
import path from "node:path";
|
||||
import { describe, expect, it } from "vitest";
|
||||
import { readCronRunLogEntriesPage } from "./run-log.js";
|
||||
|
||||
describe("cron run log errorReason", () => {
|
||||
it("backfills errorReason from timeout error text for older entries", async () => {
|
||||
const dir = await fs.mkdtemp(path.join(os.tmpdir(), "cron-run-log-"));
|
||||
const file = path.join(dir, "job.jsonl");
|
||||
await fs.writeFile(
|
||||
file,
|
||||
`${JSON.stringify({
|
||||
ts: 1,
|
||||
jobId: "job-1",
|
||||
action: "finished",
|
||||
status: "error",
|
||||
error: "cron: job execution timed out",
|
||||
})}\n`,
|
||||
"utf8",
|
||||
);
|
||||
|
||||
const page = await readCronRunLogEntriesPage(file, { limit: 10 });
|
||||
expect(page.entries[0]?.errorReason).toBe("timeout");
|
||||
});
|
||||
|
||||
it("validates persisted errorReason against the full failover reason set", async () => {
|
||||
const dir = await fs.mkdtemp(path.join(os.tmpdir(), "cron-run-log-"));
|
||||
const file = path.join(dir, "job.jsonl");
|
||||
const reasons = [
|
||||
"auth",
|
||||
"auth_permanent",
|
||||
"format",
|
||||
"rate_limit",
|
||||
"overloaded",
|
||||
"billing",
|
||||
"server_error",
|
||||
"timeout",
|
||||
"model_not_found",
|
||||
"session_expired",
|
||||
"empty_response",
|
||||
"no_error_details",
|
||||
"unclassified",
|
||||
"unknown",
|
||||
];
|
||||
await fs.writeFile(
|
||||
file,
|
||||
reasons
|
||||
.map((errorReason, index) =>
|
||||
JSON.stringify({
|
||||
ts: index + 1,
|
||||
jobId: "job-1",
|
||||
action: "finished",
|
||||
status: "error",
|
||||
errorReason,
|
||||
}),
|
||||
)
|
||||
.join("\n") + "\n",
|
||||
"utf8",
|
||||
);
|
||||
|
||||
const page = await readCronRunLogEntriesPage(file, { limit: 50, sortDir: "asc" });
|
||||
expect(page.entries.map((entry) => entry.errorReason)).toEqual(reasons);
|
||||
});
|
||||
|
||||
it("derives an invalid persisted reason from raw error text before exposing entries", async () => {
|
||||
const dir = await fs.mkdtemp(path.join(os.tmpdir(), "cron-run-log-"));
|
||||
const file = path.join(dir, "job.jsonl");
|
||||
await fs.writeFile(
|
||||
file,
|
||||
`${JSON.stringify({
|
||||
ts: 1,
|
||||
jobId: "job-1",
|
||||
action: "finished",
|
||||
status: "error",
|
||||
error: "upstream unavailable: 503 overloaded",
|
||||
errorReason: "not-a-real-reason",
|
||||
})}\n`,
|
||||
"utf8",
|
||||
);
|
||||
|
||||
const page = await readCronRunLogEntriesPage(file, { limit: 10 });
|
||||
expect(page.entries[0]?.errorReason).toBe("overloaded");
|
||||
});
|
||||
|
||||
it("uses provider context when deriving persisted run-log reasons", async () => {
|
||||
const dir = await fs.mkdtemp(path.join(os.tmpdir(), "cron-run-log-"));
|
||||
const file = path.join(dir, "job.jsonl");
|
||||
await fs.writeFile(
|
||||
file,
|
||||
`${JSON.stringify({
|
||||
ts: 1,
|
||||
jobId: "job-1",
|
||||
action: "finished",
|
||||
status: "error",
|
||||
error: "403 Key limit exceeded (monthly limit)",
|
||||
provider: "openrouter",
|
||||
})}\n`,
|
||||
"utf8",
|
||||
);
|
||||
|
||||
const page = await readCronRunLogEntriesPage(file, { limit: 10 });
|
||||
expect(page.entries[0]?.errorReason).toBe("billing");
|
||||
});
|
||||
|
||||
it("includes derived errorReason values in run-log search", async () => {
|
||||
const dir = await fs.mkdtemp(path.join(os.tmpdir(), "cron-run-log-"));
|
||||
const file = path.join(dir, "job.jsonl");
|
||||
await fs.writeFile(
|
||||
file,
|
||||
`${JSON.stringify({
|
||||
ts: 1,
|
||||
jobId: "job-1",
|
||||
action: "finished",
|
||||
status: "error",
|
||||
error: "cron: job execution timed out",
|
||||
})}\n`,
|
||||
"utf8",
|
||||
);
|
||||
|
||||
const page = await readCronRunLogEntriesPage(file, { limit: 10, query: "timeout" });
|
||||
expect(page.entries).toHaveLength(1);
|
||||
expect(page.entries[0]?.errorReason).toBe("timeout");
|
||||
});
|
||||
});
|
||||
@@ -1,6 +1,8 @@
|
||||
import fsSync from "node:fs";
|
||||
import fs from "node:fs/promises";
|
||||
import path from "node:path";
|
||||
import { resolveFailoverReasonFromError } from "../agents/failover-error.js";
|
||||
import type { FailoverReason } from "../agents/pi-embedded-helpers/types.js";
|
||||
import { parseByteSize } from "../cli/parse-bytes.js";
|
||||
import type { CronConfig } from "../config/types.cron.js";
|
||||
import { appendRegularFile, isPathInside, pathExists, root as fsRoot } from "../infra/fs-safe.js";
|
||||
@@ -27,6 +29,7 @@ export type CronRunLogEntry = {
|
||||
action: "finished";
|
||||
status?: CronRunStatus;
|
||||
error?: string;
|
||||
errorReason?: FailoverReason;
|
||||
summary?: string;
|
||||
diagnostics?: CronRunDiagnostics;
|
||||
delivered?: boolean;
|
||||
@@ -72,6 +75,29 @@ type ReadCronRunLogAllPageOptions = Omit<ReadCronRunLogPageOptions, "jobId"> & {
|
||||
jobNameById?: Record<string, string>;
|
||||
};
|
||||
|
||||
const CRON_FAILOVER_REASONS = new Set<FailoverReason>([
|
||||
"auth",
|
||||
"auth_permanent",
|
||||
"format",
|
||||
"rate_limit",
|
||||
"overloaded",
|
||||
"billing",
|
||||
"server_error",
|
||||
"timeout",
|
||||
"model_not_found",
|
||||
"session_expired",
|
||||
"empty_response",
|
||||
"no_error_details",
|
||||
"unclassified",
|
||||
"unknown",
|
||||
]);
|
||||
|
||||
function normalizeCronRunLogErrorReason(value: unknown): FailoverReason | undefined {
|
||||
return typeof value === "string" && CRON_FAILOVER_REASONS.has(value as FailoverReason)
|
||||
? (value as FailoverReason)
|
||||
: undefined;
|
||||
}
|
||||
|
||||
function assertSafeCronRunLogJobId(jobId: string): string {
|
||||
const trimmed = jobId.trim();
|
||||
if (!trimmed) {
|
||||
@@ -309,12 +335,20 @@ function parseAllRunLogEntries(raw: string, opts?: { jobId?: string }): CronRunL
|
||||
obj.usage && typeof obj.usage === "object"
|
||||
? (obj.usage as Record<string, unknown>)
|
||||
: undefined;
|
||||
const normalizedError = typeof obj.error === "string" ? obj.error : undefined;
|
||||
const normalizedProvider =
|
||||
typeof obj.provider === "string" && obj.provider.trim() ? obj.provider : undefined;
|
||||
const normalizedErrorReason =
|
||||
normalizeCronRunLogErrorReason(obj.errorReason) ??
|
||||
resolveFailoverReasonFromError(normalizedError, normalizedProvider) ??
|
||||
undefined;
|
||||
const entry: CronRunLogEntry = {
|
||||
ts: obj.ts,
|
||||
jobId: obj.jobId,
|
||||
action: "finished",
|
||||
status: obj.status,
|
||||
error: obj.error,
|
||||
error: normalizedError,
|
||||
errorReason: normalizedErrorReason,
|
||||
summary: obj.summary,
|
||||
runId: typeof obj.runId === "string" && obj.runId.trim() ? obj.runId : undefined,
|
||||
diagnostics: normalizeCronRunDiagnostics(obj.diagnostics),
|
||||
@@ -322,8 +356,7 @@ function parseAllRunLogEntries(raw: string, opts?: { jobId?: string }): CronRunL
|
||||
durationMs: obj.durationMs,
|
||||
nextRunAtMs: obj.nextRunAtMs,
|
||||
model: typeof obj.model === "string" && obj.model.trim() ? obj.model : undefined,
|
||||
provider:
|
||||
typeof obj.provider === "string" && obj.provider.trim() ? obj.provider : undefined,
|
||||
provider: normalizedProvider,
|
||||
usage: usage
|
||||
? {
|
||||
input_tokens: typeof usage.input_tokens === "number" ? usage.input_tokens : undefined,
|
||||
@@ -447,6 +480,7 @@ export async function readCronRunLogEntriesPage(
|
||||
[
|
||||
entry.summary ?? "",
|
||||
entry.error ?? "",
|
||||
entry.errorReason ?? "",
|
||||
entry.diagnostics?.summary ?? "",
|
||||
...(entry.diagnostics?.entries ?? []).map((diagnostic) => diagnostic.message),
|
||||
entry.jobId,
|
||||
@@ -535,6 +569,7 @@ export async function readCronRunLogEntriesPageAll(
|
||||
return [
|
||||
entry.summary ?? "",
|
||||
entry.error ?? "",
|
||||
entry.errorReason ?? "",
|
||||
entry.diagnostics?.summary ?? "",
|
||||
...(entry.diagnostics?.entries ?? []).map((diagnostic) => diagnostic.message),
|
||||
entry.jobId,
|
||||
|
||||
@@ -414,6 +414,138 @@ describe("CronService failure alerts", () => {
|
||||
await store.cleanup();
|
||||
});
|
||||
|
||||
it("surfaces classified causes before raw errors in failure alerts", async () => {
|
||||
const store = await makeStorePath();
|
||||
const sendCronFailureAlert = vi.fn(async () => undefined);
|
||||
const runIsolatedAgentJob = vi.fn(async () => ({
|
||||
status: "error" as const,
|
||||
error: "cron: job execution timed out",
|
||||
}));
|
||||
|
||||
const cron = createFailureAlertCron({
|
||||
storePath: store.storePath,
|
||||
cronConfig: {
|
||||
failureAlert: {
|
||||
enabled: true,
|
||||
after: 1,
|
||||
},
|
||||
},
|
||||
runIsolatedAgentJob,
|
||||
sendCronFailureAlert,
|
||||
});
|
||||
|
||||
await cron.start();
|
||||
const job = await cron.add({
|
||||
name: "timeout cause alert",
|
||||
enabled: true,
|
||||
schedule: { kind: "every", everyMs: 60_000 },
|
||||
sessionTarget: "isolated",
|
||||
wakeMode: "next-heartbeat",
|
||||
payload: { kind: "agentTurn", message: "ping" },
|
||||
delivery: { mode: "announce", channel: "telegram", to: "19098680" },
|
||||
});
|
||||
|
||||
await cron.run(job.id, "force");
|
||||
expect(sendCronFailureAlert).toHaveBeenCalledTimes(1);
|
||||
const alertText = alertCallArg(sendCronFailureAlert).text;
|
||||
expect(alertText).toBe(
|
||||
'Cron job "timeout cause alert" failed 1 times\n' +
|
||||
"Cause: timeout\n" +
|
||||
"Last error: cron: job execution timed out",
|
||||
);
|
||||
|
||||
cron.stop();
|
||||
await store.cleanup();
|
||||
});
|
||||
|
||||
it("uses provider context when surfacing failure alert causes", async () => {
|
||||
const store = await makeStorePath();
|
||||
const sendCronFailureAlert = vi.fn(async () => undefined);
|
||||
const runIsolatedAgentJob = vi.fn(async () => ({
|
||||
status: "error" as const,
|
||||
error: "403 Key limit exceeded (monthly limit)",
|
||||
provider: "openrouter",
|
||||
}));
|
||||
|
||||
const cron = createFailureAlertCron({
|
||||
storePath: store.storePath,
|
||||
cronConfig: {
|
||||
failureAlert: {
|
||||
enabled: true,
|
||||
after: 1,
|
||||
},
|
||||
},
|
||||
runIsolatedAgentJob,
|
||||
sendCronFailureAlert,
|
||||
});
|
||||
|
||||
await cron.start();
|
||||
const job = await cron.add({
|
||||
name: "provider limit alert",
|
||||
enabled: true,
|
||||
schedule: { kind: "every", everyMs: 60_000 },
|
||||
sessionTarget: "isolated",
|
||||
wakeMode: "next-heartbeat",
|
||||
payload: { kind: "agentTurn", message: "ping" },
|
||||
delivery: { mode: "announce", channel: "telegram", to: "19098680" },
|
||||
});
|
||||
|
||||
await cron.run(job.id, "force");
|
||||
expect(sendCronFailureAlert).toHaveBeenCalledTimes(1);
|
||||
const alertText = alertCallArg(sendCronFailureAlert).text;
|
||||
expect(alertText).toBe(
|
||||
'Cron job "provider limit alert" failed 1 times\n' +
|
||||
"Cause: billing\n" +
|
||||
"Last error: 403 Key limit exceeded (monthly limit)",
|
||||
);
|
||||
|
||||
cron.stop();
|
||||
await store.cleanup();
|
||||
});
|
||||
|
||||
it("keeps skipped alert text unchanged when the skip reason looks classifiable", async () => {
|
||||
const store = await makeStorePath();
|
||||
const sendCronFailureAlert = vi.fn(async () => undefined);
|
||||
const runIsolatedAgentJob = vi.fn(async () => ({
|
||||
status: "skipped" as const,
|
||||
error: "cron: job execution timed out",
|
||||
}));
|
||||
|
||||
const cron = createFailureAlertCron({
|
||||
storePath: store.storePath,
|
||||
cronConfig: {
|
||||
failureAlert: {
|
||||
enabled: true,
|
||||
after: 1,
|
||||
includeSkipped: true,
|
||||
},
|
||||
},
|
||||
runIsolatedAgentJob,
|
||||
sendCronFailureAlert,
|
||||
});
|
||||
|
||||
await cron.start();
|
||||
const job = await cron.add({
|
||||
name: "skipped timeout",
|
||||
enabled: true,
|
||||
schedule: { kind: "every", everyMs: 60_000 },
|
||||
sessionTarget: "isolated",
|
||||
wakeMode: "next-heartbeat",
|
||||
payload: { kind: "agentTurn", message: "ping" },
|
||||
delivery: { mode: "announce", channel: "telegram", to: "19098680" },
|
||||
});
|
||||
|
||||
await cron.run(job.id, "force");
|
||||
expect(sendCronFailureAlert).toHaveBeenCalledTimes(1);
|
||||
const alertText = alertCallArg(sendCronFailureAlert).text;
|
||||
expect(alertText).toBe(
|
||||
'Cron job "skipped timeout" skipped 1 times\nSkip reason: cron: job execution timed out',
|
||||
);
|
||||
|
||||
cron.stop();
|
||||
await store.cleanup();
|
||||
});
|
||||
|
||||
it("tracks skipped runs without alerting or affecting error backoff when includeSkipped is off", async () => {
|
||||
const store = await makeStorePath();
|
||||
const sendCronFailureAlert = vi.fn(async () => undefined);
|
||||
|
||||
@@ -775,6 +775,7 @@ async function finishPreparedManualRun(
|
||||
error: coreResult.error,
|
||||
diagnostics: coreResult.diagnostics,
|
||||
delivered: coreResult.delivered,
|
||||
provider: coreResult.provider,
|
||||
startedAt,
|
||||
endedAt,
|
||||
},
|
||||
|
||||
@@ -755,14 +755,20 @@ function emitFailureAlert(
|
||||
mode?: "announce" | "webhook";
|
||||
accountId?: string;
|
||||
status: "error" | "skipped";
|
||||
provider?: string;
|
||||
},
|
||||
) {
|
||||
const safeJobName = params.job.name || params.job.id;
|
||||
const truncatedError = (params.error?.trim() || "unknown reason").slice(0, 200);
|
||||
const errorReason =
|
||||
params.status === "error" && typeof params.error === "string"
|
||||
? (resolveFailoverReasonFromError(params.error, params.provider) ?? undefined)
|
||||
: undefined;
|
||||
const statusVerb = params.status === "skipped" ? "skipped" : "failed";
|
||||
const detailLabel = params.status === "skipped" ? "Skip reason" : "Last error";
|
||||
const text = [
|
||||
`Cron job "${safeJobName}" ${statusVerb} ${params.consecutiveErrors} times`,
|
||||
...(errorReason ? [`Cause: ${errorReason}`] : []),
|
||||
`${detailLabel}: ${truncatedError}`,
|
||||
].join("\n");
|
||||
|
||||
@@ -802,6 +808,7 @@ function maybeEmitFailureAlert(
|
||||
alertConfig: ResolvedFailureAlert | null;
|
||||
status: "error" | "skipped";
|
||||
error?: string;
|
||||
provider?: string;
|
||||
consecutiveCount: number;
|
||||
},
|
||||
) {
|
||||
@@ -828,6 +835,7 @@ function maybeEmitFailureAlert(
|
||||
mode: params.alertConfig.mode,
|
||||
accountId: params.alertConfig.accountId,
|
||||
status: params.status,
|
||||
provider: params.provider,
|
||||
});
|
||||
params.job.state.lastFailureAlertAtMs = now;
|
||||
}
|
||||
@@ -845,6 +853,7 @@ export function applyJobResult(
|
||||
error?: string;
|
||||
diagnostics?: CronRunOutcome["diagnostics"];
|
||||
delivered?: boolean;
|
||||
provider?: string;
|
||||
startedAt: number;
|
||||
endedAt: number;
|
||||
},
|
||||
@@ -873,7 +882,7 @@ export function applyJobResult(
|
||||
job.state.lastDiagnosticSummary = summarizeCronRunDiagnostics(job.state.lastDiagnostics);
|
||||
job.state.lastErrorReason =
|
||||
result.status === "error" && typeof result.error === "string"
|
||||
? (resolveFailoverReasonFromError(result.error) ?? undefined)
|
||||
? (resolveFailoverReasonFromError(result.error, result.provider) ?? undefined)
|
||||
: undefined;
|
||||
if (result.status === "error") {
|
||||
state.deps.log.warn(
|
||||
@@ -915,6 +924,7 @@ export function applyJobResult(
|
||||
alertConfig,
|
||||
status: "error",
|
||||
error: result.error,
|
||||
provider: result.provider,
|
||||
consecutiveCount: job.state.consecutiveErrors,
|
||||
});
|
||||
} else if (result.status === "skipped") {
|
||||
@@ -926,6 +936,7 @@ export function applyJobResult(
|
||||
alertConfig,
|
||||
status: "skipped",
|
||||
error: result.error,
|
||||
provider: result.provider,
|
||||
consecutiveCount: job.state.consecutiveSkipped,
|
||||
});
|
||||
} else {
|
||||
@@ -1080,6 +1091,7 @@ function applyOutcomeToStoredJob(state: CronServiceState, result: TimedCronRunOu
|
||||
error: result.error,
|
||||
diagnostics: result.diagnostics,
|
||||
delivered: result.delivered,
|
||||
provider: result.provider,
|
||||
startedAt: result.startedAt,
|
||||
endedAt: result.endedAt,
|
||||
});
|
||||
@@ -1102,6 +1114,7 @@ function applyOutcomeToStoredJob(state: CronServiceState, result: TimedCronRunOu
|
||||
error: result.error,
|
||||
diagnostics: result.diagnostics,
|
||||
delivered: result.delivered,
|
||||
provider: result.provider,
|
||||
startedAt: result.startedAt,
|
||||
endedAt: result.endedAt,
|
||||
});
|
||||
@@ -1952,6 +1965,7 @@ export async function executeJob(
|
||||
error: coreResult.error,
|
||||
diagnostics: coreResult.diagnostics,
|
||||
delivered: coreResult.delivered,
|
||||
provider: coreResult.provider,
|
||||
startedAt,
|
||||
endedAt,
|
||||
});
|
||||
|
||||
@@ -64,12 +64,15 @@ const CronDeliveryStatusSchema = Type.Union([
|
||||
]);
|
||||
const CronFailoverReasonSchema = Type.Union([
|
||||
Type.Literal("auth"),
|
||||
Type.Literal("auth_permanent"),
|
||||
Type.Literal("format"),
|
||||
Type.Literal("rate_limit"),
|
||||
Type.Literal("overloaded"),
|
||||
Type.Literal("billing"),
|
||||
Type.Literal("server_error"),
|
||||
Type.Literal("timeout"),
|
||||
Type.Literal("model_not_found"),
|
||||
Type.Literal("session_expired"),
|
||||
Type.Literal("empty_response"),
|
||||
Type.Literal("no_error_details"),
|
||||
Type.Literal("unclassified"),
|
||||
@@ -434,6 +437,7 @@ export const CronRunLogEntrySchema = Type.Object(
|
||||
action: Type.Literal("finished"),
|
||||
status: Type.Optional(CronRunStatusSchema),
|
||||
error: Type.Optional(Type.String()),
|
||||
errorReason: Type.Optional(CronFailoverReasonSchema),
|
||||
summary: Type.Optional(Type.String()),
|
||||
diagnostics: Type.Optional(CronRunDiagnosticsSchema),
|
||||
delivered: Type.Optional(Type.Boolean()),
|
||||
|
||||
Reference in New Issue
Block a user