fix(cron): skip isolated runs when local providers are down

This commit is contained in:
Peter Steinberger
2026-04-28 02:12:03 +01:00
parent 4e63f710f1
commit a66605bf23
9 changed files with 523 additions and 0 deletions

View File

@@ -28,6 +28,7 @@ Docs: https://docs.openclaw.ai
### Fixes
- Cron/providers: preflight local Ollama and OpenAI-compatible provider endpoints before isolated cron agent turns, record unreachable local providers as skipped runs, and cache dead-endpoint probes so many jobs do not hammer the same stopped local server. Fixes #58584. Thanks @jpeghead.
- CLI/status: show skipped fast-path memory checks as `not checked` and report active custom memory plugin runtime status from `status --json --all` without requiring built-in `agents.defaults.memorySearch`, so plugins such as memory-lancedb-pro and memory-cms no longer look unavailable when their own runtime is healthy. Fixes #56968. Thanks @Tony-ooo and @aderius.
- Gateway/channels: record and log unexpected clean channel monitor exits so channels that return without throwing no longer appear stopped with no error. Fixes #73099. Thanks @balaji1968-kingler.
- Channels/Telegram: centralize polling update tracking so accepted offsets remain durable across restarts, same-process handler failures can still retry, and slow offset writes cannot overwrite newer accepted watermarks. Refs #73115. Thanks @vdruts.

View File

@@ -144,6 +144,8 @@ Fast mode follows the resolved live selection too. If the selected model config
If an isolated run hits a live model-switch handoff, cron retries with the switched provider/model and persists that live selection for the active run before retrying. When the switch also carries a new auth profile, cron persists that auth profile override for the active run too. Retries are bounded: after the initial attempt plus 2 switch retries, cron aborts instead of looping forever.
Before an isolated cron run enters the agent runner, OpenClaw checks reachable local provider endpoints for configured `api: "ollama"` and `api: "openai-completions"` providers whose `baseUrl` is loopback, private-network, or `.local`. If that endpoint is down, the run is recorded as `skipped` with a clear provider/model error instead of starting a model call. The endpoint result is cached for 5 minutes, so many due jobs using the same dead local Ollama, vLLM, SGLang, or LM Studio server share one small probe instead of creating a request storm. Skipped provider-preflight runs do not increment execution-error backoff; enable `failureAlert.includeSkipped` when you want repeated skip notifications.
## Delivery and output
| Mode | What happens |

View File

@@ -83,6 +83,8 @@ Recurring jobs use exponential retry backoff after consecutive errors: 30s, 1m,
Skipped runs are tracked separately from execution errors. They do not affect retry backoff, but `openclaw cron edit <job-id> --failure-alert-include-skipped` can opt failure alerts into repeated skipped-run notifications.
For isolated jobs that target a local configured model provider, cron runs a lightweight provider preflight before starting the agent turn. Loopback, private-network, and `.local` `api: "ollama"` providers are probed at `/api/tags`; local OpenAI-compatible providers such as vLLM, SGLang, and LM Studio are probed at `/models`. If the endpoint is unreachable, the run is recorded as `skipped` and retried on a later schedule; matching dead endpoints are cached for 5 minutes to avoid many jobs hammering the same local server.
Note: cron job definitions live in `jobs.json`, while pending runtime state lives in `jobs-state.json`. If `jobs.json` is edited externally, the Gateway reloads changed schedules and clears stale pending slots; formatting-only rewrites do not clear the pending slot.
### Manual runs

View File

@@ -215,6 +215,13 @@ that as an exact user selection. If the configured Ollama `baseUrl` is
unreachable, the next reply fails with the provider error instead of silently
answering from another configured fallback model.
Isolated cron jobs do one extra local safety check before they start the agent
turn. If the selected model resolves to a local, private-network, or `.local`
Ollama provider and `/api/tags` is unreachable, OpenClaw records that cron run
as `skipped` with the selected `ollama/<model>` in the error text. The endpoint
preflight is cached for 5 minutes, so multiple cron jobs pointed at the same
stopped Ollama daemon do not all launch failing model requests.
Live-verify the local text path, native stream path, and embeddings against
local Ollama with:

View File

@@ -0,0 +1,84 @@
import { beforeEach, describe, expect, it } from "vitest";
import {
loadRunCronIsolatedAgentTurn,
makeCronSession,
preflightCronModelProviderMock,
resolveConfiguredModelRefMock,
resolveCronSessionMock,
resetRunCronIsolatedAgentTurnHarness,
runEmbeddedPiAgentMock,
} from "./isolated-agent/run.test-harness.js";
const runCronIsolatedAgentTurn = await loadRunCronIsolatedAgentTurn();
describe("runCronIsolatedAgentTurn model provider preflight", () => {
beforeEach(() => {
resetRunCronIsolatedAgentTurnHarness();
resolveConfiguredModelRefMock.mockReturnValue({
provider: "ollama",
model: "qwen3:32b",
});
resolveCronSessionMock.mockReturnValue(
makeCronSession({
sessionEntry: {
sessionId: "cron-session",
updatedAt: 0,
systemSent: false,
skillsSnapshot: undefined,
},
}),
);
});
it("skips isolated cron execution when the local model provider is unavailable", async () => {
preflightCronModelProviderMock.mockResolvedValueOnce({
status: "unavailable",
reason:
"Agent cron job uses ollama/qwen3:32b but the local provider endpoint is not reachable at http://127.0.0.1:11434.",
provider: "ollama",
model: "qwen3:32b",
baseUrl: "http://127.0.0.1:11434",
retryAfterMs: 300000,
});
const result = await runCronIsolatedAgentTurn({
cfg: {
models: {
providers: {
ollama: {
api: "ollama",
baseUrl: "http://127.0.0.1:11434",
models: [],
},
},
},
},
deps: {} as never,
job: {
id: "dead-ollama",
name: "Dead Ollama",
enabled: true,
createdAtMs: 0,
updatedAtMs: 0,
schedule: { kind: "cron", expr: "*/5 * * * *", tz: "UTC" },
sessionTarget: "isolated",
state: {},
wakeMode: "next-heartbeat",
payload: { kind: "agentTurn", message: "summarize" },
delivery: { mode: "none" },
},
message: "summarize",
sessionKey: "cron:dead-ollama",
lane: "cron",
});
expect(result).toMatchObject({
status: "skipped",
provider: "ollama",
model: "qwen3:32b",
sessionId: "cron-session",
});
expect(result.error).toContain("local provider endpoint is not reachable");
expect(runEmbeddedPiAgentMock).not.toHaveBeenCalled();
});
});

View File

@@ -0,0 +1,163 @@
import { beforeEach, describe, expect, it, vi } from "vitest";
const { fetchWithSsrFGuardMock } = vi.hoisted(() => ({
fetchWithSsrFGuardMock: vi.fn(),
}));
vi.mock("../../infra/net/fetch-guard.js", () => ({
fetchWithSsrFGuard: fetchWithSsrFGuardMock,
}));
import {
preflightCronModelProvider,
resetCronModelProviderPreflightCacheForTest,
} from "./model-preflight.runtime.js";
function mockReachableResponse(status = 200) {
fetchWithSsrFGuardMock.mockResolvedValueOnce({
response: { status },
release: vi.fn(async () => {}),
});
}
describe("preflightCronModelProvider", () => {
beforeEach(() => {
fetchWithSsrFGuardMock.mockReset();
resetCronModelProviderPreflightCacheForTest();
});
it("skips network checks for cloud provider URLs", async () => {
const result = await preflightCronModelProvider({
cfg: {
models: {
providers: {
openai: {
api: "openai-responses",
baseUrl: "https://api.openai.com/v1",
models: [],
},
},
},
},
provider: "openai",
model: "gpt-5.4",
});
expect(result).toEqual({ status: "available" });
expect(fetchWithSsrFGuardMock).not.toHaveBeenCalled();
});
it("treats any HTTP response from a local OpenAI-compatible endpoint as reachable", async () => {
mockReachableResponse(401);
const result = await preflightCronModelProvider({
cfg: {
models: {
providers: {
vllm: {
api: "openai-completions",
baseUrl: "http://127.0.0.1:8000/v1",
models: [],
},
},
},
},
provider: "vllm",
model: "llama",
});
expect(result).toEqual({ status: "available" });
expect(fetchWithSsrFGuardMock).toHaveBeenCalledWith(
expect.objectContaining({
url: "http://127.0.0.1:8000/v1/models",
timeoutMs: 2500,
}),
);
});
it("marks unreachable local Ollama endpoints unavailable and caches the result", async () => {
fetchWithSsrFGuardMock.mockRejectedValueOnce(new Error("ECONNREFUSED"));
const cfg = {
models: {
providers: {
Ollama: {
api: "ollama" as const,
baseUrl: "http://localhost:11434",
models: [],
},
},
},
};
const first = await preflightCronModelProvider({
cfg,
provider: "ollama",
model: "qwen3:32b",
nowMs: 1000,
});
const second = await preflightCronModelProvider({
cfg,
provider: "ollama",
model: "llama3.3:70b",
nowMs: 2000,
});
expect(first).toMatchObject({
status: "unavailable",
provider: "ollama",
model: "qwen3:32b",
baseUrl: "http://localhost:11434",
retryAfterMs: 300000,
});
expect(second).toMatchObject({
status: "unavailable",
provider: "ollama",
model: "llama3.3:70b",
baseUrl: "http://localhost:11434",
retryAfterMs: 300000,
});
expect(fetchWithSsrFGuardMock).toHaveBeenCalledTimes(1);
expect(fetchWithSsrFGuardMock).toHaveBeenCalledWith(
expect.objectContaining({
url: "http://localhost:11434/api/tags",
auditContext: "cron-model-provider-preflight",
}),
);
});
it("retries an unavailable endpoint after the cache ttl", async () => {
fetchWithSsrFGuardMock.mockRejectedValueOnce(new Error("ECONNREFUSED")).mockResolvedValueOnce({
response: { status: 200 },
release: vi.fn(async () => {}),
});
const cfg = {
models: {
providers: {
ollama: {
api: "ollama" as const,
baseUrl: "http://127.0.0.1:11434",
models: [],
},
},
},
};
const first = await preflightCronModelProvider({
cfg,
provider: "ollama",
model: "llama3",
nowMs: 1000,
});
const second = await preflightCronModelProvider({
cfg,
provider: "ollama",
model: "llama3",
nowMs: 1000 + 300001,
});
expect(first.status).toBe("unavailable");
expect(second).toEqual({ status: "available" });
expect(fetchWithSsrFGuardMock).toHaveBeenCalledTimes(2);
});
});

View File

@@ -0,0 +1,229 @@
import { normalizeProviderId } from "../../agents/provider-id.js";
import type { ModelProviderConfig } from "../../config/types.models.js";
import type { OpenClawConfig } from "../../config/types.openclaw.js";
import { fetchWithSsrFGuard } from "../../infra/net/fetch-guard.js";
import type { SsrFPolicy } from "../../infra/net/ssrf.js";
import { normalizeLowercaseStringOrEmpty } from "../../shared/string-coerce.js";
const PREFLIGHT_CACHE_TTL_MS = 5 * 60_000;
const PREFLIGHT_TIMEOUT_MS = 2_500;
type PreflightApi = "ollama" | "openai-completions";
export type CronModelProviderPreflightResult =
| { status: "available" }
| {
status: "unavailable";
reason: string;
provider: string;
model: string;
baseUrl: string;
retryAfterMs: number;
};
type EndpointPreflightResult =
| { status: "available" }
| {
status: "unavailable";
error: unknown;
};
type CachedEndpointPreflightResult = {
checkedAtMs: number;
result: EndpointPreflightResult;
};
const preflightCache = new Map<string, CachedEndpointPreflightResult>();
function resolveProviderConfig(
cfg: OpenClawConfig,
provider: string,
): ModelProviderConfig | undefined {
const providers = cfg.models?.providers;
if (!providers) {
return undefined;
}
const direct = providers[provider];
if (direct) {
return direct;
}
const normalized = normalizeProviderId(provider);
return Object.entries(providers).find(([key]) => normalizeProviderId(key) === normalized)?.[1];
}
function normalizeBaseUrl(value: unknown): string | undefined {
if (typeof value !== "string") {
return undefined;
}
const trimmed = value.trim().replace(/\/+$/, "");
return trimmed ? trimmed : undefined;
}
function normalizeProbeApi(providerConfig: ModelProviderConfig): PreflightApi | undefined {
const api = normalizeLowercaseStringOrEmpty(providerConfig.api);
return api === "ollama" || api === "openai-completions" ? api : undefined;
}
function isPrivateIpv4Host(host: string): boolean {
if (!/^\d+\.\d+\.\d+\.\d+$/.test(host)) {
return false;
}
const octets = host.split(".").map((part) => Number.parseInt(part, 10));
if (octets.some((part) => !Number.isInteger(part) || part < 0 || part > 255)) {
return false;
}
const [a, b] = octets;
return a === 10 || (a === 172 && b >= 16 && b <= 31) || (a === 192 && b === 168);
}
function isLocalProviderBaseUrl(baseUrl: string): boolean {
try {
let host = normalizeLowercaseStringOrEmpty(new URL(baseUrl).hostname);
if (host.startsWith("[") && host.endsWith("]")) {
host = host.slice(1, -1);
}
return (
host === "localhost" ||
host === "127.0.0.1" ||
host === "0.0.0.0" ||
host === "::1" ||
host === "::ffff:7f00:1" ||
host === "::ffff:127.0.0.1" ||
host.endsWith(".local") ||
isPrivateIpv4Host(host)
);
} catch {
return false;
}
}
function buildProbeUrl(api: PreflightApi, baseUrl: string): string {
if (api === "ollama") {
return `${baseUrl}/api/tags`;
}
return `${baseUrl}/models`;
}
function buildLocalProviderSsrFPolicy(baseUrl: string): SsrFPolicy | undefined {
try {
const parsed = new URL(baseUrl);
if (parsed.protocol !== "http:" && parsed.protocol !== "https:") {
return undefined;
}
return {
hostnameAllowlist: [parsed.hostname],
allowPrivateNetwork: true,
};
} catch {
return undefined;
}
}
function formatUnavailableReason(params: {
provider: string;
model: string;
baseUrl: string;
error: unknown;
}): string {
return [
`Agent cron job uses ${params.provider}/${params.model} but the local provider endpoint is not reachable at ${params.baseUrl}.`,
`Skipping this cron run; OpenClaw will retry the provider preflight on a later scheduled run.`,
`Last error: ${String(params.error)}`,
].join(" ");
}
function buildUnavailableResult(params: {
provider: string;
model: string;
baseUrl: string;
error: unknown;
}): CronModelProviderPreflightResult {
return {
status: "unavailable",
provider: params.provider,
model: params.model,
baseUrl: params.baseUrl,
retryAfterMs: PREFLIGHT_CACHE_TTL_MS,
reason: formatUnavailableReason({
provider: params.provider,
model: params.model,
baseUrl: params.baseUrl,
error: params.error,
}),
};
}
async function probeLocalProviderEndpoint(params: {
api: PreflightApi;
baseUrl: string;
}): Promise<void> {
const { response, release } = await fetchWithSsrFGuard({
url: buildProbeUrl(params.api, params.baseUrl),
init: { method: "GET" },
policy: buildLocalProviderSsrFPolicy(params.baseUrl),
timeoutMs: PREFLIGHT_TIMEOUT_MS,
auditContext: "cron-model-provider-preflight",
});
try {
// Any HTTP response means the local endpoint is alive. Auth/model errors
// still belong to the normal model runner where fallback and diagnostics
// have the full provider context.
void response.status;
} finally {
await release();
}
}
export async function preflightCronModelProvider(params: {
cfg: OpenClawConfig;
provider: string;
model: string;
nowMs?: number;
}): Promise<CronModelProviderPreflightResult> {
const providerConfig = resolveProviderConfig(params.cfg, params.provider);
if (!providerConfig) {
return { status: "available" };
}
const baseUrl = normalizeBaseUrl(providerConfig.baseUrl);
const api = normalizeProbeApi(providerConfig);
if (!baseUrl || !api || !isLocalProviderBaseUrl(baseUrl)) {
return { status: "available" };
}
const nowMs = params.nowMs ?? Date.now();
const cacheKey = `${api}\0${baseUrl}`;
const cached = preflightCache.get(cacheKey);
if (cached && nowMs - cached.checkedAtMs < PREFLIGHT_CACHE_TTL_MS) {
if (cached.result.status === "available") {
return { status: "available" };
}
return buildUnavailableResult({
provider: params.provider,
model: params.model,
baseUrl,
error: cached.result.error,
});
}
let result: EndpointPreflightResult;
try {
await probeLocalProviderEndpoint({ api, baseUrl });
result = { status: "available" };
} catch (error) {
result = { status: "unavailable", error };
}
preflightCache.set(cacheKey, { checkedAtMs: nowMs, result });
if (result.status === "available") {
return { status: "available" };
}
return buildUnavailableResult({
provider: params.provider,
model: params.model,
baseUrl,
error: result.error,
});
}
export function resetCronModelProviderPreflightCacheForTest(): void {
preflightCache.clear();
}

View File

@@ -64,6 +64,7 @@ export const resolveCronPayloadOutcomeMock = createMock();
export const resolveCronDeliveryPlanMock = createMock();
export const resolveDeliveryTargetMock = createMock();
export const dispatchCronDeliveryMock = createMock();
export const preflightCronModelProviderMock = createMock();
export const isHeartbeatOnlyResponseMock = createMock();
export const resolveHeartbeatAckMaxCharsMock = createMock();
export const resolveSessionAuthProfileOverrideMock = createMock();
@@ -220,6 +221,10 @@ vi.mock("./run-delivery.runtime.js", async () => {
};
});
vi.mock("./model-preflight.runtime.js", () => ({
preflightCronModelProvider: preflightCronModelProviderMock,
}));
vi.mock("./helpers.js", () => ({
isHeartbeatOnlyResponse: isHeartbeatOnlyResponseMock,
pickLastDeliverablePayload: vi.fn().mockReturnValue(undefined),
@@ -477,6 +482,8 @@ function resetRunOutcomeMocks(): void {
deliveryPayloads,
}),
);
preflightCronModelProviderMock.mockReset();
preflightCronModelProviderMock.mockResolvedValue({ status: "available" });
isHeartbeatOnlyResponseMock.mockReset();
isHeartbeatOnlyResponseMock.mockReturnValue(false);
resolveHeartbeatAckMaxCharsMock.mockReset();

View File

@@ -76,6 +76,9 @@ let cronModelCatalogRuntimePromise:
| Promise<typeof import("./run-model-catalog.runtime.js")>
| undefined;
let cronDeliveryRuntimePromise: Promise<typeof import("./run-delivery.runtime.js")> | undefined;
let cronModelPreflightRuntimePromise:
| Promise<typeof import("./model-preflight.runtime.js")>
| undefined;
async function loadSessionStoreRuntime() {
sessionStoreRuntimePromise ??= import("../../config/sessions/store.runtime.js");
@@ -112,6 +115,11 @@ async function loadCronDeliveryRuntime() {
return await cronDeliveryRuntimePromise;
}
async function loadCronModelPreflightRuntime() {
cronModelPreflightRuntimePromise ??= import("./model-preflight.runtime.js");
return await cronModelPreflightRuntimePromise;
}
function hasConfiguredAuthProfiles(cfg: OpenClawConfig): boolean {
return (
Boolean(cfg.auth?.profiles && Object.keys(cfg.auth.profiles).length > 0) ||
@@ -571,6 +579,26 @@ async function prepareCronRunContext(params: {
logWarn(resolvedModelSelection.warning);
}
const preflight = await (
await loadCronModelPreflightRuntime()
).preflightCronModelProvider({
cfg: cfgWithAgentDefaults,
provider,
model,
});
if (preflight.status === "unavailable") {
logWarn(`[cron:${input.job.id}] ${preflight.reason}`);
return {
ok: false,
result: withRunSession({
status: "skipped",
error: preflight.reason,
provider,
model,
}),
};
}
const hooksGmailThinking = isGmailHook
? normalizeThinkLevel(input.cfg.hooks?.gmail?.thinking)
: undefined;