diff --git a/CHANGELOG.md b/CHANGELOG.md index a702d24b083..aae96b15d48 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -28,6 +28,7 @@ Docs: https://docs.openclaw.ai ### Fixes +- Cron/providers: preflight local Ollama and OpenAI-compatible provider endpoints before isolated cron agent turns, record unreachable local providers as skipped runs, and cache dead-endpoint probes so many jobs do not hammer the same stopped local server. Fixes #58584. Thanks @jpeghead. - CLI/status: show skipped fast-path memory checks as `not checked` and report active custom memory plugin runtime status from `status --json --all` without requiring built-in `agents.defaults.memorySearch`, so plugins such as memory-lancedb-pro and memory-cms no longer look unavailable when their own runtime is healthy. Fixes #56968. Thanks @Tony-ooo and @aderius. - Gateway/channels: record and log unexpected clean channel monitor exits so channels that return without throwing no longer appear stopped with no error. Fixes #73099. Thanks @balaji1968-kingler. - Channels/Telegram: centralize polling update tracking so accepted offsets remain durable across restarts, same-process handler failures can still retry, and slow offset writes cannot overwrite newer accepted watermarks. Refs #73115. Thanks @vdruts. diff --git a/docs/automation/cron-jobs.md b/docs/automation/cron-jobs.md index 4d4cb341194..b93335eec10 100644 --- a/docs/automation/cron-jobs.md +++ b/docs/automation/cron-jobs.md @@ -144,6 +144,8 @@ Fast mode follows the resolved live selection too. If the selected model config If an isolated run hits a live model-switch handoff, cron retries with the switched provider/model and persists that live selection for the active run before retrying. When the switch also carries a new auth profile, cron persists that auth profile override for the active run too. Retries are bounded: after the initial attempt plus 2 switch retries, cron aborts instead of looping forever. +Before an isolated cron run enters the agent runner, OpenClaw checks reachable local provider endpoints for configured `api: "ollama"` and `api: "openai-completions"` providers whose `baseUrl` is loopback, private-network, or `.local`. If that endpoint is down, the run is recorded as `skipped` with a clear provider/model error instead of starting a model call. The endpoint result is cached for 5 minutes, so many due jobs using the same dead local Ollama, vLLM, SGLang, or LM Studio server share one small probe instead of creating a request storm. Skipped provider-preflight runs do not increment execution-error backoff; enable `failureAlert.includeSkipped` when you want repeated skip notifications. + ## Delivery and output | Mode | What happens | diff --git a/docs/cli/cron.md b/docs/cli/cron.md index 2aaa83549f4..75a837ca75d 100644 --- a/docs/cli/cron.md +++ b/docs/cli/cron.md @@ -83,6 +83,8 @@ Recurring jobs use exponential retry backoff after consecutive errors: 30s, 1m, Skipped runs are tracked separately from execution errors. They do not affect retry backoff, but `openclaw cron edit --failure-alert-include-skipped` can opt failure alerts into repeated skipped-run notifications. +For isolated jobs that target a local configured model provider, cron runs a lightweight provider preflight before starting the agent turn. Loopback, private-network, and `.local` `api: "ollama"` providers are probed at `/api/tags`; local OpenAI-compatible providers such as vLLM, SGLang, and LM Studio are probed at `/models`. If the endpoint is unreachable, the run is recorded as `skipped` and retried on a later schedule; matching dead endpoints are cached for 5 minutes to avoid many jobs hammering the same local server. + Note: cron job definitions live in `jobs.json`, while pending runtime state lives in `jobs-state.json`. If `jobs.json` is edited externally, the Gateway reloads changed schedules and clears stale pending slots; formatting-only rewrites do not clear the pending slot. ### Manual runs diff --git a/docs/providers/ollama.md b/docs/providers/ollama.md index 1611abae507..a152d68c46a 100644 --- a/docs/providers/ollama.md +++ b/docs/providers/ollama.md @@ -215,6 +215,13 @@ that as an exact user selection. If the configured Ollama `baseUrl` is unreachable, the next reply fails with the provider error instead of silently answering from another configured fallback model. +Isolated cron jobs do one extra local safety check before they start the agent +turn. If the selected model resolves to a local, private-network, or `.local` +Ollama provider and `/api/tags` is unreachable, OpenClaw records that cron run +as `skipped` with the selected `ollama/` in the error text. The endpoint +preflight is cached for 5 minutes, so multiple cron jobs pointed at the same +stopped Ollama daemon do not all launch failing model requests. + Live-verify the local text path, native stream path, and embeddings against local Ollama with: diff --git a/src/cron/isolated-agent.model-preflight.test.ts b/src/cron/isolated-agent.model-preflight.test.ts new file mode 100644 index 00000000000..5948ae777f0 --- /dev/null +++ b/src/cron/isolated-agent.model-preflight.test.ts @@ -0,0 +1,84 @@ +import { beforeEach, describe, expect, it } from "vitest"; +import { + loadRunCronIsolatedAgentTurn, + makeCronSession, + preflightCronModelProviderMock, + resolveConfiguredModelRefMock, + resolveCronSessionMock, + resetRunCronIsolatedAgentTurnHarness, + runEmbeddedPiAgentMock, +} from "./isolated-agent/run.test-harness.js"; + +const runCronIsolatedAgentTurn = await loadRunCronIsolatedAgentTurn(); + +describe("runCronIsolatedAgentTurn model provider preflight", () => { + beforeEach(() => { + resetRunCronIsolatedAgentTurnHarness(); + resolveConfiguredModelRefMock.mockReturnValue({ + provider: "ollama", + model: "qwen3:32b", + }); + resolveCronSessionMock.mockReturnValue( + makeCronSession({ + sessionEntry: { + sessionId: "cron-session", + updatedAt: 0, + systemSent: false, + skillsSnapshot: undefined, + }, + }), + ); + }); + + it("skips isolated cron execution when the local model provider is unavailable", async () => { + preflightCronModelProviderMock.mockResolvedValueOnce({ + status: "unavailable", + reason: + "Agent cron job uses ollama/qwen3:32b but the local provider endpoint is not reachable at http://127.0.0.1:11434.", + provider: "ollama", + model: "qwen3:32b", + baseUrl: "http://127.0.0.1:11434", + retryAfterMs: 300000, + }); + + const result = await runCronIsolatedAgentTurn({ + cfg: { + models: { + providers: { + ollama: { + api: "ollama", + baseUrl: "http://127.0.0.1:11434", + models: [], + }, + }, + }, + }, + deps: {} as never, + job: { + id: "dead-ollama", + name: "Dead Ollama", + enabled: true, + createdAtMs: 0, + updatedAtMs: 0, + schedule: { kind: "cron", expr: "*/5 * * * *", tz: "UTC" }, + sessionTarget: "isolated", + state: {}, + wakeMode: "next-heartbeat", + payload: { kind: "agentTurn", message: "summarize" }, + delivery: { mode: "none" }, + }, + message: "summarize", + sessionKey: "cron:dead-ollama", + lane: "cron", + }); + + expect(result).toMatchObject({ + status: "skipped", + provider: "ollama", + model: "qwen3:32b", + sessionId: "cron-session", + }); + expect(result.error).toContain("local provider endpoint is not reachable"); + expect(runEmbeddedPiAgentMock).not.toHaveBeenCalled(); + }); +}); diff --git a/src/cron/isolated-agent/model-preflight.runtime.test.ts b/src/cron/isolated-agent/model-preflight.runtime.test.ts new file mode 100644 index 00000000000..96d6524aa58 --- /dev/null +++ b/src/cron/isolated-agent/model-preflight.runtime.test.ts @@ -0,0 +1,163 @@ +import { beforeEach, describe, expect, it, vi } from "vitest"; + +const { fetchWithSsrFGuardMock } = vi.hoisted(() => ({ + fetchWithSsrFGuardMock: vi.fn(), +})); + +vi.mock("../../infra/net/fetch-guard.js", () => ({ + fetchWithSsrFGuard: fetchWithSsrFGuardMock, +})); + +import { + preflightCronModelProvider, + resetCronModelProviderPreflightCacheForTest, +} from "./model-preflight.runtime.js"; + +function mockReachableResponse(status = 200) { + fetchWithSsrFGuardMock.mockResolvedValueOnce({ + response: { status }, + release: vi.fn(async () => {}), + }); +} + +describe("preflightCronModelProvider", () => { + beforeEach(() => { + fetchWithSsrFGuardMock.mockReset(); + resetCronModelProviderPreflightCacheForTest(); + }); + + it("skips network checks for cloud provider URLs", async () => { + const result = await preflightCronModelProvider({ + cfg: { + models: { + providers: { + openai: { + api: "openai-responses", + baseUrl: "https://api.openai.com/v1", + models: [], + }, + }, + }, + }, + provider: "openai", + model: "gpt-5.4", + }); + + expect(result).toEqual({ status: "available" }); + expect(fetchWithSsrFGuardMock).not.toHaveBeenCalled(); + }); + + it("treats any HTTP response from a local OpenAI-compatible endpoint as reachable", async () => { + mockReachableResponse(401); + + const result = await preflightCronModelProvider({ + cfg: { + models: { + providers: { + vllm: { + api: "openai-completions", + baseUrl: "http://127.0.0.1:8000/v1", + models: [], + }, + }, + }, + }, + provider: "vllm", + model: "llama", + }); + + expect(result).toEqual({ status: "available" }); + expect(fetchWithSsrFGuardMock).toHaveBeenCalledWith( + expect.objectContaining({ + url: "http://127.0.0.1:8000/v1/models", + timeoutMs: 2500, + }), + ); + }); + + it("marks unreachable local Ollama endpoints unavailable and caches the result", async () => { + fetchWithSsrFGuardMock.mockRejectedValueOnce(new Error("ECONNREFUSED")); + + const cfg = { + models: { + providers: { + Ollama: { + api: "ollama" as const, + baseUrl: "http://localhost:11434", + models: [], + }, + }, + }, + }; + const first = await preflightCronModelProvider({ + cfg, + provider: "ollama", + model: "qwen3:32b", + nowMs: 1000, + }); + const second = await preflightCronModelProvider({ + cfg, + provider: "ollama", + model: "llama3.3:70b", + nowMs: 2000, + }); + + expect(first).toMatchObject({ + status: "unavailable", + provider: "ollama", + model: "qwen3:32b", + baseUrl: "http://localhost:11434", + retryAfterMs: 300000, + }); + expect(second).toMatchObject({ + status: "unavailable", + provider: "ollama", + model: "llama3.3:70b", + baseUrl: "http://localhost:11434", + retryAfterMs: 300000, + }); + expect(fetchWithSsrFGuardMock).toHaveBeenCalledTimes(1); + expect(fetchWithSsrFGuardMock).toHaveBeenCalledWith( + expect.objectContaining({ + url: "http://localhost:11434/api/tags", + auditContext: "cron-model-provider-preflight", + }), + ); + }); + + it("retries an unavailable endpoint after the cache ttl", async () => { + fetchWithSsrFGuardMock.mockRejectedValueOnce(new Error("ECONNREFUSED")).mockResolvedValueOnce({ + response: { status: 200 }, + release: vi.fn(async () => {}), + }); + + const cfg = { + models: { + providers: { + ollama: { + api: "ollama" as const, + baseUrl: "http://127.0.0.1:11434", + models: [], + }, + }, + }, + }; + + const first = await preflightCronModelProvider({ + cfg, + provider: "ollama", + model: "llama3", + nowMs: 1000, + }); + const second = await preflightCronModelProvider({ + cfg, + provider: "ollama", + model: "llama3", + nowMs: 1000 + 300001, + }); + + expect(first.status).toBe("unavailable"); + expect(second).toEqual({ status: "available" }); + expect(fetchWithSsrFGuardMock).toHaveBeenCalledTimes(2); + }); +}); diff --git a/src/cron/isolated-agent/model-preflight.runtime.ts b/src/cron/isolated-agent/model-preflight.runtime.ts new file mode 100644 index 00000000000..4c1af62b8d6 --- /dev/null +++ b/src/cron/isolated-agent/model-preflight.runtime.ts @@ -0,0 +1,229 @@ +import { normalizeProviderId } from "../../agents/provider-id.js"; +import type { ModelProviderConfig } from "../../config/types.models.js"; +import type { OpenClawConfig } from "../../config/types.openclaw.js"; +import { fetchWithSsrFGuard } from "../../infra/net/fetch-guard.js"; +import type { SsrFPolicy } from "../../infra/net/ssrf.js"; +import { normalizeLowercaseStringOrEmpty } from "../../shared/string-coerce.js"; + +const PREFLIGHT_CACHE_TTL_MS = 5 * 60_000; +const PREFLIGHT_TIMEOUT_MS = 2_500; + +type PreflightApi = "ollama" | "openai-completions"; + +export type CronModelProviderPreflightResult = + | { status: "available" } + | { + status: "unavailable"; + reason: string; + provider: string; + model: string; + baseUrl: string; + retryAfterMs: number; + }; + +type EndpointPreflightResult = + | { status: "available" } + | { + status: "unavailable"; + error: unknown; + }; + +type CachedEndpointPreflightResult = { + checkedAtMs: number; + result: EndpointPreflightResult; +}; + +const preflightCache = new Map(); + +function resolveProviderConfig( + cfg: OpenClawConfig, + provider: string, +): ModelProviderConfig | undefined { + const providers = cfg.models?.providers; + if (!providers) { + return undefined; + } + const direct = providers[provider]; + if (direct) { + return direct; + } + const normalized = normalizeProviderId(provider); + return Object.entries(providers).find(([key]) => normalizeProviderId(key) === normalized)?.[1]; +} + +function normalizeBaseUrl(value: unknown): string | undefined { + if (typeof value !== "string") { + return undefined; + } + const trimmed = value.trim().replace(/\/+$/, ""); + return trimmed ? trimmed : undefined; +} + +function normalizeProbeApi(providerConfig: ModelProviderConfig): PreflightApi | undefined { + const api = normalizeLowercaseStringOrEmpty(providerConfig.api); + return api === "ollama" || api === "openai-completions" ? api : undefined; +} + +function isPrivateIpv4Host(host: string): boolean { + if (!/^\d+\.\d+\.\d+\.\d+$/.test(host)) { + return false; + } + const octets = host.split(".").map((part) => Number.parseInt(part, 10)); + if (octets.some((part) => !Number.isInteger(part) || part < 0 || part > 255)) { + return false; + } + const [a, b] = octets; + return a === 10 || (a === 172 && b >= 16 && b <= 31) || (a === 192 && b === 168); +} + +function isLocalProviderBaseUrl(baseUrl: string): boolean { + try { + let host = normalizeLowercaseStringOrEmpty(new URL(baseUrl).hostname); + if (host.startsWith("[") && host.endsWith("]")) { + host = host.slice(1, -1); + } + return ( + host === "localhost" || + host === "127.0.0.1" || + host === "0.0.0.0" || + host === "::1" || + host === "::ffff:7f00:1" || + host === "::ffff:127.0.0.1" || + host.endsWith(".local") || + isPrivateIpv4Host(host) + ); + } catch { + return false; + } +} + +function buildProbeUrl(api: PreflightApi, baseUrl: string): string { + if (api === "ollama") { + return `${baseUrl}/api/tags`; + } + return `${baseUrl}/models`; +} + +function buildLocalProviderSsrFPolicy(baseUrl: string): SsrFPolicy | undefined { + try { + const parsed = new URL(baseUrl); + if (parsed.protocol !== "http:" && parsed.protocol !== "https:") { + return undefined; + } + return { + hostnameAllowlist: [parsed.hostname], + allowPrivateNetwork: true, + }; + } catch { + return undefined; + } +} + +function formatUnavailableReason(params: { + provider: string; + model: string; + baseUrl: string; + error: unknown; +}): string { + return [ + `Agent cron job uses ${params.provider}/${params.model} but the local provider endpoint is not reachable at ${params.baseUrl}.`, + `Skipping this cron run; OpenClaw will retry the provider preflight on a later scheduled run.`, + `Last error: ${String(params.error)}`, + ].join(" "); +} + +function buildUnavailableResult(params: { + provider: string; + model: string; + baseUrl: string; + error: unknown; +}): CronModelProviderPreflightResult { + return { + status: "unavailable", + provider: params.provider, + model: params.model, + baseUrl: params.baseUrl, + retryAfterMs: PREFLIGHT_CACHE_TTL_MS, + reason: formatUnavailableReason({ + provider: params.provider, + model: params.model, + baseUrl: params.baseUrl, + error: params.error, + }), + }; +} + +async function probeLocalProviderEndpoint(params: { + api: PreflightApi; + baseUrl: string; +}): Promise { + const { response, release } = await fetchWithSsrFGuard({ + url: buildProbeUrl(params.api, params.baseUrl), + init: { method: "GET" }, + policy: buildLocalProviderSsrFPolicy(params.baseUrl), + timeoutMs: PREFLIGHT_TIMEOUT_MS, + auditContext: "cron-model-provider-preflight", + }); + try { + // Any HTTP response means the local endpoint is alive. Auth/model errors + // still belong to the normal model runner where fallback and diagnostics + // have the full provider context. + void response.status; + } finally { + await release(); + } +} + +export async function preflightCronModelProvider(params: { + cfg: OpenClawConfig; + provider: string; + model: string; + nowMs?: number; +}): Promise { + const providerConfig = resolveProviderConfig(params.cfg, params.provider); + if (!providerConfig) { + return { status: "available" }; + } + const baseUrl = normalizeBaseUrl(providerConfig.baseUrl); + const api = normalizeProbeApi(providerConfig); + if (!baseUrl || !api || !isLocalProviderBaseUrl(baseUrl)) { + return { status: "available" }; + } + + const nowMs = params.nowMs ?? Date.now(); + const cacheKey = `${api}\0${baseUrl}`; + const cached = preflightCache.get(cacheKey); + if (cached && nowMs - cached.checkedAtMs < PREFLIGHT_CACHE_TTL_MS) { + if (cached.result.status === "available") { + return { status: "available" }; + } + return buildUnavailableResult({ + provider: params.provider, + model: params.model, + baseUrl, + error: cached.result.error, + }); + } + + let result: EndpointPreflightResult; + try { + await probeLocalProviderEndpoint({ api, baseUrl }); + result = { status: "available" }; + } catch (error) { + result = { status: "unavailable", error }; + } + preflightCache.set(cacheKey, { checkedAtMs: nowMs, result }); + if (result.status === "available") { + return { status: "available" }; + } + return buildUnavailableResult({ + provider: params.provider, + model: params.model, + baseUrl, + error: result.error, + }); +} + +export function resetCronModelProviderPreflightCacheForTest(): void { + preflightCache.clear(); +} diff --git a/src/cron/isolated-agent/run.test-harness.ts b/src/cron/isolated-agent/run.test-harness.ts index 163c03936a1..cb673da5008 100644 --- a/src/cron/isolated-agent/run.test-harness.ts +++ b/src/cron/isolated-agent/run.test-harness.ts @@ -64,6 +64,7 @@ export const resolveCronPayloadOutcomeMock = createMock(); export const resolveCronDeliveryPlanMock = createMock(); export const resolveDeliveryTargetMock = createMock(); export const dispatchCronDeliveryMock = createMock(); +export const preflightCronModelProviderMock = createMock(); export const isHeartbeatOnlyResponseMock = createMock(); export const resolveHeartbeatAckMaxCharsMock = createMock(); export const resolveSessionAuthProfileOverrideMock = createMock(); @@ -220,6 +221,10 @@ vi.mock("./run-delivery.runtime.js", async () => { }; }); +vi.mock("./model-preflight.runtime.js", () => ({ + preflightCronModelProvider: preflightCronModelProviderMock, +})); + vi.mock("./helpers.js", () => ({ isHeartbeatOnlyResponse: isHeartbeatOnlyResponseMock, pickLastDeliverablePayload: vi.fn().mockReturnValue(undefined), @@ -477,6 +482,8 @@ function resetRunOutcomeMocks(): void { deliveryPayloads, }), ); + preflightCronModelProviderMock.mockReset(); + preflightCronModelProviderMock.mockResolvedValue({ status: "available" }); isHeartbeatOnlyResponseMock.mockReset(); isHeartbeatOnlyResponseMock.mockReturnValue(false); resolveHeartbeatAckMaxCharsMock.mockReset(); diff --git a/src/cron/isolated-agent/run.ts b/src/cron/isolated-agent/run.ts index 2a7e5e857d0..0e26b1157b2 100644 --- a/src/cron/isolated-agent/run.ts +++ b/src/cron/isolated-agent/run.ts @@ -76,6 +76,9 @@ let cronModelCatalogRuntimePromise: | Promise | undefined; let cronDeliveryRuntimePromise: Promise | undefined; +let cronModelPreflightRuntimePromise: + | Promise + | undefined; async function loadSessionStoreRuntime() { sessionStoreRuntimePromise ??= import("../../config/sessions/store.runtime.js"); @@ -112,6 +115,11 @@ async function loadCronDeliveryRuntime() { return await cronDeliveryRuntimePromise; } +async function loadCronModelPreflightRuntime() { + cronModelPreflightRuntimePromise ??= import("./model-preflight.runtime.js"); + return await cronModelPreflightRuntimePromise; +} + function hasConfiguredAuthProfiles(cfg: OpenClawConfig): boolean { return ( Boolean(cfg.auth?.profiles && Object.keys(cfg.auth.profiles).length > 0) || @@ -571,6 +579,26 @@ async function prepareCronRunContext(params: { logWarn(resolvedModelSelection.warning); } + const preflight = await ( + await loadCronModelPreflightRuntime() + ).preflightCronModelProvider({ + cfg: cfgWithAgentDefaults, + provider, + model, + }); + if (preflight.status === "unavailable") { + logWarn(`[cron:${input.job.id}] ${preflight.reason}`); + return { + ok: false, + result: withRunSession({ + status: "skipped", + error: preflight.reason, + provider, + model, + }), + }; + } + const hooksGmailThinking = isGmailHook ? normalizeThinkLevel(input.cfg.hooks?.gmail?.thinking) : undefined;