From b66459e3c2d29fe942e435979ccec5a95e4fb98e Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Sat, 2 May 2026 06:34:14 +0100 Subject: [PATCH] fix(web-search): support self-hosted Firecrawl --- CHANGELOG.md | 1 + docs/gateway/configuration-reference.md | 2 +- docs/tools/firecrawl.md | 13 ++- docs/tools/web-fetch.md | 5 +- extensions/firecrawl/src/firecrawl-client.ts | 96 +++++++++++++++++-- .../firecrawl/src/firecrawl-tools.test.ts | 67 ++++++++++--- 6 files changed, 158 insertions(+), 26 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b5c71551f27..5815a221af6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,6 +24,7 @@ Docs: https://docs.openclaw.ai ### Fixes - CLI/directory: report unsupported directory operations for installed channel plugins instead of prompting to reinstall the plugin when it lacks a directory adapter. Fixes #75770. Thanks @lawong888. +- Web search/Firecrawl: allow self-hosted private/internal Firecrawl `baseUrl` endpoints, including HTTP for private targets, while keeping hosted Firecrawl on the strict official endpoint. Fixes #63877 and supersedes #59666, #63941, and #74013. Thanks @jhthompson12, @jzakirov, @Mlightsnow, and @shad0wca7. - Feishu: preserve Feishu/Lark HTTP error bodies for message sends, media sends, and chat member lookups, so HTTP 400 failures include vendor code, message, log id, and troubleshooter details. Fixes #73860. Thanks @desksk. - Agents/transcripts: avoid reopening large Pi transcript files through the synchronous session manager for maintenance rewrites, persisted tool-result truncation, manual compaction boundary hardening, and queued compaction rotation. Thanks @mariozechner. - Web search/Exa: accept `plugins.entries.exa.config.webSearch.baseUrl`, normalize it to the Exa `/search` endpoint, and partition cached results by endpoint. Fixes #54928 and supersedes #54939. Thanks @mrpl327 and @lyfuci. diff --git a/docs/gateway/configuration-reference.md b/docs/gateway/configuration-reference.md index 0c4a5d02cdb..c07ef30ff29 100644 --- a/docs/gateway/configuration-reference.md +++ b/docs/gateway/configuration-reference.md @@ -197,7 +197,7 @@ See [MCP](/cli/mcp#openclaw-as-an-mcp-client-registry) and - Channel plugin account/runtime settings live under `channels.` and should be described by the owning plugin's manifest `channelConfigs` metadata, not by a central OpenClaw option registry. - `plugins.entries.firecrawl.config.webFetch`: Firecrawl web-fetch provider settings. - `apiKey`: Firecrawl API key (accepts SecretRef). Falls back to `plugins.entries.firecrawl.config.webSearch.apiKey`, legacy `tools.web.fetch.firecrawl.apiKey`, or `FIRECRAWL_API_KEY` env var. - - `baseUrl`: Firecrawl API base URL (default: `https://api.firecrawl.dev`). + - `baseUrl`: Firecrawl API base URL (default: `https://api.firecrawl.dev`; self-hosted overrides must target private/internal endpoints). - `onlyMainContent`: extract only the main content from pages (default: `true`). - `maxAgeMs`: maximum cache age in milliseconds (default: `172800000` / 2 days). - `timeoutSeconds`: scrape request timeout in seconds (default: `60`). diff --git a/docs/tools/firecrawl.md b/docs/tools/firecrawl.md index dc960eaccc6..d5c4df9801b 100644 --- a/docs/tools/firecrawl.md +++ b/docs/tools/firecrawl.md @@ -54,7 +54,7 @@ Notes: - Choosing Firecrawl in onboarding or `openclaw configure --section web` enables the bundled Firecrawl plugin automatically. - `web_search` with Firecrawl supports `query` and `count`. - For Firecrawl-specific controls like `sources`, `categories`, or result scraping, use `firecrawl_search`. -- `baseUrl` overrides must stay on `https://api.firecrawl.dev`. +- `baseUrl` defaults to hosted Firecrawl at `https://api.firecrawl.dev`. Self-hosted overrides are allowed only for private/internal endpoints; HTTP is accepted only for those private targets. - `FIRECRAWL_BASE_URL` is the shared env fallback for Firecrawl search and scrape base URLs. ## Configure Firecrawl scrape + web_fetch fallback @@ -85,10 +85,19 @@ Notes: - Firecrawl fallback attempts run only when an API key is available (`plugins.entries.firecrawl.config.webFetch.apiKey` or `FIRECRAWL_API_KEY`). - `maxAgeMs` controls how old cached results can be (ms). Default is 2 days. - Legacy `tools.web.fetch.firecrawl.*` config is auto-migrated by `openclaw doctor --fix`. -- Firecrawl scrape/base URL overrides are restricted to `https://api.firecrawl.dev`. +- Firecrawl scrape/base URL overrides follow the same hosted/private rule as search: public hosted traffic uses `https://api.firecrawl.dev`; self-hosted overrides must resolve to private/internal endpoints. `firecrawl_scrape` reuses the same `plugins.entries.firecrawl.config.webFetch.*` settings and env vars. +### Self-hosted Firecrawl + +Set `plugins.entries.firecrawl.config.webSearch.baseUrl`, +`plugins.entries.firecrawl.config.webFetch.baseUrl`, or `FIRECRAWL_BASE_URL` +when you run Firecrawl yourself. OpenClaw accepts `http://` only for loopback, +private-network, `.local`, `.internal`, or `.localhost` targets. Public custom +hosts are rejected so Firecrawl API keys are not sent to arbitrary endpoints by +accident. + ## Firecrawl plugin tools ### `firecrawl_search` diff --git a/docs/tools/web-fetch.md b/docs/tools/web-fetch.md index 9b4a1475421..7904e19b8a2 100644 --- a/docs/tools/web-fetch.md +++ b/docs/tools/web-fetch.md @@ -126,8 +126,9 @@ Legacy `tools.web.fetch.firecrawl.*` config is auto-migrated by `openclaw doctor - Firecrawl `baseUrl` overrides are locked down: they must use `https://` and - the official Firecrawl host (`api.firecrawl.dev`). + Firecrawl `baseUrl` overrides are locked down: hosted traffic uses + `https://api.firecrawl.dev`; self-hosted overrides must target private or + internal endpoints, and `http://` is accepted only for those private targets. Current runtime behavior: diff --git a/extensions/firecrawl/src/firecrawl-client.ts b/extensions/firecrawl/src/firecrawl-client.ts index 6fc00df8e55..99612cbd89b 100644 --- a/extensions/firecrawl/src/firecrawl-client.ts +++ b/extensions/firecrawl/src/firecrawl-client.ts @@ -8,11 +8,19 @@ import { resolveCacheTtlMs, truncateText, withStrictWebToolsEndpoint, + withTrustedWebToolsEndpoint, writeCache, } from "openclaw/plugin-sdk/provider-web-fetch"; import { normalizeSecretInput } from "openclaw/plugin-sdk/secret-input"; import { wrapExternalContent, wrapWebContent } from "openclaw/plugin-sdk/security-runtime"; import { + isBlockedHostnameOrIp, + isPrivateIpAddress, + resolvePinnedHostnameWithPolicy, + type LookupFn, +} from "openclaw/plugin-sdk/ssrf-runtime"; +import { + DEFAULT_FIRECRAWL_BASE_URL, resolveFirecrawlApiKey, resolveFirecrawlBaseUrl, resolveFirecrawlMaxAgeMs, @@ -32,6 +40,16 @@ const SCRAPE_CACHE = new Map< const DEFAULT_SEARCH_COUNT = 5; const DEFAULT_SCRAPE_MAX_CHARS = 50_000; const ALLOWED_FIRECRAWL_HOSTS = new Set(["api.firecrawl.dev"]); +const FIRECRAWL_SELF_HOSTED_PRIVATE_ERROR = + "Firecrawl custom baseUrl must target a private or internal self-hosted endpoint."; +const FIRECRAWL_HTTP_PRIVATE_ERROR = + "Firecrawl HTTP baseUrl must target a private or internal self-hosted endpoint. Use https:// for public hosts."; + +type FirecrawlEndpointMode = "strict" | "trusted"; +type FirecrawlResolvedEndpoint = { + url: string; + mode: FirecrawlEndpointMode; +}; type FirecrawlSearchItem = { title: string; @@ -64,25 +82,75 @@ export type FirecrawlScrapeParams = { timeoutSeconds?: number; }; -function resolveEndpoint(baseUrl: string, pathname: "/v2/search" | "/v2/scrape"): string { - const url = new URL(baseUrl.trim() || "https://api.firecrawl.dev"); - if (url.protocol !== "https:") { - throw new Error("Firecrawl baseUrl must use https."); +function isOfficialFirecrawlEndpoint(url: URL): boolean { + return url.protocol === "https:" && ALLOWED_FIRECRAWL_HOSTS.has(url.hostname); +} + +async function firecrawlEndpointTargetsPrivateNetwork( + url: URL, + lookupFn?: LookupFn, +): Promise { + if (isBlockedHostnameOrIp(url.hostname)) { + return true; } - if (!ALLOWED_FIRECRAWL_HOSTS.has(url.hostname)) { - throw new Error(`Firecrawl baseUrl host is not allowed: ${url.hostname}`); + try { + const pinned = await resolvePinnedHostnameWithPolicy(url.hostname, { + lookupFn, + policy: { allowPrivateNetwork: true }, + }); + return pinned.addresses.every((address) => isPrivateIpAddress(address)); + } catch { + return false; } +} + +async function validateFirecrawlBaseUrl( + baseUrl: string, + lookupFn?: LookupFn, +): Promise { + let url: URL; + try { + url = new URL(baseUrl.trim() || DEFAULT_FIRECRAWL_BASE_URL); + } catch { + throw new Error("Firecrawl baseUrl must be a valid http:// or https:// URL."); + } + + if (url.protocol !== "http:" && url.protocol !== "https:") { + throw new Error("Firecrawl baseUrl must use http:// or https://."); + } + if (isOfficialFirecrawlEndpoint(url)) { + return "strict"; + } + + const isPrivateTarget = await firecrawlEndpointTargetsPrivateNetwork(url, lookupFn); + if (isPrivateTarget) { + return "trusted"; + } + if (url.protocol === "http:") { + throw new Error(FIRECRAWL_HTTP_PRIVATE_ERROR); + } + throw new Error(`${FIRECRAWL_SELF_HOSTED_PRIVATE_ERROR} Host: ${url.hostname}`); +} + +async function resolveEndpoint( + baseUrl: string, + pathname: "/v2/search" | "/v2/scrape", + lookupFn?: LookupFn, +): Promise { + const url = new URL(baseUrl.trim() || DEFAULT_FIRECRAWL_BASE_URL); + const mode = await validateFirecrawlBaseUrl(url.toString(), lookupFn); url.username = ""; url.password = ""; url.search = ""; url.hash = ""; url.pathname = pathname; - return url.toString(); + return { url: url.toString(), mode }; } async function postFirecrawlJson( params: { url: string; + mode?: FirecrawlEndpointMode; timeoutSeconds: number; apiKey: string; body: Record; @@ -91,7 +159,10 @@ async function postFirecrawlJson( parse: (response: Response) => Promise, ): Promise { const apiKey = normalizeSecretInput(params.apiKey); - return await withStrictWebToolsEndpoint( + const mode = params.mode ?? (await validateFirecrawlBaseUrl(params.url)); + const withEndpoint = + mode === "trusted" ? withTrustedWebToolsEndpoint : withStrictWebToolsEndpoint; + return await withEndpoint( { url: params.url, timeoutSeconds: params.timeoutSeconds, @@ -304,9 +375,11 @@ export async function runFirecrawlSearch( } const start = Date.now(); + const endpoint = await resolveEndpoint(baseUrl, "/v2/search"); const payload = await postFirecrawlJson( { - url: resolveEndpoint(baseUrl, "/v2/search"), + url: endpoint.url, + mode: endpoint.mode, timeoutSeconds, apiKey, body, @@ -448,9 +521,11 @@ export async function runFirecrawlScrape( return { ...cached.value, cached: true }; } + const endpoint = await resolveEndpoint(baseUrl, "/v2/scrape"); const payload = await postFirecrawlJson( { - url: resolveEndpoint(baseUrl, "/v2/scrape"), + url: endpoint.url, + mode: endpoint.mode, timeoutSeconds, apiKey, errorLabel: "Firecrawl", @@ -499,5 +574,6 @@ export const __testing = { parseFirecrawlScrapePayload, postFirecrawlJson, resolveEndpoint, + validateFirecrawlBaseUrl, resolveSearchItems, }; diff --git a/extensions/firecrawl/src/firecrawl-tools.test.ts b/extensions/firecrawl/src/firecrawl-tools.test.ts index 683dd9c2307..fb031fc114a 100644 --- a/extensions/firecrawl/src/firecrawl-tools.test.ts +++ b/extensions/firecrawl/src/firecrawl-tools.test.ts @@ -605,19 +605,64 @@ describe("firecrawl tools", () => { expect(resolveFirecrawlApiKey(cfg)).toBeUndefined(); }); - it("only allows the official Firecrawl API host for fetch endpoints", () => { - expect(firecrawlClientTesting.resolveEndpoint("https://api.firecrawl.dev", "/v2/scrape")).toBe( - "https://api.firecrawl.dev/v2/scrape", - ); - expect(() => + it("allows hosted Firecrawl and private self-hosted endpoints only", async () => { + await expect( + firecrawlClientTesting.resolveEndpoint("https://api.firecrawl.dev", "/v2/scrape"), + ).resolves.toEqual({ + url: "https://api.firecrawl.dev/v2/scrape", + mode: "strict", + }); + await expect( + firecrawlClientTesting.resolveEndpoint("http://127.0.0.1:8787", "/v2/scrape"), + ).resolves.toEqual({ + url: "http://127.0.0.1:8787/v2/scrape", + mode: "trusted", + }); + await expect( + firecrawlClientTesting.resolveEndpoint( + "https://host.openshell.internal:444/v1", + "/v2/search", + ), + ).resolves.toEqual({ + url: "https://host.openshell.internal:444/v2/search", + mode: "trusted", + }); + await expect( firecrawlClientTesting.resolveEndpoint("http://api.firecrawl.dev", "/v2/scrape"), - ).toThrow("Firecrawl baseUrl must use https."); - expect(() => - firecrawlClientTesting.resolveEndpoint("https://127.0.0.1:8787", "/v2/scrape"), - ).toThrow("Firecrawl baseUrl host is not allowed"); - expect(() => + ).rejects.toThrow("Firecrawl HTTP baseUrl must target a private or internal"); + await expect( firecrawlClientTesting.resolveEndpoint("https://attacker.example", "/v2/search"), - ).toThrow("Firecrawl baseUrl host is not allowed"); + ).rejects.toThrow("Firecrawl custom baseUrl must target a private or internal"); + await expect( + firecrawlClientTesting.resolveEndpoint("ftp://127.0.0.1:8787", "/v2/scrape"), + ).rejects.toThrow("Firecrawl baseUrl must use http:// or https://."); + }); + + it("routes private self-hosted Firecrawl endpoints through the trusted fetch guard", async () => { + ssrfMock?.mockRestore(); + ssrfMock = mockPinnedHostnameResolution(["127.0.0.1"]); + const fetchSpy = vi.fn( + async () => + new Response(JSON.stringify({ success: true, data: [] }), { + status: 200, + headers: { "content-type": "application/json" }, + }), + ); + global.fetch = fetchSpy as typeof fetch; + + const result = await firecrawlClientTesting.postFirecrawlJson( + { + url: "http://127.0.0.1:8787/v2/search", + timeoutSeconds: 5, + apiKey: "firecrawl-key", + body: { query: "openclaw" }, + errorLabel: "Firecrawl Search", + }, + async (response) => (await response.json()) as Record, + ); + + expect(fetchSpy).toHaveBeenCalledTimes(1); + expect(result).toMatchObject({ success: true }); }); it("respects positive numeric overrides for scrape and cache behavior", () => {