diff --git a/CHANGELOG.md b/CHANGELOG.md index 45c2011d869..898b364edbb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -36,6 +36,7 @@ Docs: https://docs.openclaw.ai - Web search/SearXNG: pass through `img_src` image URLs from SearXNG image-category results. Supersedes #61416. Thanks @sghael. - Web search/Kimi: fail explicitly when Moonshot returns an ungrounded chat answer instead of native web-search evidence, so Kimi no longer reports generic fallback text as a successful search. Fixes #52573. Thanks @wangwllu. - Web search: keep public provider requests on the strict SSRF guard and reserve private-network access for explicit self-hosted SearXNG/Firecrawl endpoints. Fixes #74357 and supersedes #74360. Thanks @fede-kamel. +- Firecrawl: reject private, loopback, metadata, and non-HTTP(S) `firecrawl_scrape` target URLs before forwarding them to Firecrawl. Supersedes #48133. Thanks @kn1ghtc. - Web search/Firecrawl: allow self-hosted private/internal Firecrawl `baseUrl` endpoints, including HTTP for private targets, while keeping hosted Firecrawl on the strict official endpoint. Fixes #63877 and supersedes #59666, #63941, and #74013. Thanks @jhthompson12, @jzakirov, @Mlightsnow, and @shad0wca7. - Providers/OpenRouter: strip trailing assistant prefill turns from verified OpenRouter Anthropic model requests when reasoning is enabled, so Claude 4.6 routes no longer fail with Anthropic's prefill rejection through the OpenAI-compatible adapter. Fixes #75395. Thanks @sbmilburn. - Feishu: preserve Feishu/Lark HTTP error bodies for message sends, media sends, and chat member lookups, so HTTP 400 failures include vendor code, message, log id, and troubleshooter details. Fixes #73860. Thanks @desksk. diff --git a/docs/tools/firecrawl.md b/docs/tools/firecrawl.md index d5c4df9801b..bfba75d4599 100644 --- a/docs/tools/firecrawl.md +++ b/docs/tools/firecrawl.md @@ -86,6 +86,7 @@ Notes: - `maxAgeMs` controls how old cached results can be (ms). Default is 2 days. - Legacy `tools.web.fetch.firecrawl.*` config is auto-migrated by `openclaw doctor --fix`. - Firecrawl scrape/base URL overrides follow the same hosted/private rule as search: public hosted traffic uses `https://api.firecrawl.dev`; self-hosted overrides must resolve to private/internal endpoints. +- `firecrawl_scrape` rejects obvious private, loopback, metadata, and non-HTTP(S) target URLs before forwarding them to Firecrawl, matching the `web_fetch` target-safety contract for explicit Firecrawl scrape calls. `firecrawl_scrape` reuses the same `plugins.entries.firecrawl.config.webFetch.*` settings and env vars. diff --git a/extensions/firecrawl/src/firecrawl-client.ts b/extensions/firecrawl/src/firecrawl-client.ts index e68fad3c1c7..1189b7c8682 100644 --- a/extensions/firecrawl/src/firecrawl-client.ts +++ b/extensions/firecrawl/src/firecrawl-client.ts @@ -14,6 +14,7 @@ import { import { normalizeSecretInput } from "openclaw/plugin-sdk/secret-input"; import { wrapExternalContent, wrapWebContent } from "openclaw/plugin-sdk/security-runtime"; import { + SsrFBlockedError, isBlockedHostnameOrIp, isPrivateIpAddress, resolvePinnedHostnameWithPolicy, @@ -82,6 +83,25 @@ export type FirecrawlScrapeParams = { timeoutSeconds?: number; }; +export function assertFirecrawlScrapeTargetAllowed(url: string): void { + let parsed: URL; + try { + parsed = new URL(url); + } catch { + throw new SsrFBlockedError("Invalid URL supplied to Firecrawl scrape"); + } + if (parsed.protocol !== "http:" && parsed.protocol !== "https:") { + throw new SsrFBlockedError( + `Blocked non-HTTP(S) protocol in Firecrawl scrape URL: ${parsed.protocol}`, + ); + } + if (isBlockedHostnameOrIp(parsed.hostname)) { + throw new SsrFBlockedError( + `Blocked hostname or private/internal IP in Firecrawl scrape URL: ${parsed.hostname}`, + ); + } +} + function isOfficialFirecrawlEndpoint(url: URL): boolean { return url.protocol === "https:" && ALLOWED_FIRECRAWL_HOSTS.has(url.hostname); } @@ -487,6 +507,8 @@ export function parseFirecrawlScrapePayload(params: { export async function runFirecrawlScrape( params: FirecrawlScrapeParams, ): Promise> { + assertFirecrawlScrapeTargetAllowed(params.url); + const apiKey = resolveFirecrawlApiKey(params.cfg); if (!apiKey) { throw new Error( @@ -571,6 +593,7 @@ export async function runFirecrawlScrape( } export const __testing = { + assertFirecrawlScrapeTargetAllowed, parseFirecrawlScrapePayload, postFirecrawlJson, resolveEndpoint, diff --git a/extensions/firecrawl/src/firecrawl-tools.test.ts b/extensions/firecrawl/src/firecrawl-tools.test.ts index 7a4ebdf4df8..17b75e09088 100644 --- a/extensions/firecrawl/src/firecrawl-tools.test.ts +++ b/extensions/firecrawl/src/firecrawl-tools.test.ts @@ -36,6 +36,7 @@ describe("firecrawl tools", () => { let createFirecrawlSearchTool: typeof import("./firecrawl-search-tool.js").createFirecrawlSearchTool; let createFirecrawlScrapeTool: typeof import("./firecrawl-scrape-tool.js").createFirecrawlScrapeTool; let firecrawlClientTesting: typeof import("./firecrawl-client.js").__testing; + let runActualFirecrawlScrape: typeof import("./firecrawl-client.js").runFirecrawlScrape; let ssrfMock: { mockRestore: () => void } | undefined; beforeAll(async () => { @@ -44,7 +45,7 @@ describe("firecrawl tools", () => { ({ createFirecrawlWebSearchProvider } = await import("./firecrawl-search-provider.js")); ({ createFirecrawlSearchTool } = await import("./firecrawl-search-tool.js")); ({ createFirecrawlScrapeTool } = await import("./firecrawl-scrape-tool.js")); - ({ __testing: firecrawlClientTesting } = + ({ __testing: firecrawlClientTesting, runFirecrawlScrape: runActualFirecrawlScrape } = await vi.importActual("./firecrawl-client.js")); }); @@ -207,6 +208,61 @@ describe("firecrawl tools", () => { expect(authHeader).toBe("Bearer firecrawl-test-key"); }); + it("blocks private and non-http scrape targets before Firecrawl requests", async () => { + expect(() => + firecrawlClientTesting.assertFirecrawlScrapeTargetAllowed("https://example.com/page"), + ).not.toThrow(); + + for (const blockedUrl of [ + "http://localhost/admin", + "http://127.0.0.1/secret", + "http://10.0.0.5/secret", + "http://169.254.169.254/latest/meta-data/", + "http://metadata.google.internal/computeMetadata/v1/", + "file:///etc/passwd", + ]) { + expect(() => firecrawlClientTesting.assertFirecrawlScrapeTargetAllowed(blockedUrl)).toThrow( + /Blocked|non-HTTP/i, + ); + } + + try { + firecrawlClientTesting.assertFirecrawlScrapeTargetAllowed("not-a-valid-url?token=secret"); + expect.fail("Expected invalid URL to be blocked"); + } catch (error) { + expect((error as Error).message).toBe("Invalid URL supplied to Firecrawl scrape"); + expect((error as Error).message).not.toContain("token=secret"); + } + }); + + it("rejects blocked scrape targets before cache lookup or network fetch", async () => { + const fetchSpy = vi.fn(async () => new Response("should not be called")); + global.fetch = fetchSpy as typeof fetch; + + await expect( + runActualFirecrawlScrape({ + cfg: { + plugins: { + entries: { + firecrawl: { + config: { + webFetch: { + apiKey: "firecrawl-key", + baseUrl: "https://api.firecrawl.dev", + }, + }, + }, + }, + }, + } as OpenClawConfig, + url: "http://169.254.169.254/latest/meta-data/", + extractMode: "markdown", + }), + ).rejects.toThrow(/Blocked hostname or private\/internal IP/); + + expect(fetchSpy).not.toHaveBeenCalled(); + }); + it("maps generic provider args into firecrawl search params", async () => { const provider = createFirecrawlWebSearchProvider(); const tool = provider.createTool({