fix(firecrawl): block unsafe scrape targets

2026-05-06 05:10:44 +00:00 · 2026-05-02 07:24:34 +01:00
parent cdd8e81075
commit 189ab9f5d1
4 changed files with 82 additions and 1 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -36,6 +36,7 @@ Docs: https://docs.openclaw.ai
 - Web search/SearXNG: pass through `img_src` image URLs from SearXNG image-category results. Supersedes #61416. Thanks @sghael.
 - Web search/Kimi: fail explicitly when Moonshot returns an ungrounded chat answer instead of native web-search evidence, so Kimi no longer reports generic fallback text as a successful search. Fixes #52573. Thanks @wangwllu.
 - Web search: keep public provider requests on the strict SSRF guard and reserve private-network access for explicit self-hosted SearXNG/Firecrawl endpoints. Fixes #74357 and supersedes #74360. Thanks @fede-kamel.
+- Firecrawl: reject private, loopback, metadata, and non-HTTP(S) `firecrawl_scrape` target URLs before forwarding them to Firecrawl. Supersedes #48133. Thanks @kn1ghtc.
 - Web search/Firecrawl: allow self-hosted private/internal Firecrawl `baseUrl` endpoints, including HTTP for private targets, while keeping hosted Firecrawl on the strict official endpoint. Fixes #63877 and supersedes #59666, #63941, and #74013. Thanks @jhthompson12, @jzakirov, @Mlightsnow, and @shad0wca7.
 - Providers/OpenRouter: strip trailing assistant prefill turns from verified OpenRouter Anthropic model requests when reasoning is enabled, so Claude 4.6 routes no longer fail with Anthropic's prefill rejection through the OpenAI-compatible adapter. Fixes #75395. Thanks @sbmilburn.
 - Feishu: preserve Feishu/Lark HTTP error bodies for message sends, media sends, and chat member lookups, so HTTP 400 failures include vendor code, message, log id, and troubleshooter details. Fixes #73860. Thanks @desksk.
--- a/docs/tools/firecrawl.md
+++ b/docs/tools/firecrawl.md
@@ -86,6 +86,7 @@ Notes:
 - `maxAgeMs` controls how old cached results can be (ms). Default is 2 days.
 - Legacy `tools.web.fetch.firecrawl.*` config is auto-migrated by `openclaw doctor --fix`.
 - Firecrawl scrape/base URL overrides follow the same hosted/private rule as search: public hosted traffic uses `https://api.firecrawl.dev`; self-hosted overrides must resolve to private/internal endpoints.
+- `firecrawl_scrape` rejects obvious private, loopback, metadata, and non-HTTP(S) target URLs before forwarding them to Firecrawl, matching the `web_fetch` target-safety contract for explicit Firecrawl scrape calls.

 `firecrawl_scrape` reuses the same `plugins.entries.firecrawl.config.webFetch.*` settings and env vars.

--- a/extensions/firecrawl/src/firecrawl-client.ts
+++ b/extensions/firecrawl/src/firecrawl-client.ts
@@ -14,6 +14,7 @@ import {
 import { normalizeSecretInput } from "openclaw/plugin-sdk/secret-input";
 import { wrapExternalContent, wrapWebContent } from "openclaw/plugin-sdk/security-runtime";
 import {
+  SsrFBlockedError,
  isBlockedHostnameOrIp,
  isPrivateIpAddress,
  resolvePinnedHostnameWithPolicy,
@@ -82,6 +83,25 @@ export type FirecrawlScrapeParams = {
  timeoutSeconds?: number;
 };

+export function assertFirecrawlScrapeTargetAllowed(url: string): void {
+  let parsed: URL;
+  try {
+    parsed = new URL(url);
+  } catch {
+    throw new SsrFBlockedError("Invalid URL supplied to Firecrawl scrape");
+  }
+  if (parsed.protocol !== "http:" && parsed.protocol !== "https:") {
+    throw new SsrFBlockedError(
+      `Blocked non-HTTP(S) protocol in Firecrawl scrape URL: ${parsed.protocol}`,
+    );
+  }
+  if (isBlockedHostnameOrIp(parsed.hostname)) {
+    throw new SsrFBlockedError(
+      `Blocked hostname or private/internal IP in Firecrawl scrape URL: ${parsed.hostname}`,
+    );
+  }
+}
+
 function isOfficialFirecrawlEndpoint(url: URL): boolean {
  return url.protocol === "https:" && ALLOWED_FIRECRAWL_HOSTS.has(url.hostname);
 }
@@ -487,6 +507,8 @@ export function parseFirecrawlScrapePayload(params: {
 export async function runFirecrawlScrape(
  params: FirecrawlScrapeParams,
 ): Promise<Record<string, unknown>> {
+  assertFirecrawlScrapeTargetAllowed(params.url);
+
  const apiKey = resolveFirecrawlApiKey(params.cfg);
  if (!apiKey) {
    throw new Error(
@@ -571,6 +593,7 @@ export async function runFirecrawlScrape(
 }

 export const __testing = {
+  assertFirecrawlScrapeTargetAllowed,
  parseFirecrawlScrapePayload,
  postFirecrawlJson,
  resolveEndpoint,
--- a/extensions/firecrawl/src/firecrawl-tools.test.ts
+++ b/extensions/firecrawl/src/firecrawl-tools.test.ts
@@ -36,6 +36,7 @@ describe("firecrawl tools", () => {
  let createFirecrawlSearchTool: typeof import("./firecrawl-search-tool.js").createFirecrawlSearchTool;
  let createFirecrawlScrapeTool: typeof import("./firecrawl-scrape-tool.js").createFirecrawlScrapeTool;
  let firecrawlClientTesting: typeof import("./firecrawl-client.js").__testing;
+  let runActualFirecrawlScrape: typeof import("./firecrawl-client.js").runFirecrawlScrape;
  let ssrfMock: { mockRestore: () => void } | undefined;

  beforeAll(async () => {
@@ -44,7 +45,7 @@ describe("firecrawl tools", () => {
    ({ createFirecrawlWebSearchProvider } = await import("./firecrawl-search-provider.js"));
    ({ createFirecrawlSearchTool } = await import("./firecrawl-search-tool.js"));
    ({ createFirecrawlScrapeTool } = await import("./firecrawl-scrape-tool.js"));
-    ({ __testing: firecrawlClientTesting } =
+    ({ __testing: firecrawlClientTesting, runFirecrawlScrape: runActualFirecrawlScrape } =
      await vi.importActual<typeof import("./firecrawl-client.js")>("./firecrawl-client.js"));
  });

@@ -207,6 +208,61 @@ describe("firecrawl tools", () => {
    expect(authHeader).toBe("Bearer firecrawl-test-key");
  });

+  it("blocks private and non-http scrape targets before Firecrawl requests", async () => {
+    expect(() =>
+      firecrawlClientTesting.assertFirecrawlScrapeTargetAllowed("https://example.com/page"),
+    ).not.toThrow();
+
+    for (const blockedUrl of [
+      "http://localhost/admin",
+      "http://127.0.0.1/secret",
+      "http://10.0.0.5/secret",
+      "http://169.254.169.254/latest/meta-data/",
+      "http://metadata.google.internal/computeMetadata/v1/",
+      "file:///etc/passwd",
+    ]) {
+      expect(() => firecrawlClientTesting.assertFirecrawlScrapeTargetAllowed(blockedUrl)).toThrow(
+        /Blocked|non-HTTP/i,
+      );
+    }
+
+    try {
+      firecrawlClientTesting.assertFirecrawlScrapeTargetAllowed("not-a-valid-url?token=secret");
+      expect.fail("Expected invalid URL to be blocked");
+    } catch (error) {
+      expect((error as Error).message).toBe("Invalid URL supplied to Firecrawl scrape");
+      expect((error as Error).message).not.toContain("token=secret");
+    }
+  });
+
+  it("rejects blocked scrape targets before cache lookup or network fetch", async () => {
+    const fetchSpy = vi.fn(async () => new Response("should not be called"));
+    global.fetch = fetchSpy as typeof fetch;
+
+    await expect(
+      runActualFirecrawlScrape({
+        cfg: {
+          plugins: {
+            entries: {
+              firecrawl: {
+                config: {
+                  webFetch: {
+                    apiKey: "firecrawl-key",
+                    baseUrl: "https://api.firecrawl.dev",
+                  },
+                },
+              },
+            },
+          },
+        } as OpenClawConfig,
+        url: "http://169.254.169.254/latest/meta-data/",
+        extractMode: "markdown",
+      }),
+    ).rejects.toThrow(/Blocked hostname or private\/internal IP/);
+
+    expect(fetchSpy).not.toHaveBeenCalled();
+  });
+
  it("maps generic provider args into firecrawl search params", async () => {
    const provider = createFirecrawlWebSearchProvider();
    const tool = provider.createTool({