mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 05:20:43 +00:00
fix(web-search): support self-hosted Firecrawl
This commit is contained in:
@@ -8,11 +8,19 @@ import {
|
||||
resolveCacheTtlMs,
|
||||
truncateText,
|
||||
withStrictWebToolsEndpoint,
|
||||
withTrustedWebToolsEndpoint,
|
||||
writeCache,
|
||||
} from "openclaw/plugin-sdk/provider-web-fetch";
|
||||
import { normalizeSecretInput } from "openclaw/plugin-sdk/secret-input";
|
||||
import { wrapExternalContent, wrapWebContent } from "openclaw/plugin-sdk/security-runtime";
|
||||
import {
|
||||
isBlockedHostnameOrIp,
|
||||
isPrivateIpAddress,
|
||||
resolvePinnedHostnameWithPolicy,
|
||||
type LookupFn,
|
||||
} from "openclaw/plugin-sdk/ssrf-runtime";
|
||||
import {
|
||||
DEFAULT_FIRECRAWL_BASE_URL,
|
||||
resolveFirecrawlApiKey,
|
||||
resolveFirecrawlBaseUrl,
|
||||
resolveFirecrawlMaxAgeMs,
|
||||
@@ -32,6 +40,16 @@ const SCRAPE_CACHE = new Map<
|
||||
const DEFAULT_SEARCH_COUNT = 5;
|
||||
const DEFAULT_SCRAPE_MAX_CHARS = 50_000;
|
||||
const ALLOWED_FIRECRAWL_HOSTS = new Set(["api.firecrawl.dev"]);
|
||||
const FIRECRAWL_SELF_HOSTED_PRIVATE_ERROR =
|
||||
"Firecrawl custom baseUrl must target a private or internal self-hosted endpoint.";
|
||||
const FIRECRAWL_HTTP_PRIVATE_ERROR =
|
||||
"Firecrawl HTTP baseUrl must target a private or internal self-hosted endpoint. Use https:// for public hosts.";
|
||||
|
||||
type FirecrawlEndpointMode = "strict" | "trusted";
|
||||
type FirecrawlResolvedEndpoint = {
|
||||
url: string;
|
||||
mode: FirecrawlEndpointMode;
|
||||
};
|
||||
|
||||
type FirecrawlSearchItem = {
|
||||
title: string;
|
||||
@@ -64,25 +82,75 @@ export type FirecrawlScrapeParams = {
|
||||
timeoutSeconds?: number;
|
||||
};
|
||||
|
||||
function resolveEndpoint(baseUrl: string, pathname: "/v2/search" | "/v2/scrape"): string {
|
||||
const url = new URL(baseUrl.trim() || "https://api.firecrawl.dev");
|
||||
if (url.protocol !== "https:") {
|
||||
throw new Error("Firecrawl baseUrl must use https.");
|
||||
function isOfficialFirecrawlEndpoint(url: URL): boolean {
|
||||
return url.protocol === "https:" && ALLOWED_FIRECRAWL_HOSTS.has(url.hostname);
|
||||
}
|
||||
|
||||
async function firecrawlEndpointTargetsPrivateNetwork(
|
||||
url: URL,
|
||||
lookupFn?: LookupFn,
|
||||
): Promise<boolean> {
|
||||
if (isBlockedHostnameOrIp(url.hostname)) {
|
||||
return true;
|
||||
}
|
||||
if (!ALLOWED_FIRECRAWL_HOSTS.has(url.hostname)) {
|
||||
throw new Error(`Firecrawl baseUrl host is not allowed: ${url.hostname}`);
|
||||
try {
|
||||
const pinned = await resolvePinnedHostnameWithPolicy(url.hostname, {
|
||||
lookupFn,
|
||||
policy: { allowPrivateNetwork: true },
|
||||
});
|
||||
return pinned.addresses.every((address) => isPrivateIpAddress(address));
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
async function validateFirecrawlBaseUrl(
|
||||
baseUrl: string,
|
||||
lookupFn?: LookupFn,
|
||||
): Promise<FirecrawlEndpointMode> {
|
||||
let url: URL;
|
||||
try {
|
||||
url = new URL(baseUrl.trim() || DEFAULT_FIRECRAWL_BASE_URL);
|
||||
} catch {
|
||||
throw new Error("Firecrawl baseUrl must be a valid http:// or https:// URL.");
|
||||
}
|
||||
|
||||
if (url.protocol !== "http:" && url.protocol !== "https:") {
|
||||
throw new Error("Firecrawl baseUrl must use http:// or https://.");
|
||||
}
|
||||
if (isOfficialFirecrawlEndpoint(url)) {
|
||||
return "strict";
|
||||
}
|
||||
|
||||
const isPrivateTarget = await firecrawlEndpointTargetsPrivateNetwork(url, lookupFn);
|
||||
if (isPrivateTarget) {
|
||||
return "trusted";
|
||||
}
|
||||
if (url.protocol === "http:") {
|
||||
throw new Error(FIRECRAWL_HTTP_PRIVATE_ERROR);
|
||||
}
|
||||
throw new Error(`${FIRECRAWL_SELF_HOSTED_PRIVATE_ERROR} Host: ${url.hostname}`);
|
||||
}
|
||||
|
||||
async function resolveEndpoint(
|
||||
baseUrl: string,
|
||||
pathname: "/v2/search" | "/v2/scrape",
|
||||
lookupFn?: LookupFn,
|
||||
): Promise<FirecrawlResolvedEndpoint> {
|
||||
const url = new URL(baseUrl.trim() || DEFAULT_FIRECRAWL_BASE_URL);
|
||||
const mode = await validateFirecrawlBaseUrl(url.toString(), lookupFn);
|
||||
url.username = "";
|
||||
url.password = "";
|
||||
url.search = "";
|
||||
url.hash = "";
|
||||
url.pathname = pathname;
|
||||
return url.toString();
|
||||
return { url: url.toString(), mode };
|
||||
}
|
||||
|
||||
async function postFirecrawlJson<T>(
|
||||
params: {
|
||||
url: string;
|
||||
mode?: FirecrawlEndpointMode;
|
||||
timeoutSeconds: number;
|
||||
apiKey: string;
|
||||
body: Record<string, unknown>;
|
||||
@@ -91,7 +159,10 @@ async function postFirecrawlJson<T>(
|
||||
parse: (response: Response) => Promise<T>,
|
||||
): Promise<T> {
|
||||
const apiKey = normalizeSecretInput(params.apiKey);
|
||||
return await withStrictWebToolsEndpoint(
|
||||
const mode = params.mode ?? (await validateFirecrawlBaseUrl(params.url));
|
||||
const withEndpoint =
|
||||
mode === "trusted" ? withTrustedWebToolsEndpoint : withStrictWebToolsEndpoint;
|
||||
return await withEndpoint(
|
||||
{
|
||||
url: params.url,
|
||||
timeoutSeconds: params.timeoutSeconds,
|
||||
@@ -304,9 +375,11 @@ export async function runFirecrawlSearch(
|
||||
}
|
||||
|
||||
const start = Date.now();
|
||||
const endpoint = await resolveEndpoint(baseUrl, "/v2/search");
|
||||
const payload = await postFirecrawlJson(
|
||||
{
|
||||
url: resolveEndpoint(baseUrl, "/v2/search"),
|
||||
url: endpoint.url,
|
||||
mode: endpoint.mode,
|
||||
timeoutSeconds,
|
||||
apiKey,
|
||||
body,
|
||||
@@ -448,9 +521,11 @@ export async function runFirecrawlScrape(
|
||||
return { ...cached.value, cached: true };
|
||||
}
|
||||
|
||||
const endpoint = await resolveEndpoint(baseUrl, "/v2/scrape");
|
||||
const payload = await postFirecrawlJson(
|
||||
{
|
||||
url: resolveEndpoint(baseUrl, "/v2/scrape"),
|
||||
url: endpoint.url,
|
||||
mode: endpoint.mode,
|
||||
timeoutSeconds,
|
||||
apiKey,
|
||||
errorLabel: "Firecrawl",
|
||||
@@ -499,5 +574,6 @@ export const __testing = {
|
||||
parseFirecrawlScrapePayload,
|
||||
postFirecrawlJson,
|
||||
resolveEndpoint,
|
||||
validateFirecrawlBaseUrl,
|
||||
resolveSearchItems,
|
||||
};
|
||||
|
||||
@@ -605,19 +605,64 @@ describe("firecrawl tools", () => {
|
||||
expect(resolveFirecrawlApiKey(cfg)).toBeUndefined();
|
||||
});
|
||||
|
||||
it("only allows the official Firecrawl API host for fetch endpoints", () => {
|
||||
expect(firecrawlClientTesting.resolveEndpoint("https://api.firecrawl.dev", "/v2/scrape")).toBe(
|
||||
"https://api.firecrawl.dev/v2/scrape",
|
||||
);
|
||||
expect(() =>
|
||||
it("allows hosted Firecrawl and private self-hosted endpoints only", async () => {
|
||||
await expect(
|
||||
firecrawlClientTesting.resolveEndpoint("https://api.firecrawl.dev", "/v2/scrape"),
|
||||
).resolves.toEqual({
|
||||
url: "https://api.firecrawl.dev/v2/scrape",
|
||||
mode: "strict",
|
||||
});
|
||||
await expect(
|
||||
firecrawlClientTesting.resolveEndpoint("http://127.0.0.1:8787", "/v2/scrape"),
|
||||
).resolves.toEqual({
|
||||
url: "http://127.0.0.1:8787/v2/scrape",
|
||||
mode: "trusted",
|
||||
});
|
||||
await expect(
|
||||
firecrawlClientTesting.resolveEndpoint(
|
||||
"https://host.openshell.internal:444/v1",
|
||||
"/v2/search",
|
||||
),
|
||||
).resolves.toEqual({
|
||||
url: "https://host.openshell.internal:444/v2/search",
|
||||
mode: "trusted",
|
||||
});
|
||||
await expect(
|
||||
firecrawlClientTesting.resolveEndpoint("http://api.firecrawl.dev", "/v2/scrape"),
|
||||
).toThrow("Firecrawl baseUrl must use https.");
|
||||
expect(() =>
|
||||
firecrawlClientTesting.resolveEndpoint("https://127.0.0.1:8787", "/v2/scrape"),
|
||||
).toThrow("Firecrawl baseUrl host is not allowed");
|
||||
expect(() =>
|
||||
).rejects.toThrow("Firecrawl HTTP baseUrl must target a private or internal");
|
||||
await expect(
|
||||
firecrawlClientTesting.resolveEndpoint("https://attacker.example", "/v2/search"),
|
||||
).toThrow("Firecrawl baseUrl host is not allowed");
|
||||
).rejects.toThrow("Firecrawl custom baseUrl must target a private or internal");
|
||||
await expect(
|
||||
firecrawlClientTesting.resolveEndpoint("ftp://127.0.0.1:8787", "/v2/scrape"),
|
||||
).rejects.toThrow("Firecrawl baseUrl must use http:// or https://.");
|
||||
});
|
||||
|
||||
it("routes private self-hosted Firecrawl endpoints through the trusted fetch guard", async () => {
|
||||
ssrfMock?.mockRestore();
|
||||
ssrfMock = mockPinnedHostnameResolution(["127.0.0.1"]);
|
||||
const fetchSpy = vi.fn(
|
||||
async () =>
|
||||
new Response(JSON.stringify({ success: true, data: [] }), {
|
||||
status: 200,
|
||||
headers: { "content-type": "application/json" },
|
||||
}),
|
||||
);
|
||||
global.fetch = fetchSpy as typeof fetch;
|
||||
|
||||
const result = await firecrawlClientTesting.postFirecrawlJson(
|
||||
{
|
||||
url: "http://127.0.0.1:8787/v2/search",
|
||||
timeoutSeconds: 5,
|
||||
apiKey: "firecrawl-key",
|
||||
body: { query: "openclaw" },
|
||||
errorLabel: "Firecrawl Search",
|
||||
},
|
||||
async (response) => (await response.json()) as Record<string, unknown>,
|
||||
);
|
||||
|
||||
expect(fetchSpy).toHaveBeenCalledTimes(1);
|
||||
expect(result).toMatchObject({ success: true });
|
||||
});
|
||||
|
||||
it("respects positive numeric overrides for scrape and cache behavior", () => {
|
||||
|
||||
Reference in New Issue
Block a user