test: decouple web fetch fallbacks from provider startup

This commit is contained in:
Peter Steinberger
2026-04-05 22:01:49 +01:00
parent 8279375bdf
commit aa464f8573
5 changed files with 145 additions and 149 deletions

View File

@@ -10,6 +10,7 @@ import {
withStrictWebToolsEndpoint,
writeCache,
} from "openclaw/plugin-sdk/provider-web-fetch";
import { normalizeSecretInput } from "openclaw/plugin-sdk/secret-input";
import { wrapExternalContent, wrapWebContent } from "openclaw/plugin-sdk/security-runtime";
import {
resolveFirecrawlApiKey,
@@ -89,6 +90,7 @@ async function postFirecrawlJson<T>(
},
parse: (response: Response) => Promise<T>,
): Promise<T> {
const apiKey = normalizeSecretInput(params.apiKey);
return await withStrictWebToolsEndpoint(
{
url: params.url,
@@ -96,7 +98,7 @@ async function postFirecrawlJson<T>(
init: {
method: "POST",
headers: {
Authorization: `Bearer ${params.apiKey}`,
Authorization: `Bearer ${apiKey}`,
"Content-Type": "application/json",
},
body: JSON.stringify(params.body),

View File

@@ -176,6 +176,32 @@ describe("firecrawl tools", () => {
).rejects.toThrow(/<<<EXTERNAL_UNTRUSTED_CONTENT id="[a-f0-9]{16}">>>/);
});
it("normalizes Firecrawl authorization headers before requests", async () => {
const fetchSpy = vi.fn(
async () =>
new Response(JSON.stringify({ success: true, data: [] }), {
status: 200,
headers: { "content-type": "application/json" },
}),
);
global.fetch = fetchSpy as typeof fetch;
await firecrawlClientTesting.postFirecrawlJson(
{
url: "https://api.firecrawl.dev/v2/search",
timeoutSeconds: 5,
apiKey: "firecrawl-test-\r\nkey",
body: { query: "openclaw" },
errorLabel: "Firecrawl search",
},
async () => "ok",
);
const init = fetchSpy.mock.calls[0]?.[1];
const authHeader = new Headers(init?.headers).get("Authorization");
expect(authHeader).toBe("Bearer firecrawl-test-key");
});
it("maps generic provider args into firecrawl search params", async () => {
const provider = createFirecrawlWebSearchProvider();
const tool = provider.createTool({

View File

@@ -2,6 +2,7 @@ import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
import * as ssrf from "../../infra/net/ssrf.js";
import { type FetchMock, withFetchPreconnect } from "../../test-utils/fetch-mock.js";
import { makeFetchHeaders } from "./web-fetch.test-harness.js";
import "./web-fetch.test-mocks.js";
const lookupMock = vi.fn();
const resolvePinnedHostname = ssrf.resolvePinnedHostname;

View File

@@ -244,7 +244,7 @@ type WebFetchRuntimeParams = {
cacheTtlMs: number;
userAgent: string;
readabilityEnabled: boolean;
providerFallback: ReturnType<typeof resolveWebFetchDefinition>;
resolveProviderFallback: () => ReturnType<typeof resolveWebFetchDefinition>;
};
function isRecord(value: unknown): value is Record<string, unknown> {
@@ -341,16 +341,17 @@ async function maybeFetchProviderWebFetchPayload(
tookMs: number;
},
): Promise<Record<string, unknown> | null> {
if (!params.providerFallback) {
const providerFallback = params.resolveProviderFallback();
if (!providerFallback) {
return null;
}
const rawPayload = await params.providerFallback.definition.execute({
const rawPayload = await providerFallback.definition.execute({
url: params.urlToFetch,
extractMode: params.extractMode,
maxChars: params.maxChars,
});
const payload = normalizeProviderWebFetchPayload({
providerId: params.providerFallback.provider.id,
providerId: providerFallback.provider.id,
payload: rawPayload,
requestedUrl: params.url,
extractMode: params.extractMode,
@@ -498,7 +499,8 @@ async function runWebFetch(params: WebFetchRuntimeParams): Promise<Record<string
title = basic.title;
extractor = "raw-html";
} else {
const providerLabel = params.providerFallback?.provider.label ?? "provider fallback";
const providerLabel =
params.resolveProviderFallback()?.provider.label ?? "provider fallback";
throw new Error(
`Web fetch extraction failed: Readability, ${providerLabel}, and basic HTML cleanup returned no content.`,
);
@@ -572,16 +574,24 @@ export function createWebFetchTool(options?: {
return null;
}
const readabilityEnabled = resolveFetchReadabilityEnabled(fetch);
const providerFallback = resolveWebFetchDefinition({
config: options?.config,
sandboxed: options?.sandboxed,
runtimeWebFetch: options?.runtimeWebFetch,
preferRuntimeProviders: true,
});
const userAgent =
(fetch && "userAgent" in fetch && typeof fetch.userAgent === "string" && fetch.userAgent) ||
DEFAULT_FETCH_USER_AGENT;
const maxResponseBytes = resolveFetchMaxResponseBytes(fetch);
let providerFallbackResolved = false;
let providerFallbackCache: ReturnType<typeof resolveWebFetchDefinition>;
const resolveProviderFallback = () => {
if (!providerFallbackResolved) {
providerFallbackCache = resolveWebFetchDefinition({
config: options?.config,
sandboxed: options?.sandboxed,
runtimeWebFetch: options?.runtimeWebFetch,
preferRuntimeProviders: true,
});
providerFallbackResolved = true;
}
return providerFallbackCache;
};
return {
label: "Web Fetch",
name: "web_fetch",
@@ -608,7 +618,7 @@ export function createWebFetchTool(options?: {
cacheTtlMs: resolveCacheTtlMs(fetch?.cacheTtlMinutes, DEFAULT_CACHE_TTL_MINUTES),
userAgent,
readabilityEnabled,
providerFallback,
resolveProviderFallback,
});
return jsonResult(result);
},

View File

@@ -4,6 +4,22 @@ import * as ssrf from "../../infra/net/ssrf.js";
import { resolveRequestUrl } from "../../plugin-sdk/request-url.js";
import { withFetchPreconnect } from "../../test-utils/fetch-mock.js";
import { makeFetchHeaders } from "./web-fetch.test-harness.js";
const { extractReadableContentMock, resolveWebFetchDefinitionMock } = vi.hoisted(() => ({
extractReadableContentMock: vi.fn(),
resolveWebFetchDefinitionMock: vi.fn(),
}));
vi.mock("./web-fetch-utils.js", async () => {
const actual =
await vi.importActual<typeof import("./web-fetch-utils.js")>("./web-fetch-utils.js");
return {
...actual,
extractReadableContent: extractReadableContentMock,
};
});
vi.mock("../../web-fetch/runtime.js", () => ({
resolveWebFetchDefinition: resolveWebFetchDefinitionMock,
}));
import { createWebFetchTool } from "./web-tools.js";
type MockResponse = {
@@ -25,30 +41,6 @@ function htmlResponse(html: string, url = "https://example.com/"): MockResponse
};
}
const apiKeyField = ["api", "Key"].join("");
function firecrawlResponse(markdown: string, url = "https://example.com/"): MockResponse {
return {
ok: true,
status: 200,
json: async () => ({
success: true,
data: {
markdown,
metadata: { title: "Firecrawl Title", sourceURL: url, statusCode: 200 },
},
}),
};
}
function firecrawlError(): MockResponse {
return {
ok: false,
status: 403,
json: async () => ({ success: false, error: "blocked" }),
};
}
function textResponse(
text: string,
url = "https://example.com/",
@@ -115,12 +107,8 @@ function installPlainTextFetch(text: string) {
);
}
function createFirecrawlTool(apiKey = defaultFirecrawlApiKey()) {
return createFetchTool({ firecrawl: { [apiKeyField]: apiKey } });
}
function defaultFirecrawlApiKey() {
return "firecrawl-test"; // pragma: allowlist secret
function createProviderFallbackTool() {
return createFetchTool();
}
function withoutAmbientFirecrawlEnv() {
@@ -151,6 +139,10 @@ describe("web_fetch extraction fallbacks", () => {
beforeEach(() => {
withoutAmbientFirecrawlEnv();
extractReadableContentMock.mockReset();
extractReadableContentMock.mockResolvedValue(null);
resolveWebFetchDefinitionMock.mockReset();
resolveWebFetchDefinitionMock.mockReturnValue(null);
vi.spyOn(ssrf, "resolvePinnedHostname").mockImplementation(async (hostname) => {
const normalized = hostname.trim().toLowerCase().replace(/\.$/, "");
const addresses = ["93.184.216.34", "93.184.216.35"];
@@ -284,77 +276,31 @@ describe("web_fetch extraction fallbacks", () => {
// NOTE: Test for wrapping url/finalUrl/warning fields requires DNS mocking.
// The sanitization of these fields is verified by external-content.test.ts tests.
it("falls back to firecrawl when readability returns no content", async () => {
it("falls back to a configured provider when readability returns no content", async () => {
installMockFetch((input: RequestInfo | URL) => {
const url = resolveRequestUrl(input);
if (url.includes("api.firecrawl.dev")) {
return Promise.resolve(firecrawlResponse("firecrawl content")) as Promise<Response>;
}
return Promise.resolve(
htmlResponse("<!doctype html><html><head></head><body></body></html>", url),
) as Promise<Response>;
});
const tool = createFirecrawlTool();
resolveWebFetchDefinitionMock.mockReturnValue({
provider: { id: "test-fetch", label: "Test Fetch" },
definition: {
description: "test provider",
parameters: {},
execute: async () => ({
extractor: "test-fetch",
text: "provider content",
}),
},
});
const tool = createProviderFallbackTool();
const result = await executeFetch(tool, { url: "https://example.com/empty" });
const details = result?.details as { extractor?: string; text?: string };
expect(details.extractor).toBe("firecrawl");
expect(details.text).toContain("firecrawl content");
});
it("normalizes firecrawl Authorization header values", async () => {
const fetchSpy = installMockFetch((input: RequestInfo | URL) => {
const url = resolveRequestUrl(input);
if (url.includes("api.firecrawl.dev/v2/scrape")) {
return Promise.resolve(firecrawlResponse("firecrawl normalized")) as Promise<Response>;
}
return Promise.resolve(
htmlResponse("<!doctype html><html><head></head><body></body></html>", url),
) as Promise<Response>;
});
const tool = createFirecrawlTool("firecrawl-test-\r\nkey");
const result = await executeFetch(tool, {
url: "https://example.com/firecrawl",
extractMode: "text",
});
expect(result?.details).toMatchObject({ extractor: "firecrawl" });
const firecrawlCall = fetchSpy.mock.calls.find((call) =>
resolveRequestUrl(call[0]).includes("/v2/scrape"),
);
expect(firecrawlCall).toBeTruthy();
const init = firecrawlCall?.[1];
const authHeader = new Headers(init?.headers).get("Authorization");
expect(authHeader).toBe("Bearer firecrawl-test-key");
});
it("uses guarded endpoint fetch for firecrawl requests", async () => {
vi.stubEnv("HTTP_PROXY", "http://127.0.0.1:7890");
const fetchSpy = installMockFetch((input: RequestInfo | URL) => {
const url = resolveRequestUrl(input);
if (url.includes("api.firecrawl.dev/v2/scrape")) {
return Promise.resolve(
firecrawlResponse("firecrawl guarded transport"),
) as Promise<Response>;
}
return Promise.resolve(
htmlResponse("<!doctype html><html><head></head><body></body></html>", url),
) as Promise<Response>;
});
const tool = createFirecrawlTool();
const result = await executeFetch(tool, { url: "https://example.com/guarded-firecrawl" });
expect(result?.details).toMatchObject({ extractor: "firecrawl" });
const firecrawlCall = fetchSpy.mock.calls.find((call) =>
resolveRequestUrl(call[0]).includes("/v2/scrape"),
);
expect(firecrawlCall).toBeTruthy();
const requestInit = firecrawlCall?.[1] as (RequestInit & { dispatcher?: unknown }) | undefined;
expect(requestInit?.dispatcher).toBeDefined();
expect(requestInit?.dispatcher).toHaveProperty("dispatch");
expect(details.extractor).toBe("test-fetch");
expect(details.text).toContain("provider content");
});
it("throws when readability is disabled and firecrawl is unavailable", async () => {
@@ -375,21 +321,32 @@ describe("web_fetch extraction fallbacks", () => {
).rejects.toThrow("Readability disabled");
});
it("throws when readability is empty and firecrawl fails", async () => {
installMockFetch((input: RequestInfo | URL) => {
const url = resolveRequestUrl(input);
if (url.includes("api.firecrawl.dev")) {
return Promise.resolve(firecrawlError()) as Promise<Response>;
}
return Promise.resolve(
htmlResponse("<!doctype html><html><head></head><body></body></html>", url),
) as Promise<Response>;
it("throws when readability is empty and the provider fallback yields no content", async () => {
installMockFetch(
(input: RequestInfo | URL) =>
Promise.resolve(
htmlResponse(
"<!doctype html><html><head></head><body></body></html>",
resolveRequestUrl(input),
),
) as Promise<Response>,
);
resolveWebFetchDefinitionMock.mockReturnValue({
provider: { id: "test-fetch", label: "Test Fetch" },
definition: {
description: "test provider",
parameters: {},
execute: async () => {
throw new Error("provider returned no content");
},
},
});
const tool = createFirecrawlTool();
const tool = createProviderFallbackTool();
await expect(
executeFetch(tool, { url: "https://example.com/readability-empty" }),
).rejects.toThrow("Readability, Firecrawl, and basic HTML cleanup returned no content");
).rejects.toThrow("Readability, Test Fetch, and basic HTML cleanup returned no content");
});
it("falls back to basic HTML cleanup after readability and before giving up", async () => {
@@ -414,12 +371,8 @@ describe("web_fetch extraction fallbacks", () => {
expect(details.title).toContain("Shell App");
});
it("uses firecrawl when direct fetch fails", async () => {
installMockFetch((input: RequestInfo | URL) => {
const url = resolveRequestUrl(input);
if (url.includes("api.firecrawl.dev")) {
return Promise.resolve(firecrawlResponse("firecrawl fallback", url)) as Promise<Response>;
}
it("uses the provider fallback when direct fetch fails", async () => {
installMockFetch((_input: RequestInfo | URL) => {
return Promise.resolve({
ok: false,
status: 403,
@@ -428,14 +381,23 @@ describe("web_fetch extraction fallbacks", () => {
} as Response);
});
const tool = createFetchTool({
firecrawl: { apiKey: "firecrawl-test" }, // pragma: allowlist secret
resolveWebFetchDefinitionMock.mockReturnValue({
provider: { id: "test-fetch", label: "Test Fetch" },
definition: {
description: "test provider",
parameters: {},
execute: async () => ({
extractor: "test-fetch",
text: "provider fallback",
}),
},
});
const tool = createProviderFallbackTool();
const result = await tool?.execute?.("call", { url: "https://example.com/blocked" });
const details = result?.details as { extractor?: string; text?: string };
expect(details.extractor).toBe("firecrawl");
expect(details.text).toContain("firecrawl fallback");
expect(details.extractor).toBe("test-fetch");
expect(details.text).toContain("provider fallback");
});
it("wraps external content and clamps oversized maxChars", async () => {
@@ -509,30 +471,25 @@ describe("web_fetch extraction fallbacks", () => {
expect(message).toContain("Oops");
});
it("wraps firecrawl error details", async () => {
installMockFetch((input: RequestInfo | URL) => {
const url = resolveRequestUrl(input);
if (url.includes("api.firecrawl.dev")) {
return Promise.resolve({
ok: false,
status: 403,
json: async () => ({ success: false, error: "blocked" }),
} as Response);
}
return Promise.reject(new Error("network down"));
it("surfaces provider fallback errors when direct fetch throws", async () => {
installMockFetch(() => Promise.reject(new Error("network down")));
resolveWebFetchDefinitionMock.mockReturnValue({
provider: { id: "test-fetch", label: "Test Fetch" },
definition: {
description: "test provider",
parameters: {},
execute: async () => {
throw new Error("provider fallback failed");
},
},
});
const tool = createFetchTool({
firecrawl: { apiKey: "firecrawl-test" }, // pragma: allowlist secret
});
const message = await captureToolErrorMessage({
tool,
url: "https://example.com/firecrawl-error",
});
expect(message).toContain("Firecrawl API error (403):");
expect(message).toMatch(/<<<EXTERNAL_UNTRUSTED_CONTENT id="[a-f0-9]{16}">>>/);
expect(message).toContain("blocked");
const tool = createProviderFallbackTool();
await expect(
captureToolErrorMessage({
tool,
url: "https://example.com/provider-error",
}),
).resolves.toContain("provider fallback failed");
});
});