docs: document web fetch helpers

This commit is contained in:
Peter Steinberger
2026-06-04 00:30:14 -04:00
parent 4cd8b5eb78
commit 9c10ef2ffa
3 changed files with 29 additions and 0 deletions

View File

@@ -1,5 +1,12 @@
import { sanitizeHtml, stripInvisibleUnicode } from "./web-fetch-visibility.js";
/**
* Lightweight HTML/text extraction utilities for the web_fetch tool.
*
* This intentionally handles common markup without a heavy renderer so provider
* responses stay bounded and deterministic.
*/
/** Output mode requested by web_fetch extraction. */
export type ExtractMode = "markdown" | "text";
function decodeEntities(value: string): string {
@@ -18,6 +25,7 @@ function stripTags(value: string): string {
return decodeEntities(value.replace(/<[^>]+>/g, ""));
}
/** Collapses display whitespace while preserving paragraph breaks. */
export function normalizeWhitespace(value: string): string {
return value
.replace(/\r/g, "")
@@ -27,6 +35,7 @@ export function normalizeWhitespace(value: string): string {
.trim();
}
/** Converts sanitized HTML into coarse markdown plus an optional title. */
export function htmlToMarkdown(html: string): { text: string; title?: string } {
const titleMatch = html.match(/<title[^>]*>([\s\S]*?)<\/title>/i);
const title = titleMatch ? normalizeWhitespace(stripTags(titleMatch[1])) : undefined;
@@ -39,6 +48,7 @@ export function htmlToMarkdown(html: string): { text: string; title?: string } {
if (!label) {
return href;
}
// Preserve link targets in markdown mode so fetched pages remain source-auditable.
return `[${label}](${href})`;
});
text = text.replace(/<h([1-6])[^>]*>([\s\S]*?)<\/h\1>/gi, (_, level, body) => {
@@ -58,6 +68,7 @@ export function htmlToMarkdown(html: string): { text: string; title?: string } {
return { text, title };
}
/** Removes markdown decoration for plain text extraction. */
export function markdownToText(markdown: string): string {
let text = markdown;
text = text.replace(/!\[[^\]]*]\([^)]+\)/g, "");
@@ -72,6 +83,7 @@ export function markdownToText(markdown: string): string {
return normalizeWhitespace(text);
}
/** Truncates text by characters and reports whether truncation occurred. */
export function truncateText(
value: string,
maxChars: number,
@@ -82,6 +94,7 @@ export function truncateText(
return { text: value.slice(0, maxChars), truncated: true };
}
/** Sanitizes HTML and extracts either markdown or plain text content. */
export async function extractBasicHtmlContent(params: {
html: string;
extractMode: ExtractMode;

View File

@@ -12,6 +12,12 @@ import {
} from "../../infra/net/ssrf.js";
import { readPositiveIntegerParam } from "./common.js";
/**
* Guarded fetch wrappers for web tools.
*
* These helpers apply SSRF policy, timeout normalization, and optional trusted
* env proxy mode before tool-specific response handling runs.
*/
const WEB_TOOLS_SELF_HOSTED_NETWORK_SSRF_POLICY: SsrFPolicy = {
dangerouslyAllowPrivateNetwork: true,
allowRfc2544BenchmarkRange: true,
@@ -45,6 +51,7 @@ function resolveTimeoutMs(params: {
return undefined;
}
/** Runs a guarded fetch with strict or trusted-env-proxy web tool policy. */
export async function fetchWithWebToolsNetworkGuard(
params: WebToolGuardedFetchOptions,
): Promise<GuardedFetchResult> {
@@ -72,6 +79,7 @@ async function withWebToolsNetworkGuard<T>(
}
}
/** Runs a fetch for trusted endpoints, allowing env proxy with pinned-host policy. */
export async function withTrustedWebToolsEndpoint<T>(
params: WebToolEndpointFetchOptions,
run: (result: { response: Response; finalUrl: string }) => Promise<T>,
@@ -87,6 +95,7 @@ export async function withTrustedWebToolsEndpoint<T>(
);
}
/** Runs a fetch for configured self-hosted endpoints with private-network access allowed. */
export async function withSelfHostedWebToolsEndpoint<T>(
params: WebToolEndpointFetchOptions,
run: (result: { response: Response; finalUrl: string }) => Promise<T>,
@@ -101,6 +110,7 @@ export async function withSelfHostedWebToolsEndpoint<T>(
);
}
/** Runs a fetch under strict SSRF protection without env proxy trust. */
export async function withStrictWebToolsEndpoint<T>(
params: WebToolEndpointFetchOptions,
run: (result: { response: Response; finalUrl: string }) => Promise<T>,

View File

@@ -1,2 +1,8 @@
/**
* Barrel for web_fetch and web_search tool factories.
*
* Higher-level tool assembly imports this narrow module so tests can mock both
* web tools together without loading provider-specific implementations.
*/
export { createWebFetchTool } from "./web-fetch.js";
export { createWebSearchTool } from "./web-search.js";