mirror of
https://github.com/openclaw/openclaw.git
synced 2026-06-28 01:43:31 +00:00
docs: document web fetch helpers
This commit is contained in:
@@ -1,5 +1,12 @@
|
||||
import { sanitizeHtml, stripInvisibleUnicode } from "./web-fetch-visibility.js";
|
||||
|
||||
/**
|
||||
* Lightweight HTML/text extraction utilities for the web_fetch tool.
|
||||
*
|
||||
* This intentionally handles common markup without a heavy renderer so provider
|
||||
* responses stay bounded and deterministic.
|
||||
*/
|
||||
/** Output mode requested by web_fetch extraction. */
|
||||
export type ExtractMode = "markdown" | "text";
|
||||
|
||||
function decodeEntities(value: string): string {
|
||||
@@ -18,6 +25,7 @@ function stripTags(value: string): string {
|
||||
return decodeEntities(value.replace(/<[^>]+>/g, ""));
|
||||
}
|
||||
|
||||
/** Collapses display whitespace while preserving paragraph breaks. */
|
||||
export function normalizeWhitespace(value: string): string {
|
||||
return value
|
||||
.replace(/\r/g, "")
|
||||
@@ -27,6 +35,7 @@ export function normalizeWhitespace(value: string): string {
|
||||
.trim();
|
||||
}
|
||||
|
||||
/** Converts sanitized HTML into coarse markdown plus an optional title. */
|
||||
export function htmlToMarkdown(html: string): { text: string; title?: string } {
|
||||
const titleMatch = html.match(/<title[^>]*>([\s\S]*?)<\/title>/i);
|
||||
const title = titleMatch ? normalizeWhitespace(stripTags(titleMatch[1])) : undefined;
|
||||
@@ -39,6 +48,7 @@ export function htmlToMarkdown(html: string): { text: string; title?: string } {
|
||||
if (!label) {
|
||||
return href;
|
||||
}
|
||||
// Preserve link targets in markdown mode so fetched pages remain source-auditable.
|
||||
return `[${label}](${href})`;
|
||||
});
|
||||
text = text.replace(/<h([1-6])[^>]*>([\s\S]*?)<\/h\1>/gi, (_, level, body) => {
|
||||
@@ -58,6 +68,7 @@ export function htmlToMarkdown(html: string): { text: string; title?: string } {
|
||||
return { text, title };
|
||||
}
|
||||
|
||||
/** Removes markdown decoration for plain text extraction. */
|
||||
export function markdownToText(markdown: string): string {
|
||||
let text = markdown;
|
||||
text = text.replace(/!\[[^\]]*]\([^)]+\)/g, "");
|
||||
@@ -72,6 +83,7 @@ export function markdownToText(markdown: string): string {
|
||||
return normalizeWhitespace(text);
|
||||
}
|
||||
|
||||
/** Truncates text by characters and reports whether truncation occurred. */
|
||||
export function truncateText(
|
||||
value: string,
|
||||
maxChars: number,
|
||||
@@ -82,6 +94,7 @@ export function truncateText(
|
||||
return { text: value.slice(0, maxChars), truncated: true };
|
||||
}
|
||||
|
||||
/** Sanitizes HTML and extracts either markdown or plain text content. */
|
||||
export async function extractBasicHtmlContent(params: {
|
||||
html: string;
|
||||
extractMode: ExtractMode;
|
||||
|
||||
@@ -12,6 +12,12 @@ import {
|
||||
} from "../../infra/net/ssrf.js";
|
||||
import { readPositiveIntegerParam } from "./common.js";
|
||||
|
||||
/**
|
||||
* Guarded fetch wrappers for web tools.
|
||||
*
|
||||
* These helpers apply SSRF policy, timeout normalization, and optional trusted
|
||||
* env proxy mode before tool-specific response handling runs.
|
||||
*/
|
||||
const WEB_TOOLS_SELF_HOSTED_NETWORK_SSRF_POLICY: SsrFPolicy = {
|
||||
dangerouslyAllowPrivateNetwork: true,
|
||||
allowRfc2544BenchmarkRange: true,
|
||||
@@ -45,6 +51,7 @@ function resolveTimeoutMs(params: {
|
||||
return undefined;
|
||||
}
|
||||
|
||||
/** Runs a guarded fetch with strict or trusted-env-proxy web tool policy. */
|
||||
export async function fetchWithWebToolsNetworkGuard(
|
||||
params: WebToolGuardedFetchOptions,
|
||||
): Promise<GuardedFetchResult> {
|
||||
@@ -72,6 +79,7 @@ async function withWebToolsNetworkGuard<T>(
|
||||
}
|
||||
}
|
||||
|
||||
/** Runs a fetch for trusted endpoints, allowing env proxy with pinned-host policy. */
|
||||
export async function withTrustedWebToolsEndpoint<T>(
|
||||
params: WebToolEndpointFetchOptions,
|
||||
run: (result: { response: Response; finalUrl: string }) => Promise<T>,
|
||||
@@ -87,6 +95,7 @@ export async function withTrustedWebToolsEndpoint<T>(
|
||||
);
|
||||
}
|
||||
|
||||
/** Runs a fetch for configured self-hosted endpoints with private-network access allowed. */
|
||||
export async function withSelfHostedWebToolsEndpoint<T>(
|
||||
params: WebToolEndpointFetchOptions,
|
||||
run: (result: { response: Response; finalUrl: string }) => Promise<T>,
|
||||
@@ -101,6 +110,7 @@ export async function withSelfHostedWebToolsEndpoint<T>(
|
||||
);
|
||||
}
|
||||
|
||||
/** Runs a fetch under strict SSRF protection without env proxy trust. */
|
||||
export async function withStrictWebToolsEndpoint<T>(
|
||||
params: WebToolEndpointFetchOptions,
|
||||
run: (result: { response: Response; finalUrl: string }) => Promise<T>,
|
||||
|
||||
@@ -1,2 +1,8 @@
|
||||
/**
|
||||
* Barrel for web_fetch and web_search tool factories.
|
||||
*
|
||||
* Higher-level tool assembly imports this narrow module so tests can mock both
|
||||
* web tools together without loading provider-specific implementations.
|
||||
*/
|
||||
export { createWebFetchTool } from "./web-fetch.js";
|
||||
export { createWebSearchTool } from "./web-search.js";
|
||||
|
||||
Reference in New Issue
Block a user