mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 05:10:44 +00:00
* refactor(web-fetch): move readability extraction to plugin * fix(web-fetch): cache extractor resolution by config * fix(test): remove redundant stat assertions
212 lines
4.8 KiB
TypeScript
212 lines
4.8 KiB
TypeScript
import type {
|
|
WebContentExtractionRequest,
|
|
WebContentExtractionResult,
|
|
WebContentExtractorPlugin,
|
|
} from "openclaw/plugin-sdk/web-content-extractor";
|
|
import {
|
|
htmlToMarkdown,
|
|
normalizeWhitespace,
|
|
sanitizeHtml,
|
|
stripInvisibleUnicode,
|
|
} from "openclaw/plugin-sdk/web-content-extractor";
|
|
|
|
const READABILITY_MAX_HTML_CHARS = 1_000_000;
|
|
const READABILITY_MAX_ESTIMATED_NESTING_DEPTH = 3_000;
|
|
|
|
type ParsedHtml = {
|
|
document: Document;
|
|
};
|
|
|
|
type ParseHtml = (html: string) => ParsedHtml;
|
|
|
|
type ReadabilityResult = {
|
|
content?: string;
|
|
textContent?: string | null;
|
|
title?: string | null;
|
|
};
|
|
|
|
type ReadabilityInstance = {
|
|
parse(): ReadabilityResult | null;
|
|
};
|
|
|
|
type ReadabilityConstructor = new (
|
|
document: Document,
|
|
options: { charThreshold: number },
|
|
) => ReadabilityInstance;
|
|
|
|
type ReadabilityModule = {
|
|
Readability: ReadabilityConstructor;
|
|
};
|
|
|
|
type LinkedomModule = {
|
|
parseHTML: ParseHtml;
|
|
};
|
|
|
|
const READABILITY_MODULE = "@mozilla/readability";
|
|
const LINKEDOM_MODULE = "linkedom";
|
|
|
|
let readabilityDepsPromise:
|
|
| Promise<{
|
|
Readability: ReadabilityConstructor;
|
|
parseHTML: ParseHtml;
|
|
}>
|
|
| undefined;
|
|
|
|
async function loadReadabilityDeps(): Promise<{
|
|
Readability: ReadabilityConstructor;
|
|
parseHTML: ParseHtml;
|
|
}> {
|
|
if (!readabilityDepsPromise) {
|
|
readabilityDepsPromise = Promise.all([
|
|
import(READABILITY_MODULE) as Promise<ReadabilityModule>,
|
|
import(LINKEDOM_MODULE) as Promise<LinkedomModule>,
|
|
]).then(([readability, linkedom]) => ({
|
|
Readability: readability.Readability,
|
|
parseHTML: linkedom.parseHTML,
|
|
}));
|
|
}
|
|
try {
|
|
return await readabilityDepsPromise;
|
|
} catch (error) {
|
|
readabilityDepsPromise = undefined;
|
|
throw error;
|
|
}
|
|
}
|
|
|
|
function normalizeLowercaseStringOrEmpty(value: string): string {
|
|
return value.trim().toLowerCase();
|
|
}
|
|
|
|
function exceedsEstimatedHtmlNestingDepth(html: string, maxDepth: number): boolean {
|
|
const voidTags = new Set([
|
|
"area",
|
|
"base",
|
|
"br",
|
|
"col",
|
|
"embed",
|
|
"hr",
|
|
"img",
|
|
"input",
|
|
"link",
|
|
"meta",
|
|
"param",
|
|
"source",
|
|
"track",
|
|
"wbr",
|
|
]);
|
|
|
|
let depth = 0;
|
|
const len = html.length;
|
|
for (let i = 0; i < len; i++) {
|
|
if (html.charCodeAt(i) !== 60) {
|
|
continue;
|
|
}
|
|
const next = html.charCodeAt(i + 1);
|
|
if (next === 33 || next === 63) {
|
|
continue;
|
|
}
|
|
|
|
let j = i + 1;
|
|
let closing = false;
|
|
if (html.charCodeAt(j) === 47) {
|
|
closing = true;
|
|
j += 1;
|
|
}
|
|
|
|
while (j < len && html.charCodeAt(j) <= 32) {
|
|
j += 1;
|
|
}
|
|
|
|
const nameStart = j;
|
|
while (j < len) {
|
|
const c = html.charCodeAt(j);
|
|
const isNameChar =
|
|
(c >= 65 && c <= 90) ||
|
|
(c >= 97 && c <= 122) ||
|
|
(c >= 48 && c <= 57) ||
|
|
c === 58 ||
|
|
c === 45;
|
|
if (!isNameChar) {
|
|
break;
|
|
}
|
|
j += 1;
|
|
}
|
|
|
|
const tagName = normalizeLowercaseStringOrEmpty(html.slice(nameStart, j));
|
|
if (!tagName) {
|
|
continue;
|
|
}
|
|
|
|
if (closing) {
|
|
depth = Math.max(0, depth - 1);
|
|
continue;
|
|
}
|
|
if (voidTags.has(tagName)) {
|
|
continue;
|
|
}
|
|
|
|
let selfClosing = false;
|
|
for (let k = j; k < len && k < j + 200; k++) {
|
|
const c = html.charCodeAt(k);
|
|
if (c === 62) {
|
|
selfClosing = html.charCodeAt(k - 1) === 47;
|
|
break;
|
|
}
|
|
}
|
|
if (selfClosing) {
|
|
continue;
|
|
}
|
|
|
|
depth += 1;
|
|
if (depth > maxDepth) {
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
async function extractWithReadability(
|
|
request: WebContentExtractionRequest,
|
|
): Promise<WebContentExtractionResult | null> {
|
|
const cleanHtml = await sanitizeHtml(request.html);
|
|
if (
|
|
cleanHtml.length > READABILITY_MAX_HTML_CHARS ||
|
|
exceedsEstimatedHtmlNestingDepth(cleanHtml, READABILITY_MAX_ESTIMATED_NESTING_DEPTH)
|
|
) {
|
|
return null;
|
|
}
|
|
try {
|
|
const { Readability, parseHTML } = await loadReadabilityDeps();
|
|
const { document } = parseHTML(cleanHtml);
|
|
try {
|
|
(document as { baseURI?: string }).baseURI = request.url;
|
|
} catch {
|
|
// Best-effort base URI for relative links.
|
|
}
|
|
const reader = new Readability(document, { charThreshold: 0 });
|
|
const parsed = reader.parse();
|
|
if (!parsed?.content) {
|
|
return null;
|
|
}
|
|
const title = parsed.title || undefined;
|
|
if (request.extractMode === "text") {
|
|
const text = stripInvisibleUnicode(normalizeWhitespace(parsed.textContent ?? ""));
|
|
return text ? { text, title } : null;
|
|
}
|
|
const rendered = htmlToMarkdown(parsed.content);
|
|
const text = stripInvisibleUnicode(rendered.text);
|
|
return text ? { text, title: title ?? rendered.title } : null;
|
|
} catch {
|
|
return null;
|
|
}
|
|
}
|
|
|
|
export function createReadabilityWebContentExtractor(): WebContentExtractorPlugin {
|
|
return {
|
|
id: "readability",
|
|
label: "Readability",
|
|
autoDetectOrder: 10,
|
|
extract: extractWithReadability,
|
|
};
|
|
}
|