mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 05:30:42 +00:00
fix: decode web fetch legacy charsets (#73513)
* fix: decode web fetch legacy charsets
This commit is contained in:
@@ -10,6 +10,8 @@ Docs: https://docs.openclaw.ai
|
||||
|
||||
### Fixes
|
||||
|
||||
- Tools/web_fetch: decode response bodies from raw bytes using declared HTTP, XML, or HTML meta charsets before extraction, so Shift_JIS and other legacy-charset pages no longer return mojibake. Fixes #72916. Thanks @amknight.
|
||||
- Channels/Discord: bound message read/search REST calls, route those actions through Gateway execution, and fall back to `CommandTargetSessionKey` for inbound hook session keys so Discord reads do not hang and hooks still fire when `SessionKey` is empty. Fixes #73431. (#73521) Thanks @amknight.
|
||||
- Plugins/media: auto-enable provider plugins referenced by `agents.defaults.imageGenerationModel`, `videoGenerationModel`, and `musicGenerationModel` primary/fallback refs, so configured Google and MiniMax media providers do not stay disabled behind a restrictive plugin allowlist. Thanks @vincentkoc.
|
||||
- Memory-core/dreaming: retry managed dreaming cron registration after startup when the cron service is not reachable yet, so the scheduled Memory Dreaming Promotion sweep recovers without waiting for heartbeat traffic. Fixes #72841. Thanks @amknight.
|
||||
|
||||
|
||||
@@ -94,6 +94,114 @@ export type ReadResponseTextResult = {
|
||||
bytesRead: number;
|
||||
};
|
||||
|
||||
const RESPONSE_CHARSET_SCAN_BYTES = 4096;
|
||||
const latin1Decoder = new TextDecoder("latin1");
|
||||
const utf8Decoder = new TextDecoder("utf-8");
|
||||
|
||||
function normalizeCharset(value: string | undefined): string | undefined {
|
||||
const normalized = value?.trim().replace(/^["']|["']$/g, "") ?? "";
|
||||
return normalized && normalized.length <= 64 && /^[A-Za-z0-9._:-]+$/.test(normalized)
|
||||
? normalized
|
||||
: undefined;
|
||||
}
|
||||
|
||||
function readCharsetParam(value: string | null | undefined): string | undefined {
|
||||
const match = /(?:^|;)\s*charset\s*=\s*(?:"([^"]+)"|'([^']+)'|([^;\s]+))/i.exec(value ?? "");
|
||||
return normalizeCharset(match?.[1] ?? match?.[2] ?? match?.[3]);
|
||||
}
|
||||
|
||||
function readAttribute(tag: string, name: string): string | undefined {
|
||||
const target = name.toLowerCase();
|
||||
for (const match of tag.matchAll(
|
||||
/([A-Za-z0-9:_-]+)\s*=\s*(?:"([^"]*)"|'([^']*)'|([^\s"'>]+))/g,
|
||||
)) {
|
||||
if (match[1]?.toLowerCase() === target) {
|
||||
return match[2] ?? match[3] ?? match[4] ?? "";
|
||||
}
|
||||
}
|
||||
return undefined;
|
||||
}
|
||||
|
||||
function shouldSniffDocumentCharset(contentType: string | null): boolean {
|
||||
const mediaType = contentType?.split(";", 1)[0]?.trim().toLowerCase();
|
||||
if (!mediaType) {
|
||||
return true;
|
||||
}
|
||||
return (
|
||||
mediaType === "text/html" ||
|
||||
mediaType === "application/xhtml+xml" ||
|
||||
mediaType === "text/xml" ||
|
||||
mediaType === "application/xml" ||
|
||||
mediaType.endsWith("+xml")
|
||||
);
|
||||
}
|
||||
|
||||
function sniffCharset(contentType: string | null, bytes: Uint8Array): string | undefined {
|
||||
if (bytes[0] === 0xef && bytes[1] === 0xbb && bytes[2] === 0xbf) {
|
||||
return "utf-8";
|
||||
}
|
||||
if (bytes[0] === 0xff && bytes[1] === 0xfe) {
|
||||
return "utf-16le";
|
||||
}
|
||||
if (bytes[0] === 0xfe && bytes[1] === 0xff) {
|
||||
return "utf-16be";
|
||||
}
|
||||
if (!shouldSniffDocumentCharset(contentType)) {
|
||||
return undefined;
|
||||
}
|
||||
|
||||
const head = latin1Decoder.decode(
|
||||
bytes.subarray(0, Math.min(bytes.byteLength, RESPONSE_CHARSET_SCAN_BYTES)),
|
||||
);
|
||||
const xmlEncoding = /<\?xml\s+[^>]*\bencoding\s*=\s*(?:"([^"]+)"|'([^']+)')/i.exec(head);
|
||||
if (xmlEncoding) {
|
||||
return normalizeCharset(xmlEncoding[1] ?? xmlEncoding[2]);
|
||||
}
|
||||
|
||||
for (const match of head.matchAll(/<meta\b[^>]*>/gi)) {
|
||||
const tag = match[0];
|
||||
const charset = normalizeCharset(readAttribute(tag, "charset"));
|
||||
if (charset) {
|
||||
return charset;
|
||||
}
|
||||
if (/^content-type$/i.test(readAttribute(tag, "http-equiv") ?? "")) {
|
||||
const contentCharset = readCharsetParam(readAttribute(tag, "content"));
|
||||
if (contentCharset) {
|
||||
return contentCharset;
|
||||
}
|
||||
}
|
||||
}
|
||||
return undefined;
|
||||
}
|
||||
|
||||
function concatBytes(parts: Uint8Array[], totalBytes: number): Uint8Array {
|
||||
if (parts.length === 1 && parts[0]?.byteLength === totalBytes) {
|
||||
return parts[0];
|
||||
}
|
||||
const bytes = new Uint8Array(totalBytes);
|
||||
let offset = 0;
|
||||
for (const part of parts) {
|
||||
bytes.set(part, offset);
|
||||
offset += part.byteLength;
|
||||
}
|
||||
return bytes;
|
||||
}
|
||||
|
||||
function responseContentType(res: Response): string | null {
|
||||
const headers = (res as { headers?: { get?: (name: string) => string | null } }).headers;
|
||||
return typeof headers?.get === "function" ? headers.get("content-type") : null;
|
||||
}
|
||||
|
||||
function decodeResponseBytes(res: Response, bytes: Uint8Array): string {
|
||||
const contentType = responseContentType(res);
|
||||
const charset = readCharsetParam(contentType) ?? sniffCharset(contentType, bytes);
|
||||
try {
|
||||
return new TextDecoder(charset ?? "utf-8").decode(bytes);
|
||||
} catch {
|
||||
return utf8Decoder.decode(bytes);
|
||||
}
|
||||
}
|
||||
|
||||
export async function readResponseText(
|
||||
res: Response,
|
||||
options?: { maxBytes?: number },
|
||||
@@ -113,10 +221,9 @@ export async function readResponseText(
|
||||
typeof (body as { getReader: () => unknown }).getReader === "function"
|
||||
) {
|
||||
const reader = (body as ReadableStream<Uint8Array>).getReader();
|
||||
const decoder = new TextDecoder();
|
||||
let bytesRead = 0;
|
||||
let truncated = false;
|
||||
const parts: string[] = [];
|
||||
const parts: Uint8Array[] = [];
|
||||
|
||||
try {
|
||||
while (true) {
|
||||
@@ -140,7 +247,7 @@ export async function readResponseText(
|
||||
}
|
||||
|
||||
bytesRead += chunk.byteLength;
|
||||
parts.push(decoder.decode(chunk, { stream: true }));
|
||||
parts.push(chunk);
|
||||
|
||||
if (truncated || bytesRead >= maxBytes) {
|
||||
truncated = true;
|
||||
@@ -148,7 +255,7 @@ export async function readResponseText(
|
||||
}
|
||||
}
|
||||
} catch {
|
||||
// Best-effort: return whatever we decoded so far.
|
||||
// Best-effort: return whatever we read so far.
|
||||
} finally {
|
||||
if (truncated) {
|
||||
// Some mocked or non-compliant streams never settle cancel(); do not
|
||||
@@ -157,8 +264,22 @@ export async function readResponseText(
|
||||
}
|
||||
}
|
||||
|
||||
parts.push(decoder.decode());
|
||||
return { text: parts.join(""), truncated, bytesRead };
|
||||
const bytes = concatBytes(parts, bytesRead);
|
||||
return { text: decodeResponseBytes(res, bytes), truncated, bytesRead };
|
||||
}
|
||||
|
||||
const readBytes = (res as { arrayBuffer?: () => Promise<ArrayBuffer> }).arrayBuffer;
|
||||
if (typeof readBytes === "function") {
|
||||
try {
|
||||
const bytes = new Uint8Array(await readBytes.call(res));
|
||||
return {
|
||||
text: decodeResponseBytes(res, bytes),
|
||||
truncated: false,
|
||||
bytesRead: bytes.byteLength,
|
||||
};
|
||||
} catch {
|
||||
// Fall back to text() for lightweight Response-like mocks that do not expose bytes.
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
|
||||
@@ -231,6 +231,83 @@ describe("web_fetch extraction fallbacks", () => {
|
||||
expect(details.truncated).toBe(true);
|
||||
});
|
||||
|
||||
it("decodes response bytes with a charset from Content-Type", async () => {
|
||||
installMockFetch((input: RequestInfo | URL) => {
|
||||
const response = new Response(new Uint8Array([0x63, 0x61, 0x66, 0xe9]), {
|
||||
status: 200,
|
||||
headers: { "content-type": "text/plain; charset=iso-8859-1" },
|
||||
});
|
||||
Object.defineProperty(response, "url", { value: resolveRequestUrl(input) });
|
||||
return Promise.resolve(response);
|
||||
});
|
||||
|
||||
const tool = createFetchTool({ firecrawl: { enabled: false } });
|
||||
const result = await executeFetch(tool, {
|
||||
url: "https://example.com/latin1",
|
||||
extractMode: "text",
|
||||
});
|
||||
const details = result?.details as { text?: string };
|
||||
|
||||
expect(details.text).toContain("café");
|
||||
expect(details.text).not.toContain("caf<61>");
|
||||
});
|
||||
|
||||
it("decodes HTML using a meta http-equiv charset before extraction", async () => {
|
||||
const encoder = new TextEncoder();
|
||||
const japanese = new Uint8Array([0x93, 0xfa, 0x96, 0x7b, 0x8c, 0xea]);
|
||||
const responseBytes = new Uint8Array([
|
||||
...encoder.encode(
|
||||
'<!doctype html><html><head><meta http-equiv="Content-Type" content="text/html; charset=Shift_JIS"><title>',
|
||||
),
|
||||
...japanese,
|
||||
...encoder.encode("</title></head><body><p>"),
|
||||
...japanese,
|
||||
...encoder.encode("</p></body></html>"),
|
||||
]);
|
||||
installMockFetch((input: RequestInfo | URL) => {
|
||||
const response = new Response(responseBytes, {
|
||||
status: 200,
|
||||
headers: { "content-type": "text/html" },
|
||||
});
|
||||
Object.defineProperty(response, "url", { value: resolveRequestUrl(input) });
|
||||
return Promise.resolve(response);
|
||||
});
|
||||
|
||||
const tool = createFetchTool({ firecrawl: { enabled: false } });
|
||||
const result = await executeFetch(tool, {
|
||||
url: "https://example.com/shift-jis",
|
||||
extractMode: "text",
|
||||
});
|
||||
const details = result?.details as { text?: string; title?: string };
|
||||
const output = `${details.title ?? ""}\n${details.text ?? ""}`;
|
||||
|
||||
expect(output).toContain("日本語");
|
||||
expect(output).not.toContain("<22>");
|
||||
});
|
||||
|
||||
it("ignores charset text in unrelated meta content", async () => {
|
||||
const body =
|
||||
'<!doctype html><html><head><meta name="description" content="charset=Shift_JIS"><title>日本語</title></head><body>日本語</body></html>';
|
||||
installMockFetch((input: RequestInfo | URL) => {
|
||||
const response = new Response(new TextEncoder().encode(body), {
|
||||
status: 200,
|
||||
headers: { "content-type": "text/html" },
|
||||
});
|
||||
Object.defineProperty(response, "url", { value: resolveRequestUrl(input) });
|
||||
return Promise.resolve(response);
|
||||
});
|
||||
|
||||
const tool = createFetchTool({ firecrawl: { enabled: false } });
|
||||
const result = await executeFetch(tool, {
|
||||
url: "https://example.com/content-only-charset",
|
||||
extractMode: "text",
|
||||
});
|
||||
const details = result?.details as { text?: string; title?: string };
|
||||
const output = `${details.title ?? ""}\n${details.text ?? ""}`;
|
||||
|
||||
expect(output).toContain("日本語");
|
||||
});
|
||||
|
||||
it("caps response bytes and does not hang on endless streams", async () => {
|
||||
const chunk = new TextEncoder().encode("<html><body><div>hi</div></body></html>");
|
||||
const stream = new ReadableStream<Uint8Array>({
|
||||
|
||||
Reference in New Issue
Block a user