fix: decode web fetch legacy charsets (#73513)

* fix: decode web fetch legacy charsets
This commit is contained in:
Alex Knight
2026-04-28 22:09:06 +10:00
committed by GitHub
parent e4ff7c1620
commit 7a23b2d945
3 changed files with 206 additions and 6 deletions

View File

@@ -10,6 +10,8 @@ Docs: https://docs.openclaw.ai
### Fixes
- Tools/web_fetch: decode response bodies from raw bytes using declared HTTP, XML, or HTML meta charsets before extraction, so Shift_JIS and other legacy-charset pages no longer return mojibake. Fixes #72916. Thanks @amknight.
- Channels/Discord: bound message read/search REST calls, route those actions through Gateway execution, and fall back to `CommandTargetSessionKey` for inbound hook session keys so Discord reads do not hang and hooks still fire when `SessionKey` is empty. Fixes #73431. (#73521) Thanks @amknight.
- Plugins/media: auto-enable provider plugins referenced by `agents.defaults.imageGenerationModel`, `videoGenerationModel`, and `musicGenerationModel` primary/fallback refs, so configured Google and MiniMax media providers do not stay disabled behind a restrictive plugin allowlist. Thanks @vincentkoc.
- Memory-core/dreaming: retry managed dreaming cron registration after startup when the cron service is not reachable yet, so the scheduled Memory Dreaming Promotion sweep recovers without waiting for heartbeat traffic. Fixes #72841. Thanks @amknight.

View File

@@ -94,6 +94,114 @@ export type ReadResponseTextResult = {
bytesRead: number;
};
const RESPONSE_CHARSET_SCAN_BYTES = 4096;
const latin1Decoder = new TextDecoder("latin1");
const utf8Decoder = new TextDecoder("utf-8");
function normalizeCharset(value: string | undefined): string | undefined {
const normalized = value?.trim().replace(/^["']|["']$/g, "") ?? "";
return normalized && normalized.length <= 64 && /^[A-Za-z0-9._:-]+$/.test(normalized)
? normalized
: undefined;
}
function readCharsetParam(value: string | null | undefined): string | undefined {
const match = /(?:^|;)\s*charset\s*=\s*(?:"([^"]+)"|'([^']+)'|([^;\s]+))/i.exec(value ?? "");
return normalizeCharset(match?.[1] ?? match?.[2] ?? match?.[3]);
}
function readAttribute(tag: string, name: string): string | undefined {
const target = name.toLowerCase();
for (const match of tag.matchAll(
/([A-Za-z0-9:_-]+)\s*=\s*(?:"([^"]*)"|'([^']*)'|([^\s"'>]+))/g,
)) {
if (match[1]?.toLowerCase() === target) {
return match[2] ?? match[3] ?? match[4] ?? "";
}
}
return undefined;
}
function shouldSniffDocumentCharset(contentType: string | null): boolean {
const mediaType = contentType?.split(";", 1)[0]?.trim().toLowerCase();
if (!mediaType) {
return true;
}
return (
mediaType === "text/html" ||
mediaType === "application/xhtml+xml" ||
mediaType === "text/xml" ||
mediaType === "application/xml" ||
mediaType.endsWith("+xml")
);
}
function sniffCharset(contentType: string | null, bytes: Uint8Array): string | undefined {
if (bytes[0] === 0xef && bytes[1] === 0xbb && bytes[2] === 0xbf) {
return "utf-8";
}
if (bytes[0] === 0xff && bytes[1] === 0xfe) {
return "utf-16le";
}
if (bytes[0] === 0xfe && bytes[1] === 0xff) {
return "utf-16be";
}
if (!shouldSniffDocumentCharset(contentType)) {
return undefined;
}
const head = latin1Decoder.decode(
bytes.subarray(0, Math.min(bytes.byteLength, RESPONSE_CHARSET_SCAN_BYTES)),
);
const xmlEncoding = /<\?xml\s+[^>]*\bencoding\s*=\s*(?:"([^"]+)"|'([^']+)')/i.exec(head);
if (xmlEncoding) {
return normalizeCharset(xmlEncoding[1] ?? xmlEncoding[2]);
}
for (const match of head.matchAll(/<meta\b[^>]*>/gi)) {
const tag = match[0];
const charset = normalizeCharset(readAttribute(tag, "charset"));
if (charset) {
return charset;
}
if (/^content-type$/i.test(readAttribute(tag, "http-equiv") ?? "")) {
const contentCharset = readCharsetParam(readAttribute(tag, "content"));
if (contentCharset) {
return contentCharset;
}
}
}
return undefined;
}
function concatBytes(parts: Uint8Array[], totalBytes: number): Uint8Array {
if (parts.length === 1 && parts[0]?.byteLength === totalBytes) {
return parts[0];
}
const bytes = new Uint8Array(totalBytes);
let offset = 0;
for (const part of parts) {
bytes.set(part, offset);
offset += part.byteLength;
}
return bytes;
}
function responseContentType(res: Response): string | null {
const headers = (res as { headers?: { get?: (name: string) => string | null } }).headers;
return typeof headers?.get === "function" ? headers.get("content-type") : null;
}
function decodeResponseBytes(res: Response, bytes: Uint8Array): string {
const contentType = responseContentType(res);
const charset = readCharsetParam(contentType) ?? sniffCharset(contentType, bytes);
try {
return new TextDecoder(charset ?? "utf-8").decode(bytes);
} catch {
return utf8Decoder.decode(bytes);
}
}
export async function readResponseText(
res: Response,
options?: { maxBytes?: number },
@@ -113,10 +221,9 @@ export async function readResponseText(
typeof (body as { getReader: () => unknown }).getReader === "function"
) {
const reader = (body as ReadableStream<Uint8Array>).getReader();
const decoder = new TextDecoder();
let bytesRead = 0;
let truncated = false;
const parts: string[] = [];
const parts: Uint8Array[] = [];
try {
while (true) {
@@ -140,7 +247,7 @@ export async function readResponseText(
}
bytesRead += chunk.byteLength;
parts.push(decoder.decode(chunk, { stream: true }));
parts.push(chunk);
if (truncated || bytesRead >= maxBytes) {
truncated = true;
@@ -148,7 +255,7 @@ export async function readResponseText(
}
}
} catch {
// Best-effort: return whatever we decoded so far.
// Best-effort: return whatever we read so far.
} finally {
if (truncated) {
// Some mocked or non-compliant streams never settle cancel(); do not
@@ -157,8 +264,22 @@ export async function readResponseText(
}
}
parts.push(decoder.decode());
return { text: parts.join(""), truncated, bytesRead };
const bytes = concatBytes(parts, bytesRead);
return { text: decodeResponseBytes(res, bytes), truncated, bytesRead };
}
const readBytes = (res as { arrayBuffer?: () => Promise<ArrayBuffer> }).arrayBuffer;
if (typeof readBytes === "function") {
try {
const bytes = new Uint8Array(await readBytes.call(res));
return {
text: decodeResponseBytes(res, bytes),
truncated: false,
bytesRead: bytes.byteLength,
};
} catch {
// Fall back to text() for lightweight Response-like mocks that do not expose bytes.
}
}
try {

View File

@@ -231,6 +231,83 @@ describe("web_fetch extraction fallbacks", () => {
expect(details.truncated).toBe(true);
});
it("decodes response bytes with a charset from Content-Type", async () => {
installMockFetch((input: RequestInfo | URL) => {
const response = new Response(new Uint8Array([0x63, 0x61, 0x66, 0xe9]), {
status: 200,
headers: { "content-type": "text/plain; charset=iso-8859-1" },
});
Object.defineProperty(response, "url", { value: resolveRequestUrl(input) });
return Promise.resolve(response);
});
const tool = createFetchTool({ firecrawl: { enabled: false } });
const result = await executeFetch(tool, {
url: "https://example.com/latin1",
extractMode: "text",
});
const details = result?.details as { text?: string };
expect(details.text).toContain("café");
expect(details.text).not.toContain("caf<61>");
});
it("decodes HTML using a meta http-equiv charset before extraction", async () => {
const encoder = new TextEncoder();
const japanese = new Uint8Array([0x93, 0xfa, 0x96, 0x7b, 0x8c, 0xea]);
const responseBytes = new Uint8Array([
...encoder.encode(
'<!doctype html><html><head><meta http-equiv="Content-Type" content="text/html; charset=Shift_JIS"><title>',
),
...japanese,
...encoder.encode("</title></head><body><p>"),
...japanese,
...encoder.encode("</p></body></html>"),
]);
installMockFetch((input: RequestInfo | URL) => {
const response = new Response(responseBytes, {
status: 200,
headers: { "content-type": "text/html" },
});
Object.defineProperty(response, "url", { value: resolveRequestUrl(input) });
return Promise.resolve(response);
});
const tool = createFetchTool({ firecrawl: { enabled: false } });
const result = await executeFetch(tool, {
url: "https://example.com/shift-jis",
extractMode: "text",
});
const details = result?.details as { text?: string; title?: string };
const output = `${details.title ?? ""}\n${details.text ?? ""}`;
expect(output).toContain("日本語");
expect(output).not.toContain("<22>");
});
it("ignores charset text in unrelated meta content", async () => {
const body =
'<!doctype html><html><head><meta name="description" content="charset=Shift_JIS"><title>日本語</title></head><body>日本語</body></html>';
installMockFetch((input: RequestInfo | URL) => {
const response = new Response(new TextEncoder().encode(body), {
status: 200,
headers: { "content-type": "text/html" },
});
Object.defineProperty(response, "url", { value: resolveRequestUrl(input) });
return Promise.resolve(response);
});
const tool = createFetchTool({ firecrawl: { enabled: false } });
const result = await executeFetch(tool, {
url: "https://example.com/content-only-charset",
extractMode: "text",
});
const details = result?.details as { text?: string; title?: string };
const output = `${details.title ?? ""}\n${details.text ?? ""}`;
expect(output).toContain("日本語");
});
it("caps response bytes and does not hang on endless streams", async () => {
const chunk = new TextEncoder().encode("<html><body><div>hi</div></body></html>");
const stream = new ReadableStream<Uint8Array>({