From 7a23b2d945cc2d2e5c452e4641213e71de8dcc66 Mon Sep 17 00:00:00 2001 From: Alex Knight Date: Tue, 28 Apr 2026 22:09:06 +1000 Subject: [PATCH] fix: decode web fetch legacy charsets (#73513) * fix: decode web fetch legacy charsets --- CHANGELOG.md | 2 + src/agents/tools/web-shared.ts | 133 ++++++++++++++++++++++- src/agents/tools/web-tools.fetch.test.ts | 77 +++++++++++++ 3 files changed, 206 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ea19af51c67..5c5233f9142 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,8 @@ Docs: https://docs.openclaw.ai ### Fixes +- Tools/web_fetch: decode response bodies from raw bytes using declared HTTP, XML, or HTML meta charsets before extraction, so Shift_JIS and other legacy-charset pages no longer return mojibake. Fixes #72916. Thanks @amknight. +- Channels/Discord: bound message read/search REST calls, route those actions through Gateway execution, and fall back to `CommandTargetSessionKey` for inbound hook session keys so Discord reads do not hang and hooks still fire when `SessionKey` is empty. Fixes #73431. (#73521) Thanks @amknight. - Plugins/media: auto-enable provider plugins referenced by `agents.defaults.imageGenerationModel`, `videoGenerationModel`, and `musicGenerationModel` primary/fallback refs, so configured Google and MiniMax media providers do not stay disabled behind a restrictive plugin allowlist. Thanks @vincentkoc. - Memory-core/dreaming: retry managed dreaming cron registration after startup when the cron service is not reachable yet, so the scheduled Memory Dreaming Promotion sweep recovers without waiting for heartbeat traffic. Fixes #72841. Thanks @amknight. diff --git a/src/agents/tools/web-shared.ts b/src/agents/tools/web-shared.ts index cd0bcd61627..11fe364ea99 100644 --- a/src/agents/tools/web-shared.ts +++ b/src/agents/tools/web-shared.ts @@ -94,6 +94,114 @@ export type ReadResponseTextResult = { bytesRead: number; }; +const RESPONSE_CHARSET_SCAN_BYTES = 4096; +const latin1Decoder = new TextDecoder("latin1"); +const utf8Decoder = new TextDecoder("utf-8"); + +function normalizeCharset(value: string | undefined): string | undefined { + const normalized = value?.trim().replace(/^["']|["']$/g, "") ?? ""; + return normalized && normalized.length <= 64 && /^[A-Za-z0-9._:-]+$/.test(normalized) + ? normalized + : undefined; +} + +function readCharsetParam(value: string | null | undefined): string | undefined { + const match = /(?:^|;)\s*charset\s*=\s*(?:"([^"]+)"|'([^']+)'|([^;\s]+))/i.exec(value ?? ""); + return normalizeCharset(match?.[1] ?? match?.[2] ?? match?.[3]); +} + +function readAttribute(tag: string, name: string): string | undefined { + const target = name.toLowerCase(); + for (const match of tag.matchAll( + /([A-Za-z0-9:_-]+)\s*=\s*(?:"([^"]*)"|'([^']*)'|([^\s"'>]+))/g, + )) { + if (match[1]?.toLowerCase() === target) { + return match[2] ?? match[3] ?? match[4] ?? ""; + } + } + return undefined; +} + +function shouldSniffDocumentCharset(contentType: string | null): boolean { + const mediaType = contentType?.split(";", 1)[0]?.trim().toLowerCase(); + if (!mediaType) { + return true; + } + return ( + mediaType === "text/html" || + mediaType === "application/xhtml+xml" || + mediaType === "text/xml" || + mediaType === "application/xml" || + mediaType.endsWith("+xml") + ); +} + +function sniffCharset(contentType: string | null, bytes: Uint8Array): string | undefined { + if (bytes[0] === 0xef && bytes[1] === 0xbb && bytes[2] === 0xbf) { + return "utf-8"; + } + if (bytes[0] === 0xff && bytes[1] === 0xfe) { + return "utf-16le"; + } + if (bytes[0] === 0xfe && bytes[1] === 0xff) { + return "utf-16be"; + } + if (!shouldSniffDocumentCharset(contentType)) { + return undefined; + } + + const head = latin1Decoder.decode( + bytes.subarray(0, Math.min(bytes.byteLength, RESPONSE_CHARSET_SCAN_BYTES)), + ); + const xmlEncoding = /<\?xml\s+[^>]*\bencoding\s*=\s*(?:"([^"]+)"|'([^']+)')/i.exec(head); + if (xmlEncoding) { + return normalizeCharset(xmlEncoding[1] ?? xmlEncoding[2]); + } + + for (const match of head.matchAll(/]*>/gi)) { + const tag = match[0]; + const charset = normalizeCharset(readAttribute(tag, "charset")); + if (charset) { + return charset; + } + if (/^content-type$/i.test(readAttribute(tag, "http-equiv") ?? "")) { + const contentCharset = readCharsetParam(readAttribute(tag, "content")); + if (contentCharset) { + return contentCharset; + } + } + } + return undefined; +} + +function concatBytes(parts: Uint8Array[], totalBytes: number): Uint8Array { + if (parts.length === 1 && parts[0]?.byteLength === totalBytes) { + return parts[0]; + } + const bytes = new Uint8Array(totalBytes); + let offset = 0; + for (const part of parts) { + bytes.set(part, offset); + offset += part.byteLength; + } + return bytes; +} + +function responseContentType(res: Response): string | null { + const headers = (res as { headers?: { get?: (name: string) => string | null } }).headers; + return typeof headers?.get === "function" ? headers.get("content-type") : null; +} + +function decodeResponseBytes(res: Response, bytes: Uint8Array): string { + const contentType = responseContentType(res); + const charset = readCharsetParam(contentType) ?? sniffCharset(contentType, bytes); + try { + return new TextDecoder(charset ?? "utf-8").decode(bytes); + } catch { + return utf8Decoder.decode(bytes); + } +} + export async function readResponseText( res: Response, options?: { maxBytes?: number }, @@ -113,10 +221,9 @@ export async function readResponseText( typeof (body as { getReader: () => unknown }).getReader === "function" ) { const reader = (body as ReadableStream).getReader(); - const decoder = new TextDecoder(); let bytesRead = 0; let truncated = false; - const parts: string[] = []; + const parts: Uint8Array[] = []; try { while (true) { @@ -140,7 +247,7 @@ export async function readResponseText( } bytesRead += chunk.byteLength; - parts.push(decoder.decode(chunk, { stream: true })); + parts.push(chunk); if (truncated || bytesRead >= maxBytes) { truncated = true; @@ -148,7 +255,7 @@ export async function readResponseText( } } } catch { - // Best-effort: return whatever we decoded so far. + // Best-effort: return whatever we read so far. } finally { if (truncated) { // Some mocked or non-compliant streams never settle cancel(); do not @@ -157,8 +264,22 @@ export async function readResponseText( } } - parts.push(decoder.decode()); - return { text: parts.join(""), truncated, bytesRead }; + const bytes = concatBytes(parts, bytesRead); + return { text: decodeResponseBytes(res, bytes), truncated, bytesRead }; + } + + const readBytes = (res as { arrayBuffer?: () => Promise }).arrayBuffer; + if (typeof readBytes === "function") { + try { + const bytes = new Uint8Array(await readBytes.call(res)); + return { + text: decodeResponseBytes(res, bytes), + truncated: false, + bytesRead: bytes.byteLength, + }; + } catch { + // Fall back to text() for lightweight Response-like mocks that do not expose bytes. + } } try { diff --git a/src/agents/tools/web-tools.fetch.test.ts b/src/agents/tools/web-tools.fetch.test.ts index 97f0974b7b0..f447a99b77d 100644 --- a/src/agents/tools/web-tools.fetch.test.ts +++ b/src/agents/tools/web-tools.fetch.test.ts @@ -231,6 +231,83 @@ describe("web_fetch extraction fallbacks", () => { expect(details.truncated).toBe(true); }); + it("decodes response bytes with a charset from Content-Type", async () => { + installMockFetch((input: RequestInfo | URL) => { + const response = new Response(new Uint8Array([0x63, 0x61, 0x66, 0xe9]), { + status: 200, + headers: { "content-type": "text/plain; charset=iso-8859-1" }, + }); + Object.defineProperty(response, "url", { value: resolveRequestUrl(input) }); + return Promise.resolve(response); + }); + + const tool = createFetchTool({ firecrawl: { enabled: false } }); + const result = await executeFetch(tool, { + url: "https://example.com/latin1", + extractMode: "text", + }); + const details = result?.details as { text?: string }; + + expect(details.text).toContain("café"); + expect(details.text).not.toContain("caf�"); + }); + + it("decodes HTML using a meta http-equiv charset before extraction", async () => { + const encoder = new TextEncoder(); + const japanese = new Uint8Array([0x93, 0xfa, 0x96, 0x7b, 0x8c, 0xea]); + const responseBytes = new Uint8Array([ + ...encoder.encode( + '', + ), + ...japanese, + ...encoder.encode("

"), + ...japanese, + ...encoder.encode("

"), + ]); + installMockFetch((input: RequestInfo | URL) => { + const response = new Response(responseBytes, { + status: 200, + headers: { "content-type": "text/html" }, + }); + Object.defineProperty(response, "url", { value: resolveRequestUrl(input) }); + return Promise.resolve(response); + }); + + const tool = createFetchTool({ firecrawl: { enabled: false } }); + const result = await executeFetch(tool, { + url: "https://example.com/shift-jis", + extractMode: "text", + }); + const details = result?.details as { text?: string; title?: string }; + const output = `${details.title ?? ""}\n${details.text ?? ""}`; + + expect(output).toContain("日本語"); + expect(output).not.toContain("�"); + }); + + it("ignores charset text in unrelated meta content", async () => { + const body = + '日本語日本語'; + installMockFetch((input: RequestInfo | URL) => { + const response = new Response(new TextEncoder().encode(body), { + status: 200, + headers: { "content-type": "text/html" }, + }); + Object.defineProperty(response, "url", { value: resolveRequestUrl(input) }); + return Promise.resolve(response); + }); + + const tool = createFetchTool({ firecrawl: { enabled: false } }); + const result = await executeFetch(tool, { + url: "https://example.com/content-only-charset", + extractMode: "text", + }); + const details = result?.details as { text?: string; title?: string }; + const output = `${details.title ?? ""}\n${details.text ?? ""}`; + + expect(output).toContain("日本語"); + }); + it("caps response bytes and does not hang on endless streams", async () => { const chunk = new TextEncoder().encode("
hi
"); const stream = new ReadableStream({