fix: decode web fetch legacy charsets (#73513)

* fix: decode web fetch legacy charsets
2026-05-06 05:30:42 +00:00 · 2026-04-28 22:09:06 +10:00
parent e4ff7c1620
commit 7a23b2d945
3 changed files with 206 additions and 6 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -10,6 +10,8 @@ Docs: https://docs.openclaw.ai

 ### Fixes

+- Tools/web_fetch: decode response bodies from raw bytes using declared HTTP, XML, or HTML meta charsets before extraction, so Shift_JIS and other legacy-charset pages no longer return mojibake. Fixes #72916. Thanks @amknight.
+- Channels/Discord: bound message read/search REST calls, route those actions through Gateway execution, and fall back to `CommandTargetSessionKey` for inbound hook session keys so Discord reads do not hang and hooks still fire when `SessionKey` is empty. Fixes #73431. (#73521) Thanks @amknight.
 - Plugins/media: auto-enable provider plugins referenced by `agents.defaults.imageGenerationModel`, `videoGenerationModel`, and `musicGenerationModel` primary/fallback refs, so configured Google and MiniMax media providers do not stay disabled behind a restrictive plugin allowlist. Thanks @vincentkoc.
 - Memory-core/dreaming: retry managed dreaming cron registration after startup when the cron service is not reachable yet, so the scheduled Memory Dreaming Promotion sweep recovers without waiting for heartbeat traffic. Fixes #72841. Thanks @amknight.

--- a/src/agents/tools/web-shared.ts
+++ b/src/agents/tools/web-shared.ts
@@ -94,6 +94,114 @@ export type ReadResponseTextResult = {
  bytesRead: number;
 };

+const RESPONSE_CHARSET_SCAN_BYTES = 4096;
+const latin1Decoder = new TextDecoder("latin1");
+const utf8Decoder = new TextDecoder("utf-8");
+
+function normalizeCharset(value: string | undefined): string | undefined {
+  const normalized = value?.trim().replace(/^["']|["']$/g, "") ?? "";
+  return normalized && normalized.length <= 64 && /^[A-Za-z0-9._:-]+$/.test(normalized)
+    ? normalized
+    : undefined;
+}
+
+function readCharsetParam(value: string | null | undefined): string | undefined {
+  const match = /(?:^|;)\s*charset\s*=\s*(?:"([^"]+)"|'([^']+)'|([^;\s]+))/i.exec(value ?? "");
+  return normalizeCharset(match?.[1] ?? match?.[2] ?? match?.[3]);
+}
+
+function readAttribute(tag: string, name: string): string | undefined {
+  const target = name.toLowerCase();
+  for (const match of tag.matchAll(
+    /([A-Za-z0-9:_-]+)\s*=\s*(?:"([^"]*)"|'([^']*)'|([^\s"'>]+))/g,
+  )) {
+    if (match[1]?.toLowerCase() === target) {
+      return match[2] ?? match[3] ?? match[4] ?? "";
+    }
+  }
+  return undefined;
+}
+
+function shouldSniffDocumentCharset(contentType: string | null): boolean {
+  const mediaType = contentType?.split(";", 1)[0]?.trim().toLowerCase();
+  if (!mediaType) {
+    return true;
+  }
+  return (
+    mediaType === "text/html" ||
+    mediaType === "application/xhtml+xml" ||
+    mediaType === "text/xml" ||
+    mediaType === "application/xml" ||
+    mediaType.endsWith("+xml")
+  );
+}
+
+function sniffCharset(contentType: string | null, bytes: Uint8Array): string | undefined {
+  if (bytes[0] === 0xef && bytes[1] === 0xbb && bytes[2] === 0xbf) {
+    return "utf-8";
+  }
+  if (bytes[0] === 0xff && bytes[1] === 0xfe) {
+    return "utf-16le";
+  }
+  if (bytes[0] === 0xfe && bytes[1] === 0xff) {
+    return "utf-16be";
+  }
+  if (!shouldSniffDocumentCharset(contentType)) {
+    return undefined;
+  }
+
+  const head = latin1Decoder.decode(
+    bytes.subarray(0, Math.min(bytes.byteLength, RESPONSE_CHARSET_SCAN_BYTES)),
+  );
+  const xmlEncoding = /<\?xml\s+[^>]*\bencoding\s*=\s*(?:"([^"]+)"|'([^']+)')/i.exec(head);
+  if (xmlEncoding) {
+    return normalizeCharset(xmlEncoding[1] ?? xmlEncoding[2]);
+  }
+
+  for (const match of head.matchAll(/<meta\b[^>]*>/gi)) {
+    const tag = match[0];
+    const charset = normalizeCharset(readAttribute(tag, "charset"));
+    if (charset) {
+      return charset;
+    }
+    if (/^content-type$/i.test(readAttribute(tag, "http-equiv") ?? "")) {
+      const contentCharset = readCharsetParam(readAttribute(tag, "content"));
+      if (contentCharset) {
+        return contentCharset;
+      }
+    }
+  }
+  return undefined;
+}
+
+function concatBytes(parts: Uint8Array[], totalBytes: number): Uint8Array {
+  if (parts.length === 1 && parts[0]?.byteLength === totalBytes) {
+    return parts[0];
+  }
+  const bytes = new Uint8Array(totalBytes);
+  let offset = 0;
+  for (const part of parts) {
+    bytes.set(part, offset);
+    offset += part.byteLength;
+  }
+  return bytes;
+}
+
+function responseContentType(res: Response): string | null {
+  const headers = (res as { headers?: { get?: (name: string) => string | null } }).headers;
+  return typeof headers?.get === "function" ? headers.get("content-type") : null;
+}
+
+function decodeResponseBytes(res: Response, bytes: Uint8Array): string {
+  const contentType = responseContentType(res);
+  const charset = readCharsetParam(contentType) ?? sniffCharset(contentType, bytes);
+  try {
+    return new TextDecoder(charset ?? "utf-8").decode(bytes);
+  } catch {
+    return utf8Decoder.decode(bytes);
+  }
+}
+
 export async function readResponseText(
  res: Response,
  options?: { maxBytes?: number },
@@ -113,10 +221,9 @@ export async function readResponseText(
    typeof (body as { getReader: () => unknown }).getReader === "function"
  ) {
    const reader = (body as ReadableStream<Uint8Array>).getReader();
-    const decoder = new TextDecoder();
    let bytesRead = 0;
    let truncated = false;
-    const parts: string[] = [];
+    const parts: Uint8Array[] = [];

    try {
      while (true) {
@@ -140,7 +247,7 @@ export async function readResponseText(
        }

        bytesRead += chunk.byteLength;
-        parts.push(decoder.decode(chunk, { stream: true }));
+        parts.push(chunk);

        if (truncated || bytesRead >= maxBytes) {
          truncated = true;
@@ -148,7 +255,7 @@ export async function readResponseText(
        }
      }
    } catch {
-      // Best-effort: return whatever we decoded so far.
+      // Best-effort: return whatever we read so far.
    } finally {
      if (truncated) {
        // Some mocked or non-compliant streams never settle cancel(); do not
@@ -157,8 +264,22 @@ export async function readResponseText(
      }
    }

-    parts.push(decoder.decode());
-    return { text: parts.join(""), truncated, bytesRead };
+    const bytes = concatBytes(parts, bytesRead);
+    return { text: decodeResponseBytes(res, bytes), truncated, bytesRead };
+  }
+
+  const readBytes = (res as { arrayBuffer?: () => Promise<ArrayBuffer> }).arrayBuffer;
+  if (typeof readBytes === "function") {
+    try {
+      const bytes = new Uint8Array(await readBytes.call(res));
+      return {
+        text: decodeResponseBytes(res, bytes),
+        truncated: false,
+        bytesRead: bytes.byteLength,
+      };
+    } catch {
+      // Fall back to text() for lightweight Response-like mocks that do not expose bytes.
+    }
  }

  try {
--- a/src/agents/tools/web-tools.fetch.test.ts
+++ b/src/agents/tools/web-tools.fetch.test.ts
@@ -231,6 +231,83 @@ describe("web_fetch extraction fallbacks", () => {
    expect(details.truncated).toBe(true);
  });

+  it("decodes response bytes with a charset from Content-Type", async () => {
+    installMockFetch((input: RequestInfo | URL) => {
+      const response = new Response(new Uint8Array([0x63, 0x61, 0x66, 0xe9]), {
+        status: 200,
+        headers: { "content-type": "text/plain; charset=iso-8859-1" },
+      });
+      Object.defineProperty(response, "url", { value: resolveRequestUrl(input) });
+      return Promise.resolve(response);
+    });
+
+    const tool = createFetchTool({ firecrawl: { enabled: false } });
+    const result = await executeFetch(tool, {
+      url: "https://example.com/latin1",
+      extractMode: "text",
+    });
+    const details = result?.details as { text?: string };
+
+    expect(details.text).toContain("café");
+    expect(details.text).not.toContain("caf<61>");
+  });
+
+  it("decodes HTML using a meta http-equiv charset before extraction", async () => {
+    const encoder = new TextEncoder();
+    const japanese = new Uint8Array([0x93, 0xfa, 0x96, 0x7b, 0x8c, 0xea]);
+    const responseBytes = new Uint8Array([
+      ...encoder.encode(
+        '<!doctype html><html><head><meta http-equiv="Content-Type" content="text/html; charset=Shift_JIS"><title>',
+      ),
+      ...japanese,
+      ...encoder.encode("</title></head><body><p>"),
+      ...japanese,
+      ...encoder.encode("</p></body></html>"),
+    ]);
+    installMockFetch((input: RequestInfo | URL) => {
+      const response = new Response(responseBytes, {
+        status: 200,
+        headers: { "content-type": "text/html" },
+      });
+      Object.defineProperty(response, "url", { value: resolveRequestUrl(input) });
+      return Promise.resolve(response);
+    });
+
+    const tool = createFetchTool({ firecrawl: { enabled: false } });
+    const result = await executeFetch(tool, {
+      url: "https://example.com/shift-jis",
+      extractMode: "text",
+    });
+    const details = result?.details as { text?: string; title?: string };
+    const output = `${details.title ?? ""}\n${details.text ?? ""}`;
+
+    expect(output).toContain("日本語");
+    expect(output).not.toContain("<22>");
+  });
+
+  it("ignores charset text in unrelated meta content", async () => {
+    const body =
+      '<!doctype html><html><head><meta name="description" content="charset=Shift_JIS"><title>日本語</title></head><body>日本語</body></html>';
+    installMockFetch((input: RequestInfo | URL) => {
+      const response = new Response(new TextEncoder().encode(body), {
+        status: 200,
+        headers: { "content-type": "text/html" },
+      });
+      Object.defineProperty(response, "url", { value: resolveRequestUrl(input) });
+      return Promise.resolve(response);
+    });
+
+    const tool = createFetchTool({ firecrawl: { enabled: false } });
+    const result = await executeFetch(tool, {
+      url: "https://example.com/content-only-charset",
+      extractMode: "text",
+    });
+    const details = result?.details as { text?: string; title?: string };
+    const output = `${details.title ?? ""}\n${details.text ?? ""}`;
+
+    expect(output).toContain("日本語");
+  });
+
  it("caps response bytes and does not hang on endless streams", async () => {
    const chunk = new TextEncoder().encode("<html><body><div>hi</div></body></html>");
    const stream = new ReadableStream<Uint8Array>({