memory: strip inbound metadata envelopes from user messages in session corpus

Session ingestion was feeding raw Telegram/Discord/Slack inbound envelopes into the dreaming corpus. The 338-char Conversation info + Sender JSON prefix on every user message blew past SESSION_INGESTION_MAX_SNIPPET_CHARS (280), so the user's actual words never made it in and REM extraction latched onto envelope words like 'assistant' as top topics. Strip inbound metadata on user-role text blocks BEFORE normalizeSessionText collapses newlines. stripInboundMetadata needs the line structure and fenced-json markers to find sentinels, so the order matters. Assistant messages are left alone — they may legitimately discuss the envelope format. Fixes #63921
2026-05-06 17:10:49 +00:00 · 2026-04-14 20:45:08 +08:00
parent 00d21d1b23
commit 8f84a54511
2 changed files with 95 additions and 4 deletions
--- a/src/memory-host-sdk/host/session-files.test.ts
+++ b/src/memory-host-sdk/host/session-files.test.ts
@@ -151,6 +151,71 @@ describe("buildSessionEntry", () => {
    ]);
  });

+  it("strips inbound metadata envelope from user messages before normalization", async () => {
+    // Real Telegram inbound envelope: Conversation info + Sender blocks prepended
+    // to the actual user text. Without stripping, the JSON envelope dominates
+    // the corpus entry and the user's real words get truncated by the
+    // SESSION_INGESTION_MAX_SNIPPET_CHARS cap downstream.
+    // See: https://github.com/openclaw/openclaw/issues/63921
+    const envelopedUserText = [
+      "Conversation info (untrusted metadata):",
+      "```json",
+      '{"message_id":"msg-100","chat_id":"-100123","sender":"Chris"}',
+      "```",
+      "",
+      "Sender (untrusted metadata):",
+      "```json",
+      '{"label":"Chris","name":"Chris","id":"42"}',
+      "```",
+      "",
+      "帮我看看今天的 Oura 数据",
+    ].join("\n");
+
+    const jsonlLines = [
+      JSON.stringify({
+        type: "message",
+        message: { role: "user", content: envelopedUserText },
+      }),
+      JSON.stringify({
+        type: "message",
+        message: { role: "assistant", content: "好的,我来查一下" },
+      }),
+    ];
+    const filePath = path.join(tmpDir, "enveloped-session.jsonl");
+    await fs.writeFile(filePath, jsonlLines.join("\n"));
+
+    const entry = await buildSessionEntry(filePath);
+    expect(entry).not.toBeNull();
+
+    const contentLines = entry!.content.split("\n");
+    expect(contentLines).toHaveLength(2);
+    // User line should contain ONLY the real user text, not the JSON envelope.
+    expect(contentLines[0]).toBe("User: 帮我看看今天的 Oura 数据");
+    expect(contentLines[0]).not.toContain("untrusted metadata");
+    expect(contentLines[0]).not.toContain("message_id");
+    expect(contentLines[0]).not.toContain("```json");
+    expect(contentLines[1]).toBe("Assistant: 好的,我来查一下");
+  });
+
+  it("preserves assistant messages that happen to contain sentinel-like text", async () => {
+    // Assistant role must NOT be stripped — only user messages carry inbound
+    // envelopes, and assistants may legitimately discuss metadata formats.
+    const assistantText =
+      "The envelope format uses 'Conversation info (untrusted metadata):' as a sentinel";
+    const jsonlLines = [
+      JSON.stringify({
+        type: "message",
+        message: { role: "assistant", content: assistantText },
+      }),
+    ];
+    const filePath = path.join(tmpDir, "assistant-sentinel.jsonl");
+    await fs.writeFile(filePath, jsonlLines.join("\n"));
+
+    const entry = await buildSessionEntry(filePath);
+    expect(entry).not.toBeNull();
+    expect(entry!.content).toBe(`Assistant: ${assistantText}`);
+  });
+
  it("flags dreaming narrative transcripts from bootstrap metadata", async () => {
    const jsonlLines = [
      JSON.stringify({
--- a/src/memory-host-sdk/host/session-files.ts
+++ b/src/memory-host-sdk/host/session-files.ts
@@ -1,5 +1,6 @@
 import fs from "node:fs/promises";
 import path from "node:path";
+import { stripInboundMetadata } from "../../auto-reply/reply/strip-inbound-meta.js";
 import { isUsageCountedSessionTranscriptFileName } from "../../config/sessions/artifacts.js";
 import { resolveSessionTranscriptsDirForAgent } from "../../config/sessions/paths.js";
 import { loadSessionStore } from "../../config/sessions/store-load.js";
@@ -182,9 +183,33 @@ function normalizeSessionText(value: string): string {
    .trim();
 }

-export function extractSessionText(content: unknown): string | null {
+/**
+ * Strip OpenClaw-injected inbound metadata envelopes from a raw text block.
+ *
+ * User-role messages arriving from external channels (Telegram, Discord,
+ * Slack, …) are stored with a multi-line prefix containing Conversation info,
+ * Sender info, and other AI-facing metadata blocks. These envelopes must be
+ * removed BEFORE normalization, because `stripInboundMetadata` relies on
+ * newline structure and fenced `json` code fences to locate sentinels; once
+ * `normalizeSessionText` collapses newlines into spaces, stripping is
+ * impossible.
+ *
+ * See: https://github.com/openclaw/openclaw/issues/63921
+ */
+function stripInboundMetadataForUserRole(text: string, role: "user" | "assistant"): string {
+  if (role !== "user") {
+    return text;
+  }
+  return stripInboundMetadata(text);
+}
+
+export function extractSessionText(
+  content: unknown,
+  role: "user" | "assistant" = "assistant",
+): string | null {
  if (typeof content === "string") {
-    const normalized = normalizeSessionText(content);
+    const stripped = stripInboundMetadataForUserRole(content, role);
+    const normalized = normalizeSessionText(stripped);
    return normalized ? normalized : null;
  }
  if (!Array.isArray(content)) {
@@ -199,7 +224,8 @@ export function extractSessionText(content: unknown): string | null {
    if (record.type !== "text" || typeof record.text !== "string") {
      continue;
    }
-    const normalized = normalizeSessionText(record.text);
+    const stripped = stripInboundMetadataForUserRole(record.text, role);
+    const normalized = normalizeSessionText(stripped);
    if (normalized) {
      parts.push(normalized);
    }
@@ -275,7 +301,7 @@ export async function buildSessionEntry(
      if (message.role !== "user" && message.role !== "assistant") {
        continue;
      }
-      const text = extractSessionText(message.content);
+      const text = extractSessionText(message.content, message.role);
      if (!text) {
        continue;
      }