diff --git a/src/memory-host-sdk/host/session-files.test.ts b/src/memory-host-sdk/host/session-files.test.ts index fd5d9a22efb..6df89081b99 100644 --- a/src/memory-host-sdk/host/session-files.test.ts +++ b/src/memory-host-sdk/host/session-files.test.ts @@ -151,6 +151,71 @@ describe("buildSessionEntry", () => { ]); }); + it("strips inbound metadata envelope from user messages before normalization", async () => { + // Real Telegram inbound envelope: Conversation info + Sender blocks prepended + // to the actual user text. Without stripping, the JSON envelope dominates + // the corpus entry and the user's real words get truncated by the + // SESSION_INGESTION_MAX_SNIPPET_CHARS cap downstream. + // See: https://github.com/openclaw/openclaw/issues/63921 + const envelopedUserText = [ + "Conversation info (untrusted metadata):", + "```json", + '{"message_id":"msg-100","chat_id":"-100123","sender":"Chris"}', + "```", + "", + "Sender (untrusted metadata):", + "```json", + '{"label":"Chris","name":"Chris","id":"42"}', + "```", + "", + "帮我看看今天的 Oura 数据", + ].join("\n"); + + const jsonlLines = [ + JSON.stringify({ + type: "message", + message: { role: "user", content: envelopedUserText }, + }), + JSON.stringify({ + type: "message", + message: { role: "assistant", content: "好的,我来查一下" }, + }), + ]; + const filePath = path.join(tmpDir, "enveloped-session.jsonl"); + await fs.writeFile(filePath, jsonlLines.join("\n")); + + const entry = await buildSessionEntry(filePath); + expect(entry).not.toBeNull(); + + const contentLines = entry!.content.split("\n"); + expect(contentLines).toHaveLength(2); + // User line should contain ONLY the real user text, not the JSON envelope. + expect(contentLines[0]).toBe("User: 帮我看看今天的 Oura 数据"); + expect(contentLines[0]).not.toContain("untrusted metadata"); + expect(contentLines[0]).not.toContain("message_id"); + expect(contentLines[0]).not.toContain("```json"); + expect(contentLines[1]).toBe("Assistant: 好的,我来查一下"); + }); + + it("preserves assistant messages that happen to contain sentinel-like text", async () => { + // Assistant role must NOT be stripped — only user messages carry inbound + // envelopes, and assistants may legitimately discuss metadata formats. + const assistantText = + "The envelope format uses 'Conversation info (untrusted metadata):' as a sentinel"; + const jsonlLines = [ + JSON.stringify({ + type: "message", + message: { role: "assistant", content: assistantText }, + }), + ]; + const filePath = path.join(tmpDir, "assistant-sentinel.jsonl"); + await fs.writeFile(filePath, jsonlLines.join("\n")); + + const entry = await buildSessionEntry(filePath); + expect(entry).not.toBeNull(); + expect(entry!.content).toBe(`Assistant: ${assistantText}`); + }); + it("flags dreaming narrative transcripts from bootstrap metadata", async () => { const jsonlLines = [ JSON.stringify({ diff --git a/src/memory-host-sdk/host/session-files.ts b/src/memory-host-sdk/host/session-files.ts index 5865cf9bbb3..3263d8d0890 100644 --- a/src/memory-host-sdk/host/session-files.ts +++ b/src/memory-host-sdk/host/session-files.ts @@ -1,5 +1,6 @@ import fs from "node:fs/promises"; import path from "node:path"; +import { stripInboundMetadata } from "../../auto-reply/reply/strip-inbound-meta.js"; import { isUsageCountedSessionTranscriptFileName } from "../../config/sessions/artifacts.js"; import { resolveSessionTranscriptsDirForAgent } from "../../config/sessions/paths.js"; import { loadSessionStore } from "../../config/sessions/store-load.js"; @@ -182,9 +183,33 @@ function normalizeSessionText(value: string): string { .trim(); } -export function extractSessionText(content: unknown): string | null { +/** + * Strip OpenClaw-injected inbound metadata envelopes from a raw text block. + * + * User-role messages arriving from external channels (Telegram, Discord, + * Slack, …) are stored with a multi-line prefix containing Conversation info, + * Sender info, and other AI-facing metadata blocks. These envelopes must be + * removed BEFORE normalization, because `stripInboundMetadata` relies on + * newline structure and fenced `json` code fences to locate sentinels; once + * `normalizeSessionText` collapses newlines into spaces, stripping is + * impossible. + * + * See: https://github.com/openclaw/openclaw/issues/63921 + */ +function stripInboundMetadataForUserRole(text: string, role: "user" | "assistant"): string { + if (role !== "user") { + return text; + } + return stripInboundMetadata(text); +} + +export function extractSessionText( + content: unknown, + role: "user" | "assistant" = "assistant", +): string | null { if (typeof content === "string") { - const normalized = normalizeSessionText(content); + const stripped = stripInboundMetadataForUserRole(content, role); + const normalized = normalizeSessionText(stripped); return normalized ? normalized : null; } if (!Array.isArray(content)) { @@ -199,7 +224,8 @@ export function extractSessionText(content: unknown): string | null { if (record.type !== "text" || typeof record.text !== "string") { continue; } - const normalized = normalizeSessionText(record.text); + const stripped = stripInboundMetadataForUserRole(record.text, role); + const normalized = normalizeSessionText(stripped); if (normalized) { parts.push(normalized); } @@ -275,7 +301,7 @@ export async function buildSessionEntry( if (message.role !== "user" && message.role !== "assistant") { continue; } - const text = extractSessionText(message.content); + const text = extractSessionText(message.content, message.role); if (!text) { continue; }