memory: strip inbound metadata envelopes from user messages in session corpus

Session ingestion was feeding raw Telegram/Discord/Slack inbound envelopes
into the dreaming corpus. The 338-char Conversation info + Sender JSON prefix
on every user message blew past SESSION_INGESTION_MAX_SNIPPET_CHARS (280),
so the user's actual words never made it in and REM extraction latched
onto envelope words like 'assistant' as top topics.

Strip inbound metadata on user-role text blocks BEFORE normalizeSessionText
collapses newlines. stripInboundMetadata needs the line structure and
fenced-json markers to find sentinels, so the order matters. Assistant
messages are left alone — they may legitimately discuss the envelope
format.

Fixes #63921
This commit is contained in:
Chris Zhang
2026-04-14 20:45:08 +08:00
committed by Josh Lehman
parent 00d21d1b23
commit 8f84a54511
2 changed files with 95 additions and 4 deletions

View File

@@ -151,6 +151,71 @@ describe("buildSessionEntry", () => {
]);
});
it("strips inbound metadata envelope from user messages before normalization", async () => {
// Real Telegram inbound envelope: Conversation info + Sender blocks prepended
// to the actual user text. Without stripping, the JSON envelope dominates
// the corpus entry and the user's real words get truncated by the
// SESSION_INGESTION_MAX_SNIPPET_CHARS cap downstream.
// See: https://github.com/openclaw/openclaw/issues/63921
const envelopedUserText = [
"Conversation info (untrusted metadata):",
"```json",
'{"message_id":"msg-100","chat_id":"-100123","sender":"Chris"}',
"```",
"",
"Sender (untrusted metadata):",
"```json",
'{"label":"Chris","name":"Chris","id":"42"}',
"```",
"",
"帮我看看今天的 Oura 数据",
].join("\n");
const jsonlLines = [
JSON.stringify({
type: "message",
message: { role: "user", content: envelopedUserText },
}),
JSON.stringify({
type: "message",
message: { role: "assistant", content: "好的,我来查一下" },
}),
];
const filePath = path.join(tmpDir, "enveloped-session.jsonl");
await fs.writeFile(filePath, jsonlLines.join("\n"));
const entry = await buildSessionEntry(filePath);
expect(entry).not.toBeNull();
const contentLines = entry!.content.split("\n");
expect(contentLines).toHaveLength(2);
// User line should contain ONLY the real user text, not the JSON envelope.
expect(contentLines[0]).toBe("User: 帮我看看今天的 Oura 数据");
expect(contentLines[0]).not.toContain("untrusted metadata");
expect(contentLines[0]).not.toContain("message_id");
expect(contentLines[0]).not.toContain("```json");
expect(contentLines[1]).toBe("Assistant: 好的,我来查一下");
});
it("preserves assistant messages that happen to contain sentinel-like text", async () => {
// Assistant role must NOT be stripped — only user messages carry inbound
// envelopes, and assistants may legitimately discuss metadata formats.
const assistantText =
"The envelope format uses 'Conversation info (untrusted metadata):' as a sentinel";
const jsonlLines = [
JSON.stringify({
type: "message",
message: { role: "assistant", content: assistantText },
}),
];
const filePath = path.join(tmpDir, "assistant-sentinel.jsonl");
await fs.writeFile(filePath, jsonlLines.join("\n"));
const entry = await buildSessionEntry(filePath);
expect(entry).not.toBeNull();
expect(entry!.content).toBe(`Assistant: ${assistantText}`);
});
it("flags dreaming narrative transcripts from bootstrap metadata", async () => {
const jsonlLines = [
JSON.stringify({

View File

@@ -1,5 +1,6 @@
import fs from "node:fs/promises";
import path from "node:path";
import { stripInboundMetadata } from "../../auto-reply/reply/strip-inbound-meta.js";
import { isUsageCountedSessionTranscriptFileName } from "../../config/sessions/artifacts.js";
import { resolveSessionTranscriptsDirForAgent } from "../../config/sessions/paths.js";
import { loadSessionStore } from "../../config/sessions/store-load.js";
@@ -182,9 +183,33 @@ function normalizeSessionText(value: string): string {
.trim();
}
export function extractSessionText(content: unknown): string | null {
/**
* Strip OpenClaw-injected inbound metadata envelopes from a raw text block.
*
* User-role messages arriving from external channels (Telegram, Discord,
* Slack, …) are stored with a multi-line prefix containing Conversation info,
* Sender info, and other AI-facing metadata blocks. These envelopes must be
* removed BEFORE normalization, because `stripInboundMetadata` relies on
* newline structure and fenced `json` code fences to locate sentinels; once
* `normalizeSessionText` collapses newlines into spaces, stripping is
* impossible.
*
* See: https://github.com/openclaw/openclaw/issues/63921
*/
function stripInboundMetadataForUserRole(text: string, role: "user" | "assistant"): string {
if (role !== "user") {
return text;
}
return stripInboundMetadata(text);
}
export function extractSessionText(
content: unknown,
role: "user" | "assistant" = "assistant",
): string | null {
if (typeof content === "string") {
const normalized = normalizeSessionText(content);
const stripped = stripInboundMetadataForUserRole(content, role);
const normalized = normalizeSessionText(stripped);
return normalized ? normalized : null;
}
if (!Array.isArray(content)) {
@@ -199,7 +224,8 @@ export function extractSessionText(content: unknown): string | null {
if (record.type !== "text" || typeof record.text !== "string") {
continue;
}
const normalized = normalizeSessionText(record.text);
const stripped = stripInboundMetadataForUserRole(record.text, role);
const normalized = normalizeSessionText(stripped);
if (normalized) {
parts.push(normalized);
}
@@ -275,7 +301,7 @@ export async function buildSessionEntry(
if (message.role !== "user" && message.role !== "assistant") {
continue;
}
const text = extractSessionText(message.content);
const text = extractSessionText(message.content, message.role);
if (!text) {
continue;
}