mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 17:10:49 +00:00
memory: strip inbound metadata envelopes from user messages in session corpus
Session ingestion was feeding raw Telegram/Discord/Slack inbound envelopes into the dreaming corpus. The 338-char Conversation info + Sender JSON prefix on every user message blew past SESSION_INGESTION_MAX_SNIPPET_CHARS (280), so the user's actual words never made it in and REM extraction latched onto envelope words like 'assistant' as top topics. Strip inbound metadata on user-role text blocks BEFORE normalizeSessionText collapses newlines. stripInboundMetadata needs the line structure and fenced-json markers to find sentinels, so the order matters. Assistant messages are left alone — they may legitimately discuss the envelope format. Fixes #63921
This commit is contained in:
@@ -151,6 +151,71 @@ describe("buildSessionEntry", () => {
|
||||
]);
|
||||
});
|
||||
|
||||
it("strips inbound metadata envelope from user messages before normalization", async () => {
|
||||
// Real Telegram inbound envelope: Conversation info + Sender blocks prepended
|
||||
// to the actual user text. Without stripping, the JSON envelope dominates
|
||||
// the corpus entry and the user's real words get truncated by the
|
||||
// SESSION_INGESTION_MAX_SNIPPET_CHARS cap downstream.
|
||||
// See: https://github.com/openclaw/openclaw/issues/63921
|
||||
const envelopedUserText = [
|
||||
"Conversation info (untrusted metadata):",
|
||||
"```json",
|
||||
'{"message_id":"msg-100","chat_id":"-100123","sender":"Chris"}',
|
||||
"```",
|
||||
"",
|
||||
"Sender (untrusted metadata):",
|
||||
"```json",
|
||||
'{"label":"Chris","name":"Chris","id":"42"}',
|
||||
"```",
|
||||
"",
|
||||
"帮我看看今天的 Oura 数据",
|
||||
].join("\n");
|
||||
|
||||
const jsonlLines = [
|
||||
JSON.stringify({
|
||||
type: "message",
|
||||
message: { role: "user", content: envelopedUserText },
|
||||
}),
|
||||
JSON.stringify({
|
||||
type: "message",
|
||||
message: { role: "assistant", content: "好的,我来查一下" },
|
||||
}),
|
||||
];
|
||||
const filePath = path.join(tmpDir, "enveloped-session.jsonl");
|
||||
await fs.writeFile(filePath, jsonlLines.join("\n"));
|
||||
|
||||
const entry = await buildSessionEntry(filePath);
|
||||
expect(entry).not.toBeNull();
|
||||
|
||||
const contentLines = entry!.content.split("\n");
|
||||
expect(contentLines).toHaveLength(2);
|
||||
// User line should contain ONLY the real user text, not the JSON envelope.
|
||||
expect(contentLines[0]).toBe("User: 帮我看看今天的 Oura 数据");
|
||||
expect(contentLines[0]).not.toContain("untrusted metadata");
|
||||
expect(contentLines[0]).not.toContain("message_id");
|
||||
expect(contentLines[0]).not.toContain("```json");
|
||||
expect(contentLines[1]).toBe("Assistant: 好的,我来查一下");
|
||||
});
|
||||
|
||||
it("preserves assistant messages that happen to contain sentinel-like text", async () => {
|
||||
// Assistant role must NOT be stripped — only user messages carry inbound
|
||||
// envelopes, and assistants may legitimately discuss metadata formats.
|
||||
const assistantText =
|
||||
"The envelope format uses 'Conversation info (untrusted metadata):' as a sentinel";
|
||||
const jsonlLines = [
|
||||
JSON.stringify({
|
||||
type: "message",
|
||||
message: { role: "assistant", content: assistantText },
|
||||
}),
|
||||
];
|
||||
const filePath = path.join(tmpDir, "assistant-sentinel.jsonl");
|
||||
await fs.writeFile(filePath, jsonlLines.join("\n"));
|
||||
|
||||
const entry = await buildSessionEntry(filePath);
|
||||
expect(entry).not.toBeNull();
|
||||
expect(entry!.content).toBe(`Assistant: ${assistantText}`);
|
||||
});
|
||||
|
||||
it("flags dreaming narrative transcripts from bootstrap metadata", async () => {
|
||||
const jsonlLines = [
|
||||
JSON.stringify({
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
import fs from "node:fs/promises";
|
||||
import path from "node:path";
|
||||
import { stripInboundMetadata } from "../../auto-reply/reply/strip-inbound-meta.js";
|
||||
import { isUsageCountedSessionTranscriptFileName } from "../../config/sessions/artifacts.js";
|
||||
import { resolveSessionTranscriptsDirForAgent } from "../../config/sessions/paths.js";
|
||||
import { loadSessionStore } from "../../config/sessions/store-load.js";
|
||||
@@ -182,9 +183,33 @@ function normalizeSessionText(value: string): string {
|
||||
.trim();
|
||||
}
|
||||
|
||||
export function extractSessionText(content: unknown): string | null {
|
||||
/**
|
||||
* Strip OpenClaw-injected inbound metadata envelopes from a raw text block.
|
||||
*
|
||||
* User-role messages arriving from external channels (Telegram, Discord,
|
||||
* Slack, …) are stored with a multi-line prefix containing Conversation info,
|
||||
* Sender info, and other AI-facing metadata blocks. These envelopes must be
|
||||
* removed BEFORE normalization, because `stripInboundMetadata` relies on
|
||||
* newline structure and fenced `json` code fences to locate sentinels; once
|
||||
* `normalizeSessionText` collapses newlines into spaces, stripping is
|
||||
* impossible.
|
||||
*
|
||||
* See: https://github.com/openclaw/openclaw/issues/63921
|
||||
*/
|
||||
function stripInboundMetadataForUserRole(text: string, role: "user" | "assistant"): string {
|
||||
if (role !== "user") {
|
||||
return text;
|
||||
}
|
||||
return stripInboundMetadata(text);
|
||||
}
|
||||
|
||||
export function extractSessionText(
|
||||
content: unknown,
|
||||
role: "user" | "assistant" = "assistant",
|
||||
): string | null {
|
||||
if (typeof content === "string") {
|
||||
const normalized = normalizeSessionText(content);
|
||||
const stripped = stripInboundMetadataForUserRole(content, role);
|
||||
const normalized = normalizeSessionText(stripped);
|
||||
return normalized ? normalized : null;
|
||||
}
|
||||
if (!Array.isArray(content)) {
|
||||
@@ -199,7 +224,8 @@ export function extractSessionText(content: unknown): string | null {
|
||||
if (record.type !== "text" || typeof record.text !== "string") {
|
||||
continue;
|
||||
}
|
||||
const normalized = normalizeSessionText(record.text);
|
||||
const stripped = stripInboundMetadataForUserRole(record.text, role);
|
||||
const normalized = normalizeSessionText(stripped);
|
||||
if (normalized) {
|
||||
parts.push(normalized);
|
||||
}
|
||||
@@ -275,7 +301,7 @@ export async function buildSessionEntry(
|
||||
if (message.role !== "user" && message.role !== "assistant") {
|
||||
continue;
|
||||
}
|
||||
const text = extractSessionText(message.content);
|
||||
const text = extractSessionText(message.content, message.role);
|
||||
if (!text) {
|
||||
continue;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user