memory: strip inbound metadata envelopes from user messages in session corpus (#66548)

Merged via squash.

Prepared head SHA: 98562b2a84
Co-authored-by: zqchris <4436110+zqchris@users.noreply.github.com>
Co-authored-by: jalehman <550978+jalehman@users.noreply.github.com>
Reviewed-by: @jalehman
This commit is contained in:
zqchris
2026-04-17 02:15:44 +08:00
committed by GitHub
parent 00d21d1b23
commit 82e349a48a
5 changed files with 253 additions and 24 deletions

View File

@@ -151,6 +151,101 @@ describe("buildSessionEntry", () => {
]);
});
it("strips inbound metadata envelope from user messages before normalization", async () => {
// Real Telegram inbound envelope: Conversation info + Sender blocks prepended
// to the actual user text. Without stripping, the JSON envelope dominates
// the corpus entry and the user's real words get truncated by the
// SESSION_INGESTION_MAX_SNIPPET_CHARS cap downstream.
// See: https://github.com/openclaw/openclaw/issues/63921
const envelopedUserText = [
"Conversation info (untrusted metadata):",
"```json",
'{"message_id":"msg-100","chat_id":"-100123","sender":"Chris"}',
"```",
"",
"Sender (untrusted metadata):",
"```json",
'{"label":"Chris","name":"Chris","id":"42"}',
"```",
"",
"帮我看看今天的 Oura 数据",
].join("\n");
const jsonlLines = [
JSON.stringify({
type: "message",
message: { role: "user", content: envelopedUserText },
}),
JSON.stringify({
type: "message",
message: { role: "assistant", content: "好的,我来查一下" },
}),
];
const filePath = path.join(tmpDir, "enveloped-session.jsonl");
await fs.writeFile(filePath, jsonlLines.join("\n"));
const entry = await buildSessionEntry(filePath);
expect(entry).not.toBeNull();
const contentLines = entry!.content.split("\n");
expect(contentLines).toHaveLength(2);
// User line should contain ONLY the real user text, not the JSON envelope.
expect(contentLines[0]).toBe("User: 帮我看看今天的 Oura 数据");
expect(contentLines[0]).not.toContain("untrusted metadata");
expect(contentLines[0]).not.toContain("message_id");
expect(contentLines[0]).not.toContain("```json");
expect(contentLines[1]).toBe("Assistant: 好的,我来查一下");
});
it("strips inbound metadata when a user envelope is split across text blocks", async () => {
const jsonlLines = [
JSON.stringify({
type: "message",
message: {
role: "user",
content: [
{ type: "text", text: "Conversation info (untrusted metadata):" },
{ type: "text", text: "```json" },
{ type: "text", text: '{"message_id":"msg-100","chat_id":"-100123"}' },
{ type: "text", text: "```" },
{ type: "text", text: "" },
{ type: "text", text: "Sender (untrusted metadata):" },
{ type: "text", text: "```json" },
{ type: "text", text: '{"label":"Chris","id":"42"}' },
{ type: "text", text: "```" },
{ type: "text", text: "" },
{ type: "text", text: "Actual user text" },
],
},
}),
];
const filePath = path.join(tmpDir, "enveloped-session-array.jsonl");
await fs.writeFile(filePath, jsonlLines.join("\n"));
const entry = await buildSessionEntry(filePath);
expect(entry).not.toBeNull();
expect(entry!.content).toBe("User: Actual user text");
});
it("preserves assistant messages that happen to contain sentinel-like text", async () => {
// Assistant role must NOT be stripped — only user messages carry inbound
// envelopes, and assistants may legitimately discuss metadata formats.
const assistantText =
"The envelope format uses 'Conversation info (untrusted metadata):' as a sentinel";
const jsonlLines = [
JSON.stringify({
type: "message",
message: { role: "assistant", content: assistantText },
}),
];
const filePath = path.join(tmpDir, "assistant-sentinel.jsonl");
await fs.writeFile(filePath, jsonlLines.join("\n"));
const entry = await buildSessionEntry(filePath);
expect(entry).not.toBeNull();
expect(entry!.content).toBe(`Assistant: ${assistantText}`);
});
it("flags dreaming narrative transcripts from bootstrap metadata", async () => {
const jsonlLines = [
JSON.stringify({

View File

@@ -1,5 +1,6 @@
import fs from "node:fs/promises";
import path from "node:path";
import { stripInboundMetadata } from "../../auto-reply/reply/strip-inbound-meta.js";
import { isUsageCountedSessionTranscriptFileName } from "../../config/sessions/artifacts.js";
import { resolveSessionTranscriptsDirForAgent } from "../../config/sessions/paths.js";
import { loadSessionStore } from "../../config/sessions/store-load.js";
@@ -182,10 +183,9 @@ function normalizeSessionText(value: string): string {
.trim();
}
export function extractSessionText(content: unknown): string | null {
function collectRawSessionText(content: unknown): string | null {
if (typeof content === "string") {
const normalized = normalizeSessionText(content);
return normalized ? normalized : null;
return content;
}
if (!Array.isArray(content)) {
return null;
@@ -196,18 +196,44 @@ export function extractSessionText(content: unknown): string | null {
continue;
}
const record = block as { type?: unknown; text?: unknown };
if (record.type !== "text" || typeof record.text !== "string") {
continue;
}
const normalized = normalizeSessionText(record.text);
if (normalized) {
parts.push(normalized);
if (record.type === "text" && typeof record.text === "string") {
parts.push(record.text);
}
}
if (parts.length === 0) {
return parts.length > 0 ? parts.join("\n") : null;
}
/**
* Strip OpenClaw-injected inbound metadata envelopes from a raw text block.
*
* User-role messages arriving from external channels (Telegram, Discord,
* Slack, …) are stored with a multi-line prefix containing Conversation info,
* Sender info, and other AI-facing metadata blocks. These envelopes must be
* removed BEFORE normalization, because `stripInboundMetadata` relies on
* newline structure and fenced `json` code fences to locate sentinels; once
* `normalizeSessionText` collapses newlines into spaces, stripping is
* impossible.
*
* See: https://github.com/openclaw/openclaw/issues/63921
*/
function stripInboundMetadataForUserRole(text: string, role: "user" | "assistant"): string {
if (role !== "user") {
return text;
}
return stripInboundMetadata(text);
}
export function extractSessionText(
content: unknown,
role: "user" | "assistant" = "assistant",
): string | null {
const rawText = collectRawSessionText(content);
if (rawText === null) {
return null;
}
return parts.join(" ");
const stripped = stripInboundMetadataForUserRole(rawText, role);
const normalized = normalizeSessionText(stripped);
return normalized ? normalized : null;
}
function parseSessionTimestampMs(
@@ -275,7 +301,7 @@ export async function buildSessionEntry(
if (message.role !== "user" && message.role !== "assistant") {
continue;
}
const text = extractSessionText(message.content);
const text = extractSessionText(message.content, message.role);
if (!text) {
continue;
}