mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 14:40:43 +00:00
memory: strip inbound metadata envelopes from user messages in session corpus (#66548)
Merged via squash.
Prepared head SHA: 98562b2a84
Co-authored-by: zqchris <4436110+zqchris@users.noreply.github.com>
Co-authored-by: jalehman <550978+jalehman@users.noreply.github.com>
Reviewed-by: @jalehman
This commit is contained in:
@@ -151,6 +151,101 @@ describe("buildSessionEntry", () => {
|
||||
]);
|
||||
});
|
||||
|
||||
it("strips inbound metadata envelope from user messages before normalization", async () => {
|
||||
// Real Telegram inbound envelope: Conversation info + Sender blocks prepended
|
||||
// to the actual user text. Without stripping, the JSON envelope dominates
|
||||
// the corpus entry and the user's real words get truncated by the
|
||||
// SESSION_INGESTION_MAX_SNIPPET_CHARS cap downstream.
|
||||
// See: https://github.com/openclaw/openclaw/issues/63921
|
||||
const envelopedUserText = [
|
||||
"Conversation info (untrusted metadata):",
|
||||
"```json",
|
||||
'{"message_id":"msg-100","chat_id":"-100123","sender":"Chris"}',
|
||||
"```",
|
||||
"",
|
||||
"Sender (untrusted metadata):",
|
||||
"```json",
|
||||
'{"label":"Chris","name":"Chris","id":"42"}',
|
||||
"```",
|
||||
"",
|
||||
"帮我看看今天的 Oura 数据",
|
||||
].join("\n");
|
||||
|
||||
const jsonlLines = [
|
||||
JSON.stringify({
|
||||
type: "message",
|
||||
message: { role: "user", content: envelopedUserText },
|
||||
}),
|
||||
JSON.stringify({
|
||||
type: "message",
|
||||
message: { role: "assistant", content: "好的,我来查一下" },
|
||||
}),
|
||||
];
|
||||
const filePath = path.join(tmpDir, "enveloped-session.jsonl");
|
||||
await fs.writeFile(filePath, jsonlLines.join("\n"));
|
||||
|
||||
const entry = await buildSessionEntry(filePath);
|
||||
expect(entry).not.toBeNull();
|
||||
|
||||
const contentLines = entry!.content.split("\n");
|
||||
expect(contentLines).toHaveLength(2);
|
||||
// User line should contain ONLY the real user text, not the JSON envelope.
|
||||
expect(contentLines[0]).toBe("User: 帮我看看今天的 Oura 数据");
|
||||
expect(contentLines[0]).not.toContain("untrusted metadata");
|
||||
expect(contentLines[0]).not.toContain("message_id");
|
||||
expect(contentLines[0]).not.toContain("```json");
|
||||
expect(contentLines[1]).toBe("Assistant: 好的,我来查一下");
|
||||
});
|
||||
|
||||
it("strips inbound metadata when a user envelope is split across text blocks", async () => {
|
||||
const jsonlLines = [
|
||||
JSON.stringify({
|
||||
type: "message",
|
||||
message: {
|
||||
role: "user",
|
||||
content: [
|
||||
{ type: "text", text: "Conversation info (untrusted metadata):" },
|
||||
{ type: "text", text: "```json" },
|
||||
{ type: "text", text: '{"message_id":"msg-100","chat_id":"-100123"}' },
|
||||
{ type: "text", text: "```" },
|
||||
{ type: "text", text: "" },
|
||||
{ type: "text", text: "Sender (untrusted metadata):" },
|
||||
{ type: "text", text: "```json" },
|
||||
{ type: "text", text: '{"label":"Chris","id":"42"}' },
|
||||
{ type: "text", text: "```" },
|
||||
{ type: "text", text: "" },
|
||||
{ type: "text", text: "Actual user text" },
|
||||
],
|
||||
},
|
||||
}),
|
||||
];
|
||||
const filePath = path.join(tmpDir, "enveloped-session-array.jsonl");
|
||||
await fs.writeFile(filePath, jsonlLines.join("\n"));
|
||||
|
||||
const entry = await buildSessionEntry(filePath);
|
||||
expect(entry).not.toBeNull();
|
||||
expect(entry!.content).toBe("User: Actual user text");
|
||||
});
|
||||
|
||||
it("preserves assistant messages that happen to contain sentinel-like text", async () => {
|
||||
// Assistant role must NOT be stripped — only user messages carry inbound
|
||||
// envelopes, and assistants may legitimately discuss metadata formats.
|
||||
const assistantText =
|
||||
"The envelope format uses 'Conversation info (untrusted metadata):' as a sentinel";
|
||||
const jsonlLines = [
|
||||
JSON.stringify({
|
||||
type: "message",
|
||||
message: { role: "assistant", content: assistantText },
|
||||
}),
|
||||
];
|
||||
const filePath = path.join(tmpDir, "assistant-sentinel.jsonl");
|
||||
await fs.writeFile(filePath, jsonlLines.join("\n"));
|
||||
|
||||
const entry = await buildSessionEntry(filePath);
|
||||
expect(entry).not.toBeNull();
|
||||
expect(entry!.content).toBe(`Assistant: ${assistantText}`);
|
||||
});
|
||||
|
||||
it("flags dreaming narrative transcripts from bootstrap metadata", async () => {
|
||||
const jsonlLines = [
|
||||
JSON.stringify({
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
import fs from "node:fs/promises";
|
||||
import path from "node:path";
|
||||
import { stripInboundMetadata } from "../../auto-reply/reply/strip-inbound-meta.js";
|
||||
import { isUsageCountedSessionTranscriptFileName } from "../../config/sessions/artifacts.js";
|
||||
import { resolveSessionTranscriptsDirForAgent } from "../../config/sessions/paths.js";
|
||||
import { loadSessionStore } from "../../config/sessions/store-load.js";
|
||||
@@ -182,10 +183,9 @@ function normalizeSessionText(value: string): string {
|
||||
.trim();
|
||||
}
|
||||
|
||||
export function extractSessionText(content: unknown): string | null {
|
||||
function collectRawSessionText(content: unknown): string | null {
|
||||
if (typeof content === "string") {
|
||||
const normalized = normalizeSessionText(content);
|
||||
return normalized ? normalized : null;
|
||||
return content;
|
||||
}
|
||||
if (!Array.isArray(content)) {
|
||||
return null;
|
||||
@@ -196,18 +196,44 @@ export function extractSessionText(content: unknown): string | null {
|
||||
continue;
|
||||
}
|
||||
const record = block as { type?: unknown; text?: unknown };
|
||||
if (record.type !== "text" || typeof record.text !== "string") {
|
||||
continue;
|
||||
}
|
||||
const normalized = normalizeSessionText(record.text);
|
||||
if (normalized) {
|
||||
parts.push(normalized);
|
||||
if (record.type === "text" && typeof record.text === "string") {
|
||||
parts.push(record.text);
|
||||
}
|
||||
}
|
||||
if (parts.length === 0) {
|
||||
return parts.length > 0 ? parts.join("\n") : null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Strip OpenClaw-injected inbound metadata envelopes from a raw text block.
|
||||
*
|
||||
* User-role messages arriving from external channels (Telegram, Discord,
|
||||
* Slack, …) are stored with a multi-line prefix containing Conversation info,
|
||||
* Sender info, and other AI-facing metadata blocks. These envelopes must be
|
||||
* removed BEFORE normalization, because `stripInboundMetadata` relies on
|
||||
* newline structure and fenced `json` code fences to locate sentinels; once
|
||||
* `normalizeSessionText` collapses newlines into spaces, stripping is
|
||||
* impossible.
|
||||
*
|
||||
* See: https://github.com/openclaw/openclaw/issues/63921
|
||||
*/
|
||||
function stripInboundMetadataForUserRole(text: string, role: "user" | "assistant"): string {
|
||||
if (role !== "user") {
|
||||
return text;
|
||||
}
|
||||
return stripInboundMetadata(text);
|
||||
}
|
||||
|
||||
export function extractSessionText(
|
||||
content: unknown,
|
||||
role: "user" | "assistant" = "assistant",
|
||||
): string | null {
|
||||
const rawText = collectRawSessionText(content);
|
||||
if (rawText === null) {
|
||||
return null;
|
||||
}
|
||||
return parts.join(" ");
|
||||
const stripped = stripInboundMetadataForUserRole(rawText, role);
|
||||
const normalized = normalizeSessionText(stripped);
|
||||
return normalized ? normalized : null;
|
||||
}
|
||||
|
||||
function parseSessionTimestampMs(
|
||||
@@ -275,7 +301,7 @@ export async function buildSessionEntry(
|
||||
if (message.role !== "user" && message.role !== "assistant") {
|
||||
continue;
|
||||
}
|
||||
const text = extractSessionText(message.content);
|
||||
const text = extractSessionText(message.content, message.role);
|
||||
if (!text) {
|
||||
continue;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user