memory: handle split inbound envelopes in session corpus

This commit is contained in:
Josh Lehman
2026-04-15 10:30:14 -07:00
parent d5fa7b0838
commit 62b50d482e
4 changed files with 110 additions and 50 deletions

View File

@@ -120,4 +120,34 @@ describe("buildSessionEntry", () => {
expect(entry).not.toBeNull();
expect(entry!.lineMap).toEqual([3, 5]);
});
it("strips inbound metadata when a user envelope is split across text blocks", async () => {
const jsonlLines = [
JSON.stringify({
type: "message",
message: {
role: "user",
content: [
{ type: "text", text: "Conversation info (untrusted metadata):" },
{ type: "text", text: "```json" },
{ type: "text", text: '{"message_id":"msg-100","chat_id":"-100123"}' },
{ type: "text", text: "```" },
{ type: "text", text: "" },
{ type: "text", text: "Sender (untrusted metadata):" },
{ type: "text", text: "```json" },
{ type: "text", text: '{"label":"Chris","id":"42"}' },
{ type: "text", text: "```" },
{ type: "text", text: "" },
{ type: "text", text: "Actual user text" },
],
},
}),
];
const filePath = path.join(tmpDir, "enveloped-session-array.jsonl");
await fs.writeFile(filePath, jsonlLines.join("\n"));
const entry = await buildSessionEntry(filePath);
expect(entry).not.toBeNull();
expect(entry!.content).toBe("User: Actual user text");
});
});

View File

@@ -69,6 +69,26 @@ function normalizeSessionText(value: string): string {
.trim();
}
function collectRawSessionText(content: unknown): string | null {
if (typeof content === "string") {
return content;
}
if (!Array.isArray(content)) {
return null;
}
const parts: string[] = [];
for (const block of content) {
if (!block || typeof block !== "object") {
continue;
}
const record = block as { type?: unknown; text?: unknown };
if (record.type === "text" && typeof record.text === "string") {
parts.push(record.text);
}
}
return parts.length > 0 ? parts.join("\n") : null;
}
/**
* Strip OpenClaw-injected inbound metadata envelopes from a raw text block
* on user-role messages before normalization. See the authoritative
@@ -86,33 +106,13 @@ export function extractSessionText(
content: unknown,
role: "user" | "assistant" = "assistant",
): string | null {
if (typeof content === "string") {
const stripped = stripInboundMetadataForUserRole(content, role);
const normalized = normalizeSessionText(stripped);
return normalized ? normalized : null;
}
if (!Array.isArray(content)) {
const rawText = collectRawSessionText(content);
if (rawText === null) {
return null;
}
const parts: string[] = [];
for (const block of content) {
if (!block || typeof block !== "object") {
continue;
}
const record = block as { type?: unknown; text?: unknown };
if (record.type !== "text" || typeof record.text !== "string") {
continue;
}
const stripped = stripInboundMetadataForUserRole(record.text, role);
const normalized = normalizeSessionText(stripped);
if (normalized) {
parts.push(normalized);
}
}
if (parts.length === 0) {
return null;
}
return parts.join(" ");
const stripped = stripInboundMetadataForUserRole(rawText, role);
const normalized = normalizeSessionText(stripped);
return normalized ? normalized : null;
}
export async function buildSessionEntry(absPath: string): Promise<SessionFileEntry | null> {

View File

@@ -197,6 +197,36 @@ describe("buildSessionEntry", () => {
expect(contentLines[1]).toBe("Assistant: 好的,我来查一下");
});
it("strips inbound metadata when a user envelope is split across text blocks", async () => {
const jsonlLines = [
JSON.stringify({
type: "message",
message: {
role: "user",
content: [
{ type: "text", text: "Conversation info (untrusted metadata):" },
{ type: "text", text: "```json" },
{ type: "text", text: '{"message_id":"msg-100","chat_id":"-100123"}' },
{ type: "text", text: "```" },
{ type: "text", text: "" },
{ type: "text", text: "Sender (untrusted metadata):" },
{ type: "text", text: "```json" },
{ type: "text", text: '{"label":"Chris","id":"42"}' },
{ type: "text", text: "```" },
{ type: "text", text: "" },
{ type: "text", text: "Actual user text" },
],
},
}),
];
const filePath = path.join(tmpDir, "enveloped-session-array.jsonl");
await fs.writeFile(filePath, jsonlLines.join("\n"));
const entry = await buildSessionEntry(filePath);
expect(entry).not.toBeNull();
expect(entry!.content).toBe("User: Actual user text");
});
it("preserves assistant messages that happen to contain sentinel-like text", async () => {
// Assistant role must NOT be stripped — only user messages carry inbound
// envelopes, and assistants may legitimately discuss metadata formats.

View File

@@ -183,6 +183,26 @@ function normalizeSessionText(value: string): string {
.trim();
}
function collectRawSessionText(content: unknown): string | null {
if (typeof content === "string") {
return content;
}
if (!Array.isArray(content)) {
return null;
}
const parts: string[] = [];
for (const block of content) {
if (!block || typeof block !== "object") {
continue;
}
const record = block as { type?: unknown; text?: unknown };
if (record.type === "text" && typeof record.text === "string") {
parts.push(record.text);
}
}
return parts.length > 0 ? parts.join("\n") : null;
}
/**
* Strip OpenClaw-injected inbound metadata envelopes from a raw text block.
*
@@ -207,33 +227,13 @@ export function extractSessionText(
content: unknown,
role: "user" | "assistant" = "assistant",
): string | null {
if (typeof content === "string") {
const stripped = stripInboundMetadataForUserRole(content, role);
const normalized = normalizeSessionText(stripped);
return normalized ? normalized : null;
}
if (!Array.isArray(content)) {
const rawText = collectRawSessionText(content);
if (rawText === null) {
return null;
}
const parts: string[] = [];
for (const block of content) {
if (!block || typeof block !== "object") {
continue;
}
const record = block as { type?: unknown; text?: unknown };
if (record.type !== "text" || typeof record.text !== "string") {
continue;
}
const stripped = stripInboundMetadataForUserRole(record.text, role);
const normalized = normalizeSessionText(stripped);
if (normalized) {
parts.push(normalized);
}
}
if (parts.length === 0) {
return null;
}
return parts.join(" ");
const stripped = stripInboundMetadataForUserRole(rawText, role);
const normalized = normalizeSessionText(stripped);
return normalized ? normalized : null;
}
function parseSessionTimestampMs(