diff --git a/src/memory-host-sdk/host/session-files.test.ts b/src/memory-host-sdk/host/session-files.test.ts index 1e506067997..20fd3790a20 100644 --- a/src/memory-host-sdk/host/session-files.test.ts +++ b/src/memory-host-sdk/host/session-files.test.ts @@ -32,6 +32,21 @@ afterEach(() => { } }); +function expectNoUnpairedSurrogates(value: string): void { + for (let index = 0; index < value.length; index += 1) { + const code = value.charCodeAt(index); + if (code >= 0xd800 && code <= 0xdbff) { + expect(index + 1).toBeLessThan(value.length); + const next = value.charCodeAt(index + 1); + expect(next).toBeGreaterThanOrEqual(0xdc00); + expect(next).toBeLessThanOrEqual(0xdfff); + index += 1; + continue; + } + expect(code < 0xdc00 || code > 0xdfff).toBe(true); + } +} + describe("listSessionFilesForAgent", () => { it("includes reset and deleted transcripts in session file listing", async () => { const sessionsDir = path.join(tmpDir, "agents", "main", "sessions"); @@ -237,6 +252,78 @@ describe("buildSessionEntry", () => { expect(entry!.content).toBe("User: Actual user text"); }); + it("wraps pathological long messages into multiple exported lines and repeats mappings", async () => { + const longWordyLine = Array.from({ length: 260 }, (_, idx) => `segment-${idx}`).join(" "); + const timestamp = Date.parse("2026-04-05T10:00:00.000Z"); + const jsonlLines = [ + JSON.stringify({ + type: "message", + timestamp: "2026-04-05T10:00:00.000Z", + message: { role: "user", content: longWordyLine }, + }), + ]; + const filePath = path.join(tmpDir, "wrapped-session.jsonl"); + await fs.writeFile(filePath, jsonlLines.join("\n")); + + const entry = await buildSessionEntry(filePath); + expect(entry).not.toBeNull(); + + const contentLines = entry!.content.split("\n"); + expect(contentLines.length).toBeGreaterThan(1); + expect(contentLines.every((line) => line.startsWith("User: "))).toBe(true); + expect(contentLines.every((line) => line.length <= 810)).toBe(true); + expect(entry!.lineMap).toEqual(contentLines.map(() => 1)); + expect(entry!.messageTimestampsMs).toEqual(contentLines.map(() => timestamp)); + }); + + it("hard-wraps pathological long tokens without spaces", async () => { + const giantToken = "x".repeat(1800); + const jsonlLines = [ + JSON.stringify({ + type: "message", + message: { role: "assistant", content: giantToken }, + }), + ]; + const filePath = path.join(tmpDir, "hard-wrapped-session.jsonl"); + await fs.writeFile(filePath, jsonlLines.join("\n")); + + const entry = await buildSessionEntry(filePath); + expect(entry).not.toBeNull(); + + const contentLines = entry!.content.split("\n"); + expect(contentLines.length).toBe(3); + expect(contentLines.every((line) => line.startsWith("Assistant: "))).toBe(true); + expect(contentLines[0].length).toBeLessThanOrEqual(811); + expect(contentLines[1].length).toBeLessThanOrEqual(811); + expect(entry!.lineMap).toEqual([1, 1, 1]); + expect(entry!.messageTimestampsMs).toEqual([0, 0, 0]); + }); + + it("does not split surrogate pairs when hard-wrapping astral unicode without spaces", async () => { + const astralChar = "\u{20000}"; + const giantToken = astralChar.repeat(1200); + const jsonlLines = [ + JSON.stringify({ + type: "message", + message: { role: "assistant", content: giantToken }, + }), + ]; + const filePath = path.join(tmpDir, "surrogate-safe-session.jsonl"); + await fs.writeFile(filePath, jsonlLines.join("\n")); + + const entry = await buildSessionEntry(filePath); + expect(entry).not.toBeNull(); + + const contentLines = entry!.content.split("\n"); + expect(contentLines.length).toBeGreaterThan(1); + expect(entry!.lineMap).toEqual(contentLines.map(() => 1)); + expect(entry!.messageTimestampsMs).toEqual(contentLines.map(() => 0)); + for (const line of contentLines) { + expect(line.startsWith("Assistant: ")).toBe(true); + expectNoUnpairedSurrogates(line); + } + }); + it("preserves assistant messages that happen to contain sentinel-like text", async () => { // Assistant role must NOT be stripped — only user messages carry inbound // envelopes, and assistants may legitimately discuss metadata formats. diff --git a/src/memory-host-sdk/host/session-files.ts b/src/memory-host-sdk/host/session-files.ts index bc30386061b..b24e08fb33f 100644 --- a/src/memory-host-sdk/host/session-files.ts +++ b/src/memory-host-sdk/host/session-files.ts @@ -10,6 +10,11 @@ import { hashText } from "./internal.js"; const log = createSubsystemLogger("memory"); const DREAMING_NARRATIVE_RUN_PREFIX = "dreaming-narrative-"; +// Keep the historical one-line-per-message export shape for normal turns, but +// wrap pathological long messages so downstream indexers never ingest a single +// toxic line. Wrapped continuation lines still map back to the same JSONL line. +// This limit applies to content only; the role label adds up to 11 chars. +const SESSION_EXPORT_CONTENT_WRAP_CHARS = 800; export type SessionFileEntry = { path: string; @@ -203,6 +208,65 @@ function collectRawSessionText(content: unknown): string | null { return parts.length > 0 ? parts.join("\n") : null; } +function isHighSurrogate(code: number): boolean { + return code >= 0xd800 && code <= 0xdbff; +} + +function isLowSurrogate(code: number): boolean { + return code >= 0xdc00 && code <= 0xdfff; +} + +function splitLongSessionLine( + text: string, + maxChars: number = SESSION_EXPORT_CONTENT_WRAP_CHARS, +): string[] { + const normalized = text.trim(); + if (!normalized) { + return []; + } + if (normalized.length <= maxChars) { + return [normalized]; + } + + const segments: string[] = []; + let cursor = 0; + while (cursor < normalized.length) { + const remaining = normalized.length - cursor; + if (remaining <= maxChars) { + segments.push(normalized.slice(cursor).trim()); + break; + } + + const limit = cursor + maxChars; + let splitAt = limit; + for (let index = limit; index > cursor; index -= 1) { + if (normalized[index] === " ") { + splitAt = index; + break; + } + } + if ( + splitAt < normalized.length && + splitAt > cursor && + isHighSurrogate(normalized.charCodeAt(splitAt - 1)) && + isLowSurrogate(normalized.charCodeAt(splitAt)) + ) { + splitAt -= 1; + } + segments.push(normalized.slice(cursor, splitAt).trim()); + cursor = splitAt; + while (cursor < normalized.length && normalized[cursor] === " ") { + cursor += 1; + } + } + + return segments.filter(Boolean); +} + +function renderSessionExportLines(label: string, text: string): string[] { + return splitLongSessionLine(text).map((segment) => `${label}: ${segment}`); +} + /** * Strip OpenClaw-injected inbound metadata envelopes from a raw text block. * @@ -310,14 +374,14 @@ export async function buildSessionEntry( } const safe = redactSensitiveText(text, { mode: "tools" }); const label = message.role === "user" ? "User" : "Assistant"; - collected.push(`${label}: ${safe}`); - lineMap.push(jsonlIdx + 1); - messageTimestampsMs.push( - parseSessionTimestampMs( - record as { timestamp?: unknown }, - message as { timestamp?: unknown }, - ), + const renderedLines = renderSessionExportLines(label, safe); + const timestampMs = parseSessionTimestampMs( + record as { timestamp?: unknown }, + message as { timestamp?: unknown }, ); + collected.push(...renderedLines); + lineMap.push(...renderedLines.map(() => jsonlIdx + 1)); + messageTimestampsMs.push(...renderedLines.map(() => timestampMs)); } const content = collected.join("\n"); return {