fix: wrap oversized session lines before JSONL write (#64494)

updates the real session-export path so pathological transcript messages no longer become a single toxic export line for downstream indexing.
This commit is contained in:
Bek
2026-04-21 18:18:22 -04:00
committed by GitHub
parent 66add9fcd9
commit 1acb094579
2 changed files with 158 additions and 7 deletions

View File

@@ -32,6 +32,21 @@ afterEach(() => {
}
});
function expectNoUnpairedSurrogates(value: string): void {
for (let index = 0; index < value.length; index += 1) {
const code = value.charCodeAt(index);
if (code >= 0xd800 && code <= 0xdbff) {
expect(index + 1).toBeLessThan(value.length);
const next = value.charCodeAt(index + 1);
expect(next).toBeGreaterThanOrEqual(0xdc00);
expect(next).toBeLessThanOrEqual(0xdfff);
index += 1;
continue;
}
expect(code < 0xdc00 || code > 0xdfff).toBe(true);
}
}
describe("listSessionFilesForAgent", () => {
it("includes reset and deleted transcripts in session file listing", async () => {
const sessionsDir = path.join(tmpDir, "agents", "main", "sessions");
@@ -237,6 +252,78 @@ describe("buildSessionEntry", () => {
expect(entry!.content).toBe("User: Actual user text");
});
it("wraps pathological long messages into multiple exported lines and repeats mappings", async () => {
const longWordyLine = Array.from({ length: 260 }, (_, idx) => `segment-${idx}`).join(" ");
const timestamp = Date.parse("2026-04-05T10:00:00.000Z");
const jsonlLines = [
JSON.stringify({
type: "message",
timestamp: "2026-04-05T10:00:00.000Z",
message: { role: "user", content: longWordyLine },
}),
];
const filePath = path.join(tmpDir, "wrapped-session.jsonl");
await fs.writeFile(filePath, jsonlLines.join("\n"));
const entry = await buildSessionEntry(filePath);
expect(entry).not.toBeNull();
const contentLines = entry!.content.split("\n");
expect(contentLines.length).toBeGreaterThan(1);
expect(contentLines.every((line) => line.startsWith("User: "))).toBe(true);
expect(contentLines.every((line) => line.length <= 810)).toBe(true);
expect(entry!.lineMap).toEqual(contentLines.map(() => 1));
expect(entry!.messageTimestampsMs).toEqual(contentLines.map(() => timestamp));
});
it("hard-wraps pathological long tokens without spaces", async () => {
const giantToken = "x".repeat(1800);
const jsonlLines = [
JSON.stringify({
type: "message",
message: { role: "assistant", content: giantToken },
}),
];
const filePath = path.join(tmpDir, "hard-wrapped-session.jsonl");
await fs.writeFile(filePath, jsonlLines.join("\n"));
const entry = await buildSessionEntry(filePath);
expect(entry).not.toBeNull();
const contentLines = entry!.content.split("\n");
expect(contentLines.length).toBe(3);
expect(contentLines.every((line) => line.startsWith("Assistant: "))).toBe(true);
expect(contentLines[0].length).toBeLessThanOrEqual(811);
expect(contentLines[1].length).toBeLessThanOrEqual(811);
expect(entry!.lineMap).toEqual([1, 1, 1]);
expect(entry!.messageTimestampsMs).toEqual([0, 0, 0]);
});
it("does not split surrogate pairs when hard-wrapping astral unicode without spaces", async () => {
const astralChar = "\u{20000}";
const giantToken = astralChar.repeat(1200);
const jsonlLines = [
JSON.stringify({
type: "message",
message: { role: "assistant", content: giantToken },
}),
];
const filePath = path.join(tmpDir, "surrogate-safe-session.jsonl");
await fs.writeFile(filePath, jsonlLines.join("\n"));
const entry = await buildSessionEntry(filePath);
expect(entry).not.toBeNull();
const contentLines = entry!.content.split("\n");
expect(contentLines.length).toBeGreaterThan(1);
expect(entry!.lineMap).toEqual(contentLines.map(() => 1));
expect(entry!.messageTimestampsMs).toEqual(contentLines.map(() => 0));
for (const line of contentLines) {
expect(line.startsWith("Assistant: ")).toBe(true);
expectNoUnpairedSurrogates(line);
}
});
it("preserves assistant messages that happen to contain sentinel-like text", async () => {
// Assistant role must NOT be stripped — only user messages carry inbound
// envelopes, and assistants may legitimately discuss metadata formats.

View File

@@ -10,6 +10,11 @@ import { hashText } from "./internal.js";
const log = createSubsystemLogger("memory");
const DREAMING_NARRATIVE_RUN_PREFIX = "dreaming-narrative-";
// Keep the historical one-line-per-message export shape for normal turns, but
// wrap pathological long messages so downstream indexers never ingest a single
// toxic line. Wrapped continuation lines still map back to the same JSONL line.
// This limit applies to content only; the role label adds up to 11 chars.
const SESSION_EXPORT_CONTENT_WRAP_CHARS = 800;
export type SessionFileEntry = {
path: string;
@@ -203,6 +208,65 @@ function collectRawSessionText(content: unknown): string | null {
return parts.length > 0 ? parts.join("\n") : null;
}
function isHighSurrogate(code: number): boolean {
return code >= 0xd800 && code <= 0xdbff;
}
function isLowSurrogate(code: number): boolean {
return code >= 0xdc00 && code <= 0xdfff;
}
function splitLongSessionLine(
text: string,
maxChars: number = SESSION_EXPORT_CONTENT_WRAP_CHARS,
): string[] {
const normalized = text.trim();
if (!normalized) {
return [];
}
if (normalized.length <= maxChars) {
return [normalized];
}
const segments: string[] = [];
let cursor = 0;
while (cursor < normalized.length) {
const remaining = normalized.length - cursor;
if (remaining <= maxChars) {
segments.push(normalized.slice(cursor).trim());
break;
}
const limit = cursor + maxChars;
let splitAt = limit;
for (let index = limit; index > cursor; index -= 1) {
if (normalized[index] === " ") {
splitAt = index;
break;
}
}
if (
splitAt < normalized.length &&
splitAt > cursor &&
isHighSurrogate(normalized.charCodeAt(splitAt - 1)) &&
isLowSurrogate(normalized.charCodeAt(splitAt))
) {
splitAt -= 1;
}
segments.push(normalized.slice(cursor, splitAt).trim());
cursor = splitAt;
while (cursor < normalized.length && normalized[cursor] === " ") {
cursor += 1;
}
}
return segments.filter(Boolean);
}
function renderSessionExportLines(label: string, text: string): string[] {
return splitLongSessionLine(text).map((segment) => `${label}: ${segment}`);
}
/**
* Strip OpenClaw-injected inbound metadata envelopes from a raw text block.
*
@@ -310,14 +374,14 @@ export async function buildSessionEntry(
}
const safe = redactSensitiveText(text, { mode: "tools" });
const label = message.role === "user" ? "User" : "Assistant";
collected.push(`${label}: ${safe}`);
lineMap.push(jsonlIdx + 1);
messageTimestampsMs.push(
parseSessionTimestampMs(
record as { timestamp?: unknown },
message as { timestamp?: unknown },
),
const renderedLines = renderSessionExportLines(label, safe);
const timestampMs = parseSessionTimestampMs(
record as { timestamp?: unknown },
message as { timestamp?: unknown },
);
collected.push(...renderedLines);
lineMap.push(...renderedLines.map(() => jsonlIdx + 1));
messageTimestampsMs.push(...renderedLines.map(() => timestampMs));
}
const content = collected.join("\n");
return {