mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 06:40:44 +00:00
fix: wrap oversized session lines before JSONL write (#64494)
updates the real session-export path so pathological transcript messages no longer become a single toxic export line for downstream indexing.
This commit is contained in:
@@ -32,6 +32,21 @@ afterEach(() => {
|
||||
}
|
||||
});
|
||||
|
||||
function expectNoUnpairedSurrogates(value: string): void {
|
||||
for (let index = 0; index < value.length; index += 1) {
|
||||
const code = value.charCodeAt(index);
|
||||
if (code >= 0xd800 && code <= 0xdbff) {
|
||||
expect(index + 1).toBeLessThan(value.length);
|
||||
const next = value.charCodeAt(index + 1);
|
||||
expect(next).toBeGreaterThanOrEqual(0xdc00);
|
||||
expect(next).toBeLessThanOrEqual(0xdfff);
|
||||
index += 1;
|
||||
continue;
|
||||
}
|
||||
expect(code < 0xdc00 || code > 0xdfff).toBe(true);
|
||||
}
|
||||
}
|
||||
|
||||
describe("listSessionFilesForAgent", () => {
|
||||
it("includes reset and deleted transcripts in session file listing", async () => {
|
||||
const sessionsDir = path.join(tmpDir, "agents", "main", "sessions");
|
||||
@@ -237,6 +252,78 @@ describe("buildSessionEntry", () => {
|
||||
expect(entry!.content).toBe("User: Actual user text");
|
||||
});
|
||||
|
||||
it("wraps pathological long messages into multiple exported lines and repeats mappings", async () => {
|
||||
const longWordyLine = Array.from({ length: 260 }, (_, idx) => `segment-${idx}`).join(" ");
|
||||
const timestamp = Date.parse("2026-04-05T10:00:00.000Z");
|
||||
const jsonlLines = [
|
||||
JSON.stringify({
|
||||
type: "message",
|
||||
timestamp: "2026-04-05T10:00:00.000Z",
|
||||
message: { role: "user", content: longWordyLine },
|
||||
}),
|
||||
];
|
||||
const filePath = path.join(tmpDir, "wrapped-session.jsonl");
|
||||
await fs.writeFile(filePath, jsonlLines.join("\n"));
|
||||
|
||||
const entry = await buildSessionEntry(filePath);
|
||||
expect(entry).not.toBeNull();
|
||||
|
||||
const contentLines = entry!.content.split("\n");
|
||||
expect(contentLines.length).toBeGreaterThan(1);
|
||||
expect(contentLines.every((line) => line.startsWith("User: "))).toBe(true);
|
||||
expect(contentLines.every((line) => line.length <= 810)).toBe(true);
|
||||
expect(entry!.lineMap).toEqual(contentLines.map(() => 1));
|
||||
expect(entry!.messageTimestampsMs).toEqual(contentLines.map(() => timestamp));
|
||||
});
|
||||
|
||||
it("hard-wraps pathological long tokens without spaces", async () => {
|
||||
const giantToken = "x".repeat(1800);
|
||||
const jsonlLines = [
|
||||
JSON.stringify({
|
||||
type: "message",
|
||||
message: { role: "assistant", content: giantToken },
|
||||
}),
|
||||
];
|
||||
const filePath = path.join(tmpDir, "hard-wrapped-session.jsonl");
|
||||
await fs.writeFile(filePath, jsonlLines.join("\n"));
|
||||
|
||||
const entry = await buildSessionEntry(filePath);
|
||||
expect(entry).not.toBeNull();
|
||||
|
||||
const contentLines = entry!.content.split("\n");
|
||||
expect(contentLines.length).toBe(3);
|
||||
expect(contentLines.every((line) => line.startsWith("Assistant: "))).toBe(true);
|
||||
expect(contentLines[0].length).toBeLessThanOrEqual(811);
|
||||
expect(contentLines[1].length).toBeLessThanOrEqual(811);
|
||||
expect(entry!.lineMap).toEqual([1, 1, 1]);
|
||||
expect(entry!.messageTimestampsMs).toEqual([0, 0, 0]);
|
||||
});
|
||||
|
||||
it("does not split surrogate pairs when hard-wrapping astral unicode without spaces", async () => {
|
||||
const astralChar = "\u{20000}";
|
||||
const giantToken = astralChar.repeat(1200);
|
||||
const jsonlLines = [
|
||||
JSON.stringify({
|
||||
type: "message",
|
||||
message: { role: "assistant", content: giantToken },
|
||||
}),
|
||||
];
|
||||
const filePath = path.join(tmpDir, "surrogate-safe-session.jsonl");
|
||||
await fs.writeFile(filePath, jsonlLines.join("\n"));
|
||||
|
||||
const entry = await buildSessionEntry(filePath);
|
||||
expect(entry).not.toBeNull();
|
||||
|
||||
const contentLines = entry!.content.split("\n");
|
||||
expect(contentLines.length).toBeGreaterThan(1);
|
||||
expect(entry!.lineMap).toEqual(contentLines.map(() => 1));
|
||||
expect(entry!.messageTimestampsMs).toEqual(contentLines.map(() => 0));
|
||||
for (const line of contentLines) {
|
||||
expect(line.startsWith("Assistant: ")).toBe(true);
|
||||
expectNoUnpairedSurrogates(line);
|
||||
}
|
||||
});
|
||||
|
||||
it("preserves assistant messages that happen to contain sentinel-like text", async () => {
|
||||
// Assistant role must NOT be stripped — only user messages carry inbound
|
||||
// envelopes, and assistants may legitimately discuss metadata formats.
|
||||
|
||||
@@ -10,6 +10,11 @@ import { hashText } from "./internal.js";
|
||||
|
||||
const log = createSubsystemLogger("memory");
|
||||
const DREAMING_NARRATIVE_RUN_PREFIX = "dreaming-narrative-";
|
||||
// Keep the historical one-line-per-message export shape for normal turns, but
|
||||
// wrap pathological long messages so downstream indexers never ingest a single
|
||||
// toxic line. Wrapped continuation lines still map back to the same JSONL line.
|
||||
// This limit applies to content only; the role label adds up to 11 chars.
|
||||
const SESSION_EXPORT_CONTENT_WRAP_CHARS = 800;
|
||||
|
||||
export type SessionFileEntry = {
|
||||
path: string;
|
||||
@@ -203,6 +208,65 @@ function collectRawSessionText(content: unknown): string | null {
|
||||
return parts.length > 0 ? parts.join("\n") : null;
|
||||
}
|
||||
|
||||
function isHighSurrogate(code: number): boolean {
|
||||
return code >= 0xd800 && code <= 0xdbff;
|
||||
}
|
||||
|
||||
function isLowSurrogate(code: number): boolean {
|
||||
return code >= 0xdc00 && code <= 0xdfff;
|
||||
}
|
||||
|
||||
function splitLongSessionLine(
|
||||
text: string,
|
||||
maxChars: number = SESSION_EXPORT_CONTENT_WRAP_CHARS,
|
||||
): string[] {
|
||||
const normalized = text.trim();
|
||||
if (!normalized) {
|
||||
return [];
|
||||
}
|
||||
if (normalized.length <= maxChars) {
|
||||
return [normalized];
|
||||
}
|
||||
|
||||
const segments: string[] = [];
|
||||
let cursor = 0;
|
||||
while (cursor < normalized.length) {
|
||||
const remaining = normalized.length - cursor;
|
||||
if (remaining <= maxChars) {
|
||||
segments.push(normalized.slice(cursor).trim());
|
||||
break;
|
||||
}
|
||||
|
||||
const limit = cursor + maxChars;
|
||||
let splitAt = limit;
|
||||
for (let index = limit; index > cursor; index -= 1) {
|
||||
if (normalized[index] === " ") {
|
||||
splitAt = index;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (
|
||||
splitAt < normalized.length &&
|
||||
splitAt > cursor &&
|
||||
isHighSurrogate(normalized.charCodeAt(splitAt - 1)) &&
|
||||
isLowSurrogate(normalized.charCodeAt(splitAt))
|
||||
) {
|
||||
splitAt -= 1;
|
||||
}
|
||||
segments.push(normalized.slice(cursor, splitAt).trim());
|
||||
cursor = splitAt;
|
||||
while (cursor < normalized.length && normalized[cursor] === " ") {
|
||||
cursor += 1;
|
||||
}
|
||||
}
|
||||
|
||||
return segments.filter(Boolean);
|
||||
}
|
||||
|
||||
function renderSessionExportLines(label: string, text: string): string[] {
|
||||
return splitLongSessionLine(text).map((segment) => `${label}: ${segment}`);
|
||||
}
|
||||
|
||||
/**
|
||||
* Strip OpenClaw-injected inbound metadata envelopes from a raw text block.
|
||||
*
|
||||
@@ -310,14 +374,14 @@ export async function buildSessionEntry(
|
||||
}
|
||||
const safe = redactSensitiveText(text, { mode: "tools" });
|
||||
const label = message.role === "user" ? "User" : "Assistant";
|
||||
collected.push(`${label}: ${safe}`);
|
||||
lineMap.push(jsonlIdx + 1);
|
||||
messageTimestampsMs.push(
|
||||
parseSessionTimestampMs(
|
||||
record as { timestamp?: unknown },
|
||||
message as { timestamp?: unknown },
|
||||
),
|
||||
const renderedLines = renderSessionExportLines(label, safe);
|
||||
const timestampMs = parseSessionTimestampMs(
|
||||
record as { timestamp?: unknown },
|
||||
message as { timestamp?: unknown },
|
||||
);
|
||||
collected.push(...renderedLines);
|
||||
lineMap.push(...renderedLines.map(() => jsonlIdx + 1));
|
||||
messageTimestampsMs.push(...renderedLines.map(() => timestampMs));
|
||||
}
|
||||
const content = collected.join("\n");
|
||||
return {
|
||||
|
||||
Reference in New Issue
Block a user