// Filesystem transcript indexer. // Streams JSONL transcript files into byte-offset indexes for history paging. import fs from "node:fs"; import { StringDecoder } from "node:string_decoder"; const TRANSCRIPT_INDEX_READ_CHUNK_BYTES = 64 * 1024; const MAX_TRANSCRIPT_INDEX_CACHE_ENTRIES = 256; const MAX_TRANSCRIPT_INDEX_PARSE_LINE_BYTES = 256 * 1024; const OVERSIZED_TRANSCRIPT_METADATA_PREFIX_CHARS = 64 * 1024; const TRANSCRIPT_OVERSIZED_MESSAGE_PLACEHOLDER = "[chat.history omitted: message too large]"; type ParsedTranscriptRecord = Record; /** Visible transcript entry plus its byte range in the JSONL file. */ export type IndexedTranscriptEntry = { seq: number; id?: string; offset: number; byteLength: number; record: ParsedTranscriptRecord; }; type SessionTranscriptIndex = { filePath: string; mtimeMs: number; size: number; hasTreeEntries: boolean; leafId?: string; entries: IndexedTranscriptEntry[]; }; type IndexedRawEntry = { id?: string; parentId?: string | null; offset: number; byteLength: number; record: ParsedTranscriptRecord; }; type CacheEntry = { mtimeMs: number; size: number; index: SessionTranscriptIndex; }; type ReadSessionTranscriptIndexOptions = { cache?: "reuse" | "skip"; }; const transcriptIndexCache = new Map(); const transcriptIndexBuilds = new Map< string, { mtimeMs: number; size: number; promise: Promise; } >(); function normalizeOptionalString(value: unknown): string | undefined { return typeof value === "string" && value.trim().length > 0 ? value : undefined; } function escapeRegExp(value: string): string { return value.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); } function extractJsonStringFieldPrefix(prefix: string, field: string): string | undefined { const match = new RegExp(`"${escapeRegExp(field)}"\\s*:\\s*"((?:\\\\.|[^"\\\\])*)"`).exec(prefix); if (!match) { return undefined; } try { const decoded = JSON.parse(`"${match[1]}"`) as unknown; return normalizeOptionalString(decoded); } catch { return undefined; } } function extractJsonNullableStringFieldPrefix( prefix: string, field: string, ): string | null | undefined { if (new RegExp(`"${escapeRegExp(field)}"\\s*:\\s*null`).test(prefix)) { return null; } return extractJsonStringFieldPrefix(prefix, field); } function extractJsonNumberFieldPrefix(prefix: string, field: string): number | undefined { const match = new RegExp( `"${escapeRegExp(field)}"\\s*:\\s*(-?\\d+(?:\\.\\d+)?(?:[eE][+-]?\\d+)?)`, ).exec(prefix); if (!match) { return undefined; } const decoded = Number(match[1]); return Number.isFinite(decoded) ? decoded : undefined; } async function yieldTranscriptIndexScan(): Promise { await new Promise((resolve) => { setImmediate(resolve); }); } function touchCachedIndex(filePath: string, entry: CacheEntry): SessionTranscriptIndex { transcriptIndexCache.delete(filePath); transcriptIndexCache.set(filePath, entry); return entry.index; } function setCachedIndex(filePath: string, entry: CacheEntry): void { transcriptIndexCache.set(filePath, entry); while (transcriptIndexCache.size > MAX_TRANSCRIPT_INDEX_CACHE_ENTRIES) { const oldestKey = transcriptIndexCache.keys().next().value; if (typeof oldestKey !== "string" || !oldestKey) { break; } transcriptIndexCache.delete(oldestKey); } } /** Clears transcript index caches and in-flight builds between tests. */ export function clearSessionTranscriptIndexCache(): void { transcriptIndexCache.clear(); transcriptIndexBuilds.clear(); } function isIndexableTranscriptRecord(record: unknown): record is ParsedTranscriptRecord { return Boolean(record && typeof record === "object" && !Array.isArray(record)); } function isVisibleTranscriptRecord(record: ParsedTranscriptRecord): boolean { return Boolean(record.message) || record.type === "compaction"; } function isTreeTranscriptRecord(record: ParsedTranscriptRecord): boolean { return record.type !== "session" && typeof record.id === "string" && "parentId" in record; } function buildOversizedIndexedRawEntry(params: { line: string; offset: number; byteLength: number; }): IndexedRawEntry | null { // Oversized lines may contain huge message arrays, so recover only metadata // from a bounded prefix and synthesize a visible placeholder record. const prefix = params.line.slice(0, OVERSIZED_TRANSCRIPT_METADATA_PREFIX_CHARS); const messageMatch = /"message"\s*:/.exec(prefix); const recordPrefix = messageMatch ? prefix.slice(0, messageMatch.index) : prefix; const id = extractJsonStringFieldPrefix(prefix, "id"); const parentId = extractJsonNullableStringFieldPrefix(prefix, "parentId"); const type = extractJsonStringFieldPrefix(prefix, "type"); const timestamp = extractJsonStringFieldPrefix(recordPrefix, "timestamp") ?? extractJsonNumberFieldPrefix(recordPrefix, "timestamp"); const role = extractJsonStringFieldPrefix(prefix, "role") ?? "assistant"; const record: ParsedTranscriptRecord = { ...(type ? { type } : {}), ...(id ? { id } : {}), ...(parentId !== undefined ? { parentId } : {}), ...(timestamp !== undefined ? { timestamp } : {}), message: { role, content: [{ type: "text", text: TRANSCRIPT_OVERSIZED_MESSAGE_PLACEHOLDER }], __openclaw: { truncated: true, reason: "oversized" }, }, }; return { ...(id ? { id } : {}), ...(parentId !== undefined ? { parentId } : {}), offset: params.offset, byteLength: params.byteLength, record, }; } async function visitTranscriptJsonLines( filePath: string, visit: (line: string, offset: number, byteLength: number) => void, ): Promise { const handle = await fs.promises.open(filePath, "r"); try { const decoder = new StringDecoder("utf8"); const buffer = Buffer.allocUnsafe(TRANSCRIPT_INDEX_READ_CHUNK_BYTES); let carry = ""; let carryOffset = 0; let nextOffset = 0; while (true) { const { bytesRead } = await handle.read(buffer, 0, buffer.length, null); if (bytesRead <= 0) { break; } const chunk = buffer.subarray(0, bytesRead); const text = carry + decoder.write(chunk); const lines = text.split("\n"); carry = lines.pop() ?? ""; let lineOffset = carryOffset; for (const rawLine of lines) { const line = rawLine.endsWith("\r") ? rawLine.slice(0, -1) : rawLine; const byteLength = Buffer.byteLength(line, "utf8"); visit(line, lineOffset, byteLength); lineOffset += Buffer.byteLength(rawLine, "utf8") + 1; } nextOffset += bytesRead; carryOffset = nextOffset - Buffer.byteLength(carry, "utf8"); // Yield between chunks so a large transcript scan does not monopolize the // gateway event loop while chat/session traffic is still flowing. await yieldTranscriptIndexScan(); } const tail = carry + decoder.end(); if (tail) { const line = tail.endsWith("\r") ? tail.slice(0, -1) : tail; visit(line, carryOffset, Buffer.byteLength(line, "utf8")); } } finally { await handle.close(); } } function buildActiveTreeEntries(params: { byId: Map; leafId?: string; }): IndexedRawEntry[] { const out: IndexedRawEntry[] = []; const seen = new Set(); let currentId = params.leafId; while (currentId) { if (seen.has(currentId)) { return []; } seen.add(currentId); const entry = params.byId.get(currentId); if (!entry) { break; } out.push(entry); currentId = entry.parentId ?? undefined; } return out.toReversed(); } function toIndexedEntries(rawEntries: IndexedRawEntry[]): IndexedTranscriptEntry[] { const entries: IndexedTranscriptEntry[] = []; let seq = 0; for (const entry of rawEntries) { if (!isVisibleTranscriptRecord(entry.record)) { continue; } seq += 1; entries.push({ seq, ...(entry.id ? { id: entry.id } : {}), offset: entry.offset, byteLength: entry.byteLength, record: entry.record, }); } return entries; } async function buildSessionTranscriptIndex( filePath: string, stat: fs.Stats, ): Promise { const rawEntries: IndexedRawEntry[] = []; const byId = new Map(); let hasTreeEntries = false; let leafId: string | undefined; await visitTranscriptJsonLines(filePath, (line, offset, byteLength) => { if (!line.trim()) { return; } if (byteLength > MAX_TRANSCRIPT_INDEX_PARSE_LINE_BYTES) { const rawEntry = buildOversizedIndexedRawEntry({ line, offset, byteLength }); if (!rawEntry) { return; } rawEntries.push(rawEntry); if (rawEntry.id) { byId.set(rawEntry.id, rawEntry); if (isTreeTranscriptRecord(rawEntry.record)) { hasTreeEntries = true; leafId = rawEntry.id; } } return; } let parsed: unknown; try { parsed = JSON.parse(line); } catch { return; } if (!isIndexableTranscriptRecord(parsed)) { return; } const id = normalizeOptionalString(parsed.id); const parentId = parsed.parentId === null ? null : (normalizeOptionalString(parsed.parentId) ?? undefined); const rawEntry: IndexedRawEntry = { ...(id ? { id } : {}), ...(parentId !== undefined ? { parentId } : {}), offset, byteLength, record: parsed, }; rawEntries.push(rawEntry); if (id) { byId.set(id, rawEntry); if (isTreeTranscriptRecord(parsed)) { hasTreeEntries = true; leafId = id; } } }); const activeRawEntries = hasTreeEntries ? buildActiveTreeEntries({ byId, leafId }) : rawEntries; return { filePath, mtimeMs: stat.mtimeMs, size: stat.size, hasTreeEntries, ...(leafId ? { leafId } : {}), entries: toIndexedEntries(activeRawEntries), }; } /** Reads or builds the visible transcript index for a JSONL session file. */ export async function readSessionTranscriptIndex( filePath: string, opts: ReadSessionTranscriptIndexOptions = {}, ): Promise { let stat: fs.Stats; try { stat = await fs.promises.stat(filePath); } catch { transcriptIndexCache.delete(filePath); return null; } if (!stat.isFile()) { transcriptIndexCache.delete(filePath); return null; } if (opts.cache === "skip") { return await buildSessionTranscriptIndex(filePath, stat); } const cached = transcriptIndexCache.get(filePath); if (cached && cached.mtimeMs === stat.mtimeMs && cached.size === stat.size) { return touchCachedIndex(filePath, cached); } const inFlight = transcriptIndexBuilds.get(filePath); if (inFlight && inFlight.mtimeMs === stat.mtimeMs && inFlight.size === stat.size) { return await inFlight.promise; } const promise = buildSessionTranscriptIndex(filePath, stat); transcriptIndexBuilds.set(filePath, { mtimeMs: stat.mtimeMs, size: stat.size, promise, }); const index = await promise.finally(() => { const current = transcriptIndexBuilds.get(filePath); if (current?.promise === promise) { transcriptIndexBuilds.delete(filePath); } }); setCachedIndex(filePath, { mtimeMs: stat.mtimeMs, size: stat.size, index, }); return index; }