mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-12 16:00:42 +00:00
Implements #78051 — oc:// addressing substrate for workspace files. New src/oc-path/ substrate (parser/formatter, per-kind parse+emit for md/jsonc/jsonl/yaml, universal resolveOcPath/setOcPath/findOcPaths verbs, sentinel emit guard) + openclaw path resolve|find|set|validate|emit CLI + docs/cli/path.md reference page + CHANGELOG entry. Co-authored-by: giodl73-repo <235387111+giodl73-repo@users.noreply.github.com> Co-authored-by: galiniliev <5711535+galiniliev@users.noreply.github.com>
295 lines
8.2 KiB
TypeScript
295 lines
8.2 KiB
TypeScript
/**
|
||
* Generic markdown-flavored parser for the 8 workspace files.
|
||
*
|
||
* Produces a `MdAst` addressing index over `raw` bytes:
|
||
* frontmatter (if present), preamble (prose before first H2), and an
|
||
* H2-block tree with items/tables/code-blocks extracted for OcPath
|
||
* resolution.
|
||
*
|
||
* **No file-kind discrimination.** Same parse path for SOUL.md /
|
||
* AGENTS.md / MEMORY.md / TOOLS.md / IDENTITY.md / USER.md /
|
||
* HEARTBEAT.md / SKILL.md. Per-file lint opinions ride downstream
|
||
* (`@openclaw/oc-lint` rule packs).
|
||
*
|
||
* **Byte-fidelity contract**: `raw` is preserved on the AST root so
|
||
* `emitMd(parse(raw)) === raw` for every input the parser accepts.
|
||
*
|
||
* @module @openclaw/oc-path/parse
|
||
*/
|
||
|
||
import type {
|
||
AstBlock,
|
||
AstCodeBlock,
|
||
AstItem,
|
||
AstTable,
|
||
Diagnostic,
|
||
FrontmatterEntry,
|
||
ParseResult,
|
||
MdAst,
|
||
} from './ast.js';
|
||
import { slugify } from './slug.js';
|
||
|
||
const FENCE = '---';
|
||
const BOM = '';
|
||
|
||
/**
|
||
* Parse raw bytes into a `MdAst`. Soft-error policy: never
|
||
* throws. Suspicious-but-recoverable inputs (unclosed frontmatter,
|
||
* malformed bullet) become diagnostics.
|
||
*/
|
||
export function parseMd(raw: string): ParseResult {
|
||
const diagnostics: Diagnostic[] = [];
|
||
|
||
// Strip a leading BOM for parsing convenience; keep the raw input
|
||
// intact on the AST so emit can round-trip the BOM if present.
|
||
const withoutBom = raw.startsWith(BOM) ? raw.slice(BOM.length) : raw;
|
||
const lines = withoutBom.split(/\r?\n/);
|
||
|
||
const fm = detectFrontmatter(lines, diagnostics);
|
||
const bodyStartLine = fm === null ? 0 : fm.endLine + 1;
|
||
const bodyLines = lines.slice(bodyStartLine);
|
||
|
||
const { preamble, blocks } = splitH2Blocks(bodyLines, bodyStartLine + 1, diagnostics);
|
||
|
||
const ast: MdAst = {
|
||
kind: 'md',
|
||
raw,
|
||
frontmatter: fm?.entries ?? [],
|
||
preamble,
|
||
blocks,
|
||
};
|
||
|
||
return { ast, diagnostics };
|
||
}
|
||
|
||
// ---------- Frontmatter ---------------------------------------------------
|
||
|
||
interface FrontmatterRange {
|
||
readonly entries: readonly FrontmatterEntry[];
|
||
/** 0-based line index of the closing `---`. */
|
||
readonly endLine: number;
|
||
}
|
||
|
||
function detectFrontmatter(
|
||
lines: readonly string[],
|
||
diagnostics: Diagnostic[],
|
||
): FrontmatterRange | null {
|
||
if (lines.length < 2) {return null;}
|
||
if (lines[0] !== FENCE) {return null;}
|
||
|
||
let closeIndex = -1;
|
||
for (let i = 1; i < lines.length; i++) {
|
||
if (lines[i] === FENCE) {
|
||
closeIndex = i;
|
||
break;
|
||
}
|
||
}
|
||
if (closeIndex === -1) {
|
||
diagnostics.push({
|
||
line: 1,
|
||
message: 'frontmatter opens with --- but never closes',
|
||
severity: 'warning',
|
||
code: 'OC_FRONTMATTER_UNCLOSED',
|
||
});
|
||
return null;
|
||
}
|
||
|
||
const entries: FrontmatterEntry[] = [];
|
||
for (let i = 1; i < closeIndex; i++) {
|
||
const line = lines[i];
|
||
if (line.trim().length === 0) {continue;}
|
||
const m = /^([a-zA-Z_][a-zA-Z0-9_-]*)\s*:\s*(.*)$/.exec(line);
|
||
if (m === null) {
|
||
// Could be a list-style continuation (` - item`) for the previous key;
|
||
// we don't structurally model lists in frontmatter at the substrate
|
||
// layer (lint rules can do that against the raw substring if they
|
||
// need to). Skip silently — keeps the parser opinion-free.
|
||
continue;
|
||
}
|
||
entries.push({
|
||
key: m[1],
|
||
value: unquote(m[2].trim()),
|
||
line: i + 1,
|
||
});
|
||
}
|
||
|
||
return { entries, endLine: closeIndex };
|
||
}
|
||
|
||
function unquote(value: string): string {
|
||
if (value.length >= 2) {
|
||
const first = value.charCodeAt(0);
|
||
const last = value.charCodeAt(value.length - 1);
|
||
if (first === last && (first === 34 /* " */ || first === 39 /* ' */)) {
|
||
return value.slice(1, -1);
|
||
}
|
||
}
|
||
return value;
|
||
}
|
||
|
||
// ---------- H2 block split -------------------------------------------------
|
||
|
||
function splitH2Blocks(
|
||
bodyLines: readonly string[],
|
||
/** 1-based line number of `bodyLines[0]` in the original file. */
|
||
bodyStartLineNum: number,
|
||
diagnostics: Diagnostic[],
|
||
): { preamble: string; blocks: AstBlock[] } {
|
||
// Track code-block state so `##` inside a fenced block doesn't get
|
||
// parsed as a heading.
|
||
let inCode = false;
|
||
const headings: { line: number; text: string }[] = [];
|
||
|
||
for (let i = 0; i < bodyLines.length; i++) {
|
||
const line = bodyLines[i];
|
||
if (line.startsWith('```')) {
|
||
inCode = !inCode;
|
||
continue;
|
||
}
|
||
if (inCode) {continue;}
|
||
const m = /^##\s+(\S.*?)\s*$/.exec(line);
|
||
if (m !== null) {
|
||
headings.push({ line: i, text: m[1] });
|
||
}
|
||
}
|
||
|
||
if (headings.length === 0) {
|
||
return {
|
||
preamble: bodyLines.join('\n'),
|
||
blocks: [],
|
||
};
|
||
}
|
||
|
||
const preamble = bodyLines.slice(0, headings[0].line).join('\n');
|
||
const blocks: AstBlock[] = [];
|
||
|
||
for (let h = 0; h < headings.length; h++) {
|
||
const start = headings[h].line;
|
||
const end = h + 1 < headings.length ? headings[h + 1].line : bodyLines.length;
|
||
const headingText = headings[h].text;
|
||
const blockBodyLines = bodyLines.slice(start + 1, end);
|
||
const bodyText = blockBodyLines.join('\n');
|
||
const headingLineNum = bodyStartLineNum + start;
|
||
|
||
const items = extractItems(blockBodyLines, headingLineNum + 1, diagnostics);
|
||
const tables = extractTables(blockBodyLines, headingLineNum + 1);
|
||
const codeBlocks = extractCodeBlocks(blockBodyLines, headingLineNum + 1);
|
||
|
||
blocks.push({
|
||
heading: headingText,
|
||
slug: slugify(headingText),
|
||
line: headingLineNum,
|
||
bodyText,
|
||
items,
|
||
tables,
|
||
codeBlocks,
|
||
});
|
||
}
|
||
|
||
return { preamble, blocks };
|
||
}
|
||
|
||
// ---------- Items ----------------------------------------------------------
|
||
|
||
const BULLET_RE = /^(?:[-*+])\s+(.+?)\s*$/;
|
||
const KV_RE = /^([^:]+?)\s*:\s*(.+)$/;
|
||
|
||
function extractItems(
|
||
blockBodyLines: readonly string[],
|
||
startLineNum: number,
|
||
_diagnostics: Diagnostic[],
|
||
): AstItem[] {
|
||
const items: AstItem[] = [];
|
||
let inCode = false;
|
||
|
||
for (let i = 0; i < blockBodyLines.length; i++) {
|
||
const line = blockBodyLines[i];
|
||
if (line.startsWith('```')) {
|
||
inCode = !inCode;
|
||
continue;
|
||
}
|
||
if (inCode) {continue;}
|
||
const m = BULLET_RE.exec(line);
|
||
if (m === null) {continue;}
|
||
const text = m[1];
|
||
const kvMatch = KV_RE.exec(text);
|
||
const item: AstItem = {
|
||
text,
|
||
slug: kvMatch ? slugify(kvMatch[1]) : slugify(text),
|
||
line: startLineNum + i,
|
||
...(kvMatch !== null
|
||
? { kv: { key: kvMatch[1].trim(), value: kvMatch[2].trim() } }
|
||
: {}),
|
||
};
|
||
items.push(item);
|
||
}
|
||
|
||
return items;
|
||
}
|
||
|
||
// ---------- Tables ---------------------------------------------------------
|
||
|
||
function extractTables(
|
||
blockBodyLines: readonly string[],
|
||
startLineNum: number,
|
||
): AstTable[] {
|
||
const tables: AstTable[] = [];
|
||
let i = 0;
|
||
while (i < blockBodyLines.length) {
|
||
const headerLine = blockBodyLines[i];
|
||
const sepLine = blockBodyLines[i + 1];
|
||
if (
|
||
headerLine.trim().startsWith('|') &&
|
||
sepLine !== undefined &&
|
||
/^\s*\|\s*[:-]+(?:\s*\|\s*[:-]+)*\s*\|?\s*$/.test(sepLine)
|
||
) {
|
||
const headers = splitTableRow(headerLine);
|
||
const rows: string[][] = [];
|
||
let j = i + 2;
|
||
while (j < blockBodyLines.length && blockBodyLines[j].trim().startsWith('|')) {
|
||
rows.push(splitTableRow(blockBodyLines[j]));
|
||
j++;
|
||
}
|
||
tables.push({ headers, rows, line: startLineNum + i });
|
||
i = j;
|
||
continue;
|
||
}
|
||
i++;
|
||
}
|
||
return tables;
|
||
}
|
||
|
||
function splitTableRow(line: string): string[] {
|
||
const trimmed = line.trim().replace(/^\|/, '').replace(/\|$/, '');
|
||
return trimmed.split('|').map((cell) => cell.trim());
|
||
}
|
||
|
||
// ---------- Code blocks ---------------------------------------------------
|
||
|
||
function extractCodeBlocks(
|
||
blockBodyLines: readonly string[],
|
||
startLineNum: number,
|
||
): AstCodeBlock[] {
|
||
const codeBlocks: AstCodeBlock[] = [];
|
||
let i = 0;
|
||
while (i < blockBodyLines.length) {
|
||
const open = blockBodyLines[i];
|
||
if (open.startsWith('```')) {
|
||
const lang = open.slice(3).trim();
|
||
const langField = lang.length > 0 ? lang : null;
|
||
const startLine = startLineNum + i;
|
||
let j = i + 1;
|
||
const bodyLines: string[] = [];
|
||
while (j < blockBodyLines.length && !blockBodyLines[j].startsWith('```')) {
|
||
bodyLines.push(blockBodyLines[j]);
|
||
j++;
|
||
}
|
||
codeBlocks.push({ lang: langField, text: bodyLines.join('\n'), line: startLine });
|
||
i = j + 1;
|
||
continue;
|
||
}
|
||
i++;
|
||
}
|
||
return codeBlocks;
|
||
}
|