mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-10 09:10:45 +00:00
refactor(oc-path): markdown-it tokenizer + grammar relaxation
The hand-rolled MD parser is replaced with a markdown-it token-stream walker. AstTable and AstCodeBlock are dropped from the AST — the substrate doesn't address into table rows or fence content, and markdown-it's tokenizer already handles "##/- inside fenced code should not be a heading/item" correctly without first-class AST modeling. Grammar opinions move from parser to lint: - Indented `## foo` (1-3 spaces) is now a heading - Empty `## ` is a heading with empty slug - Ordered lists (`1. step`) become items - Nested sub-bullets become items at flat level Each was previously a silent parser refusal — now they are recognized shapes. Lint rules can flag them (`OC_HEADING_INDENTED`, `OC_HEADING_EMPTY`, etc.) where authoring conventions require the narrower shape. Net: parse.ts drops 301 → 207 LoC; tables/code-blocks scenario tests removed wholesale (-251 LoC of test surface that pinned dead AST fields).
This commit is contained in:
committed by
Peter Steinberger
parent
6283c8247c
commit
7b7e65105b
@@ -57,32 +57,17 @@ export interface AstItem {
|
||||
readonly kv?: { readonly key: string; readonly value: string };
|
||||
}
|
||||
|
||||
/**
|
||||
* A markdown table. Tables surface in `## Tool Guidance` blocks and
|
||||
* elsewhere; lint rules can address rows by header value if needed.
|
||||
*/
|
||||
export interface AstTable {
|
||||
readonly headers: readonly string[];
|
||||
readonly rows: readonly (readonly string[])[];
|
||||
readonly line: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* A fenced code block. Carries the language tag (or `null`) and the
|
||||
* verbatim body.
|
||||
*/
|
||||
export interface AstCodeBlock {
|
||||
readonly lang: string | null;
|
||||
readonly text: string;
|
||||
readonly line: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* An H2-delimited block. The `slug` is the kebab-case lowercase form of
|
||||
* `heading` and is what OcPath `section` matches against. `bodyText` is
|
||||
* the prose between this heading and the next H2 (or end of file),
|
||||
* verbatim. `items`, `tables`, `codeBlocks` are extracted from
|
||||
* `bodyText` for addressing convenience but the raw text is preserved.
|
||||
* verbatim. `items` are extracted from `bodyText` for addressing
|
||||
* convenience but the raw text is preserved.
|
||||
*
|
||||
* Tables and fenced code blocks are NOT modeled as first-class AST
|
||||
* children — addressing into them is out of scope for the substrate.
|
||||
* Lint rules that need table rows or code-block contents re-tokenize
|
||||
* the block's `bodyText` on demand.
|
||||
*/
|
||||
export interface AstBlock {
|
||||
readonly heading: string;
|
||||
@@ -90,8 +75,6 @@ export interface AstBlock {
|
||||
readonly line: number;
|
||||
readonly bodyText: string;
|
||||
readonly items: readonly AstItem[];
|
||||
readonly tables: readonly AstTable[];
|
||||
readonly codeBlocks: readonly AstCodeBlock[];
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@@ -35,9 +35,7 @@ export const SDK_VERSION = "0.1.0";
|
||||
// AST types
|
||||
export type {
|
||||
AstBlock,
|
||||
AstCodeBlock,
|
||||
AstItem,
|
||||
AstTable,
|
||||
Diagnostic,
|
||||
FrontmatterEntry,
|
||||
ParseResult,
|
||||
|
||||
@@ -1,15 +1,24 @@
|
||||
/**
|
||||
* Generic markdown-flavored parser for the 8 workspace files.
|
||||
* Generic markdown-flavored parser for the workspace files.
|
||||
*
|
||||
* Produces a `MdAst` addressing index over `raw` bytes:
|
||||
* frontmatter (if present), preamble (prose before first H2), and an
|
||||
* H2-block tree with items/tables/code-blocks extracted for OcPath
|
||||
* resolution.
|
||||
* Produces a `MdAst` addressing index over `raw` bytes: frontmatter
|
||||
* (if present), preamble (prose before first H2), and an H2-block tree
|
||||
* with items extracted for OcPath resolution.
|
||||
*
|
||||
* **No file-kind discrimination.** Same parse path for SOUL.md /
|
||||
* AGENTS.md / MEMORY.md / TOOLS.md / IDENTITY.md / USER.md /
|
||||
* HEARTBEAT.md / SKILL.md. Per-file lint opinions ride downstream
|
||||
* (`@openclaw/oc-lint` rule packs).
|
||||
* Tokenization is delegated to markdown-it; this module owns the
|
||||
* frontmatter detector (markdown-it does not handle YAML frontmatter
|
||||
* natively) and the token-stream walker that buckets headings and
|
||||
* bullets into the addressable AST shape. Tables and fenced code
|
||||
* blocks are NOT first-class AST children — substrate addressing
|
||||
* doesn't go inside them, and tokenizer-level structure (which
|
||||
* markdown-it already gets right) is sufficient to ensure `##` and
|
||||
* `-` inside them aren't misparsed as headings or items.
|
||||
*
|
||||
* **Grammar opinions live in lint rules, not the parser.** Indented
|
||||
* `## foo`, empty `## `, ordered (`1.`) lists, and nested sub-bullets
|
||||
* are all recognized as headings / items here; downstream lint rules
|
||||
* (`OC_HEADING_INDENTED`, `OC_HEADING_EMPTY`, etc.) decide whether
|
||||
* those shapes are OK in a particular file.
|
||||
*
|
||||
* **Byte-fidelity contract**: `raw` is preserved on the AST root so
|
||||
* `emitMd(parse(raw)) === raw` for every input the parser accepts.
|
||||
@@ -17,49 +26,43 @@
|
||||
* @module @openclaw/oc-path/parse
|
||||
*/
|
||||
|
||||
import MarkdownIt from "markdown-it";
|
||||
|
||||
import type {
|
||||
AstBlock,
|
||||
AstCodeBlock,
|
||||
AstItem,
|
||||
AstTable,
|
||||
Diagnostic,
|
||||
FrontmatterEntry,
|
||||
ParseResult,
|
||||
MdAst,
|
||||
ParseResult,
|
||||
} from "./ast.js";
|
||||
import { slugify } from "./slug.js";
|
||||
|
||||
type Token = ReturnType<MarkdownIt["parse"]>[number];
|
||||
|
||||
const FENCE = "---";
|
||||
const BOM = "";
|
||||
const KV_RE = /^([^:]+?)\s*:\s*(.+)$/;
|
||||
|
||||
const md = new MarkdownIt({ html: true });
|
||||
|
||||
/**
|
||||
* Parse raw bytes into a `MdAst`. Soft-error policy: never
|
||||
* throws. Suspicious-but-recoverable inputs (unclosed frontmatter,
|
||||
* malformed bullet) become diagnostics.
|
||||
*/
|
||||
export function parseMd(raw: string): ParseResult {
|
||||
const diagnostics: Diagnostic[] = [];
|
||||
|
||||
// Strip a leading BOM for parsing convenience; keep the raw input
|
||||
// intact on the AST so emit can round-trip the BOM if present.
|
||||
const withoutBom = raw.startsWith(BOM) ? raw.slice(BOM.length) : raw;
|
||||
const lines = withoutBom.split(/\r?\n/);
|
||||
|
||||
const fm = detectFrontmatter(lines, diagnostics);
|
||||
const bodyStartLine = fm === null ? 0 : fm.endLine + 1;
|
||||
const bodyLines = lines.slice(bodyStartLine);
|
||||
const bodyStartIdx = fm === null ? 0 : fm.endLine + 1;
|
||||
const bodyLines = lines.slice(bodyStartIdx);
|
||||
const bodyFileLine = bodyStartIdx + 1;
|
||||
|
||||
const { preamble, blocks } = splitH2Blocks(bodyLines, bodyStartLine + 1, diagnostics);
|
||||
const tokens = md.parse(bodyLines.join("\n"), {});
|
||||
const { preamble, blocks } = walkBlocks(tokens, bodyLines, bodyFileLine);
|
||||
|
||||
const ast: MdAst = {
|
||||
kind: "md",
|
||||
raw,
|
||||
frontmatter: fm?.entries ?? [],
|
||||
preamble,
|
||||
blocks,
|
||||
return {
|
||||
ast: { kind: "md", raw, frontmatter: fm?.entries ?? [], preamble, blocks },
|
||||
diagnostics,
|
||||
};
|
||||
|
||||
return { ast, diagnostics };
|
||||
}
|
||||
|
||||
// ---------- Frontmatter ---------------------------------------------------
|
||||
@@ -74,13 +77,9 @@ function detectFrontmatter(
|
||||
lines: readonly string[],
|
||||
diagnostics: Diagnostic[],
|
||||
): FrontmatterRange | null {
|
||||
if (lines.length < 2) {
|
||||
if (lines.length < 2 || lines[0] !== FENCE) {
|
||||
return null;
|
||||
}
|
||||
if (lines[0] !== FENCE) {
|
||||
return null;
|
||||
}
|
||||
|
||||
let closeIndex = -1;
|
||||
for (let i = 1; i < lines.length; i++) {
|
||||
if (lines[i] === FENCE) {
|
||||
@@ -97,205 +96,112 @@ function detectFrontmatter(
|
||||
});
|
||||
return null;
|
||||
}
|
||||
|
||||
const entries: FrontmatterEntry[] = [];
|
||||
for (let i = 1; i < closeIndex; i++) {
|
||||
const line = lines[i];
|
||||
if (line.trim().length === 0) {
|
||||
continue;
|
||||
const m = /^([a-zA-Z_][a-zA-Z0-9_-]*)\s*:\s*(.*)$/.exec(lines[i]);
|
||||
if (m !== null) {
|
||||
entries.push({ key: m[1], value: unquote(m[2].trim()), line: i + 1 });
|
||||
}
|
||||
const m = /^([a-zA-Z_][a-zA-Z0-9_-]*)\s*:\s*(.*)$/.exec(line);
|
||||
if (m === null) {
|
||||
// Could be a list-style continuation (` - item`) for the previous key;
|
||||
// we don't structurally model lists in frontmatter at the substrate
|
||||
// layer (lint rules can do that against the raw substring if they
|
||||
// need to). Skip silently — keeps the parser opinion-free.
|
||||
continue;
|
||||
}
|
||||
entries.push({
|
||||
key: m[1],
|
||||
value: unquote(m[2].trim()),
|
||||
line: i + 1,
|
||||
});
|
||||
}
|
||||
|
||||
return { entries, endLine: closeIndex };
|
||||
}
|
||||
|
||||
function unquote(value: string): string {
|
||||
if (value.length >= 2) {
|
||||
const first = value.charCodeAt(0);
|
||||
const last = value.charCodeAt(value.length - 1);
|
||||
if (first === last && (first === 34 /* " */ || first === 39) /* ' */) {
|
||||
const f = value.charCodeAt(0);
|
||||
const l = value.charCodeAt(value.length - 1);
|
||||
if (f === l && (f === 34 || f === 39)) {
|
||||
return value.slice(1, -1);
|
||||
}
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
// ---------- H2 block split -------------------------------------------------
|
||||
// ---------- H2 block walker -----------------------------------------------
|
||||
|
||||
function splitH2Blocks(
|
||||
function walkBlocks(
|
||||
tokens: readonly Token[],
|
||||
bodyLines: readonly string[],
|
||||
/** 1-based line number of `bodyLines[0]` in the original file. */
|
||||
bodyStartLineNum: number,
|
||||
diagnostics: Diagnostic[],
|
||||
bodyFileLine: number,
|
||||
): { preamble: string; blocks: AstBlock[] } {
|
||||
// Track code-block state so `##` inside a fenced block doesn't get
|
||||
// parsed as a heading.
|
||||
let inCode = false;
|
||||
const headings: { line: number; text: string }[] = [];
|
||||
|
||||
for (let i = 0; i < bodyLines.length; i++) {
|
||||
const line = bodyLines[i];
|
||||
if (line.startsWith("```")) {
|
||||
inCode = !inCode;
|
||||
continue;
|
||||
}
|
||||
if (inCode) {
|
||||
continue;
|
||||
}
|
||||
const m = /^##\s+(\S.*?)\s*$/.exec(line);
|
||||
if (m !== null) {
|
||||
headings.push({ line: i, text: m[1] });
|
||||
// Match atx-style `##` only — setext h2 (`Heading\n---`) carries
|
||||
// `markup: "-"` on the heading_open token, so the `markup === "##"`
|
||||
// filter picks atx exclusively. Authors who want setext can still
|
||||
// write it; substrate just doesn't address it as a section.
|
||||
const h2: { tokenIdx: number; lineIdx: number; text: string }[] = [];
|
||||
for (let i = 0; i < tokens.length; i++) {
|
||||
const t = tokens[i];
|
||||
if (t.type === "heading_open" && t.tag === "h2" && t.markup === "##" && t.map !== null) {
|
||||
const inline = tokens[i + 1];
|
||||
h2.push({ tokenIdx: i, lineIdx: t.map[0], text: inline?.content ?? "" });
|
||||
}
|
||||
}
|
||||
|
||||
if (headings.length === 0) {
|
||||
return {
|
||||
preamble: bodyLines.join("\n"),
|
||||
blocks: [],
|
||||
};
|
||||
if (h2.length === 0) {
|
||||
return { preamble: bodyLines.join("\n"), blocks: [] };
|
||||
}
|
||||
|
||||
const preamble = bodyLines.slice(0, headings[0].line).join("\n");
|
||||
const preamble = bodyLines.slice(0, h2[0].lineIdx).join("\n");
|
||||
const blocks: AstBlock[] = [];
|
||||
|
||||
for (let h = 0; h < headings.length; h++) {
|
||||
const start = headings[h].line;
|
||||
const end = h + 1 < headings.length ? headings[h + 1].line : bodyLines.length;
|
||||
const headingText = headings[h].text;
|
||||
const blockBodyLines = bodyLines.slice(start + 1, end);
|
||||
const bodyText = blockBodyLines.join("\n");
|
||||
const headingLineNum = bodyStartLineNum + start;
|
||||
|
||||
const items = extractItems(blockBodyLines, headingLineNum + 1, diagnostics);
|
||||
const tables = extractTables(blockBodyLines, headingLineNum + 1);
|
||||
const codeBlocks = extractCodeBlocks(blockBodyLines, headingLineNum + 1);
|
||||
|
||||
for (let h = 0; h < h2.length; h++) {
|
||||
const start = h2[h].lineIdx;
|
||||
const end = h + 1 < h2.length ? h2[h + 1].lineIdx : bodyLines.length;
|
||||
// Slice tokens by INDEX so descendant tokens with no `map` (table
|
||||
// cells, list markers, inline content) ride along with their
|
||||
// mapped parent. heading_open / inline / heading_close = 3 tokens.
|
||||
const tokenStart = h2[h].tokenIdx + 3;
|
||||
const tokenEnd = h + 1 < h2.length ? h2[h + 1].tokenIdx : tokens.length;
|
||||
const blockTokens = tokens.slice(tokenStart, tokenEnd);
|
||||
blocks.push({
|
||||
heading: headingText,
|
||||
slug: slugify(headingText),
|
||||
line: headingLineNum,
|
||||
bodyText,
|
||||
items,
|
||||
tables,
|
||||
codeBlocks,
|
||||
heading: h2[h].text,
|
||||
slug: slugify(h2[h].text),
|
||||
line: bodyFileLine + start,
|
||||
bodyText: bodyLines.slice(start + 1, end).join("\n"),
|
||||
items: extractItems(blockTokens, bodyFileLine),
|
||||
});
|
||||
}
|
||||
|
||||
return { preamble, blocks };
|
||||
}
|
||||
|
||||
// ---------- Items ----------------------------------------------------------
|
||||
// ---------- Item extraction ----------------------------------------------
|
||||
|
||||
const BULLET_RE = /^(?:[-*+])\s+(.+?)\s*$/;
|
||||
const KV_RE = /^([^:]+?)\s*:\s*(.+)$/;
|
||||
|
||||
function extractItems(
|
||||
blockBodyLines: readonly string[],
|
||||
startLineNum: number,
|
||||
_diagnostics: Diagnostic[],
|
||||
): AstItem[] {
|
||||
function extractItems(tokens: readonly Token[], bodyFileLine: number): AstItem[] {
|
||||
// Every `list_item_open` becomes an item — bullets, numbered lists,
|
||||
// nested sub-bullets all included. Lint rules can flag depth or
|
||||
// duplicate-slug collisions; the parser stays opinion-free.
|
||||
const items: AstItem[] = [];
|
||||
let inCode = false;
|
||||
|
||||
for (let i = 0; i < blockBodyLines.length; i++) {
|
||||
const line = blockBodyLines[i];
|
||||
if (line.startsWith("```")) {
|
||||
inCode = !inCode;
|
||||
for (let i = 0; i < tokens.length; i++) {
|
||||
const t = tokens[i];
|
||||
if (t.type !== "list_item_open" || t.map === null) {
|
||||
continue;
|
||||
}
|
||||
if (inCode) {
|
||||
continue;
|
||||
// First inline at the item's own depth is the item text.
|
||||
let nestedDepth = 0;
|
||||
let text = "";
|
||||
for (let j = i + 1; j < tokens.length; j++) {
|
||||
const x = tokens[j];
|
||||
if (x.type === "list_item_close" && nestedDepth === 0) {
|
||||
break;
|
||||
}
|
||||
if (x.type === "bullet_list_open" || x.type === "ordered_list_open") {
|
||||
nestedDepth++;
|
||||
} else if (x.type === "bullet_list_close" || x.type === "ordered_list_close") {
|
||||
nestedDepth--;
|
||||
} else if (x.type === "inline" && nestedDepth === 0 && text === "") {
|
||||
text = x.content;
|
||||
}
|
||||
}
|
||||
const m = BULLET_RE.exec(line);
|
||||
if (m === null) {
|
||||
continue;
|
||||
}
|
||||
const text = m[1];
|
||||
const kvMatch = KV_RE.exec(text);
|
||||
const item: AstItem = {
|
||||
items.push({
|
||||
text,
|
||||
slug: kvMatch ? slugify(kvMatch[1]) : slugify(text),
|
||||
line: startLineNum + i,
|
||||
...(kvMatch !== null ? { kv: { key: kvMatch[1].trim(), value: kvMatch[2].trim() } } : {}),
|
||||
};
|
||||
items.push(item);
|
||||
line: bodyFileLine + t.map[0],
|
||||
...(kvMatch !== null
|
||||
? { kv: { key: kvMatch[1].trim(), value: kvMatch[2].trim() } }
|
||||
: {}),
|
||||
});
|
||||
}
|
||||
|
||||
return items;
|
||||
}
|
||||
|
||||
// ---------- Tables ---------------------------------------------------------
|
||||
|
||||
function extractTables(blockBodyLines: readonly string[], startLineNum: number): AstTable[] {
|
||||
const tables: AstTable[] = [];
|
||||
let i = 0;
|
||||
while (i < blockBodyLines.length) {
|
||||
const headerLine = blockBodyLines[i];
|
||||
const sepLine = blockBodyLines[i + 1];
|
||||
if (
|
||||
headerLine.trim().startsWith("|") &&
|
||||
sepLine !== undefined &&
|
||||
/^\s*\|\s*[:-]+(?:\s*\|\s*[:-]+)*\s*\|?\s*$/.test(sepLine)
|
||||
) {
|
||||
const headers = splitTableRow(headerLine);
|
||||
const rows: string[][] = [];
|
||||
let j = i + 2;
|
||||
while (j < blockBodyLines.length && blockBodyLines[j].trim().startsWith("|")) {
|
||||
rows.push(splitTableRow(blockBodyLines[j]));
|
||||
j++;
|
||||
}
|
||||
tables.push({ headers, rows, line: startLineNum + i });
|
||||
i = j;
|
||||
continue;
|
||||
}
|
||||
i++;
|
||||
}
|
||||
return tables;
|
||||
}
|
||||
|
||||
function splitTableRow(line: string): string[] {
|
||||
const trimmed = line.trim().replace(/^\|/, "").replace(/\|$/, "");
|
||||
return trimmed.split("|").map((cell) => cell.trim());
|
||||
}
|
||||
|
||||
// ---------- Code blocks ---------------------------------------------------
|
||||
|
||||
function extractCodeBlocks(
|
||||
blockBodyLines: readonly string[],
|
||||
startLineNum: number,
|
||||
): AstCodeBlock[] {
|
||||
const codeBlocks: AstCodeBlock[] = [];
|
||||
let i = 0;
|
||||
while (i < blockBodyLines.length) {
|
||||
const open = blockBodyLines[i];
|
||||
if (open.startsWith("```")) {
|
||||
const lang = open.slice(3).trim();
|
||||
const langField = lang.length > 0 ? lang : null;
|
||||
const startLine = startLineNum + i;
|
||||
let j = i + 1;
|
||||
const bodyLines: string[] = [];
|
||||
while (j < blockBodyLines.length && !blockBodyLines[j].startsWith("```")) {
|
||||
bodyLines.push(blockBodyLines[j]);
|
||||
j++;
|
||||
}
|
||||
codeBlocks.push({ lang: langField, text: bodyLines.join("\n"), line: startLine });
|
||||
i = j + 1;
|
||||
continue;
|
||||
}
|
||||
i++;
|
||||
}
|
||||
return codeBlocks;
|
||||
}
|
||||
|
||||
@@ -135,53 +135,6 @@ describe("parseMd — items", () => {
|
||||
});
|
||||
});
|
||||
|
||||
describe("parseMd — tables", () => {
|
||||
it("extracts a simple table", () => {
|
||||
const raw = `## Tool Guidance
|
||||
|
||||
| tool | guidance |
|
||||
| --- | --- |
|
||||
| gh | use for GitHub |
|
||||
| curl | HTTP client |
|
||||
`;
|
||||
const { ast } = parseMd(raw);
|
||||
const table = ast.blocks[0]?.tables[0];
|
||||
if (!table) {
|
||||
throw new Error("expected parsed markdown table");
|
||||
}
|
||||
expect(table.headers).toEqual(["tool", "guidance"]);
|
||||
expect(table.rows.length).toBe(2);
|
||||
expect(table.rows[0]).toEqual(["gh", "use for GitHub"]);
|
||||
});
|
||||
});
|
||||
|
||||
describe("parseMd — code blocks", () => {
|
||||
it("extracts a fenced code block", () => {
|
||||
const raw = `## Examples
|
||||
|
||||
\`\`\`ts
|
||||
const x = 1;
|
||||
\`\`\`
|
||||
`;
|
||||
const { ast } = parseMd(raw);
|
||||
expect(ast.blocks[0]?.codeBlocks[0]).toMatchObject({
|
||||
lang: "ts",
|
||||
text: "const x = 1;",
|
||||
});
|
||||
});
|
||||
|
||||
it("handles unlanguaged fences", () => {
|
||||
const raw = `## Block
|
||||
|
||||
\`\`\`
|
||||
plain text
|
||||
\`\`\`
|
||||
`;
|
||||
const { ast } = parseMd(raw);
|
||||
expect(ast.blocks[0]?.codeBlocks[0]?.lang).toBeNull();
|
||||
});
|
||||
});
|
||||
|
||||
describe("parseMd — byte-fidelity", () => {
|
||||
it("preserves raw on the AST", () => {
|
||||
const raw = `---\nname: x\n---\n\n## Sec\n\n- a\n- b\n`;
|
||||
|
||||
@@ -1,97 +0,0 @@
|
||||
/**
|
||||
* Wave 6 — fenced code blocks.
|
||||
*
|
||||
* Substrate guarantee: triple-backtick fences (` ``` `) inside H2 blocks
|
||||
* extract as `AstCodeBlock` with `lang` (or null) and verbatim `text`.
|
||||
* Code blocks suppress H2-split and item-extraction inside their body.
|
||||
*/
|
||||
import { describe, expect, it } from "vitest";
|
||||
import { parseMd } from "../../parse.js";
|
||||
|
||||
describe("wave-06 code-blocks", () => {
|
||||
it("CB-01 unlanguaged fence", () => {
|
||||
const raw = `## H\n\n\`\`\`\nplain text\n\`\`\`\n`;
|
||||
const { ast } = parseMd(raw);
|
||||
expect(ast.blocks[0]?.codeBlocks[0]).toMatchObject({
|
||||
lang: null,
|
||||
text: "plain text",
|
||||
});
|
||||
});
|
||||
|
||||
it("CB-02 languaged fence", () => {
|
||||
const raw = `## H\n\n\`\`\`ts\nconst x = 1;\n\`\`\`\n`;
|
||||
const { ast } = parseMd(raw);
|
||||
expect(ast.blocks[0]?.codeBlocks[0]?.lang).toBe("ts");
|
||||
expect(ast.blocks[0]?.codeBlocks[0]?.text).toBe("const x = 1;");
|
||||
});
|
||||
|
||||
it("CB-03 multi-line code body preserved verbatim", () => {
|
||||
const raw = `## H\n\n\`\`\`ts\nline 1\nline 2\nline 3\n\`\`\`\n`;
|
||||
const { ast } = parseMd(raw);
|
||||
expect(ast.blocks[0]?.codeBlocks[0]?.text).toBe("line 1\nline 2\nline 3");
|
||||
});
|
||||
|
||||
it("CB-04 empty code block", () => {
|
||||
const raw = `## H\n\n\`\`\`\n\`\`\`\n`;
|
||||
const { ast } = parseMd(raw);
|
||||
expect(ast.blocks[0]?.codeBlocks[0]?.text).toBe("");
|
||||
});
|
||||
|
||||
it("CB-05 code block with `## ` does NOT split as heading", () => {
|
||||
const raw = `## Real\n\n\`\`\`md\n## Not a heading\n\`\`\`\n\n## Another real\n`;
|
||||
const { ast } = parseMd(raw);
|
||||
expect(ast.blocks.map((b) => b.heading)).toEqual(["Real", "Another real"]);
|
||||
});
|
||||
|
||||
it("CB-06 code block with `- bullet` does NOT extract as item", () => {
|
||||
const raw = `## H\n\n\`\`\`\n- not a bullet\n- still not\n\`\`\`\n\n- real bullet\n`;
|
||||
const { ast } = parseMd(raw);
|
||||
expect(ast.blocks[0]?.items.map((i) => i.text)).toEqual(["real bullet"]);
|
||||
});
|
||||
|
||||
it("CB-07 multiple code blocks in same section", () => {
|
||||
const raw = `## H\n\n\`\`\`a\nfirst\n\`\`\`\n\n\`\`\`b\nsecond\n\`\`\`\n`;
|
||||
const { ast } = parseMd(raw);
|
||||
expect(ast.blocks[0]?.codeBlocks.length).toBe(2);
|
||||
expect(ast.blocks[0]?.codeBlocks.map((c) => c.lang)).toEqual(["a", "b"]);
|
||||
});
|
||||
|
||||
it("CB-08 unterminated fence — body extends to end of section", () => {
|
||||
const raw = `## H\n\n\`\`\`\nopen but never closes\n`;
|
||||
const { ast } = parseMd(raw);
|
||||
// Behavior: code block is created with whatever was after the open
|
||||
// fence, including any trailing newline lines. Documents are
|
||||
// likely malformed; substrate is lenient and preserves what's
|
||||
// there (verifiable via raw round-trip).
|
||||
expect(ast.blocks[0]?.codeBlocks[0]?.text).toContain("open but never closes");
|
||||
});
|
||||
|
||||
it("CB-09 fence with leading spaces (4-space indented code)", () => {
|
||||
// Note: only column-0 ``` triggers fence. Indented content is body
|
||||
// text. This is the documented behavior.
|
||||
const raw = `## H\n\n \`\`\`\n indented\n \`\`\`\n`;
|
||||
const { ast } = parseMd(raw);
|
||||
expect(ast.blocks[0]?.codeBlocks).toEqual([]);
|
||||
});
|
||||
|
||||
it("CB-10 lang tag with extra whitespace trimmed", () => {
|
||||
const raw = `## H\n\n\`\`\` jsonc \nbody\n\`\`\`\n`;
|
||||
const { ast } = parseMd(raw);
|
||||
expect(ast.blocks[0]?.codeBlocks[0]?.lang).toBe("jsonc");
|
||||
});
|
||||
|
||||
it("CB-11 lang tag with hyphen / dot (typescript-jsx, c++)", () => {
|
||||
const raw = `## H\n\n\`\`\`typescript-jsx\nx\n\`\`\`\n`;
|
||||
const { ast } = parseMd(raw);
|
||||
expect(ast.blocks[0]?.codeBlocks[0]?.lang).toBe("typescript-jsx");
|
||||
});
|
||||
|
||||
it("CB-12 fence appearing in preamble (before any H2) is ignored at block layer", () => {
|
||||
const raw = `\`\`\`\npreamble code\n\`\`\`\n\n## H\n`;
|
||||
const { ast } = parseMd(raw);
|
||||
// Preamble code blocks aren't structurally extracted at the
|
||||
// substrate layer; this is documented. Lint can scan preamble
|
||||
// raw if needed.
|
||||
expect(ast.blocks[0]?.codeBlocks).toEqual([]);
|
||||
});
|
||||
});
|
||||
@@ -56,10 +56,12 @@ describe("wave-03 h2-block-split", () => {
|
||||
expect(ast.blocks[0]?.heading).toBe("With space");
|
||||
});
|
||||
|
||||
it("H2-08 leading whitespace before `##` — does NOT match (regex anchored at line start)", () => {
|
||||
it("H2-08 leading whitespace before `##` — recognized as heading (CommonMark)", () => {
|
||||
// Substrate accepts up to 3 spaces of indentation as an atx
|
||||
// heading per CommonMark. Lint rules can flag if a particular
|
||||
// workspace file requires column-zero authoring.
|
||||
const { ast } = parseMd(" ## indented\n## not indented\n");
|
||||
expect(ast.blocks.length).toBe(1);
|
||||
expect(ast.blocks[0]?.heading).toBe("not indented");
|
||||
expect(ast.blocks.map((b) => b.heading)).toEqual(["indented", "not indented"]);
|
||||
});
|
||||
|
||||
it("H2-09 trailing whitespace on heading — trimmed in heading text", () => {
|
||||
@@ -126,16 +128,19 @@ describe("wave-03 h2-block-split", () => {
|
||||
});
|
||||
|
||||
it("H2-19 empty heading text (`## `)", () => {
|
||||
// Substrate accepts an empty atx heading; downstream lint
|
||||
// (`OC_HEADING_EMPTY`) flags it. Slug is empty string — collisions
|
||||
// are a lint-level concern, not a parser refusal.
|
||||
const { ast } = parseMd("## \n");
|
||||
// Empty heading is technically a valid match (`## ` + empty text)
|
||||
// but the regex requires `(.+?)` so empty doesn't match. Validates
|
||||
// it's NOT split.
|
||||
expect(ast.blocks).toEqual([]);
|
||||
expect(ast.blocks.length).toBe(1);
|
||||
expect(ast.blocks[0]?.heading).toBe("");
|
||||
expect(ast.blocks[0]?.slug).toBe("");
|
||||
});
|
||||
|
||||
it("H2-20 heading with only whitespace (`## `)", () => {
|
||||
const { ast } = parseMd("## \n");
|
||||
expect(ast.blocks).toEqual([]);
|
||||
expect(ast.blocks.length).toBe(1);
|
||||
expect(ast.blocks[0]?.heading).toBe("");
|
||||
});
|
||||
|
||||
it("H2-21 heading-shaped text inside multi-line bullet body — does split", () => {
|
||||
|
||||
@@ -85,23 +85,27 @@ describe("wave-04 items", () => {
|
||||
expect(ast.blocks[0]?.items[0]?.text).toBe("spaced");
|
||||
});
|
||||
|
||||
it("I-15 empty bullet text is dropped", () => {
|
||||
it("I-15 empty bullet — recognized with empty text/slug", () => {
|
||||
// Substrate accepts an empty bullet; lint can flag if collisions
|
||||
// matter. Both `- ` and `- real` become items.
|
||||
const { ast } = parseMd("## H\n- \n- real\n");
|
||||
// The regex requires (.+?) non-empty, so `- ` alone doesn't match.
|
||||
expect(ast.blocks[0]?.items.length).toBe(1);
|
||||
expect(ast.blocks[0]?.items.length).toBe(2);
|
||||
expect(ast.blocks[0]?.items.map((i) => i.text)).toEqual(["", "real"]);
|
||||
});
|
||||
|
||||
it("I-16 indented bullet (sub-bullet) — current parser still picks up", () => {
|
||||
// The current regex `^(?:[-*+])\\s+(.+?)\\s*$` requires column-0
|
||||
// bullet markers; indented bullets do NOT match. Documented as a
|
||||
// limit — sub-bullets surface in body text but not in items.
|
||||
it("I-16 indented bullet (sub-bullet) — recognized as item alongside parent", () => {
|
||||
// Substrate flattens the bullet tree into a list of items;
|
||||
// sub-bullets surface as their own AstItem entries. Lint rules
|
||||
// can flag depth or duplicate-slug collisions.
|
||||
const { ast } = parseMd("## H\n- top\n - sub\n");
|
||||
expect(ast.blocks[0]?.items.map((i) => i.text)).toEqual(["top"]);
|
||||
expect(ast.blocks[0]?.items.map((i) => i.text)).toEqual(["top", "sub"]);
|
||||
});
|
||||
|
||||
it("I-17 numbered list (1. item) is NOT extracted as item", () => {
|
||||
it("I-17 numbered list (1. item) — recognized as items", () => {
|
||||
// Substrate treats ordered and unordered lists symmetrically.
|
||||
// Lint rules can flag if a particular file requires bullet style.
|
||||
const { ast } = parseMd("## H\n1. first\n2. second\n");
|
||||
expect(ast.blocks[0]?.items).toEqual([]);
|
||||
expect(ast.blocks[0]?.items.map((i) => i.text)).toEqual(["first", "second"]);
|
||||
});
|
||||
|
||||
it("I-18 items in a section with no body before — first item line is heading+1", () => {
|
||||
|
||||
@@ -56,7 +56,7 @@ describe("wave-12 real-world-fixtures", () => {
|
||||
}
|
||||
});
|
||||
|
||||
it("F-04 TOOLS.md table extracted from Tool Guidance section", () => {
|
||||
it("F-04 TOOLS.md tool-guidance section resolves by slug", () => {
|
||||
const raw = load("TOOLS.md");
|
||||
const { ast } = parseMd(raw);
|
||||
expect(emitMd(ast)).toBe(raw);
|
||||
@@ -65,10 +65,6 @@ describe("wave-12 real-world-fixtures", () => {
|
||||
section: "tool-guidance",
|
||||
});
|
||||
expect(guidance?.kind).toBe("block");
|
||||
if (guidance?.kind === "block") {
|
||||
expect(guidance.node.tables.length).toBeGreaterThan(0);
|
||||
expect(guidance.node.tables[0]?.headers).toEqual(["tool", "guidance"]);
|
||||
}
|
||||
});
|
||||
|
||||
it("F-05 IDENTITY.md sections resolvable by slug", () => {
|
||||
|
||||
@@ -1,154 +0,0 @@
|
||||
/**
|
||||
* Wave 5 — markdown tables.
|
||||
*
|
||||
* Substrate guarantee: GFM-style tables (`| h | h |\n|---|---|\n| r | r |`)
|
||||
* inside H2 blocks are extracted into `AstTable`. Tables inside fenced
|
||||
* code blocks are NOT extracted (handled at item-extraction layer too;
|
||||
* tables share the same code-block awareness when relevant).
|
||||
*/
|
||||
import { describe, expect, it } from "vitest";
|
||||
import { parseMd } from "../../parse.js";
|
||||
|
||||
describe("wave-05 tables", () => {
|
||||
it("T-01 standard 2-column table", () => {
|
||||
const raw = `## H
|
||||
|
||||
| tool | guidance |
|
||||
| --- | --- |
|
||||
| gh | use for GitHub |
|
||||
| curl | HTTP client |
|
||||
`;
|
||||
const { ast } = parseMd(raw);
|
||||
const table = ast.blocks[0]?.tables[0];
|
||||
expect(table?.headers).toEqual(["tool", "guidance"]);
|
||||
expect(table?.rows).toEqual([
|
||||
["gh", "use for GitHub"],
|
||||
["curl", "HTTP client"],
|
||||
]);
|
||||
});
|
||||
|
||||
it("T-02 3+ column table", () => {
|
||||
const raw = `## H
|
||||
|
||||
| a | b | c |
|
||||
| - | - | - |
|
||||
| 1 | 2 | 3 |
|
||||
`;
|
||||
const { ast } = parseMd(raw);
|
||||
expect(ast.blocks[0]?.tables[0]?.headers).toEqual(["a", "b", "c"]);
|
||||
expect(ast.blocks[0]?.tables[0]?.rows[0]).toEqual(["1", "2", "3"]);
|
||||
});
|
||||
|
||||
it("T-03 table with alignment colons in separator", () => {
|
||||
const raw = `## H
|
||||
|
||||
| left | center | right |
|
||||
| :--- | :---: | ---: |
|
||||
| a | b | c |
|
||||
`;
|
||||
const { ast } = parseMd(raw);
|
||||
expect(ast.blocks[0]?.tables.length).toBe(1);
|
||||
});
|
||||
|
||||
it("T-04 table with empty cells", () => {
|
||||
const raw = `## H
|
||||
|
||||
| a | b |
|
||||
| - | - |
|
||||
| 1 | |
|
||||
| | 2 |
|
||||
`;
|
||||
const { ast } = parseMd(raw);
|
||||
expect(ast.blocks[0]?.tables[0]?.rows).toEqual([
|
||||
["1", ""],
|
||||
["", "2"],
|
||||
]);
|
||||
});
|
||||
|
||||
it("T-05 table with no rows (header + sep only)", () => {
|
||||
const raw = `## H
|
||||
|
||||
| a | b |
|
||||
| - | - |
|
||||
`;
|
||||
const { ast } = parseMd(raw);
|
||||
expect(ast.blocks[0]?.tables[0]?.headers).toEqual(["a", "b"]);
|
||||
expect(ast.blocks[0]?.tables[0]?.rows).toEqual([]);
|
||||
});
|
||||
|
||||
it("T-06 multiple tables in same section", () => {
|
||||
const raw = `## H
|
||||
|
||||
| a | b |
|
||||
| - | - |
|
||||
| 1 | 2 |
|
||||
|
||||
Some text.
|
||||
|
||||
| x | y |
|
||||
| - | - |
|
||||
| 3 | 4 |
|
||||
`;
|
||||
const { ast } = parseMd(raw);
|
||||
expect(ast.blocks[0]?.tables.length).toBe(2);
|
||||
});
|
||||
|
||||
it("T-07 table line numbers track to the header line", () => {
|
||||
const raw = `## Section
|
||||
preamble line
|
||||
| a | b |
|
||||
| - | - |
|
||||
`;
|
||||
const { ast } = parseMd(raw);
|
||||
expect(ast.blocks[0]?.tables[0]?.line).toBeGreaterThan(0);
|
||||
});
|
||||
|
||||
it("T-08 invalid separator (no pipes) — no table extracted", () => {
|
||||
const raw = `## H
|
||||
|
||||
| a | b |
|
||||
not a separator
|
||||
| 1 | 2 |
|
||||
`;
|
||||
const { ast } = parseMd(raw);
|
||||
expect(ast.blocks[0]?.tables).toEqual([]);
|
||||
});
|
||||
|
||||
it("T-09 single-column table (just `| col |\\n|---|`)", () => {
|
||||
const raw = `## H
|
||||
|
||||
| col |
|
||||
| --- |
|
||||
| value1 |
|
||||
| value2 |
|
||||
`;
|
||||
const { ast } = parseMd(raw);
|
||||
expect(ast.blocks[0]?.tables[0]?.headers).toEqual(["col"]);
|
||||
expect(ast.blocks[0]?.tables[0]?.rows).toEqual([["value1"], ["value2"]]);
|
||||
});
|
||||
|
||||
it("T-10 table at end of file with trailing newlines", () => {
|
||||
const raw = `## H
|
||||
|
||||
| a |
|
||||
| - |
|
||||
| 1 |
|
||||
|
||||
|
||||
`;
|
||||
const { ast } = parseMd(raw);
|
||||
expect(ast.blocks[0]?.tables[0]?.rows).toEqual([["1"]]);
|
||||
});
|
||||
|
||||
it("T-11 table content with internal whitespace trimmed", () => {
|
||||
const raw = `## H
|
||||
|
||||
| col1 | col2 |
|
||||
| --- | --- |
|
||||
| a | b |
|
||||
`;
|
||||
const { ast } = parseMd(raw);
|
||||
expect(ast.blocks[0]?.tables[0]?.headers).toEqual(["col1", "col2"]);
|
||||
expect(ast.blocks[0]?.tables[0]?.rows[0]).toEqual(["a", "b"]);
|
||||
});
|
||||
});
|
||||
Reference in New Issue
Block a user