refactor(oc-path): markdown-it tokenizer + grammar relaxation

The hand-rolled MD parser is replaced with a markdown-it token-stream
walker. AstTable and AstCodeBlock are dropped from the AST — the
substrate doesn't address into table rows or fence content, and
markdown-it's tokenizer already handles "##/- inside fenced code
should not be a heading/item" correctly without first-class AST
modeling.

Grammar opinions move from parser to lint:
  - Indented `## foo` (1-3 spaces) is now a heading
  - Empty `## ` is a heading with empty slug
  - Ordered lists (`1. step`) become items
  - Nested sub-bullets become items at flat level

Each was previously a silent parser refusal — now they are recognized
shapes. Lint rules can flag them (`OC_HEADING_INDENTED`,
`OC_HEADING_EMPTY`, etc.) where authoring conventions require the
narrower shape.

Net: parse.ts drops 301 → 207 LoC; tables/code-blocks scenario tests
removed wholesale (-251 LoC of test surface that pinned dead AST
fields).
This commit is contained in:
Gio Della-Libera
2026-05-08 18:40:56 -07:00
committed by Peter Steinberger
parent 6283c8247c
commit 7b7e65105b
9 changed files with 136 additions and 542 deletions

View File

@@ -57,32 +57,17 @@ export interface AstItem {
readonly kv?: { readonly key: string; readonly value: string };
}
/**
* A markdown table. Tables surface in `## Tool Guidance` blocks and
* elsewhere; lint rules can address rows by header value if needed.
*/
export interface AstTable {
readonly headers: readonly string[];
readonly rows: readonly (readonly string[])[];
readonly line: number;
}
/**
* A fenced code block. Carries the language tag (or `null`) and the
* verbatim body.
*/
export interface AstCodeBlock {
readonly lang: string | null;
readonly text: string;
readonly line: number;
}
/**
* An H2-delimited block. The `slug` is the kebab-case lowercase form of
* `heading` and is what OcPath `section` matches against. `bodyText` is
* the prose between this heading and the next H2 (or end of file),
* verbatim. `items`, `tables`, `codeBlocks` are extracted from
* `bodyText` for addressing convenience but the raw text is preserved.
* verbatim. `items` are extracted from `bodyText` for addressing
* convenience but the raw text is preserved.
*
* Tables and fenced code blocks are NOT modeled as first-class AST
* children — addressing into them is out of scope for the substrate.
* Lint rules that need table rows or code-block contents re-tokenize
* the block's `bodyText` on demand.
*/
export interface AstBlock {
readonly heading: string;
@@ -90,8 +75,6 @@ export interface AstBlock {
readonly line: number;
readonly bodyText: string;
readonly items: readonly AstItem[];
readonly tables: readonly AstTable[];
readonly codeBlocks: readonly AstCodeBlock[];
}
/**

View File

@@ -35,9 +35,7 @@ export const SDK_VERSION = "0.1.0";
// AST types
export type {
AstBlock,
AstCodeBlock,
AstItem,
AstTable,
Diagnostic,
FrontmatterEntry,
ParseResult,

View File

@@ -1,15 +1,24 @@
/**
* Generic markdown-flavored parser for the 8 workspace files.
* Generic markdown-flavored parser for the workspace files.
*
* Produces a `MdAst` addressing index over `raw` bytes:
* frontmatter (if present), preamble (prose before first H2), and an
* H2-block tree with items/tables/code-blocks extracted for OcPath
* resolution.
* Produces a `MdAst` addressing index over `raw` bytes: frontmatter
* (if present), preamble (prose before first H2), and an H2-block tree
* with items extracted for OcPath resolution.
*
* **No file-kind discrimination.** Same parse path for SOUL.md /
* AGENTS.md / MEMORY.md / TOOLS.md / IDENTITY.md / USER.md /
* HEARTBEAT.md / SKILL.md. Per-file lint opinions ride downstream
* (`@openclaw/oc-lint` rule packs).
* Tokenization is delegated to markdown-it; this module owns the
* frontmatter detector (markdown-it does not handle YAML frontmatter
* natively) and the token-stream walker that buckets headings and
* bullets into the addressable AST shape. Tables and fenced code
* blocks are NOT first-class AST children — substrate addressing
* doesn't go inside them, and tokenizer-level structure (which
* markdown-it already gets right) is sufficient to ensure `##` and
* `-` inside them aren't misparsed as headings or items.
*
* **Grammar opinions live in lint rules, not the parser.** Indented
* `## foo`, empty `## `, ordered (`1.`) lists, and nested sub-bullets
* are all recognized as headings / items here; downstream lint rules
* (`OC_HEADING_INDENTED`, `OC_HEADING_EMPTY`, etc.) decide whether
* those shapes are OK in a particular file.
*
* **Byte-fidelity contract**: `raw` is preserved on the AST root so
* `emitMd(parse(raw)) === raw` for every input the parser accepts.
@@ -17,49 +26,43 @@
* @module @openclaw/oc-path/parse
*/
import MarkdownIt from "markdown-it";
import type {
AstBlock,
AstCodeBlock,
AstItem,
AstTable,
Diagnostic,
FrontmatterEntry,
ParseResult,
MdAst,
ParseResult,
} from "./ast.js";
import { slugify } from "./slug.js";
type Token = ReturnType<MarkdownIt["parse"]>[number];
const FENCE = "---";
const BOM = "";
const KV_RE = /^([^:]+?)\s*:\s*(.+)$/;
const md = new MarkdownIt({ html: true });
/**
* Parse raw bytes into a `MdAst`. Soft-error policy: never
* throws. Suspicious-but-recoverable inputs (unclosed frontmatter,
* malformed bullet) become diagnostics.
*/
export function parseMd(raw: string): ParseResult {
const diagnostics: Diagnostic[] = [];
// Strip a leading BOM for parsing convenience; keep the raw input
// intact on the AST so emit can round-trip the BOM if present.
const withoutBom = raw.startsWith(BOM) ? raw.slice(BOM.length) : raw;
const lines = withoutBom.split(/\r?\n/);
const fm = detectFrontmatter(lines, diagnostics);
const bodyStartLine = fm === null ? 0 : fm.endLine + 1;
const bodyLines = lines.slice(bodyStartLine);
const bodyStartIdx = fm === null ? 0 : fm.endLine + 1;
const bodyLines = lines.slice(bodyStartIdx);
const bodyFileLine = bodyStartIdx + 1;
const { preamble, blocks } = splitH2Blocks(bodyLines, bodyStartLine + 1, diagnostics);
const tokens = md.parse(bodyLines.join("\n"), {});
const { preamble, blocks } = walkBlocks(tokens, bodyLines, bodyFileLine);
const ast: MdAst = {
kind: "md",
raw,
frontmatter: fm?.entries ?? [],
preamble,
blocks,
return {
ast: { kind: "md", raw, frontmatter: fm?.entries ?? [], preamble, blocks },
diagnostics,
};
return { ast, diagnostics };
}
// ---------- Frontmatter ---------------------------------------------------
@@ -74,13 +77,9 @@ function detectFrontmatter(
lines: readonly string[],
diagnostics: Diagnostic[],
): FrontmatterRange | null {
if (lines.length < 2) {
if (lines.length < 2 || lines[0] !== FENCE) {
return null;
}
if (lines[0] !== FENCE) {
return null;
}
let closeIndex = -1;
for (let i = 1; i < lines.length; i++) {
if (lines[i] === FENCE) {
@@ -97,205 +96,112 @@ function detectFrontmatter(
});
return null;
}
const entries: FrontmatterEntry[] = [];
for (let i = 1; i < closeIndex; i++) {
const line = lines[i];
if (line.trim().length === 0) {
continue;
const m = /^([a-zA-Z_][a-zA-Z0-9_-]*)\s*:\s*(.*)$/.exec(lines[i]);
if (m !== null) {
entries.push({ key: m[1], value: unquote(m[2].trim()), line: i + 1 });
}
const m = /^([a-zA-Z_][a-zA-Z0-9_-]*)\s*:\s*(.*)$/.exec(line);
if (m === null) {
// Could be a list-style continuation (` - item`) for the previous key;
// we don't structurally model lists in frontmatter at the substrate
// layer (lint rules can do that against the raw substring if they
// need to). Skip silently — keeps the parser opinion-free.
continue;
}
entries.push({
key: m[1],
value: unquote(m[2].trim()),
line: i + 1,
});
}
return { entries, endLine: closeIndex };
}
function unquote(value: string): string {
if (value.length >= 2) {
const first = value.charCodeAt(0);
const last = value.charCodeAt(value.length - 1);
if (first === last && (first === 34 /* " */ || first === 39) /* ' */) {
const f = value.charCodeAt(0);
const l = value.charCodeAt(value.length - 1);
if (f === l && (f === 34 || f === 39)) {
return value.slice(1, -1);
}
}
return value;
}
// ---------- H2 block split -------------------------------------------------
// ---------- H2 block walker -----------------------------------------------
function splitH2Blocks(
function walkBlocks(
tokens: readonly Token[],
bodyLines: readonly string[],
/** 1-based line number of `bodyLines[0]` in the original file. */
bodyStartLineNum: number,
diagnostics: Diagnostic[],
bodyFileLine: number,
): { preamble: string; blocks: AstBlock[] } {
// Track code-block state so `##` inside a fenced block doesn't get
// parsed as a heading.
let inCode = false;
const headings: { line: number; text: string }[] = [];
for (let i = 0; i < bodyLines.length; i++) {
const line = bodyLines[i];
if (line.startsWith("```")) {
inCode = !inCode;
continue;
}
if (inCode) {
continue;
}
const m = /^##\s+(\S.*?)\s*$/.exec(line);
if (m !== null) {
headings.push({ line: i, text: m[1] });
// Match atx-style `##` only — setext h2 (`Heading\n---`) carries
// `markup: "-"` on the heading_open token, so the `markup === "##"`
// filter picks atx exclusively. Authors who want setext can still
// write it; substrate just doesn't address it as a section.
const h2: { tokenIdx: number; lineIdx: number; text: string }[] = [];
for (let i = 0; i < tokens.length; i++) {
const t = tokens[i];
if (t.type === "heading_open" && t.tag === "h2" && t.markup === "##" && t.map !== null) {
const inline = tokens[i + 1];
h2.push({ tokenIdx: i, lineIdx: t.map[0], text: inline?.content ?? "" });
}
}
if (headings.length === 0) {
return {
preamble: bodyLines.join("\n"),
blocks: [],
};
if (h2.length === 0) {
return { preamble: bodyLines.join("\n"), blocks: [] };
}
const preamble = bodyLines.slice(0, headings[0].line).join("\n");
const preamble = bodyLines.slice(0, h2[0].lineIdx).join("\n");
const blocks: AstBlock[] = [];
for (let h = 0; h < headings.length; h++) {
const start = headings[h].line;
const end = h + 1 < headings.length ? headings[h + 1].line : bodyLines.length;
const headingText = headings[h].text;
const blockBodyLines = bodyLines.slice(start + 1, end);
const bodyText = blockBodyLines.join("\n");
const headingLineNum = bodyStartLineNum + start;
const items = extractItems(blockBodyLines, headingLineNum + 1, diagnostics);
const tables = extractTables(blockBodyLines, headingLineNum + 1);
const codeBlocks = extractCodeBlocks(blockBodyLines, headingLineNum + 1);
for (let h = 0; h < h2.length; h++) {
const start = h2[h].lineIdx;
const end = h + 1 < h2.length ? h2[h + 1].lineIdx : bodyLines.length;
// Slice tokens by INDEX so descendant tokens with no `map` (table
// cells, list markers, inline content) ride along with their
// mapped parent. heading_open / inline / heading_close = 3 tokens.
const tokenStart = h2[h].tokenIdx + 3;
const tokenEnd = h + 1 < h2.length ? h2[h + 1].tokenIdx : tokens.length;
const blockTokens = tokens.slice(tokenStart, tokenEnd);
blocks.push({
heading: headingText,
slug: slugify(headingText),
line: headingLineNum,
bodyText,
items,
tables,
codeBlocks,
heading: h2[h].text,
slug: slugify(h2[h].text),
line: bodyFileLine + start,
bodyText: bodyLines.slice(start + 1, end).join("\n"),
items: extractItems(blockTokens, bodyFileLine),
});
}
return { preamble, blocks };
}
// ---------- Items ----------------------------------------------------------
// ---------- Item extraction ----------------------------------------------
const BULLET_RE = /^(?:[-*+])\s+(.+?)\s*$/;
const KV_RE = /^([^:]+?)\s*:\s*(.+)$/;
function extractItems(
blockBodyLines: readonly string[],
startLineNum: number,
_diagnostics: Diagnostic[],
): AstItem[] {
function extractItems(tokens: readonly Token[], bodyFileLine: number): AstItem[] {
// Every `list_item_open` becomes an item — bullets, numbered lists,
// nested sub-bullets all included. Lint rules can flag depth or
// duplicate-slug collisions; the parser stays opinion-free.
const items: AstItem[] = [];
let inCode = false;
for (let i = 0; i < blockBodyLines.length; i++) {
const line = blockBodyLines[i];
if (line.startsWith("```")) {
inCode = !inCode;
for (let i = 0; i < tokens.length; i++) {
const t = tokens[i];
if (t.type !== "list_item_open" || t.map === null) {
continue;
}
if (inCode) {
continue;
// First inline at the item's own depth is the item text.
let nestedDepth = 0;
let text = "";
for (let j = i + 1; j < tokens.length; j++) {
const x = tokens[j];
if (x.type === "list_item_close" && nestedDepth === 0) {
break;
}
if (x.type === "bullet_list_open" || x.type === "ordered_list_open") {
nestedDepth++;
} else if (x.type === "bullet_list_close" || x.type === "ordered_list_close") {
nestedDepth--;
} else if (x.type === "inline" && nestedDepth === 0 && text === "") {
text = x.content;
}
}
const m = BULLET_RE.exec(line);
if (m === null) {
continue;
}
const text = m[1];
const kvMatch = KV_RE.exec(text);
const item: AstItem = {
items.push({
text,
slug: kvMatch ? slugify(kvMatch[1]) : slugify(text),
line: startLineNum + i,
...(kvMatch !== null ? { kv: { key: kvMatch[1].trim(), value: kvMatch[2].trim() } } : {}),
};
items.push(item);
line: bodyFileLine + t.map[0],
...(kvMatch !== null
? { kv: { key: kvMatch[1].trim(), value: kvMatch[2].trim() } }
: {}),
});
}
return items;
}
// ---------- Tables ---------------------------------------------------------
function extractTables(blockBodyLines: readonly string[], startLineNum: number): AstTable[] {
const tables: AstTable[] = [];
let i = 0;
while (i < blockBodyLines.length) {
const headerLine = blockBodyLines[i];
const sepLine = blockBodyLines[i + 1];
if (
headerLine.trim().startsWith("|") &&
sepLine !== undefined &&
/^\s*\|\s*[:-]+(?:\s*\|\s*[:-]+)*\s*\|?\s*$/.test(sepLine)
) {
const headers = splitTableRow(headerLine);
const rows: string[][] = [];
let j = i + 2;
while (j < blockBodyLines.length && blockBodyLines[j].trim().startsWith("|")) {
rows.push(splitTableRow(blockBodyLines[j]));
j++;
}
tables.push({ headers, rows, line: startLineNum + i });
i = j;
continue;
}
i++;
}
return tables;
}
function splitTableRow(line: string): string[] {
const trimmed = line.trim().replace(/^\|/, "").replace(/\|$/, "");
return trimmed.split("|").map((cell) => cell.trim());
}
// ---------- Code blocks ---------------------------------------------------
function extractCodeBlocks(
blockBodyLines: readonly string[],
startLineNum: number,
): AstCodeBlock[] {
const codeBlocks: AstCodeBlock[] = [];
let i = 0;
while (i < blockBodyLines.length) {
const open = blockBodyLines[i];
if (open.startsWith("```")) {
const lang = open.slice(3).trim();
const langField = lang.length > 0 ? lang : null;
const startLine = startLineNum + i;
let j = i + 1;
const bodyLines: string[] = [];
while (j < blockBodyLines.length && !blockBodyLines[j].startsWith("```")) {
bodyLines.push(blockBodyLines[j]);
j++;
}
codeBlocks.push({ lang: langField, text: bodyLines.join("\n"), line: startLine });
i = j + 1;
continue;
}
i++;
}
return codeBlocks;
}

View File

@@ -135,53 +135,6 @@ describe("parseMd — items", () => {
});
});
describe("parseMd — tables", () => {
it("extracts a simple table", () => {
const raw = `## Tool Guidance
| tool | guidance |
| --- | --- |
| gh | use for GitHub |
| curl | HTTP client |
`;
const { ast } = parseMd(raw);
const table = ast.blocks[0]?.tables[0];
if (!table) {
throw new Error("expected parsed markdown table");
}
expect(table.headers).toEqual(["tool", "guidance"]);
expect(table.rows.length).toBe(2);
expect(table.rows[0]).toEqual(["gh", "use for GitHub"]);
});
});
describe("parseMd — code blocks", () => {
it("extracts a fenced code block", () => {
const raw = `## Examples
\`\`\`ts
const x = 1;
\`\`\`
`;
const { ast } = parseMd(raw);
expect(ast.blocks[0]?.codeBlocks[0]).toMatchObject({
lang: "ts",
text: "const x = 1;",
});
});
it("handles unlanguaged fences", () => {
const raw = `## Block
\`\`\`
plain text
\`\`\`
`;
const { ast } = parseMd(raw);
expect(ast.blocks[0]?.codeBlocks[0]?.lang).toBeNull();
});
});
describe("parseMd — byte-fidelity", () => {
it("preserves raw on the AST", () => {
const raw = `---\nname: x\n---\n\n## Sec\n\n- a\n- b\n`;

View File

@@ -1,97 +0,0 @@
/**
* Wave 6 — fenced code blocks.
*
* Substrate guarantee: triple-backtick fences (` ``` `) inside H2 blocks
* extract as `AstCodeBlock` with `lang` (or null) and verbatim `text`.
* Code blocks suppress H2-split and item-extraction inside their body.
*/
import { describe, expect, it } from "vitest";
import { parseMd } from "../../parse.js";
describe("wave-06 code-blocks", () => {
it("CB-01 unlanguaged fence", () => {
const raw = `## H\n\n\`\`\`\nplain text\n\`\`\`\n`;
const { ast } = parseMd(raw);
expect(ast.blocks[0]?.codeBlocks[0]).toMatchObject({
lang: null,
text: "plain text",
});
});
it("CB-02 languaged fence", () => {
const raw = `## H\n\n\`\`\`ts\nconst x = 1;\n\`\`\`\n`;
const { ast } = parseMd(raw);
expect(ast.blocks[0]?.codeBlocks[0]?.lang).toBe("ts");
expect(ast.blocks[0]?.codeBlocks[0]?.text).toBe("const x = 1;");
});
it("CB-03 multi-line code body preserved verbatim", () => {
const raw = `## H\n\n\`\`\`ts\nline 1\nline 2\nline 3\n\`\`\`\n`;
const { ast } = parseMd(raw);
expect(ast.blocks[0]?.codeBlocks[0]?.text).toBe("line 1\nline 2\nline 3");
});
it("CB-04 empty code block", () => {
const raw = `## H\n\n\`\`\`\n\`\`\`\n`;
const { ast } = parseMd(raw);
expect(ast.blocks[0]?.codeBlocks[0]?.text).toBe("");
});
it("CB-05 code block with `## ` does NOT split as heading", () => {
const raw = `## Real\n\n\`\`\`md\n## Not a heading\n\`\`\`\n\n## Another real\n`;
const { ast } = parseMd(raw);
expect(ast.blocks.map((b) => b.heading)).toEqual(["Real", "Another real"]);
});
it("CB-06 code block with `- bullet` does NOT extract as item", () => {
const raw = `## H\n\n\`\`\`\n- not a bullet\n- still not\n\`\`\`\n\n- real bullet\n`;
const { ast } = parseMd(raw);
expect(ast.blocks[0]?.items.map((i) => i.text)).toEqual(["real bullet"]);
});
it("CB-07 multiple code blocks in same section", () => {
const raw = `## H\n\n\`\`\`a\nfirst\n\`\`\`\n\n\`\`\`b\nsecond\n\`\`\`\n`;
const { ast } = parseMd(raw);
expect(ast.blocks[0]?.codeBlocks.length).toBe(2);
expect(ast.blocks[0]?.codeBlocks.map((c) => c.lang)).toEqual(["a", "b"]);
});
it("CB-08 unterminated fence — body extends to end of section", () => {
const raw = `## H\n\n\`\`\`\nopen but never closes\n`;
const { ast } = parseMd(raw);
// Behavior: code block is created with whatever was after the open
// fence, including any trailing newline lines. Documents are
// likely malformed; substrate is lenient and preserves what's
// there (verifiable via raw round-trip).
expect(ast.blocks[0]?.codeBlocks[0]?.text).toContain("open but never closes");
});
it("CB-09 fence with leading spaces (4-space indented code)", () => {
// Note: only column-0 ``` triggers fence. Indented content is body
// text. This is the documented behavior.
const raw = `## H\n\n \`\`\`\n indented\n \`\`\`\n`;
const { ast } = parseMd(raw);
expect(ast.blocks[0]?.codeBlocks).toEqual([]);
});
it("CB-10 lang tag with extra whitespace trimmed", () => {
const raw = `## H\n\n\`\`\` jsonc \nbody\n\`\`\`\n`;
const { ast } = parseMd(raw);
expect(ast.blocks[0]?.codeBlocks[0]?.lang).toBe("jsonc");
});
it("CB-11 lang tag with hyphen / dot (typescript-jsx, c++)", () => {
const raw = `## H\n\n\`\`\`typescript-jsx\nx\n\`\`\`\n`;
const { ast } = parseMd(raw);
expect(ast.blocks[0]?.codeBlocks[0]?.lang).toBe("typescript-jsx");
});
it("CB-12 fence appearing in preamble (before any H2) is ignored at block layer", () => {
const raw = `\`\`\`\npreamble code\n\`\`\`\n\n## H\n`;
const { ast } = parseMd(raw);
// Preamble code blocks aren't structurally extracted at the
// substrate layer; this is documented. Lint can scan preamble
// raw if needed.
expect(ast.blocks[0]?.codeBlocks).toEqual([]);
});
});

View File

@@ -56,10 +56,12 @@ describe("wave-03 h2-block-split", () => {
expect(ast.blocks[0]?.heading).toBe("With space");
});
it("H2-08 leading whitespace before `##` — does NOT match (regex anchored at line start)", () => {
it("H2-08 leading whitespace before `##` — recognized as heading (CommonMark)", () => {
// Substrate accepts up to 3 spaces of indentation as an atx
// heading per CommonMark. Lint rules can flag if a particular
// workspace file requires column-zero authoring.
const { ast } = parseMd(" ## indented\n## not indented\n");
expect(ast.blocks.length).toBe(1);
expect(ast.blocks[0]?.heading).toBe("not indented");
expect(ast.blocks.map((b) => b.heading)).toEqual(["indented", "not indented"]);
});
it("H2-09 trailing whitespace on heading — trimmed in heading text", () => {
@@ -126,16 +128,19 @@ describe("wave-03 h2-block-split", () => {
});
it("H2-19 empty heading text (`## `)", () => {
// Substrate accepts an empty atx heading; downstream lint
// (`OC_HEADING_EMPTY`) flags it. Slug is empty string — collisions
// are a lint-level concern, not a parser refusal.
const { ast } = parseMd("## \n");
// Empty heading is technically a valid match (`## ` + empty text)
// but the regex requires `(.+?)` so empty doesn't match. Validates
// it's NOT split.
expect(ast.blocks).toEqual([]);
expect(ast.blocks.length).toBe(1);
expect(ast.blocks[0]?.heading).toBe("");
expect(ast.blocks[0]?.slug).toBe("");
});
it("H2-20 heading with only whitespace (`## `)", () => {
const { ast } = parseMd("## \n");
expect(ast.blocks).toEqual([]);
expect(ast.blocks.length).toBe(1);
expect(ast.blocks[0]?.heading).toBe("");
});
it("H2-21 heading-shaped text inside multi-line bullet body — does split", () => {

View File

@@ -85,23 +85,27 @@ describe("wave-04 items", () => {
expect(ast.blocks[0]?.items[0]?.text).toBe("spaced");
});
it("I-15 empty bullet text is dropped", () => {
it("I-15 empty bullet — recognized with empty text/slug", () => {
// Substrate accepts an empty bullet; lint can flag if collisions
// matter. Both `- ` and `- real` become items.
const { ast } = parseMd("## H\n- \n- real\n");
// The regex requires (.+?) non-empty, so `- ` alone doesn't match.
expect(ast.blocks[0]?.items.length).toBe(1);
expect(ast.blocks[0]?.items.length).toBe(2);
expect(ast.blocks[0]?.items.map((i) => i.text)).toEqual(["", "real"]);
});
it("I-16 indented bullet (sub-bullet) — current parser still picks up", () => {
// The current regex `^(?:[-*+])\\s+(.+?)\\s*$` requires column-0
// bullet markers; indented bullets do NOT match. Documented as a
// limit — sub-bullets surface in body text but not in items.
it("I-16 indented bullet (sub-bullet) — recognized as item alongside parent", () => {
// Substrate flattens the bullet tree into a list of items;
// sub-bullets surface as their own AstItem entries. Lint rules
// can flag depth or duplicate-slug collisions.
const { ast } = parseMd("## H\n- top\n - sub\n");
expect(ast.blocks[0]?.items.map((i) => i.text)).toEqual(["top"]);
expect(ast.blocks[0]?.items.map((i) => i.text)).toEqual(["top", "sub"]);
});
it("I-17 numbered list (1. item) is NOT extracted as item", () => {
it("I-17 numbered list (1. item) — recognized as items", () => {
// Substrate treats ordered and unordered lists symmetrically.
// Lint rules can flag if a particular file requires bullet style.
const { ast } = parseMd("## H\n1. first\n2. second\n");
expect(ast.blocks[0]?.items).toEqual([]);
expect(ast.blocks[0]?.items.map((i) => i.text)).toEqual(["first", "second"]);
});
it("I-18 items in a section with no body before — first item line is heading+1", () => {

View File

@@ -56,7 +56,7 @@ describe("wave-12 real-world-fixtures", () => {
}
});
it("F-04 TOOLS.md table extracted from Tool Guidance section", () => {
it("F-04 TOOLS.md tool-guidance section resolves by slug", () => {
const raw = load("TOOLS.md");
const { ast } = parseMd(raw);
expect(emitMd(ast)).toBe(raw);
@@ -65,10 +65,6 @@ describe("wave-12 real-world-fixtures", () => {
section: "tool-guidance",
});
expect(guidance?.kind).toBe("block");
if (guidance?.kind === "block") {
expect(guidance.node.tables.length).toBeGreaterThan(0);
expect(guidance.node.tables[0]?.headers).toEqual(["tool", "guidance"]);
}
});
it("F-05 IDENTITY.md sections resolvable by slug", () => {

View File

@@ -1,154 +0,0 @@
/**
* Wave 5 — markdown tables.
*
* Substrate guarantee: GFM-style tables (`| h | h |\n|---|---|\n| r | r |`)
* inside H2 blocks are extracted into `AstTable`. Tables inside fenced
* code blocks are NOT extracted (handled at item-extraction layer too;
* tables share the same code-block awareness when relevant).
*/
import { describe, expect, it } from "vitest";
import { parseMd } from "../../parse.js";
describe("wave-05 tables", () => {
it("T-01 standard 2-column table", () => {
const raw = `## H
| tool | guidance |
| --- | --- |
| gh | use for GitHub |
| curl | HTTP client |
`;
const { ast } = parseMd(raw);
const table = ast.blocks[0]?.tables[0];
expect(table?.headers).toEqual(["tool", "guidance"]);
expect(table?.rows).toEqual([
["gh", "use for GitHub"],
["curl", "HTTP client"],
]);
});
it("T-02 3+ column table", () => {
const raw = `## H
| a | b | c |
| - | - | - |
| 1 | 2 | 3 |
`;
const { ast } = parseMd(raw);
expect(ast.blocks[0]?.tables[0]?.headers).toEqual(["a", "b", "c"]);
expect(ast.blocks[0]?.tables[0]?.rows[0]).toEqual(["1", "2", "3"]);
});
it("T-03 table with alignment colons in separator", () => {
const raw = `## H
| left | center | right |
| :--- | :---: | ---: |
| a | b | c |
`;
const { ast } = parseMd(raw);
expect(ast.blocks[0]?.tables.length).toBe(1);
});
it("T-04 table with empty cells", () => {
const raw = `## H
| a | b |
| - | - |
| 1 | |
| | 2 |
`;
const { ast } = parseMd(raw);
expect(ast.blocks[0]?.tables[0]?.rows).toEqual([
["1", ""],
["", "2"],
]);
});
it("T-05 table with no rows (header + sep only)", () => {
const raw = `## H
| a | b |
| - | - |
`;
const { ast } = parseMd(raw);
expect(ast.blocks[0]?.tables[0]?.headers).toEqual(["a", "b"]);
expect(ast.blocks[0]?.tables[0]?.rows).toEqual([]);
});
it("T-06 multiple tables in same section", () => {
const raw = `## H
| a | b |
| - | - |
| 1 | 2 |
Some text.
| x | y |
| - | - |
| 3 | 4 |
`;
const { ast } = parseMd(raw);
expect(ast.blocks[0]?.tables.length).toBe(2);
});
it("T-07 table line numbers track to the header line", () => {
const raw = `## Section
preamble line
| a | b |
| - | - |
`;
const { ast } = parseMd(raw);
expect(ast.blocks[0]?.tables[0]?.line).toBeGreaterThan(0);
});
it("T-08 invalid separator (no pipes) — no table extracted", () => {
const raw = `## H
| a | b |
not a separator
| 1 | 2 |
`;
const { ast } = parseMd(raw);
expect(ast.blocks[0]?.tables).toEqual([]);
});
it("T-09 single-column table (just `| col |\\n|---|`)", () => {
const raw = `## H
| col |
| --- |
| value1 |
| value2 |
`;
const { ast } = parseMd(raw);
expect(ast.blocks[0]?.tables[0]?.headers).toEqual(["col"]);
expect(ast.blocks[0]?.tables[0]?.rows).toEqual([["value1"], ["value2"]]);
});
it("T-10 table at end of file with trailing newlines", () => {
const raw = `## H
| a |
| - |
| 1 |
`;
const { ast } = parseMd(raw);
expect(ast.blocks[0]?.tables[0]?.rows).toEqual([["1"]]);
});
it("T-11 table content with internal whitespace trimmed", () => {
const raw = `## H
| col1 | col2 |
| --- | --- |
| a | b |
`;
const { ast } = parseMd(raw);
expect(ast.blocks[0]?.tables[0]?.headers).toEqual(["col1", "col2"]);
expect(ast.blocks[0]?.tables[0]?.rows[0]).toEqual(["a", "b"]);
});
});