refactor(oc-path): markdown-it tokenizer + grammar relaxation

The hand-rolled MD parser is replaced with a markdown-it token-stream walker. AstTable and AstCodeBlock are dropped from the AST — the substrate doesn't address into table rows or fence content, and markdown-it's tokenizer already handles "##/- inside fenced code should not be a heading/item" correctly without first-class AST modeling. Grammar opinions move from parser to lint: - Indented `## foo` (1-3 spaces) is now a heading - Empty `## ` is a heading with empty slug - Ordered lists (`1. step`) become items - Nested sub-bullets become items at flat level Each was previously a silent parser refusal — now they are recognized shapes. Lint rules can flag them (`OC_HEADING_INDENTED`, `OC_HEADING_EMPTY`, etc.) where authoring conventions require the narrower shape. Net: parse.ts drops 301 → 207 LoC; tables/code-blocks scenario tests removed wholesale (-251 LoC of test surface that pinned dead AST fields).
2026-05-10 09:10:45 +00:00 · 2026-05-08 18:40:56 -07:00
parent 6283c8247c
commit 7b7e65105b
9 changed files with 136 additions and 542 deletions
--- a/extensions/oc-path/src/oc-path/ast.ts
+++ b/extensions/oc-path/src/oc-path/ast.ts
@@ -57,32 +57,17 @@ export interface AstItem {
  readonly kv?: { readonly key: string; readonly value: string };
 }

-/**
- * A markdown table. Tables surface in `## Tool Guidance` blocks and
- * elsewhere; lint rules can address rows by header value if needed.
- */
-export interface AstTable {
-  readonly headers: readonly string[];
-  readonly rows: readonly (readonly string[])[];
-  readonly line: number;
-}
-
-/**
- * A fenced code block. Carries the language tag (or `null`) and the
- * verbatim body.
- */
-export interface AstCodeBlock {
-  readonly lang: string | null;
-  readonly text: string;
-  readonly line: number;
-}
-
 /**
 * An H2-delimited block. The `slug` is the kebab-case lowercase form of
 * `heading` and is what OcPath `section` matches against. `bodyText` is
 * the prose between this heading and the next H2 (or end of file),
- * verbatim. `items`, `tables`, `codeBlocks` are extracted from
- * `bodyText` for addressing convenience but the raw text is preserved.
+ * verbatim. `items` are extracted from `bodyText` for addressing
+ * convenience but the raw text is preserved.
+ *
+ * Tables and fenced code blocks are NOT modeled as first-class AST
+ * children — addressing into them is out of scope for the substrate.
+ * Lint rules that need table rows or code-block contents re-tokenize
+ * the block's `bodyText` on demand.
 */
 export interface AstBlock {
  readonly heading: string;
@@ -90,8 +75,6 @@ export interface AstBlock {
  readonly line: number;
  readonly bodyText: string;
  readonly items: readonly AstItem[];
-  readonly tables: readonly AstTable[];
-  readonly codeBlocks: readonly AstCodeBlock[];
 }

 /**
--- a/extensions/oc-path/src/oc-path/index.ts
+++ b/extensions/oc-path/src/oc-path/index.ts
@@ -35,9 +35,7 @@ export const SDK_VERSION = "0.1.0";
 // AST types
 export type {
  AstBlock,
-  AstCodeBlock,
  AstItem,
-  AstTable,
  Diagnostic,
  FrontmatterEntry,
  ParseResult,
--- a/extensions/oc-path/src/oc-path/parse.ts
+++ b/extensions/oc-path/src/oc-path/parse.ts
@@ -1,15 +1,24 @@
 /**
- * Generic markdown-flavored parser for the 8 workspace files.
+ * Generic markdown-flavored parser for the workspace files.
 *
- * Produces a `MdAst` addressing index over `raw` bytes:
- * frontmatter (if present), preamble (prose before first H2), and an
- * H2-block tree with items/tables/code-blocks extracted for OcPath
- * resolution.
+ * Produces a `MdAst` addressing index over `raw` bytes: frontmatter
+ * (if present), preamble (prose before first H2), and an H2-block tree
+ * with items extracted for OcPath resolution.
 *
- * **No file-kind discrimination.** Same parse path for SOUL.md /
- * AGENTS.md / MEMORY.md / TOOLS.md / IDENTITY.md / USER.md /
- * HEARTBEAT.md / SKILL.md. Per-file lint opinions ride downstream
- * (`@openclaw/oc-lint` rule packs).
+ * Tokenization is delegated to markdown-it; this module owns the
+ * frontmatter detector (markdown-it does not handle YAML frontmatter
+ * natively) and the token-stream walker that buckets headings and
+ * bullets into the addressable AST shape. Tables and fenced code
+ * blocks are NOT first-class AST children — substrate addressing
+ * doesn't go inside them, and tokenizer-level structure (which
+ * markdown-it already gets right) is sufficient to ensure `##` and
+ * `-` inside them aren't misparsed as headings or items.
+ *
+ * **Grammar opinions live in lint rules, not the parser.** Indented
+ * `## foo`, empty `## `, ordered (`1.`) lists, and nested sub-bullets
+ * are all recognized as headings / items here; downstream lint rules
+ * (`OC_HEADING_INDENTED`, `OC_HEADING_EMPTY`, etc.) decide whether
+ * those shapes are OK in a particular file.
 *
 * **Byte-fidelity contract**: `raw` is preserved on the AST root so
 * `emitMd(parse(raw)) === raw` for every input the parser accepts.
@@ -17,49 +26,43 @@
 * @module @openclaw/oc-path/parse
 */

+import MarkdownIt from "markdown-it";
+
 import type {
  AstBlock,
-  AstCodeBlock,
  AstItem,
-  AstTable,
  Diagnostic,
  FrontmatterEntry,
-  ParseResult,
  MdAst,
+  ParseResult,
 } from "./ast.js";
 import { slugify } from "./slug.js";

+type Token = ReturnType<MarkdownIt["parse"]>[number];
+
 const FENCE = "---";
 const BOM = "";
+const KV_RE = /^([^:]+?)\s*:\s*(.+)$/;
+
+const md = new MarkdownIt({ html: true });

-/**
- * Parse raw bytes into a `MdAst`. Soft-error policy: never
- * throws. Suspicious-but-recoverable inputs (unclosed frontmatter,
- * malformed bullet) become diagnostics.
- */
 export function parseMd(raw: string): ParseResult {
  const diagnostics: Diagnostic[] = [];
-
-  // Strip a leading BOM for parsing convenience; keep the raw input
-  // intact on the AST so emit can round-trip the BOM if present.
  const withoutBom = raw.startsWith(BOM) ? raw.slice(BOM.length) : raw;
  const lines = withoutBom.split(/\r?\n/);

  const fm = detectFrontmatter(lines, diagnostics);
-  const bodyStartLine = fm === null ? 0 : fm.endLine + 1;
-  const bodyLines = lines.slice(bodyStartLine);
+  const bodyStartIdx = fm === null ? 0 : fm.endLine + 1;
+  const bodyLines = lines.slice(bodyStartIdx);
+  const bodyFileLine = bodyStartIdx + 1;

-  const { preamble, blocks } = splitH2Blocks(bodyLines, bodyStartLine + 1, diagnostics);
+  const tokens = md.parse(bodyLines.join("\n"), {});
+  const { preamble, blocks } = walkBlocks(tokens, bodyLines, bodyFileLine);

-  const ast: MdAst = {
-    kind: "md",
-    raw,
-    frontmatter: fm?.entries ?? [],
-    preamble,
-    blocks,
+  return {
+    ast: { kind: "md", raw, frontmatter: fm?.entries ?? [], preamble, blocks },
+    diagnostics,
  };
-
-  return { ast, diagnostics };
 }

 // ---------- Frontmatter ---------------------------------------------------
@@ -74,13 +77,9 @@ function detectFrontmatter(
  lines: readonly string[],
  diagnostics: Diagnostic[],
 ): FrontmatterRange | null {
-  if (lines.length < 2) {
+  if (lines.length < 2 || lines[0] !== FENCE) {
    return null;
  }
-  if (lines[0] !== FENCE) {
-    return null;
-  }
-
  let closeIndex = -1;
  for (let i = 1; i < lines.length; i++) {
    if (lines[i] === FENCE) {
@@ -97,205 +96,112 @@ function detectFrontmatter(
    });
    return null;
  }
-
  const entries: FrontmatterEntry[] = [];
  for (let i = 1; i < closeIndex; i++) {
-    const line = lines[i];
-    if (line.trim().length === 0) {
-      continue;
+    const m = /^([a-zA-Z_][a-zA-Z0-9_-]*)\s*:\s*(.*)$/.exec(lines[i]);
+    if (m !== null) {
+      entries.push({ key: m[1], value: unquote(m[2].trim()), line: i + 1 });
    }
-    const m = /^([a-zA-Z_][a-zA-Z0-9_-]*)\s*:\s*(.*)$/.exec(line);
-    if (m === null) {
-      // Could be a list-style continuation (`  - item`) for the previous key;
-      // we don't structurally model lists in frontmatter at the substrate
-      // layer (lint rules can do that against the raw substring if they
-      // need to). Skip silently — keeps the parser opinion-free.
-      continue;
-    }
-    entries.push({
-      key: m[1],
-      value: unquote(m[2].trim()),
-      line: i + 1,
-    });
  }
-
  return { entries, endLine: closeIndex };
 }

 function unquote(value: string): string {
  if (value.length >= 2) {
-    const first = value.charCodeAt(0);
-    const last = value.charCodeAt(value.length - 1);
-    if (first === last && (first === 34 /* " */ || first === 39) /* ' */) {
+    const f = value.charCodeAt(0);
+    const l = value.charCodeAt(value.length - 1);
+    if (f === l && (f === 34 || f === 39)) {
      return value.slice(1, -1);
    }
  }
  return value;
 }

-// ---------- H2 block split -------------------------------------------------
+// ---------- H2 block walker -----------------------------------------------

-function splitH2Blocks(
+function walkBlocks(
+  tokens: readonly Token[],
  bodyLines: readonly string[],
-  /** 1-based line number of `bodyLines[0]` in the original file. */
-  bodyStartLineNum: number,
-  diagnostics: Diagnostic[],
+  bodyFileLine: number,
 ): { preamble: string; blocks: AstBlock[] } {
-  // Track code-block state so `##` inside a fenced block doesn't get
-  // parsed as a heading.
-  let inCode = false;
-  const headings: { line: number; text: string }[] = [];
-
-  for (let i = 0; i < bodyLines.length; i++) {
-    const line = bodyLines[i];
-    if (line.startsWith("```")) {
-      inCode = !inCode;
-      continue;
-    }
-    if (inCode) {
-      continue;
-    }
-    const m = /^##\s+(\S.*?)\s*$/.exec(line);
-    if (m !== null) {
-      headings.push({ line: i, text: m[1] });
+  // Match atx-style `##` only — setext h2 (`Heading\n---`) carries
+  // `markup: "-"` on the heading_open token, so the `markup === "##"`
+  // filter picks atx exclusively. Authors who want setext can still
+  // write it; substrate just doesn't address it as a section.
+  const h2: { tokenIdx: number; lineIdx: number; text: string }[] = [];
+  for (let i = 0; i < tokens.length; i++) {
+    const t = tokens[i];
+    if (t.type === "heading_open" && t.tag === "h2" && t.markup === "##" && t.map !== null) {
+      const inline = tokens[i + 1];
+      h2.push({ tokenIdx: i, lineIdx: t.map[0], text: inline?.content ?? "" });
    }
  }

-  if (headings.length === 0) {
-    return {
-      preamble: bodyLines.join("\n"),
-      blocks: [],
-    };
+  if (h2.length === 0) {
+    return { preamble: bodyLines.join("\n"), blocks: [] };
  }

-  const preamble = bodyLines.slice(0, headings[0].line).join("\n");
+  const preamble = bodyLines.slice(0, h2[0].lineIdx).join("\n");
  const blocks: AstBlock[] = [];

-  for (let h = 0; h < headings.length; h++) {
-    const start = headings[h].line;
-    const end = h + 1 < headings.length ? headings[h + 1].line : bodyLines.length;
-    const headingText = headings[h].text;
-    const blockBodyLines = bodyLines.slice(start + 1, end);
-    const bodyText = blockBodyLines.join("\n");
-    const headingLineNum = bodyStartLineNum + start;
-
-    const items = extractItems(blockBodyLines, headingLineNum + 1, diagnostics);
-    const tables = extractTables(blockBodyLines, headingLineNum + 1);
-    const codeBlocks = extractCodeBlocks(blockBodyLines, headingLineNum + 1);
-
+  for (let h = 0; h < h2.length; h++) {
+    const start = h2[h].lineIdx;
+    const end = h + 1 < h2.length ? h2[h + 1].lineIdx : bodyLines.length;
+    // Slice tokens by INDEX so descendant tokens with no `map` (table
+    // cells, list markers, inline content) ride along with their
+    // mapped parent. heading_open / inline / heading_close = 3 tokens.
+    const tokenStart = h2[h].tokenIdx + 3;
+    const tokenEnd = h + 1 < h2.length ? h2[h + 1].tokenIdx : tokens.length;
+    const blockTokens = tokens.slice(tokenStart, tokenEnd);
    blocks.push({
-      heading: headingText,
-      slug: slugify(headingText),
-      line: headingLineNum,
-      bodyText,
-      items,
-      tables,
-      codeBlocks,
+      heading: h2[h].text,
+      slug: slugify(h2[h].text),
+      line: bodyFileLine + start,
+      bodyText: bodyLines.slice(start + 1, end).join("\n"),
+      items: extractItems(blockTokens, bodyFileLine),
    });
  }

  return { preamble, blocks };
 }

-// ---------- Items ----------------------------------------------------------
+// ---------- Item extraction ----------------------------------------------

-const BULLET_RE = /^(?:[-*+])\s+(.+?)\s*$/;
-const KV_RE = /^([^:]+?)\s*:\s*(.+)$/;
-
-function extractItems(
-  blockBodyLines: readonly string[],
-  startLineNum: number,
-  _diagnostics: Diagnostic[],
-): AstItem[] {
+function extractItems(tokens: readonly Token[], bodyFileLine: number): AstItem[] {
+  // Every `list_item_open` becomes an item — bullets, numbered lists,
+  // nested sub-bullets all included. Lint rules can flag depth or
+  // duplicate-slug collisions; the parser stays opinion-free.
  const items: AstItem[] = [];
-  let inCode = false;
-
-  for (let i = 0; i < blockBodyLines.length; i++) {
-    const line = blockBodyLines[i];
-    if (line.startsWith("```")) {
-      inCode = !inCode;
+  for (let i = 0; i < tokens.length; i++) {
+    const t = tokens[i];
+    if (t.type !== "list_item_open" || t.map === null) {
      continue;
    }
-    if (inCode) {
-      continue;
+    // First inline at the item's own depth is the item text.
+    let nestedDepth = 0;
+    let text = "";
+    for (let j = i + 1; j < tokens.length; j++) {
+      const x = tokens[j];
+      if (x.type === "list_item_close" && nestedDepth === 0) {
+        break;
+      }
+      if (x.type === "bullet_list_open" || x.type === "ordered_list_open") {
+        nestedDepth++;
+      } else if (x.type === "bullet_list_close" || x.type === "ordered_list_close") {
+        nestedDepth--;
+      } else if (x.type === "inline" && nestedDepth === 0 && text === "") {
+        text = x.content;
+      }
    }
-    const m = BULLET_RE.exec(line);
-    if (m === null) {
-      continue;
-    }
-    const text = m[1];
    const kvMatch = KV_RE.exec(text);
-    const item: AstItem = {
+    items.push({
      text,
      slug: kvMatch ? slugify(kvMatch[1]) : slugify(text),
-      line: startLineNum + i,
-      ...(kvMatch !== null ? { kv: { key: kvMatch[1].trim(), value: kvMatch[2].trim() } } : {}),
-    };
-    items.push(item);
+      line: bodyFileLine + t.map[0],
+      ...(kvMatch !== null
+        ? { kv: { key: kvMatch[1].trim(), value: kvMatch[2].trim() } }
+        : {}),
+    });
  }
-
  return items;
 }
-
-// ---------- Tables ---------------------------------------------------------
-
-function extractTables(blockBodyLines: readonly string[], startLineNum: number): AstTable[] {
-  const tables: AstTable[] = [];
-  let i = 0;
-  while (i < blockBodyLines.length) {
-    const headerLine = blockBodyLines[i];
-    const sepLine = blockBodyLines[i + 1];
-    if (
-      headerLine.trim().startsWith("|") &&
-      sepLine !== undefined &&
-      /^\s*\|\s*[:-]+(?:\s*\|\s*[:-]+)*\s*\|?\s*$/.test(sepLine)
-    ) {
-      const headers = splitTableRow(headerLine);
-      const rows: string[][] = [];
-      let j = i + 2;
-      while (j < blockBodyLines.length && blockBodyLines[j].trim().startsWith("|")) {
-        rows.push(splitTableRow(blockBodyLines[j]));
-        j++;
-      }
-      tables.push({ headers, rows, line: startLineNum + i });
-      i = j;
-      continue;
-    }
-    i++;
-  }
-  return tables;
-}
-
-function splitTableRow(line: string): string[] {
-  const trimmed = line.trim().replace(/^\|/, "").replace(/\|$/, "");
-  return trimmed.split("|").map((cell) => cell.trim());
-}
-
-// ---------- Code blocks ---------------------------------------------------
-
-function extractCodeBlocks(
-  blockBodyLines: readonly string[],
-  startLineNum: number,
-): AstCodeBlock[] {
-  const codeBlocks: AstCodeBlock[] = [];
-  let i = 0;
-  while (i < blockBodyLines.length) {
-    const open = blockBodyLines[i];
-    if (open.startsWith("```")) {
-      const lang = open.slice(3).trim();
-      const langField = lang.length > 0 ? lang : null;
-      const startLine = startLineNum + i;
-      let j = i + 1;
-      const bodyLines: string[] = [];
-      while (j < blockBodyLines.length && !blockBodyLines[j].startsWith("```")) {
-        bodyLines.push(blockBodyLines[j]);
-        j++;
-      }
-      codeBlocks.push({ lang: langField, text: bodyLines.join("\n"), line: startLine });
-      i = j + 1;
-      continue;
-    }
-    i++;
-  }
-  return codeBlocks;
-}
--- a/extensions/oc-path/src/oc-path/tests/parse.test.ts
+++ b/extensions/oc-path/src/oc-path/tests/parse.test.ts
@@ -135,53 +135,6 @@ describe("parseMd — items", () => {
  });
 });

-describe("parseMd — tables", () => {
-  it("extracts a simple table", () => {
-    const raw = `## Tool Guidance
-
-| tool | guidance |
-| --- | --- |
-| gh | use for GitHub |
-| curl | HTTP client |
-`;
-    const { ast } = parseMd(raw);
-    const table = ast.blocks[0]?.tables[0];
-    if (!table) {
-      throw new Error("expected parsed markdown table");
-    }
-    expect(table.headers).toEqual(["tool", "guidance"]);
-    expect(table.rows.length).toBe(2);
-    expect(table.rows[0]).toEqual(["gh", "use for GitHub"]);
-  });
-});
-
-describe("parseMd — code blocks", () => {
-  it("extracts a fenced code block", () => {
-    const raw = `## Examples
-
-\`\`\`ts
-const x = 1;
-\`\`\`
-`;
-    const { ast } = parseMd(raw);
-    expect(ast.blocks[0]?.codeBlocks[0]).toMatchObject({
-      lang: "ts",
-      text: "const x = 1;",
-    });
-  });
-
-  it("handles unlanguaged fences", () => {
-    const raw = `## Block
-
-\`\`\`
-plain text
-\`\`\`
-`;
-    const { ast } = parseMd(raw);
-    expect(ast.blocks[0]?.codeBlocks[0]?.lang).toBeNull();
-  });
-});
-
 describe("parseMd — byte-fidelity", () => {
  it("preserves raw on the AST", () => {
    const raw = `---\nname: x\n---\n\n## Sec\n\n- a\n- b\n`;
--- a/extensions/oc-path/src/oc-path/tests/scenarios/code-blocks.test.ts
+++ b/extensions/oc-path/src/oc-path/tests/scenarios/code-blocks.test.ts
@@ -1,97 +0,0 @@
-/**
- * Wave 6 — fenced code blocks.
- *
- * Substrate guarantee: triple-backtick fences (` ``` `) inside H2 blocks
- * extract as `AstCodeBlock` with `lang` (or null) and verbatim `text`.
- * Code blocks suppress H2-split and item-extraction inside their body.
- */
-import { describe, expect, it } from "vitest";
-import { parseMd } from "../../parse.js";
-
-describe("wave-06 code-blocks", () => {
-  it("CB-01 unlanguaged fence", () => {
-    const raw = `## H\n\n\`\`\`\nplain text\n\`\`\`\n`;
-    const { ast } = parseMd(raw);
-    expect(ast.blocks[0]?.codeBlocks[0]).toMatchObject({
-      lang: null,
-      text: "plain text",
-    });
-  });
-
-  it("CB-02 languaged fence", () => {
-    const raw = `## H\n\n\`\`\`ts\nconst x = 1;\n\`\`\`\n`;
-    const { ast } = parseMd(raw);
-    expect(ast.blocks[0]?.codeBlocks[0]?.lang).toBe("ts");
-    expect(ast.blocks[0]?.codeBlocks[0]?.text).toBe("const x = 1;");
-  });
-
-  it("CB-03 multi-line code body preserved verbatim", () => {
-    const raw = `## H\n\n\`\`\`ts\nline 1\nline 2\nline 3\n\`\`\`\n`;
-    const { ast } = parseMd(raw);
-    expect(ast.blocks[0]?.codeBlocks[0]?.text).toBe("line 1\nline 2\nline 3");
-  });
-
-  it("CB-04 empty code block", () => {
-    const raw = `## H\n\n\`\`\`\n\`\`\`\n`;
-    const { ast } = parseMd(raw);
-    expect(ast.blocks[0]?.codeBlocks[0]?.text).toBe("");
-  });
-
-  it("CB-05 code block with `## ` does NOT split as heading", () => {
-    const raw = `## Real\n\n\`\`\`md\n## Not a heading\n\`\`\`\n\n## Another real\n`;
-    const { ast } = parseMd(raw);
-    expect(ast.blocks.map((b) => b.heading)).toEqual(["Real", "Another real"]);
-  });
-
-  it("CB-06 code block with `- bullet` does NOT extract as item", () => {
-    const raw = `## H\n\n\`\`\`\n- not a bullet\n- still not\n\`\`\`\n\n- real bullet\n`;
-    const { ast } = parseMd(raw);
-    expect(ast.blocks[0]?.items.map((i) => i.text)).toEqual(["real bullet"]);
-  });
-
-  it("CB-07 multiple code blocks in same section", () => {
-    const raw = `## H\n\n\`\`\`a\nfirst\n\`\`\`\n\n\`\`\`b\nsecond\n\`\`\`\n`;
-    const { ast } = parseMd(raw);
-    expect(ast.blocks[0]?.codeBlocks.length).toBe(2);
-    expect(ast.blocks[0]?.codeBlocks.map((c) => c.lang)).toEqual(["a", "b"]);
-  });
-
-  it("CB-08 unterminated fence — body extends to end of section", () => {
-    const raw = `## H\n\n\`\`\`\nopen but never closes\n`;
-    const { ast } = parseMd(raw);
-    // Behavior: code block is created with whatever was after the open
-    // fence, including any trailing newline lines. Documents are
-    // likely malformed; substrate is lenient and preserves what's
-    // there (verifiable via raw round-trip).
-    expect(ast.blocks[0]?.codeBlocks[0]?.text).toContain("open but never closes");
-  });
-
-  it("CB-09 fence with leading spaces (4-space indented code)", () => {
-    // Note: only column-0 ``` triggers fence. Indented content is body
-    // text. This is the documented behavior.
-    const raw = `## H\n\n    \`\`\`\n    indented\n    \`\`\`\n`;
-    const { ast } = parseMd(raw);
-    expect(ast.blocks[0]?.codeBlocks).toEqual([]);
-  });
-
-  it("CB-10 lang tag with extra whitespace trimmed", () => {
-    const raw = `## H\n\n\`\`\`  jsonc  \nbody\n\`\`\`\n`;
-    const { ast } = parseMd(raw);
-    expect(ast.blocks[0]?.codeBlocks[0]?.lang).toBe("jsonc");
-  });
-
-  it("CB-11 lang tag with hyphen / dot (typescript-jsx, c++)", () => {
-    const raw = `## H\n\n\`\`\`typescript-jsx\nx\n\`\`\`\n`;
-    const { ast } = parseMd(raw);
-    expect(ast.blocks[0]?.codeBlocks[0]?.lang).toBe("typescript-jsx");
-  });
-
-  it("CB-12 fence appearing in preamble (before any H2) is ignored at block layer", () => {
-    const raw = `\`\`\`\npreamble code\n\`\`\`\n\n## H\n`;
-    const { ast } = parseMd(raw);
-    // Preamble code blocks aren't structurally extracted at the
-    // substrate layer; this is documented. Lint can scan preamble
-    // raw if needed.
-    expect(ast.blocks[0]?.codeBlocks).toEqual([]);
-  });
-});
--- a/extensions/oc-path/src/oc-path/tests/scenarios/h2-block-split.test.ts
+++ b/extensions/oc-path/src/oc-path/tests/scenarios/h2-block-split.test.ts
@@ -56,10 +56,12 @@ describe("wave-03 h2-block-split", () => {
    expect(ast.blocks[0]?.heading).toBe("With space");
  });

-  it("H2-08 leading whitespace before `##` — does NOT match (regex anchored at line start)", () => {
+  it("H2-08 leading whitespace before `##` — recognized as heading (CommonMark)", () => {
+    // Substrate accepts up to 3 spaces of indentation as an atx
+    // heading per CommonMark. Lint rules can flag if a particular
+    // workspace file requires column-zero authoring.
    const { ast } = parseMd("   ## indented\n## not indented\n");
-    expect(ast.blocks.length).toBe(1);
-    expect(ast.blocks[0]?.heading).toBe("not indented");
+    expect(ast.blocks.map((b) => b.heading)).toEqual(["indented", "not indented"]);
  });

  it("H2-09 trailing whitespace on heading — trimmed in heading text", () => {
@@ -126,16 +128,19 @@ describe("wave-03 h2-block-split", () => {
  });

  it("H2-19 empty heading text (`## `)", () => {
+    // Substrate accepts an empty atx heading; downstream lint
+    // (`OC_HEADING_EMPTY`) flags it. Slug is empty string — collisions
+    // are a lint-level concern, not a parser refusal.
    const { ast } = parseMd("## \n");
-    // Empty heading is technically a valid match (`## ` + empty text)
-    // but the regex requires `(.+?)` so empty doesn't match. Validates
-    // it's NOT split.
-    expect(ast.blocks).toEqual([]);
+    expect(ast.blocks.length).toBe(1);
+    expect(ast.blocks[0]?.heading).toBe("");
+    expect(ast.blocks[0]?.slug).toBe("");
  });

  it("H2-20 heading with only whitespace (`##    `)", () => {
    const { ast } = parseMd("##    \n");
-    expect(ast.blocks).toEqual([]);
+    expect(ast.blocks.length).toBe(1);
+    expect(ast.blocks[0]?.heading).toBe("");
  });

  it("H2-21 heading-shaped text inside multi-line bullet body — does split", () => {
--- a/extensions/oc-path/src/oc-path/tests/scenarios/items.test.ts
+++ b/extensions/oc-path/src/oc-path/tests/scenarios/items.test.ts
@@ -85,23 +85,27 @@ describe("wave-04 items", () => {
    expect(ast.blocks[0]?.items[0]?.text).toBe("spaced");
  });

-  it("I-15 empty bullet text is dropped", () => {
+  it("I-15 empty bullet — recognized with empty text/slug", () => {
+    // Substrate accepts an empty bullet; lint can flag if collisions
+    // matter. Both `- ` and `- real` become items.
    const { ast } = parseMd("## H\n- \n- real\n");
-    // The regex requires (.+?) non-empty, so `- ` alone doesn't match.
-    expect(ast.blocks[0]?.items.length).toBe(1);
+    expect(ast.blocks[0]?.items.length).toBe(2);
+    expect(ast.blocks[0]?.items.map((i) => i.text)).toEqual(["", "real"]);
  });

-  it("I-16 indented bullet (sub-bullet) — current parser still picks up", () => {
-    // The current regex `^(?:[-*+])\\s+(.+?)\\s*$` requires column-0
-    // bullet markers; indented bullets do NOT match. Documented as a
-    // limit — sub-bullets surface in body text but not in items.
+  it("I-16 indented bullet (sub-bullet) — recognized as item alongside parent", () => {
+    // Substrate flattens the bullet tree into a list of items;
+    // sub-bullets surface as their own AstItem entries. Lint rules
+    // can flag depth or duplicate-slug collisions.
    const { ast } = parseMd("## H\n- top\n  - sub\n");
-    expect(ast.blocks[0]?.items.map((i) => i.text)).toEqual(["top"]);
+    expect(ast.blocks[0]?.items.map((i) => i.text)).toEqual(["top", "sub"]);
  });

-  it("I-17 numbered list (1. item) is NOT extracted as item", () => {
+  it("I-17 numbered list (1. item) — recognized as items", () => {
+    // Substrate treats ordered and unordered lists symmetrically.
+    // Lint rules can flag if a particular file requires bullet style.
    const { ast } = parseMd("## H\n1. first\n2. second\n");
-    expect(ast.blocks[0]?.items).toEqual([]);
+    expect(ast.blocks[0]?.items.map((i) => i.text)).toEqual(["first", "second"]);
  });

  it("I-18 items in a section with no body before — first item line is heading+1", () => {
--- a/extensions/oc-path/src/oc-path/tests/scenarios/real-world-fixtures.test.ts
+++ b/extensions/oc-path/src/oc-path/tests/scenarios/real-world-fixtures.test.ts
@@ -56,7 +56,7 @@ describe("wave-12 real-world-fixtures", () => {
    }
  });

-  it("F-04 TOOLS.md table extracted from Tool Guidance section", () => {
+  it("F-04 TOOLS.md tool-guidance section resolves by slug", () => {
    const raw = load("TOOLS.md");
    const { ast } = parseMd(raw);
    expect(emitMd(ast)).toBe(raw);
@@ -65,10 +65,6 @@ describe("wave-12 real-world-fixtures", () => {
      section: "tool-guidance",
    });
    expect(guidance?.kind).toBe("block");
-    if (guidance?.kind === "block") {
-      expect(guidance.node.tables.length).toBeGreaterThan(0);
-      expect(guidance.node.tables[0]?.headers).toEqual(["tool", "guidance"]);
-    }
  });

  it("F-05 IDENTITY.md sections resolvable by slug", () => {
--- a/extensions/oc-path/src/oc-path/tests/scenarios/tables.test.ts
+++ b/extensions/oc-path/src/oc-path/tests/scenarios/tables.test.ts
@@ -1,154 +0,0 @@
-/**
- * Wave 5 — markdown tables.
- *
- * Substrate guarantee: GFM-style tables (`| h | h |\n|---|---|\n| r | r |`)
- * inside H2 blocks are extracted into `AstTable`. Tables inside fenced
- * code blocks are NOT extracted (handled at item-extraction layer too;
- * tables share the same code-block awareness when relevant).
- */
-import { describe, expect, it } from "vitest";
-import { parseMd } from "../../parse.js";
-
-describe("wave-05 tables", () => {
-  it("T-01 standard 2-column table", () => {
-    const raw = `## H
-
-| tool | guidance |
-| --- | --- |
-| gh | use for GitHub |
-| curl | HTTP client |
-`;
-    const { ast } = parseMd(raw);
-    const table = ast.blocks[0]?.tables[0];
-    expect(table?.headers).toEqual(["tool", "guidance"]);
-    expect(table?.rows).toEqual([
-      ["gh", "use for GitHub"],
-      ["curl", "HTTP client"],
-    ]);
-  });
-
-  it("T-02 3+ column table", () => {
-    const raw = `## H
-
-| a | b | c |
-| - | - | - |
-| 1 | 2 | 3 |
-`;
-    const { ast } = parseMd(raw);
-    expect(ast.blocks[0]?.tables[0]?.headers).toEqual(["a", "b", "c"]);
-    expect(ast.blocks[0]?.tables[0]?.rows[0]).toEqual(["1", "2", "3"]);
-  });
-
-  it("T-03 table with alignment colons in separator", () => {
-    const raw = `## H
-
-| left | center | right |
-| :--- | :---: | ---: |
-| a | b | c |
-`;
-    const { ast } = parseMd(raw);
-    expect(ast.blocks[0]?.tables.length).toBe(1);
-  });
-
-  it("T-04 table with empty cells", () => {
-    const raw = `## H
-
-| a | b |
-| - | - |
-| 1 |   |
-|   | 2 |
-`;
-    const { ast } = parseMd(raw);
-    expect(ast.blocks[0]?.tables[0]?.rows).toEqual([
-      ["1", ""],
-      ["", "2"],
-    ]);
-  });
-
-  it("T-05 table with no rows (header + sep only)", () => {
-    const raw = `## H
-
-| a | b |
-| - | - |
-`;
-    const { ast } = parseMd(raw);
-    expect(ast.blocks[0]?.tables[0]?.headers).toEqual(["a", "b"]);
-    expect(ast.blocks[0]?.tables[0]?.rows).toEqual([]);
-  });
-
-  it("T-06 multiple tables in same section", () => {
-    const raw = `## H
-
-| a | b |
-| - | - |
-| 1 | 2 |
-
-Some text.
-
-| x | y |
-| - | - |
-| 3 | 4 |
-`;
-    const { ast } = parseMd(raw);
-    expect(ast.blocks[0]?.tables.length).toBe(2);
-  });
-
-  it("T-07 table line numbers track to the header line", () => {
-    const raw = `## Section
-preamble line
-| a | b |
-| - | - |
-`;
-    const { ast } = parseMd(raw);
-    expect(ast.blocks[0]?.tables[0]?.line).toBeGreaterThan(0);
-  });
-
-  it("T-08 invalid separator (no pipes) — no table extracted", () => {
-    const raw = `## H
-
-| a | b |
-not a separator
-| 1 | 2 |
-`;
-    const { ast } = parseMd(raw);
-    expect(ast.blocks[0]?.tables).toEqual([]);
-  });
-
-  it("T-09 single-column table (just `| col |\\n|---|`)", () => {
-    const raw = `## H
-
-| col |
-| --- |
-| value1 |
-| value2 |
-`;
-    const { ast } = parseMd(raw);
-    expect(ast.blocks[0]?.tables[0]?.headers).toEqual(["col"]);
-    expect(ast.blocks[0]?.tables[0]?.rows).toEqual([["value1"], ["value2"]]);
-  });
-
-  it("T-10 table at end of file with trailing newlines", () => {
-    const raw = `## H
-
-| a |
-| - |
-| 1 |
-
-
-`;
-    const { ast } = parseMd(raw);
-    expect(ast.blocks[0]?.tables[0]?.rows).toEqual([["1"]]);
-  });
-
-  it("T-11 table content with internal whitespace trimmed", () => {
-    const raw = `## H
-
-|   col1   |   col2   |
-| --- | --- |
-|   a   |   b   |
-`;
-    const { ast } = parseMd(raw);
-    expect(ast.blocks[0]?.tables[0]?.headers).toEqual(["col1", "col2"]);
-    expect(ast.blocks[0]?.tables[0]?.rows[0]).toEqual(["a", "b"]);
-  });
-});