From 7b7e65105b0d486342ac4eaa6b004816c5bfd71d Mon Sep 17 00:00:00 2001 From: Gio Della-Libera Date: Fri, 8 May 2026 18:40:56 -0700 Subject: [PATCH] refactor(oc-path): markdown-it tokenizer + grammar relaxation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The hand-rolled MD parser is replaced with a markdown-it token-stream walker. AstTable and AstCodeBlock are dropped from the AST — the substrate doesn't address into table rows or fence content, and markdown-it's tokenizer already handles "##/- inside fenced code should not be a heading/item" correctly without first-class AST modeling. Grammar opinions move from parser to lint: - Indented `## foo` (1-3 spaces) is now a heading - Empty `## ` is a heading with empty slug - Ordered lists (`1. step`) become items - Nested sub-bullets become items at flat level Each was previously a silent parser refusal — now they are recognized shapes. Lint rules can flag them (`OC_HEADING_INDENTED`, `OC_HEADING_EMPTY`, etc.) where authoring conventions require the narrower shape. Net: parse.ts drops 301 → 207 LoC; tables/code-blocks scenario tests removed wholesale (-251 LoC of test surface that pinned dead AST fields). --- extensions/oc-path/src/oc-path/ast.ts | 31 +- extensions/oc-path/src/oc-path/index.ts | 2 - extensions/oc-path/src/oc-path/parse.ts | 296 ++++++------------ .../oc-path/src/oc-path/tests/parse.test.ts | 47 --- .../tests/scenarios/code-blocks.test.ts | 97 ------ .../tests/scenarios/h2-block-split.test.ts | 21 +- .../src/oc-path/tests/scenarios/items.test.ts | 24 +- .../scenarios/real-world-fixtures.test.ts | 6 +- .../oc-path/tests/scenarios/tables.test.ts | 154 --------- 9 files changed, 136 insertions(+), 542 deletions(-) delete mode 100644 extensions/oc-path/src/oc-path/tests/scenarios/code-blocks.test.ts delete mode 100644 extensions/oc-path/src/oc-path/tests/scenarios/tables.test.ts diff --git a/extensions/oc-path/src/oc-path/ast.ts b/extensions/oc-path/src/oc-path/ast.ts index 97633d2b95a..512c742e589 100644 --- a/extensions/oc-path/src/oc-path/ast.ts +++ b/extensions/oc-path/src/oc-path/ast.ts @@ -57,32 +57,17 @@ export interface AstItem { readonly kv?: { readonly key: string; readonly value: string }; } -/** - * A markdown table. Tables surface in `## Tool Guidance` blocks and - * elsewhere; lint rules can address rows by header value if needed. - */ -export interface AstTable { - readonly headers: readonly string[]; - readonly rows: readonly (readonly string[])[]; - readonly line: number; -} - -/** - * A fenced code block. Carries the language tag (or `null`) and the - * verbatim body. - */ -export interface AstCodeBlock { - readonly lang: string | null; - readonly text: string; - readonly line: number; -} - /** * An H2-delimited block. The `slug` is the kebab-case lowercase form of * `heading` and is what OcPath `section` matches against. `bodyText` is * the prose between this heading and the next H2 (or end of file), - * verbatim. `items`, `tables`, `codeBlocks` are extracted from - * `bodyText` for addressing convenience but the raw text is preserved. + * verbatim. `items` are extracted from `bodyText` for addressing + * convenience but the raw text is preserved. + * + * Tables and fenced code blocks are NOT modeled as first-class AST + * children — addressing into them is out of scope for the substrate. + * Lint rules that need table rows or code-block contents re-tokenize + * the block's `bodyText` on demand. */ export interface AstBlock { readonly heading: string; @@ -90,8 +75,6 @@ export interface AstBlock { readonly line: number; readonly bodyText: string; readonly items: readonly AstItem[]; - readonly tables: readonly AstTable[]; - readonly codeBlocks: readonly AstCodeBlock[]; } /** diff --git a/extensions/oc-path/src/oc-path/index.ts b/extensions/oc-path/src/oc-path/index.ts index f5b734aba3f..3477ab5c7e9 100644 --- a/extensions/oc-path/src/oc-path/index.ts +++ b/extensions/oc-path/src/oc-path/index.ts @@ -35,9 +35,7 @@ export const SDK_VERSION = "0.1.0"; // AST types export type { AstBlock, - AstCodeBlock, AstItem, - AstTable, Diagnostic, FrontmatterEntry, ParseResult, diff --git a/extensions/oc-path/src/oc-path/parse.ts b/extensions/oc-path/src/oc-path/parse.ts index de07f3be537..8ebfc5ea7fe 100644 --- a/extensions/oc-path/src/oc-path/parse.ts +++ b/extensions/oc-path/src/oc-path/parse.ts @@ -1,15 +1,24 @@ /** - * Generic markdown-flavored parser for the 8 workspace files. + * Generic markdown-flavored parser for the workspace files. * - * Produces a `MdAst` addressing index over `raw` bytes: - * frontmatter (if present), preamble (prose before first H2), and an - * H2-block tree with items/tables/code-blocks extracted for OcPath - * resolution. + * Produces a `MdAst` addressing index over `raw` bytes: frontmatter + * (if present), preamble (prose before first H2), and an H2-block tree + * with items extracted for OcPath resolution. * - * **No file-kind discrimination.** Same parse path for SOUL.md / - * AGENTS.md / MEMORY.md / TOOLS.md / IDENTITY.md / USER.md / - * HEARTBEAT.md / SKILL.md. Per-file lint opinions ride downstream - * (`@openclaw/oc-lint` rule packs). + * Tokenization is delegated to markdown-it; this module owns the + * frontmatter detector (markdown-it does not handle YAML frontmatter + * natively) and the token-stream walker that buckets headings and + * bullets into the addressable AST shape. Tables and fenced code + * blocks are NOT first-class AST children — substrate addressing + * doesn't go inside them, and tokenizer-level structure (which + * markdown-it already gets right) is sufficient to ensure `##` and + * `-` inside them aren't misparsed as headings or items. + * + * **Grammar opinions live in lint rules, not the parser.** Indented + * `## foo`, empty `## `, ordered (`1.`) lists, and nested sub-bullets + * are all recognized as headings / items here; downstream lint rules + * (`OC_HEADING_INDENTED`, `OC_HEADING_EMPTY`, etc.) decide whether + * those shapes are OK in a particular file. * * **Byte-fidelity contract**: `raw` is preserved on the AST root so * `emitMd(parse(raw)) === raw` for every input the parser accepts. @@ -17,49 +26,43 @@ * @module @openclaw/oc-path/parse */ +import MarkdownIt from "markdown-it"; + import type { AstBlock, - AstCodeBlock, AstItem, - AstTable, Diagnostic, FrontmatterEntry, - ParseResult, MdAst, + ParseResult, } from "./ast.js"; import { slugify } from "./slug.js"; +type Token = ReturnType[number]; + const FENCE = "---"; const BOM = ""; +const KV_RE = /^([^:]+?)\s*:\s*(.+)$/; + +const md = new MarkdownIt({ html: true }); -/** - * Parse raw bytes into a `MdAst`. Soft-error policy: never - * throws. Suspicious-but-recoverable inputs (unclosed frontmatter, - * malformed bullet) become diagnostics. - */ export function parseMd(raw: string): ParseResult { const diagnostics: Diagnostic[] = []; - - // Strip a leading BOM for parsing convenience; keep the raw input - // intact on the AST so emit can round-trip the BOM if present. const withoutBom = raw.startsWith(BOM) ? raw.slice(BOM.length) : raw; const lines = withoutBom.split(/\r?\n/); const fm = detectFrontmatter(lines, diagnostics); - const bodyStartLine = fm === null ? 0 : fm.endLine + 1; - const bodyLines = lines.slice(bodyStartLine); + const bodyStartIdx = fm === null ? 0 : fm.endLine + 1; + const bodyLines = lines.slice(bodyStartIdx); + const bodyFileLine = bodyStartIdx + 1; - const { preamble, blocks } = splitH2Blocks(bodyLines, bodyStartLine + 1, diagnostics); + const tokens = md.parse(bodyLines.join("\n"), {}); + const { preamble, blocks } = walkBlocks(tokens, bodyLines, bodyFileLine); - const ast: MdAst = { - kind: "md", - raw, - frontmatter: fm?.entries ?? [], - preamble, - blocks, + return { + ast: { kind: "md", raw, frontmatter: fm?.entries ?? [], preamble, blocks }, + diagnostics, }; - - return { ast, diagnostics }; } // ---------- Frontmatter --------------------------------------------------- @@ -74,13 +77,9 @@ function detectFrontmatter( lines: readonly string[], diagnostics: Diagnostic[], ): FrontmatterRange | null { - if (lines.length < 2) { + if (lines.length < 2 || lines[0] !== FENCE) { return null; } - if (lines[0] !== FENCE) { - return null; - } - let closeIndex = -1; for (let i = 1; i < lines.length; i++) { if (lines[i] === FENCE) { @@ -97,205 +96,112 @@ function detectFrontmatter( }); return null; } - const entries: FrontmatterEntry[] = []; for (let i = 1; i < closeIndex; i++) { - const line = lines[i]; - if (line.trim().length === 0) { - continue; + const m = /^([a-zA-Z_][a-zA-Z0-9_-]*)\s*:\s*(.*)$/.exec(lines[i]); + if (m !== null) { + entries.push({ key: m[1], value: unquote(m[2].trim()), line: i + 1 }); } - const m = /^([a-zA-Z_][a-zA-Z0-9_-]*)\s*:\s*(.*)$/.exec(line); - if (m === null) { - // Could be a list-style continuation (` - item`) for the previous key; - // we don't structurally model lists in frontmatter at the substrate - // layer (lint rules can do that against the raw substring if they - // need to). Skip silently — keeps the parser opinion-free. - continue; - } - entries.push({ - key: m[1], - value: unquote(m[2].trim()), - line: i + 1, - }); } - return { entries, endLine: closeIndex }; } function unquote(value: string): string { if (value.length >= 2) { - const first = value.charCodeAt(0); - const last = value.charCodeAt(value.length - 1); - if (first === last && (first === 34 /* " */ || first === 39) /* ' */) { + const f = value.charCodeAt(0); + const l = value.charCodeAt(value.length - 1); + if (f === l && (f === 34 || f === 39)) { return value.slice(1, -1); } } return value; } -// ---------- H2 block split ------------------------------------------------- +// ---------- H2 block walker ----------------------------------------------- -function splitH2Blocks( +function walkBlocks( + tokens: readonly Token[], bodyLines: readonly string[], - /** 1-based line number of `bodyLines[0]` in the original file. */ - bodyStartLineNum: number, - diagnostics: Diagnostic[], + bodyFileLine: number, ): { preamble: string; blocks: AstBlock[] } { - // Track code-block state so `##` inside a fenced block doesn't get - // parsed as a heading. - let inCode = false; - const headings: { line: number; text: string }[] = []; - - for (let i = 0; i < bodyLines.length; i++) { - const line = bodyLines[i]; - if (line.startsWith("```")) { - inCode = !inCode; - continue; - } - if (inCode) { - continue; - } - const m = /^##\s+(\S.*?)\s*$/.exec(line); - if (m !== null) { - headings.push({ line: i, text: m[1] }); + // Match atx-style `##` only — setext h2 (`Heading\n---`) carries + // `markup: "-"` on the heading_open token, so the `markup === "##"` + // filter picks atx exclusively. Authors who want setext can still + // write it; substrate just doesn't address it as a section. + const h2: { tokenIdx: number; lineIdx: number; text: string }[] = []; + for (let i = 0; i < tokens.length; i++) { + const t = tokens[i]; + if (t.type === "heading_open" && t.tag === "h2" && t.markup === "##" && t.map !== null) { + const inline = tokens[i + 1]; + h2.push({ tokenIdx: i, lineIdx: t.map[0], text: inline?.content ?? "" }); } } - if (headings.length === 0) { - return { - preamble: bodyLines.join("\n"), - blocks: [], - }; + if (h2.length === 0) { + return { preamble: bodyLines.join("\n"), blocks: [] }; } - const preamble = bodyLines.slice(0, headings[0].line).join("\n"); + const preamble = bodyLines.slice(0, h2[0].lineIdx).join("\n"); const blocks: AstBlock[] = []; - for (let h = 0; h < headings.length; h++) { - const start = headings[h].line; - const end = h + 1 < headings.length ? headings[h + 1].line : bodyLines.length; - const headingText = headings[h].text; - const blockBodyLines = bodyLines.slice(start + 1, end); - const bodyText = blockBodyLines.join("\n"); - const headingLineNum = bodyStartLineNum + start; - - const items = extractItems(blockBodyLines, headingLineNum + 1, diagnostics); - const tables = extractTables(blockBodyLines, headingLineNum + 1); - const codeBlocks = extractCodeBlocks(blockBodyLines, headingLineNum + 1); - + for (let h = 0; h < h2.length; h++) { + const start = h2[h].lineIdx; + const end = h + 1 < h2.length ? h2[h + 1].lineIdx : bodyLines.length; + // Slice tokens by INDEX so descendant tokens with no `map` (table + // cells, list markers, inline content) ride along with their + // mapped parent. heading_open / inline / heading_close = 3 tokens. + const tokenStart = h2[h].tokenIdx + 3; + const tokenEnd = h + 1 < h2.length ? h2[h + 1].tokenIdx : tokens.length; + const blockTokens = tokens.slice(tokenStart, tokenEnd); blocks.push({ - heading: headingText, - slug: slugify(headingText), - line: headingLineNum, - bodyText, - items, - tables, - codeBlocks, + heading: h2[h].text, + slug: slugify(h2[h].text), + line: bodyFileLine + start, + bodyText: bodyLines.slice(start + 1, end).join("\n"), + items: extractItems(blockTokens, bodyFileLine), }); } return { preamble, blocks }; } -// ---------- Items ---------------------------------------------------------- +// ---------- Item extraction ---------------------------------------------- -const BULLET_RE = /^(?:[-*+])\s+(.+?)\s*$/; -const KV_RE = /^([^:]+?)\s*:\s*(.+)$/; - -function extractItems( - blockBodyLines: readonly string[], - startLineNum: number, - _diagnostics: Diagnostic[], -): AstItem[] { +function extractItems(tokens: readonly Token[], bodyFileLine: number): AstItem[] { + // Every `list_item_open` becomes an item — bullets, numbered lists, + // nested sub-bullets all included. Lint rules can flag depth or + // duplicate-slug collisions; the parser stays opinion-free. const items: AstItem[] = []; - let inCode = false; - - for (let i = 0; i < blockBodyLines.length; i++) { - const line = blockBodyLines[i]; - if (line.startsWith("```")) { - inCode = !inCode; + for (let i = 0; i < tokens.length; i++) { + const t = tokens[i]; + if (t.type !== "list_item_open" || t.map === null) { continue; } - if (inCode) { - continue; + // First inline at the item's own depth is the item text. + let nestedDepth = 0; + let text = ""; + for (let j = i + 1; j < tokens.length; j++) { + const x = tokens[j]; + if (x.type === "list_item_close" && nestedDepth === 0) { + break; + } + if (x.type === "bullet_list_open" || x.type === "ordered_list_open") { + nestedDepth++; + } else if (x.type === "bullet_list_close" || x.type === "ordered_list_close") { + nestedDepth--; + } else if (x.type === "inline" && nestedDepth === 0 && text === "") { + text = x.content; + } } - const m = BULLET_RE.exec(line); - if (m === null) { - continue; - } - const text = m[1]; const kvMatch = KV_RE.exec(text); - const item: AstItem = { + items.push({ text, slug: kvMatch ? slugify(kvMatch[1]) : slugify(text), - line: startLineNum + i, - ...(kvMatch !== null ? { kv: { key: kvMatch[1].trim(), value: kvMatch[2].trim() } } : {}), - }; - items.push(item); + line: bodyFileLine + t.map[0], + ...(kvMatch !== null + ? { kv: { key: kvMatch[1].trim(), value: kvMatch[2].trim() } } + : {}), + }); } - return items; } - -// ---------- Tables --------------------------------------------------------- - -function extractTables(blockBodyLines: readonly string[], startLineNum: number): AstTable[] { - const tables: AstTable[] = []; - let i = 0; - while (i < blockBodyLines.length) { - const headerLine = blockBodyLines[i]; - const sepLine = blockBodyLines[i + 1]; - if ( - headerLine.trim().startsWith("|") && - sepLine !== undefined && - /^\s*\|\s*[:-]+(?:\s*\|\s*[:-]+)*\s*\|?\s*$/.test(sepLine) - ) { - const headers = splitTableRow(headerLine); - const rows: string[][] = []; - let j = i + 2; - while (j < blockBodyLines.length && blockBodyLines[j].trim().startsWith("|")) { - rows.push(splitTableRow(blockBodyLines[j])); - j++; - } - tables.push({ headers, rows, line: startLineNum + i }); - i = j; - continue; - } - i++; - } - return tables; -} - -function splitTableRow(line: string): string[] { - const trimmed = line.trim().replace(/^\|/, "").replace(/\|$/, ""); - return trimmed.split("|").map((cell) => cell.trim()); -} - -// ---------- Code blocks --------------------------------------------------- - -function extractCodeBlocks( - blockBodyLines: readonly string[], - startLineNum: number, -): AstCodeBlock[] { - const codeBlocks: AstCodeBlock[] = []; - let i = 0; - while (i < blockBodyLines.length) { - const open = blockBodyLines[i]; - if (open.startsWith("```")) { - const lang = open.slice(3).trim(); - const langField = lang.length > 0 ? lang : null; - const startLine = startLineNum + i; - let j = i + 1; - const bodyLines: string[] = []; - while (j < blockBodyLines.length && !blockBodyLines[j].startsWith("```")) { - bodyLines.push(blockBodyLines[j]); - j++; - } - codeBlocks.push({ lang: langField, text: bodyLines.join("\n"), line: startLine }); - i = j + 1; - continue; - } - i++; - } - return codeBlocks; -} diff --git a/extensions/oc-path/src/oc-path/tests/parse.test.ts b/extensions/oc-path/src/oc-path/tests/parse.test.ts index 0fa4f9754ba..0e5bb1e36df 100644 --- a/extensions/oc-path/src/oc-path/tests/parse.test.ts +++ b/extensions/oc-path/src/oc-path/tests/parse.test.ts @@ -135,53 +135,6 @@ describe("parseMd — items", () => { }); }); -describe("parseMd — tables", () => { - it("extracts a simple table", () => { - const raw = `## Tool Guidance - -| tool | guidance | -| --- | --- | -| gh | use for GitHub | -| curl | HTTP client | -`; - const { ast } = parseMd(raw); - const table = ast.blocks[0]?.tables[0]; - if (!table) { - throw new Error("expected parsed markdown table"); - } - expect(table.headers).toEqual(["tool", "guidance"]); - expect(table.rows.length).toBe(2); - expect(table.rows[0]).toEqual(["gh", "use for GitHub"]); - }); -}); - -describe("parseMd — code blocks", () => { - it("extracts a fenced code block", () => { - const raw = `## Examples - -\`\`\`ts -const x = 1; -\`\`\` -`; - const { ast } = parseMd(raw); - expect(ast.blocks[0]?.codeBlocks[0]).toMatchObject({ - lang: "ts", - text: "const x = 1;", - }); - }); - - it("handles unlanguaged fences", () => { - const raw = `## Block - -\`\`\` -plain text -\`\`\` -`; - const { ast } = parseMd(raw); - expect(ast.blocks[0]?.codeBlocks[0]?.lang).toBeNull(); - }); -}); - describe("parseMd — byte-fidelity", () => { it("preserves raw on the AST", () => { const raw = `---\nname: x\n---\n\n## Sec\n\n- a\n- b\n`; diff --git a/extensions/oc-path/src/oc-path/tests/scenarios/code-blocks.test.ts b/extensions/oc-path/src/oc-path/tests/scenarios/code-blocks.test.ts deleted file mode 100644 index a9addad2ec2..00000000000 --- a/extensions/oc-path/src/oc-path/tests/scenarios/code-blocks.test.ts +++ /dev/null @@ -1,97 +0,0 @@ -/** - * Wave 6 — fenced code blocks. - * - * Substrate guarantee: triple-backtick fences (` ``` `) inside H2 blocks - * extract as `AstCodeBlock` with `lang` (or null) and verbatim `text`. - * Code blocks suppress H2-split and item-extraction inside their body. - */ -import { describe, expect, it } from "vitest"; -import { parseMd } from "../../parse.js"; - -describe("wave-06 code-blocks", () => { - it("CB-01 unlanguaged fence", () => { - const raw = `## H\n\n\`\`\`\nplain text\n\`\`\`\n`; - const { ast } = parseMd(raw); - expect(ast.blocks[0]?.codeBlocks[0]).toMatchObject({ - lang: null, - text: "plain text", - }); - }); - - it("CB-02 languaged fence", () => { - const raw = `## H\n\n\`\`\`ts\nconst x = 1;\n\`\`\`\n`; - const { ast } = parseMd(raw); - expect(ast.blocks[0]?.codeBlocks[0]?.lang).toBe("ts"); - expect(ast.blocks[0]?.codeBlocks[0]?.text).toBe("const x = 1;"); - }); - - it("CB-03 multi-line code body preserved verbatim", () => { - const raw = `## H\n\n\`\`\`ts\nline 1\nline 2\nline 3\n\`\`\`\n`; - const { ast } = parseMd(raw); - expect(ast.blocks[0]?.codeBlocks[0]?.text).toBe("line 1\nline 2\nline 3"); - }); - - it("CB-04 empty code block", () => { - const raw = `## H\n\n\`\`\`\n\`\`\`\n`; - const { ast } = parseMd(raw); - expect(ast.blocks[0]?.codeBlocks[0]?.text).toBe(""); - }); - - it("CB-05 code block with `## ` does NOT split as heading", () => { - const raw = `## Real\n\n\`\`\`md\n## Not a heading\n\`\`\`\n\n## Another real\n`; - const { ast } = parseMd(raw); - expect(ast.blocks.map((b) => b.heading)).toEqual(["Real", "Another real"]); - }); - - it("CB-06 code block with `- bullet` does NOT extract as item", () => { - const raw = `## H\n\n\`\`\`\n- not a bullet\n- still not\n\`\`\`\n\n- real bullet\n`; - const { ast } = parseMd(raw); - expect(ast.blocks[0]?.items.map((i) => i.text)).toEqual(["real bullet"]); - }); - - it("CB-07 multiple code blocks in same section", () => { - const raw = `## H\n\n\`\`\`a\nfirst\n\`\`\`\n\n\`\`\`b\nsecond\n\`\`\`\n`; - const { ast } = parseMd(raw); - expect(ast.blocks[0]?.codeBlocks.length).toBe(2); - expect(ast.blocks[0]?.codeBlocks.map((c) => c.lang)).toEqual(["a", "b"]); - }); - - it("CB-08 unterminated fence — body extends to end of section", () => { - const raw = `## H\n\n\`\`\`\nopen but never closes\n`; - const { ast } = parseMd(raw); - // Behavior: code block is created with whatever was after the open - // fence, including any trailing newline lines. Documents are - // likely malformed; substrate is lenient and preserves what's - // there (verifiable via raw round-trip). - expect(ast.blocks[0]?.codeBlocks[0]?.text).toContain("open but never closes"); - }); - - it("CB-09 fence with leading spaces (4-space indented code)", () => { - // Note: only column-0 ``` triggers fence. Indented content is body - // text. This is the documented behavior. - const raw = `## H\n\n \`\`\`\n indented\n \`\`\`\n`; - const { ast } = parseMd(raw); - expect(ast.blocks[0]?.codeBlocks).toEqual([]); - }); - - it("CB-10 lang tag with extra whitespace trimmed", () => { - const raw = `## H\n\n\`\`\` jsonc \nbody\n\`\`\`\n`; - const { ast } = parseMd(raw); - expect(ast.blocks[0]?.codeBlocks[0]?.lang).toBe("jsonc"); - }); - - it("CB-11 lang tag with hyphen / dot (typescript-jsx, c++)", () => { - const raw = `## H\n\n\`\`\`typescript-jsx\nx\n\`\`\`\n`; - const { ast } = parseMd(raw); - expect(ast.blocks[0]?.codeBlocks[0]?.lang).toBe("typescript-jsx"); - }); - - it("CB-12 fence appearing in preamble (before any H2) is ignored at block layer", () => { - const raw = `\`\`\`\npreamble code\n\`\`\`\n\n## H\n`; - const { ast } = parseMd(raw); - // Preamble code blocks aren't structurally extracted at the - // substrate layer; this is documented. Lint can scan preamble - // raw if needed. - expect(ast.blocks[0]?.codeBlocks).toEqual([]); - }); -}); diff --git a/extensions/oc-path/src/oc-path/tests/scenarios/h2-block-split.test.ts b/extensions/oc-path/src/oc-path/tests/scenarios/h2-block-split.test.ts index 8e58a8c7a40..ce0918ee902 100644 --- a/extensions/oc-path/src/oc-path/tests/scenarios/h2-block-split.test.ts +++ b/extensions/oc-path/src/oc-path/tests/scenarios/h2-block-split.test.ts @@ -56,10 +56,12 @@ describe("wave-03 h2-block-split", () => { expect(ast.blocks[0]?.heading).toBe("With space"); }); - it("H2-08 leading whitespace before `##` — does NOT match (regex anchored at line start)", () => { + it("H2-08 leading whitespace before `##` — recognized as heading (CommonMark)", () => { + // Substrate accepts up to 3 spaces of indentation as an atx + // heading per CommonMark. Lint rules can flag if a particular + // workspace file requires column-zero authoring. const { ast } = parseMd(" ## indented\n## not indented\n"); - expect(ast.blocks.length).toBe(1); - expect(ast.blocks[0]?.heading).toBe("not indented"); + expect(ast.blocks.map((b) => b.heading)).toEqual(["indented", "not indented"]); }); it("H2-09 trailing whitespace on heading — trimmed in heading text", () => { @@ -126,16 +128,19 @@ describe("wave-03 h2-block-split", () => { }); it("H2-19 empty heading text (`## `)", () => { + // Substrate accepts an empty atx heading; downstream lint + // (`OC_HEADING_EMPTY`) flags it. Slug is empty string — collisions + // are a lint-level concern, not a parser refusal. const { ast } = parseMd("## \n"); - // Empty heading is technically a valid match (`## ` + empty text) - // but the regex requires `(.+?)` so empty doesn't match. Validates - // it's NOT split. - expect(ast.blocks).toEqual([]); + expect(ast.blocks.length).toBe(1); + expect(ast.blocks[0]?.heading).toBe(""); + expect(ast.blocks[0]?.slug).toBe(""); }); it("H2-20 heading with only whitespace (`## `)", () => { const { ast } = parseMd("## \n"); - expect(ast.blocks).toEqual([]); + expect(ast.blocks.length).toBe(1); + expect(ast.blocks[0]?.heading).toBe(""); }); it("H2-21 heading-shaped text inside multi-line bullet body — does split", () => { diff --git a/extensions/oc-path/src/oc-path/tests/scenarios/items.test.ts b/extensions/oc-path/src/oc-path/tests/scenarios/items.test.ts index a6fd16d48a6..2628b9be097 100644 --- a/extensions/oc-path/src/oc-path/tests/scenarios/items.test.ts +++ b/extensions/oc-path/src/oc-path/tests/scenarios/items.test.ts @@ -85,23 +85,27 @@ describe("wave-04 items", () => { expect(ast.blocks[0]?.items[0]?.text).toBe("spaced"); }); - it("I-15 empty bullet text is dropped", () => { + it("I-15 empty bullet — recognized with empty text/slug", () => { + // Substrate accepts an empty bullet; lint can flag if collisions + // matter. Both `- ` and `- real` become items. const { ast } = parseMd("## H\n- \n- real\n"); - // The regex requires (.+?) non-empty, so `- ` alone doesn't match. - expect(ast.blocks[0]?.items.length).toBe(1); + expect(ast.blocks[0]?.items.length).toBe(2); + expect(ast.blocks[0]?.items.map((i) => i.text)).toEqual(["", "real"]); }); - it("I-16 indented bullet (sub-bullet) — current parser still picks up", () => { - // The current regex `^(?:[-*+])\\s+(.+?)\\s*$` requires column-0 - // bullet markers; indented bullets do NOT match. Documented as a - // limit — sub-bullets surface in body text but not in items. + it("I-16 indented bullet (sub-bullet) — recognized as item alongside parent", () => { + // Substrate flattens the bullet tree into a list of items; + // sub-bullets surface as their own AstItem entries. Lint rules + // can flag depth or duplicate-slug collisions. const { ast } = parseMd("## H\n- top\n - sub\n"); - expect(ast.blocks[0]?.items.map((i) => i.text)).toEqual(["top"]); + expect(ast.blocks[0]?.items.map((i) => i.text)).toEqual(["top", "sub"]); }); - it("I-17 numbered list (1. item) is NOT extracted as item", () => { + it("I-17 numbered list (1. item) — recognized as items", () => { + // Substrate treats ordered and unordered lists symmetrically. + // Lint rules can flag if a particular file requires bullet style. const { ast } = parseMd("## H\n1. first\n2. second\n"); - expect(ast.blocks[0]?.items).toEqual([]); + expect(ast.blocks[0]?.items.map((i) => i.text)).toEqual(["first", "second"]); }); it("I-18 items in a section with no body before — first item line is heading+1", () => { diff --git a/extensions/oc-path/src/oc-path/tests/scenarios/real-world-fixtures.test.ts b/extensions/oc-path/src/oc-path/tests/scenarios/real-world-fixtures.test.ts index 87c53f1ed20..8d779101edf 100644 --- a/extensions/oc-path/src/oc-path/tests/scenarios/real-world-fixtures.test.ts +++ b/extensions/oc-path/src/oc-path/tests/scenarios/real-world-fixtures.test.ts @@ -56,7 +56,7 @@ describe("wave-12 real-world-fixtures", () => { } }); - it("F-04 TOOLS.md table extracted from Tool Guidance section", () => { + it("F-04 TOOLS.md tool-guidance section resolves by slug", () => { const raw = load("TOOLS.md"); const { ast } = parseMd(raw); expect(emitMd(ast)).toBe(raw); @@ -65,10 +65,6 @@ describe("wave-12 real-world-fixtures", () => { section: "tool-guidance", }); expect(guidance?.kind).toBe("block"); - if (guidance?.kind === "block") { - expect(guidance.node.tables.length).toBeGreaterThan(0); - expect(guidance.node.tables[0]?.headers).toEqual(["tool", "guidance"]); - } }); it("F-05 IDENTITY.md sections resolvable by slug", () => { diff --git a/extensions/oc-path/src/oc-path/tests/scenarios/tables.test.ts b/extensions/oc-path/src/oc-path/tests/scenarios/tables.test.ts deleted file mode 100644 index 3272f55dae7..00000000000 --- a/extensions/oc-path/src/oc-path/tests/scenarios/tables.test.ts +++ /dev/null @@ -1,154 +0,0 @@ -/** - * Wave 5 — markdown tables. - * - * Substrate guarantee: GFM-style tables (`| h | h |\n|---|---|\n| r | r |`) - * inside H2 blocks are extracted into `AstTable`. Tables inside fenced - * code blocks are NOT extracted (handled at item-extraction layer too; - * tables share the same code-block awareness when relevant). - */ -import { describe, expect, it } from "vitest"; -import { parseMd } from "../../parse.js"; - -describe("wave-05 tables", () => { - it("T-01 standard 2-column table", () => { - const raw = `## H - -| tool | guidance | -| --- | --- | -| gh | use for GitHub | -| curl | HTTP client | -`; - const { ast } = parseMd(raw); - const table = ast.blocks[0]?.tables[0]; - expect(table?.headers).toEqual(["tool", "guidance"]); - expect(table?.rows).toEqual([ - ["gh", "use for GitHub"], - ["curl", "HTTP client"], - ]); - }); - - it("T-02 3+ column table", () => { - const raw = `## H - -| a | b | c | -| - | - | - | -| 1 | 2 | 3 | -`; - const { ast } = parseMd(raw); - expect(ast.blocks[0]?.tables[0]?.headers).toEqual(["a", "b", "c"]); - expect(ast.blocks[0]?.tables[0]?.rows[0]).toEqual(["1", "2", "3"]); - }); - - it("T-03 table with alignment colons in separator", () => { - const raw = `## H - -| left | center | right | -| :--- | :---: | ---: | -| a | b | c | -`; - const { ast } = parseMd(raw); - expect(ast.blocks[0]?.tables.length).toBe(1); - }); - - it("T-04 table with empty cells", () => { - const raw = `## H - -| a | b | -| - | - | -| 1 | | -| | 2 | -`; - const { ast } = parseMd(raw); - expect(ast.blocks[0]?.tables[0]?.rows).toEqual([ - ["1", ""], - ["", "2"], - ]); - }); - - it("T-05 table with no rows (header + sep only)", () => { - const raw = `## H - -| a | b | -| - | - | -`; - const { ast } = parseMd(raw); - expect(ast.blocks[0]?.tables[0]?.headers).toEqual(["a", "b"]); - expect(ast.blocks[0]?.tables[0]?.rows).toEqual([]); - }); - - it("T-06 multiple tables in same section", () => { - const raw = `## H - -| a | b | -| - | - | -| 1 | 2 | - -Some text. - -| x | y | -| - | - | -| 3 | 4 | -`; - const { ast } = parseMd(raw); - expect(ast.blocks[0]?.tables.length).toBe(2); - }); - - it("T-07 table line numbers track to the header line", () => { - const raw = `## Section -preamble line -| a | b | -| - | - | -`; - const { ast } = parseMd(raw); - expect(ast.blocks[0]?.tables[0]?.line).toBeGreaterThan(0); - }); - - it("T-08 invalid separator (no pipes) — no table extracted", () => { - const raw = `## H - -| a | b | -not a separator -| 1 | 2 | -`; - const { ast } = parseMd(raw); - expect(ast.blocks[0]?.tables).toEqual([]); - }); - - it("T-09 single-column table (just `| col |\\n|---|`)", () => { - const raw = `## H - -| col | -| --- | -| value1 | -| value2 | -`; - const { ast } = parseMd(raw); - expect(ast.blocks[0]?.tables[0]?.headers).toEqual(["col"]); - expect(ast.blocks[0]?.tables[0]?.rows).toEqual([["value1"], ["value2"]]); - }); - - it("T-10 table at end of file with trailing newlines", () => { - const raw = `## H - -| a | -| - | -| 1 | - - -`; - const { ast } = parseMd(raw); - expect(ast.blocks[0]?.tables[0]?.rows).toEqual([["1"]]); - }); - - it("T-11 table content with internal whitespace trimmed", () => { - const raw = `## H - -| col1 | col2 | -| --- | --- | -| a | b | -`; - const { ast } = parseMd(raw); - expect(ast.blocks[0]?.tables[0]?.headers).toEqual(["col1", "col2"]); - expect(ast.blocks[0]?.tables[0]?.rows[0]).toEqual(["a", "b"]); - }); -});