diff --git a/src/infra/command-explainer/extract.test.ts b/src/infra/command-explainer/extract.test.ts index cc4157a0705..6cfc2fa3532 100644 --- a/src/infra/command-explainer/extract.test.ts +++ b/src/infra/command-explainer/extract.test.ts @@ -1,5 +1,5 @@ import { afterEach, describe, expect, it, vi } from "vitest"; -import type { Parser } from "web-tree-sitter"; +import type { Node as TreeSitterNode, Parser, Tree } from "web-tree-sitter"; import { explainShellCommand } from "./extract.js"; import { getBashParserForCommandExplanation, @@ -15,6 +15,119 @@ function setParserLoaderForTest(loader: () => Promise): void { setBashParserLoaderForCommandExplanationForTest(loader); } +type FakeNodeInit = { + type: string; + text: string; + startIndex: number; + endIndex: number; + startPosition: TreeSitterNode["startPosition"]; + endPosition: TreeSitterNode["endPosition"]; + namedChildren?: TreeSitterNode[]; + fieldChildren?: Record; + hasError?: boolean; +}; + +function fakeNode(init: FakeNodeInit): TreeSitterNode { + const named = init.namedChildren ?? []; + const children = named; + return { + type: init.type, + text: init.text, + startIndex: init.startIndex, + endIndex: init.endIndex, + startPosition: init.startPosition, + endPosition: init.endPosition, + childCount: children.length, + namedChildCount: named.length, + hasError: init.hasError ?? false, + child(index: number): TreeSitterNode | null { + return children[index] ?? null; + }, + namedChild(index: number): TreeSitterNode | null { + return named[index] ?? null; + }, + childForFieldName(name: string): TreeSitterNode | null { + return init.fieldChildren?.[name] ?? null; + }, + } as unknown as TreeSitterNode; +} + +function createByteIndexedUnicodeCommandTree(source: string): Tree { + const firstCommand = "echo café"; + const separator = " && "; + const secondCommand = "echo ok"; + const firstCommandEnd = Buffer.byteLength(firstCommand, "utf8"); + const secondCommandStart = Buffer.byteLength(firstCommand + separator, "utf8"); + const sourceEnd = Buffer.byteLength(source, "utf8"); + + const firstName = fakeNode({ + type: "command_name", + text: "echo", + startIndex: 0, + endIndex: 4, + startPosition: { row: 0, column: 0 }, + endPosition: { row: 0, column: 4 }, + }); + const firstArgument = fakeNode({ + type: "word", + text: "café", + startIndex: 5, + endIndex: firstCommandEnd, + startPosition: { row: 0, column: 5 }, + endPosition: { row: 0, column: firstCommandEnd }, + }); + const first = fakeNode({ + type: "command", + text: firstCommand, + startIndex: 0, + endIndex: firstCommandEnd, + startPosition: { row: 0, column: 0 }, + endPosition: { row: 0, column: firstCommandEnd }, + namedChildren: [firstName, firstArgument], + fieldChildren: { name: firstName }, + }); + + const secondName = fakeNode({ + type: "command_name", + text: "echo", + startIndex: secondCommandStart, + endIndex: secondCommandStart + 4, + startPosition: { row: 0, column: secondCommandStart }, + endPosition: { row: 0, column: secondCommandStart + 4 }, + }); + const secondArgument = fakeNode({ + type: "word", + text: "ok", + startIndex: secondCommandStart + 5, + endIndex: sourceEnd, + startPosition: { row: 0, column: secondCommandStart + 5 }, + endPosition: { row: 0, column: sourceEnd }, + }); + const second = fakeNode({ + type: "command", + text: secondCommand, + startIndex: secondCommandStart, + endIndex: sourceEnd, + startPosition: { row: 0, column: secondCommandStart }, + endPosition: { row: 0, column: sourceEnd }, + namedChildren: [secondName, secondArgument], + fieldChildren: { name: secondName }, + }); + + return { + rootNode: fakeNode({ + type: "program", + text: source, + startIndex: 0, + endIndex: sourceEnd, + startPosition: { row: 0, column: 0 }, + endPosition: { row: 0, column: sourceEnd }, + namedChildren: [first, second], + }), + delete: vi.fn(), + } as unknown as Tree; +} + afterEach(() => { if (parserLoaderOverridden) { setBashParserLoaderForCommandExplanationForTest(); @@ -94,6 +207,34 @@ describe("command explainer tree-sitter runtime", () => { expect(reset).toHaveBeenCalledOnce(); }); + it("maps parser byte offsets to JavaScript string spans for Unicode source", async () => { + const source = "echo café && echo ok"; + const parser = { + parse: vi.fn(() => createByteIndexedUnicodeCommandTree(source)), + reset: vi.fn(), + }; + setParserLoaderForTest(async () => parser as unknown as Parser); + + const explanation = await explainShellCommand(source); + + expect(explanation.topLevelCommands).toEqual([ + expect.objectContaining({ + executable: "echo", + argv: ["echo", "café"], + span: expect.objectContaining({ startIndex: 0, endIndex: 9 }), + }), + expect.objectContaining({ + executable: "echo", + argv: ["echo", "ok"], + span: expect.objectContaining({ startIndex: 13, endIndex: 20 }), + }), + ]); + for (const command of explanation.topLevelCommands) { + expect(source.slice(command.span.startIndex, command.span.endIndex)).toBe(command.text); + expect(command.span.endPosition.column).toBe(command.span.endIndex); + } + }); + it("explains a pipeline with python inline eval", async () => { const explanation = await explainShellCommand('ls | grep "stuff" | python -c \'print("hi")\''); @@ -566,7 +707,7 @@ describe("command explainer tree-sitter runtime", () => { 'find . -name "*.ts" -exec grep -n TODO {} +', 'bash -lc "echo hi | wc -c"', ]; - const iterations = 10; + const iterations = 3; for (let index = 0; index < iterations; index += 1) { for (const command of corpus) { const explanation = await explainShellCommand(command); diff --git a/src/infra/command-explainer/extract.ts b/src/infra/command-explainer/extract.ts index 239a5afc57e..4d659065358 100644 --- a/src/infra/command-explainer/extract.ts +++ b/src/infra/command-explainer/extract.ts @@ -149,6 +149,92 @@ function advancePosition( return { row, column }; } +function utf8ByteLengthForCodePoint(codePoint: number): number { + if (codePoint <= 0x7f) { + return 1; + } + if (codePoint <= 0x7ff) { + return 2; + } + if (codePoint <= 0xffff) { + return 3; + } + return 4; +} + +function utf8ByteLength(text: string): number { + let length = 0; + for (let index = 0; index < text.length; index += 1) { + const codePoint = text.codePointAt(index); + if (codePoint === undefined) { + continue; + } + length += utf8ByteLengthForCodePoint(codePoint); + if (codePoint > 0xffff) { + index += 1; + } + } + return length; +} + +function utf8ByteOffsetToStringIndex(text: string, byteOffset: number): number { + if (byteOffset <= 0) { + return 0; + } + let currentByteOffset = 0; + for (let index = 0; index < text.length; index += 1) { + const codePoint = text.codePointAt(index); + if (codePoint === undefined) { + return text.length; + } + const codePointLength = utf8ByteLengthForCodePoint(codePoint); + if (currentByteOffset + codePointLength > byteOffset) { + return index; + } + currentByteOffset += codePointLength; + if (currentByteOffset === byteOffset) { + return codePoint > 0xffff ? index + 2 : index + 1; + } + if (codePoint > 0xffff) { + index += 1; + } + } + return text.length; +} + +function parserOffsetToStringIndex( + source: string, + rootNode: TreeSitterNode, +): (offset: number) => number { + const utf8Length = utf8ByteLength(source); + if (utf8Length !== source.length && rootNode.endIndex === utf8Length) { + return (offset) => utf8ByteOffsetToStringIndex(source, offset); + } + return (offset) => offset; +} + +function spanBaseForParserSource( + source: string, + rootNode: TreeSitterNode, + base: SpanBase, +): SpanBase { + const offsetToStringIndex = parserOffsetToStringIndex(source, rootNode); + return { + startIndex: base.startIndex, + startPosition: base.startPosition, + mapOffset(offset) { + const sourceIndex = offsetToStringIndex(offset); + if (base.mapOffset) { + return base.mapOffset(sourceIndex); + } + return { + index: base.startIndex + sourceIndex, + position: advancePosition(base.startPosition, source.slice(0, sourceIndex)), + }; + }, + }; +} + function valuePrefixLength(node: TreeSitterNode): number { if (node.type === "string" || node.type === "raw_string") { return 1; @@ -1051,18 +1137,23 @@ async function walk( ); if (wrapperPayload && state.wrapperPayloadDepth < MAX_WRAPPER_PAYLOAD_DEPTH) { const wrapperTree = await parseBashForCommandExplanation(wrapperPayload.command); + const wrapperSpanBase = spanBaseForParserSource( + wrapperPayload.command, + wrapperTree.rootNode, + wrapperPayload.spanBase, + ); try { if (wrapperTree.rootNode.hasError) { output.hasParseError = true; output.risks.push({ kind: "syntax-error", text: wrapperPayload.command, - span: spanFromNode(wrapperTree.rootNode, wrapperPayload.spanBase), + span: spanFromNode(wrapperTree.rootNode, wrapperSpanBase), }); } await walk(wrapperTree.rootNode, output, "wrapper-payload", { wrapperPayloadDepth: state.wrapperPayloadDepth + 1, - spanBase: wrapperPayload.spanBase, + spanBase: wrapperSpanBase, }); } finally { wrapperTree.delete(); @@ -1079,6 +1170,7 @@ async function walk( export async function explainShellCommand(source: string): Promise { const tree = await parseBashForCommandExplanation(source); try { + const spanBase = spanBaseForParserSource(source, tree.rootNode, ROOT_SPAN_BASE); const output: MutableExplanation = { shapes: new Set(), commands: [], @@ -1087,7 +1179,7 @@ export async function explainShellCommand(source: string): Promise command.context === "top-level"); return {