diff --git a/CHANGELOG.md b/CHANGELOG.md index f392c792a2f..430439bd83e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,7 @@ Docs: https://docs.openclaw.ai - Discord/status: add degraded Discord transport and gateway event-loop starvation signals to `openclaw channels status`, `openclaw status --deep`, and fetch-timeout logs so intermittent socket resets do not look like a healthy running channel. (#76327) Thanks @joshavant. - Plugins/update: on the beta OpenClaw update channel, default-line npm and ClawHub plugin updates try `@beta` first and fall back to default/latest when no plugin beta release exists. - Channels/WhatsApp: support explicit WhatsApp Channel/Newsletter `@newsletter` outbound message targets with channel session metadata instead of DM routing. Fixes #13417; carries forward the narrow outbound target idea from #13424. Thanks @vincentkoc and @agentz-manfred. +- Exec approvals: add a tree-sitter-backed shell command explainer for future approval and command-review surfaces. (#75004) Thanks @jesse-merhi. ### Fixes diff --git a/knip.config.ts b/knip.config.ts index 5679fcb88f3..d511e6d2342 100644 --- a/knip.config.ts +++ b/knip.config.ts @@ -10,6 +10,7 @@ const rootEntries = [ "src/entry.ts!", "src/cli/daemon-cli.ts!", "src/infra/warning-filter.ts!", + "src/infra/command-explainer/index.ts!", bundledPluginFile("telegram", "src/audit.ts", "!"), bundledPluginFile("telegram", "src/token.ts", "!"), "src/hooks/bundled/*/handler.ts!", @@ -139,6 +140,7 @@ const config = { "@openclaw/*", "playwright-core", "sqlite-vec", + "tree-sitter-bash", ...rootBundledPluginRuntimeDependencies, ], project: [ diff --git a/package.json b/package.json index 357088fb5ad..0c51ce473c2 100644 --- a/package.json +++ b/package.json @@ -1700,10 +1700,12 @@ "sqlite-vec": "0.1.9", "tar": "7.5.13", "tokenjuice": "0.7.0", + "tree-sitter-bash": "^0.25.1", "tslog": "^4.10.2", "typebox": "1.1.37", "undici": "8.1.0", "web-push": "^3.6.7", + "web-tree-sitter": "^0.26.8", "ws": "^8.20.0", "yaml": "^2.8.3", "zod": "^4.4.1" @@ -1784,7 +1786,8 @@ "sharp" ], "ignoredBuiltDependencies": [ - "koffi" + "koffi", + "tree-sitter-bash" ], "packageExtensions": { "@mariozechner/pi-coding-agent": { diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 9c4965b293e..b9cc6b075f5 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -193,6 +193,9 @@ importers: tokenjuice: specifier: 0.7.0 version: 0.7.0 + tree-sitter-bash: + specifier: ^0.25.1 + version: 0.25.1 tslog: specifier: ^4.10.2 version: 4.10.2 @@ -205,6 +208,9 @@ importers: web-push: specifier: ^3.6.7 version: 3.6.7 + web-tree-sitter: + specifier: ^0.26.8 + version: 0.26.8 ws: specifier: ^8.20.0 version: 8.20.0 @@ -6379,6 +6385,10 @@ packages: resolution: {integrity: sha512-9MdFxmkKaOYVTV+XVRG8ArDwwQ77XIgIPyKASB1k3JPq3M8fGQQQE3YpMOrKm6g//Ktx8ivZr8xo1Qmtqub+GA==} engines: {node: ^18 || ^20 || >= 21} + node-gyp-build@4.8.4: + resolution: {integrity: sha512-LA4ZjwlnUblHVgq0oBF3Jl/6h/Nvs5fzBLwdEF4nuxnFdsfajde4WfxtJr3CaiH+F6ewcIB/q4jQ4UzPyid+CQ==} + hasBin: true + node-downloader-helper@2.1.11: resolution: {integrity: sha512-882fH2C9AWdiPCwz/2beq5t8FGMZK9Dx8TJUOIxzMCbvG7XUKM5BuJwN5f0NKo4SCQK6jR4p2TPm54mYGdGchQ==} engines: {node: '>=14.18'} @@ -7388,6 +7398,14 @@ packages: resolution: {integrity: sha512-L0Orpi8qGpRG//Nd+H90vFB+3iHnue1zSSGmNOOCh1GLJ7rUKVwV2HvijphGQS2UmhUZewS9VgvxYIdgr+fG1A==} hasBin: true + tree-sitter-bash@0.25.1: + resolution: {integrity: sha512-7hMytuYIMoXOq24yRulgIxthE9YmggZIOHCyPTTuJcu6EU54tYD+4G39cUb28kxC6jMf/AbPfWGLQtgPTdh3xw==} + peerDependencies: + tree-sitter: ^0.25.0 + peerDependenciesMeta: + tree-sitter: + optional: true + trim-lines@3.0.1: resolution: {integrity: sha512-kRj8B+YHZCc9kQYdWfJB2/oUl9rA99qbowYYBtr4ui4mZyAQ2JpvVBd/6U2YloATfqBhBTSMhTpgBHtU0Mf3Rg==} @@ -7665,6 +7683,9 @@ packages: resolution: {integrity: sha512-d2JWLCivmZYTSIoge9MsgFCZrt571BikcWGYkjC1khllbTeDlGqZ2D8vD8E/lJa8WGWbb7Plm8/XJYV7IJHZZw==} engines: {node: '>= 8'} + web-tree-sitter@0.26.8: + resolution: {integrity: sha512-4sUwi7ZyOrIk5KLgYLkc2A/F0LFMQnBhfb+2Cdl7ik4ePJ6JD+fk4ofI2sA5eGawBKBaK4Vntt7Ww5KcEsay4A==} + webidl-conversions@3.0.1: resolution: {integrity: sha512-2JAn3z8AR6rjK8Sm8orRC0h/bcl/DqL7tRPdGZ4I1CjdF+EaMLmYxBHyXuKL849eucPFhvBoxMsflfOb8kxaeQ==} @@ -13555,8 +13576,9 @@ snapshots: netmask@2.1.1: {} - node-addon-api@8.7.0: - optional: true + node-addon-api@8.7.0: {} + + node-gyp-build@4.8.4: {} node-downloader-helper@2.1.11: {} @@ -14755,6 +14777,11 @@ snapshots: tree-kill@1.2.2: {} + tree-sitter-bash@0.25.1: + dependencies: + node-addon-api: 8.7.0 + node-gyp-build: 4.8.4 + trim-lines@3.0.1: {} trough@2.2.0: {} @@ -14976,6 +15003,8 @@ snapshots: web-streams-polyfill@3.3.3: {} + web-tree-sitter@0.26.8: {} + webidl-conversions@3.0.1: {} webidl-conversions@8.0.1: {} diff --git a/pnpm-workspace.yaml b/pnpm-workspace.yaml index 4e9cad27335..f32590086c0 100644 --- a/pnpm-workspace.yaml +++ b/pnpm-workspace.yaml @@ -48,3 +48,4 @@ onlyBuiltDependencies: ignoredBuiltDependencies: - koffi + - tree-sitter-bash diff --git a/src/infra/command-explainer/extract.test.ts b/src/infra/command-explainer/extract.test.ts new file mode 100644 index 00000000000..6cfc2fa3532 --- /dev/null +++ b/src/infra/command-explainer/extract.test.ts @@ -0,0 +1,718 @@ +import { afterEach, describe, expect, it, vi } from "vitest"; +import type { Node as TreeSitterNode, Parser, Tree } from "web-tree-sitter"; +import { explainShellCommand } from "./extract.js"; +import { + getBashParserForCommandExplanation, + parseBashForCommandExplanation, + resolvePackageFileForCommandExplanation, + setBashParserLoaderForCommandExplanationForTest, +} from "./tree-sitter-runtime.js"; + +let parserLoaderOverridden = false; + +function setParserLoaderForTest(loader: () => Promise): void { + parserLoaderOverridden = true; + setBashParserLoaderForCommandExplanationForTest(loader); +} + +type FakeNodeInit = { + type: string; + text: string; + startIndex: number; + endIndex: number; + startPosition: TreeSitterNode["startPosition"]; + endPosition: TreeSitterNode["endPosition"]; + namedChildren?: TreeSitterNode[]; + fieldChildren?: Record; + hasError?: boolean; +}; + +function fakeNode(init: FakeNodeInit): TreeSitterNode { + const named = init.namedChildren ?? []; + const children = named; + return { + type: init.type, + text: init.text, + startIndex: init.startIndex, + endIndex: init.endIndex, + startPosition: init.startPosition, + endPosition: init.endPosition, + childCount: children.length, + namedChildCount: named.length, + hasError: init.hasError ?? false, + child(index: number): TreeSitterNode | null { + return children[index] ?? null; + }, + namedChild(index: number): TreeSitterNode | null { + return named[index] ?? null; + }, + childForFieldName(name: string): TreeSitterNode | null { + return init.fieldChildren?.[name] ?? null; + }, + } as unknown as TreeSitterNode; +} + +function createByteIndexedUnicodeCommandTree(source: string): Tree { + const firstCommand = "echo café"; + const separator = " && "; + const secondCommand = "echo ok"; + const firstCommandEnd = Buffer.byteLength(firstCommand, "utf8"); + const secondCommandStart = Buffer.byteLength(firstCommand + separator, "utf8"); + const sourceEnd = Buffer.byteLength(source, "utf8"); + + const firstName = fakeNode({ + type: "command_name", + text: "echo", + startIndex: 0, + endIndex: 4, + startPosition: { row: 0, column: 0 }, + endPosition: { row: 0, column: 4 }, + }); + const firstArgument = fakeNode({ + type: "word", + text: "café", + startIndex: 5, + endIndex: firstCommandEnd, + startPosition: { row: 0, column: 5 }, + endPosition: { row: 0, column: firstCommandEnd }, + }); + const first = fakeNode({ + type: "command", + text: firstCommand, + startIndex: 0, + endIndex: firstCommandEnd, + startPosition: { row: 0, column: 0 }, + endPosition: { row: 0, column: firstCommandEnd }, + namedChildren: [firstName, firstArgument], + fieldChildren: { name: firstName }, + }); + + const secondName = fakeNode({ + type: "command_name", + text: "echo", + startIndex: secondCommandStart, + endIndex: secondCommandStart + 4, + startPosition: { row: 0, column: secondCommandStart }, + endPosition: { row: 0, column: secondCommandStart + 4 }, + }); + const secondArgument = fakeNode({ + type: "word", + text: "ok", + startIndex: secondCommandStart + 5, + endIndex: sourceEnd, + startPosition: { row: 0, column: secondCommandStart + 5 }, + endPosition: { row: 0, column: sourceEnd }, + }); + const second = fakeNode({ + type: "command", + text: secondCommand, + startIndex: secondCommandStart, + endIndex: sourceEnd, + startPosition: { row: 0, column: secondCommandStart }, + endPosition: { row: 0, column: sourceEnd }, + namedChildren: [secondName, secondArgument], + fieldChildren: { name: secondName }, + }); + + return { + rootNode: fakeNode({ + type: "program", + text: source, + startIndex: 0, + endIndex: sourceEnd, + startPosition: { row: 0, column: 0 }, + endPosition: { row: 0, column: sourceEnd }, + namedChildren: [first, second], + }), + delete: vi.fn(), + } as unknown as Tree; +} + +afterEach(() => { + if (parserLoaderOverridden) { + setBashParserLoaderForCommandExplanationForTest(); + parserLoaderOverridden = false; + } + vi.restoreAllMocks(); +}); + +describe("command explainer tree-sitter runtime", () => { + it("loads tree-sitter bash and parses a simple command", async () => { + const tree = await parseBashForCommandExplanation("ls | grep stuff"); + + try { + expect(tree.rootNode.type).toBe("program"); + expect(tree.rootNode.toString()).toContain("pipeline"); + } finally { + tree.delete(); + } + }); + + it("rejects oversized parser input before parsing", async () => { + await expect(parseBashForCommandExplanation("x".repeat(128 * 1024 + 1))).rejects.toThrow( + "Shell command is too large to explain", + ); + }); + + it("retries parser initialization after a loader rejection", async () => { + const parser = {} as Parser; + let calls = 0; + setParserLoaderForTest(async () => { + calls += 1; + if (calls === 1) { + throw new Error("transient parser load failure"); + } + return parser; + }); + + await expect(getBashParserForCommandExplanation()).rejects.toThrow( + "transient parser load failure", + ); + await expect(getBashParserForCommandExplanation()).resolves.toBe(parser); + expect(calls).toBe(2); + }); + + it("reports missing parser packages and wasm files with explainer context", () => { + expect(() => + resolvePackageFileForCommandExplanation( + "definitely-missing-openclaw-parser-package", + "parser.wasm", + ), + ).toThrow("Unable to resolve definitely-missing-openclaw-parser-package"); + + expect(() => + resolvePackageFileForCommandExplanation("web-tree-sitter", "missing-openclaw-parser.wasm"), + ).toThrow("Unable to locate missing-openclaw-parser.wasm in web-tree-sitter"); + }); + + it("reports parser progress cancellation as a timeout", async () => { + const reset = vi.fn(); + const parser = { + parse: ( + _source: string, + _oldTree: unknown, + options?: { progressCallback?: (state: unknown) => boolean }, + ) => { + options?.progressCallback?.({ currentOffset: 0, hasError: false }); + return null; + }, + reset, + } as unknown as Parser; + vi.spyOn(performance, "now").mockReturnValueOnce(0).mockReturnValue(501); + setParserLoaderForTest(async () => parser); + + await expect(parseBashForCommandExplanation("echo hi")).rejects.toThrow( + "tree-sitter-bash timed out after 500ms while parsing shell command", + ); + expect(reset).toHaveBeenCalledOnce(); + }); + + it("maps parser byte offsets to JavaScript string spans for Unicode source", async () => { + const source = "echo café && echo ok"; + const parser = { + parse: vi.fn(() => createByteIndexedUnicodeCommandTree(source)), + reset: vi.fn(), + }; + setParserLoaderForTest(async () => parser as unknown as Parser); + + const explanation = await explainShellCommand(source); + + expect(explanation.topLevelCommands).toEqual([ + expect.objectContaining({ + executable: "echo", + argv: ["echo", "café"], + span: expect.objectContaining({ startIndex: 0, endIndex: 9 }), + }), + expect.objectContaining({ + executable: "echo", + argv: ["echo", "ok"], + span: expect.objectContaining({ startIndex: 13, endIndex: 20 }), + }), + ]); + for (const command of explanation.topLevelCommands) { + expect(source.slice(command.span.startIndex, command.span.endIndex)).toBe(command.text); + expect(command.span.endPosition.column).toBe(command.span.endIndex); + } + }); + + it("explains a pipeline with python inline eval", async () => { + const explanation = await explainShellCommand('ls | grep "stuff" | python -c \'print("hi")\''); + + expect(explanation.ok).toBe(true); + expect(explanation.shapes).toContain("pipeline"); + expect(explanation.topLevelCommands.map((step) => step.executable)).toEqual([ + "ls", + "grep", + "python", + ]); + expect(explanation.topLevelCommands[2]?.argv).toEqual(["python", "-c", 'print("hi")']); + expect(explanation.nestedCommands).toEqual([]); + expect(explanation.topLevelCommands[2]?.span).toEqual( + expect.objectContaining({ startIndex: expect.any(Number), endIndex: expect.any(Number) }), + ); + expect(explanation.risks).toContainEqual( + expect.objectContaining({ + kind: "inline-eval", + command: "python", + flag: "-c", + text: "python -c 'print(\"hi\")'", + }), + ); + }); + + it("separates command substitution in an argument", async () => { + const explanation = await explainShellCommand("echo $(whoami)"); + + expect(explanation.topLevelCommands.map((step) => step.executable)).toEqual(["echo"]); + expect(explanation.nestedCommands).toEqual([ + expect.objectContaining({ context: "command-substitution", executable: "whoami" }), + ]); + expect(explanation.risks).toContainEqual( + expect.objectContaining({ kind: "command-substitution", text: "$(whoami)" }), + ); + }); + + it("marks command substitution in executable position as dynamic", async () => { + const explanation = await explainShellCommand("$(whoami) --help"); + + expect(explanation.topLevelCommands).toEqual([]); + expect(explanation.nestedCommands).toEqual([ + expect.objectContaining({ context: "command-substitution", executable: "whoami" }), + ]); + expect(explanation.risks).toContainEqual( + expect.objectContaining({ kind: "dynamic-executable", text: "$(whoami)" }), + ); + }); + + it("separates process substitution commands", async () => { + const explanation = await explainShellCommand("diff <(ls a) <(ls b)"); + + expect(explanation.topLevelCommands.map((step) => step.executable)).toEqual(["diff"]); + expect(explanation.nestedCommands.map((step) => `${step.context}:${step.executable}`)).toEqual([ + "process-substitution:ls", + "process-substitution:ls", + ]); + expect(explanation.risks.map((risk) => risk.kind)).toContain("process-substitution"); + }); + + it("detects AND OR and sequence shapes", async () => { + const explanation = await explainShellCommand("pnpm test && pnpm build || echo failed; pwd"); + + expect(explanation.shapes).toEqual(expect.arrayContaining(["and", "or", "sequence"])); + expect(explanation.topLevelCommands.map((step) => step.executable)).toEqual([ + "pnpm", + "pnpm", + "echo", + "pwd", + ]); + }); + + it("detects newline sequences and background commands", async () => { + const newlineSequence = await explainShellCommand("echo a\necho b"); + expect(newlineSequence.shapes).toContain("sequence"); + expect(newlineSequence.topLevelCommands.map((step) => step.executable)).toEqual([ + "echo", + "echo", + ]); + + const background = await explainShellCommand("echo a & echo b"); + expect(background.shapes).toEqual(expect.arrayContaining(["background", "sequence"])); + expect(background.topLevelCommands.map((step) => step.executable)).toEqual(["echo", "echo"]); + }); + + it("detects conditionals", async () => { + const explanation = await explainShellCommand( + "if test -f package.json; then pnpm test; else echo missing; fi", + ); + + expect(explanation.shapes).toContain("if"); + expect(explanation.topLevelCommands.map((step) => step.executable)).toEqual([ + "test", + "pnpm", + "echo", + ]); + }); + + it("detects declaration and test command forms", async () => { + const declaration = await explainShellCommand("export A=$(whoami)"); + + expect(declaration.topLevelCommands).toEqual([ + expect.objectContaining({ executable: "export", argv: ["export", "A=$(whoami)"] }), + ]); + expect(declaration.nestedCommands).toEqual([ + expect.objectContaining({ context: "command-substitution", executable: "whoami" }), + ]); + + const testCommand = await explainShellCommand("[ -f package.json ]"); + expect(testCommand.topLevelCommands).toEqual([ + expect.objectContaining({ executable: "[", argv: ["[", "-f", "package.json"] }), + ]); + + const doubleBracket = await explainShellCommand("[[ -f package.json ]]"); + expect(doubleBracket.topLevelCommands).toEqual([ + expect.objectContaining({ executable: "[[", argv: ["[[", "-f", "package.json"] }), + ]); + }); + + it("detects shell wrappers", async () => { + const explanation = await explainShellCommand('bash -lc "echo hi | wc -c"'); + + expect(explanation.topLevelCommands.map((step) => step.executable)).toEqual(["bash"]); + expect(explanation.nestedCommands).toEqual([ + expect.objectContaining({ context: "wrapper-payload", executable: "echo" }), + expect.objectContaining({ context: "wrapper-payload", executable: "wc" }), + ]); + const [wrappedEcho, wrappedWc] = explanation.nestedCommands; + expect(explanation.source.slice(wrappedEcho?.span.startIndex, wrappedEcho?.span.endIndex)).toBe( + "echo hi", + ); + expect(explanation.source.slice(wrappedWc?.span.startIndex, wrappedWc?.span.endIndex)).toBe( + "wc -c", + ); + expect(explanation.shapes).toContain("pipeline"); + expect(explanation.risks).toContainEqual( + expect.objectContaining({ + kind: "shell-wrapper", + executable: "bash", + flag: "-lc", + payload: "echo hi | wc -c", + text: 'bash -lc "echo hi | wc -c"', + }), + ); + + const combinedFlags = await explainShellCommand('bash -euxc "echo hi"'); + expect(combinedFlags.risks).toContainEqual( + expect.objectContaining({ + kind: "shell-wrapper", + executable: "bash", + flag: "-euxc", + payload: "echo hi", + }), + ); + + const combinedInline = await explainShellCommand('bash -c"echo hi"'); + expect(combinedInline.risks).toContainEqual( + expect.objectContaining({ + kind: "shell-wrapper", + executable: "bash", + payload: "echo hi", + }), + ); + + const powershell = await explainShellCommand('pwsh -Command "Get-ChildItem"'); + expect(powershell.risks).toContainEqual( + expect.objectContaining({ + kind: "shell-wrapper", + executable: "pwsh", + flag: "-Command", + payload: "Get-ChildItem", + }), + ); + + const powershellWithOptions = await explainShellCommand( + "pwsh -ExecutionPolicy Bypass -Command Get-ChildItem", + ); + expect(powershellWithOptions.risks).toContainEqual( + expect.objectContaining({ + kind: "shell-wrapper", + executable: "pwsh", + flag: "-Command", + payload: "Get-ChildItem", + }), + ); + + const dynamicPayload = await explainShellCommand('bash -lc "$CMD"'); + expect(dynamicPayload.nestedCommands).toEqual([]); + expect(dynamicPayload.risks).toContainEqual( + expect.objectContaining({ + kind: "shell-wrapper", + executable: "bash", + flag: "-lc", + payload: "$CMD", + }), + ); + + const invalidPayload = await explainShellCommand("bash -lc 'echo &&'"); + expect(invalidPayload.ok).toBe(false); + expect(invalidPayload.risks).toContainEqual(expect.objectContaining({ kind: "syntax-error" })); + + const powershellPipeline = await explainShellCommand( + 'pwsh -Command "Get-ChildItem | Select Name"', + ); + expect(powershellPipeline.nestedCommands).toEqual([]); + expect(powershellPipeline.risks).toContainEqual( + expect.objectContaining({ + kind: "shell-wrapper", + executable: "pwsh", + flag: "-Command", + payload: "Get-ChildItem | Select Name", + }), + ); + + for (const [command, carrier] of [ + ["time bash -lc 'id'", "time"], + ["nice bash -lc 'id'", "nice"], + ["timeout 1 bash -lc 'id'", "timeout"], + ["caffeinate -d -w 42 bash -lc 'id'", "caffeinate"], + ] as const) { + const wrapped = await explainShellCommand(command); + expect(wrapped.risks).toContainEqual( + expect.objectContaining({ + kind: "shell-wrapper-through-carrier", + command: carrier, + }), + ); + expect(wrapped.nestedCommands).toContainEqual( + expect.objectContaining({ context: "wrapper-payload", executable: "id" }), + ); + const wrappedId = wrapped.nestedCommands.find((step) => step.executable === "id"); + expect(wrapped.source.slice(wrappedId?.span.startIndex, wrappedId?.span.endIndex)).toBe("id"); + } + }); + + it("maps decoded shell-wrapper payload spans back to original source escapes", async () => { + const explanation = await explainShellCommand('bash -lc "printf \\"hi\\" | wc -c"'); + + const wrappedPrintf = explanation.nestedCommands.find((step) => step.executable === "printf"); + const wrappedWc = explanation.nestedCommands.find((step) => step.executable === "wc"); + + expect(wrappedPrintf).toEqual( + expect.objectContaining({ + context: "wrapper-payload", + text: 'printf "hi"', + }), + ); + expect( + explanation.source.slice(wrappedPrintf?.span.startIndex, wrappedPrintf?.span.endIndex), + ).toBe('printf \\"hi\\"'); + expect(explanation.source.slice(wrappedWc?.span.startIndex, wrappedWc?.span.endIndex)).toBe( + "wc -c", + ); + }); + + it("normalizes static shell words before classifying commands", async () => { + const quotedCommand = await explainShellCommand("e'c'ho a\\ b \"c d\""); + expect(quotedCommand.topLevelCommands).toEqual([ + expect.objectContaining({ executable: "echo", argv: ["echo", "a b", "c d"] }), + ]); + + const ansiCString = await explainShellCommand("$'ec\\x68o' hi"); + expect(ansiCString.topLevelCommands).toEqual([ + expect.objectContaining({ executable: "echo", argv: ["echo", "hi"] }), + ]); + + const wrappedShell = await explainShellCommand("b'a'sh -lc 'echo hi'"); + expect(wrappedShell.risks).toContainEqual( + expect.objectContaining({ + kind: "shell-wrapper", + executable: "bash", + flag: "-lc", + payload: "echo hi", + }), + ); + }); + + it("does not normalize dynamic executable names into trusted commands", async () => { + const dynamicPrefix = await explainShellCommand("e${CMD}ho hi"); + expect(dynamicPrefix.topLevelCommands).toEqual([]); + expect(dynamicPrefix.risks).toContainEqual( + expect.objectContaining({ kind: "dynamic-executable", text: "e${CMD}ho" }), + ); + + const dynamicQuoted = await explainShellCommand('"${CMD}" hi'); + expect(dynamicQuoted.topLevelCommands).toEqual([]); + expect(dynamicQuoted.risks).toContainEqual( + expect.objectContaining({ kind: "dynamic-executable", text: '"${CMD}"' }), + ); + + const dynamicGlob = await explainShellCommand("./ec* hi"); + expect(dynamicGlob.topLevelCommands).toEqual([]); + expect(dynamicGlob.risks).toContainEqual( + expect.objectContaining({ kind: "dynamic-executable", text: "./ec*" }), + ); + + const dynamicBraceExpansion = await explainShellCommand("./{echo,printf} hi"); + expect(dynamicBraceExpansion.topLevelCommands).toEqual([]); + expect(dynamicBraceExpansion.risks).toContainEqual( + expect.objectContaining({ kind: "dynamic-executable", text: "./{echo,printf}" }), + ); + + const dynamicArgument = await explainShellCommand("echo ./ec*"); + expect(dynamicArgument.topLevelCommands).toEqual([ + expect.objectContaining({ executable: "echo", argv: ["echo", "./ec*"] }), + ]); + expect(dynamicArgument.risks).toContainEqual( + expect.objectContaining({ + kind: "dynamic-argument", + command: "echo", + argumentIndex: 1, + text: "./ec*", + }), + ); + + const dynamicShellFlag = await explainShellCommand("bash $FLAGS id"); + expect(dynamicShellFlag.risks).toContainEqual( + expect.objectContaining({ + kind: "dynamic-argument", + command: "bash", + argumentIndex: 1, + text: "$FLAGS", + }), + ); + + const lineContinuation = await explainShellCommand("ec\\\nho hi"); + expect(lineContinuation.topLevelCommands).toEqual([]); + expect(lineContinuation.risks).toContainEqual( + expect.objectContaining({ kind: "line-continuation" }), + ); + expect(lineContinuation.risks).toContainEqual( + expect.objectContaining({ kind: "dynamic-executable" }), + ); + + const continuedArgument = await explainShellCommand("pnpm test \\\n --filter foo"); + expect(continuedArgument.topLevelCommands).toEqual([ + expect.objectContaining({ + executable: "pnpm", + argv: ["pnpm", "test", "--filter", "foo"], + }), + ]); + expect(continuedArgument.risks).toContainEqual( + expect.objectContaining({ kind: "line-continuation" }), + ); + + const invalidObfuscation = await explainShellCommand("e'c'h'o hi"); + expect(invalidObfuscation.ok).toBe(false); + expect(invalidObfuscation.risks).toContainEqual( + expect.objectContaining({ kind: "syntax-error" }), + ); + }); + + it("detects command carriers", async () => { + const find = await explainShellCommand('find . -name "*.ts" -exec grep -n TODO {} +'); + expect(find.risks).toContainEqual( + expect.objectContaining({ kind: "command-carrier", command: "find", flag: "-exec" }), + ); + + const xargs = await explainShellCommand('printf "%s\\n" a b | xargs -I{} sh -c "echo {}"'); + expect(xargs.risks).toContainEqual( + expect.objectContaining({ kind: "command-carrier", command: "xargs" }), + ); + + const envSplitString = await explainShellCommand("env -S 'sh -c \"id\"'"); + expect(envSplitString.risks).toContainEqual( + expect.objectContaining({ kind: "command-carrier", command: "env", flag: "-S" }), + ); + + for (const command of [ + 'env python -c "print(1)"', + 'sudo python -c "print(1)"', + 'command python -c "print(1)"', + ]) { + const explanation = await explainShellCommand(command); + expect(explanation.risks).toContainEqual( + expect.objectContaining({ + kind: "inline-eval", + command: "python", + flag: "-c", + }), + ); + } + }); + + it("detects eval, source, aliases, and carrier shell wrappers", async () => { + const evalCommand = await explainShellCommand('eval "$OPENCLAW_CMD"'); + expect(evalCommand.risks).toContainEqual(expect.objectContaining({ kind: "eval" })); + + const builtinEval = await explainShellCommand("builtin eval 'echo hi'"); + expect(builtinEval.risks).toContainEqual(expect.objectContaining({ kind: "eval" })); + + const sourceCommand = await explainShellCommand(". ./some-script.sh"); + expect(sourceCommand.risks).toContainEqual( + expect.objectContaining({ kind: "source", command: "." }), + ); + + const aliasCommand = await explainShellCommand("alias ll='ls -l'"); + expect(aliasCommand.risks).toContainEqual(expect.objectContaining({ kind: "alias" })); + + const sudoShell = await explainShellCommand('sudo sh -c "id && whoami"'); + expect(sudoShell.risks).toContainEqual( + expect.objectContaining({ kind: "shell-wrapper-through-carrier", command: "sudo" }), + ); + + const commandShell = await explainShellCommand("command bash -lc 'id && whoami'"); + expect(commandShell.risks).toContainEqual( + expect.objectContaining({ kind: "shell-wrapper-through-carrier", command: "command" }), + ); + + const sudoCombinedFlags = await explainShellCommand('sudo bash -euxc "id && whoami"'); + expect(sudoCombinedFlags.risks).toContainEqual( + expect.objectContaining({ kind: "shell-wrapper-through-carrier", command: "sudo" }), + ); + }); + + it("treats function bodies as nested command context", async () => { + const explanation = await explainShellCommand("ls() { echo hi; }; ls /tmp"); + + expect(explanation.topLevelCommands).toEqual([ + expect.objectContaining({ context: "top-level", executable: "ls", argv: ["ls", "/tmp"] }), + ]); + expect(explanation.nestedCommands).toEqual([ + expect.objectContaining({ context: "function-definition", executable: "echo" }), + ]); + expect(explanation.risks).toContainEqual( + expect.objectContaining({ kind: "function-definition", name: "ls" }), + ); + }); + + it("does not treat literal operator text as command shapes", async () => { + const quotedSemicolon = await explainShellCommand('echo ";"'); + expect(quotedSemicolon.shapes).not.toContain("sequence"); + + const heredoc = await explainShellCommand("cat < { + const redirect = await explainShellCommand("echo hi > out.txt"); + const redirectRisks = redirect.risks.filter((risk) => risk.kind === "redirect"); + expect(redirectRisks).toEqual([expect.objectContaining({ text: "> out.txt" })]); + + const heredoc = await explainShellCommand("cat < { + const explanation = await explainShellCommand("echo 'unterminated"); + + expect(explanation.ok).toBe(false); + expect(explanation.risks).toContainEqual( + expect.objectContaining({ + kind: "syntax-error", + span: expect.objectContaining({ + startIndex: expect.any(Number), + endIndex: expect.any(Number), + }), + }), + ); + }); + + it("parses and extracts a repeated approval-sized corpus without parser state leakage", async () => { + const corpus = [ + 'ls | grep "stuff" | python -c \'print("hi")\'', + "echo $(whoami)", + "diff <(ls a) <(ls b)", + 'find . -name "*.ts" -exec grep -n TODO {} +', + 'bash -lc "echo hi | wc -c"', + ]; + const iterations = 3; + for (let index = 0; index < iterations; index += 1) { + for (const command of corpus) { + const explanation = await explainShellCommand(command); + expect(explanation.risks.length + explanation.topLevelCommands.length).toBeGreaterThan(0); + } + } + }); +}); diff --git a/src/infra/command-explainer/extract.ts b/src/infra/command-explainer/extract.ts new file mode 100644 index 00000000000..4d659065358 --- /dev/null +++ b/src/infra/command-explainer/extract.ts @@ -0,0 +1,1196 @@ +import type { Node as TreeSitterNode } from "web-tree-sitter"; +import { unwrapKnownDispatchWrapperInvocation } from "../dispatch-wrapper-resolution.js"; +import { detectInterpreterInlineEvalArgv } from "../exec-inline-eval.js"; +import { normalizeExecutableToken } from "../exec-wrapper-resolution.js"; +import { + extractShellWrapperCommand, + isShellWrapperExecutable, + POSIX_SHELL_WRAPPERS, + resolveShellWrapperTransportArgv, +} from "../shell-wrapper-resolution.js"; +import { parseBashForCommandExplanation } from "./tree-sitter-runtime.js"; +import type { + CommandContext, + CommandExplanation, + CommandRisk, + CommandShape, + CommandStep, + SourceSpan, +} from "./types.js"; + +type MutableExplanation = { + shapes: Set; + commands: CommandStep[]; + risks: CommandRisk[]; + hasParseError: boolean; +}; + +type DynamicArgument = { + index: number; + text: string; + value: string; + span: SourceSpan; +}; + +type CommandArgument = { + index: number; + text: string; + value: string; + span: SourceSpan; + decodedSourceOffsets: number[]; +}; + +type CommandArgv = { + argv: string[]; + arguments: CommandArgument[]; + dynamicArguments: DynamicArgument[]; +}; + +type WalkState = { + wrapperPayloadDepth: number; + spanBase: SpanBase; +}; + +const MAX_WRAPPER_PAYLOAD_DEPTH = 2; + +const PARSEABLE_SHELL_WRAPPERS = new Set(POSIX_SHELL_WRAPPERS); +const SHELL_CARRIER_EXECUTABLES = new Set(["sudo", "doas", "env", "command", "builtin"]); +const SOURCE_EXECUTABLES = new Set([".", "source"]); + +type SpanBase = { + startIndex: number; + startPosition: SourceSpan["startPosition"]; + mapOffset?: (offset: number) => { index: number; position: SourceSpan["startPosition"] }; +}; + +const ROOT_SPAN_BASE: SpanBase = { + startIndex: 0, + startPosition: { row: 0, column: 0 }, +}; + +function children(node: TreeSitterNode): TreeSitterNode[] { + return Array.from({ length: node.childCount }, (_, index) => node.child(index)).filter( + (child): child is TreeSitterNode => child !== null, + ); +} + +function namedChildren(node: TreeSitterNode): TreeSitterNode[] { + return Array.from({ length: node.namedChildCount }, (_, index) => node.namedChild(index)).filter( + (child): child is TreeSitterNode => child !== null, + ); +} + +function hasDirectChildType(node: TreeSitterNode, type: string): boolean { + return children(node).some((child) => child.type === type); +} + +function translatePosition( + position: SourceSpan["startPosition"], + base: SourceSpan["startPosition"], +): SourceSpan["startPosition"] { + return { + row: base.row + position.row, + column: position.row === 0 ? base.column + position.column : position.column, + }; +} + +function translateSpan(span: SourceSpan, base: SpanBase): SourceSpan { + if (base.mapOffset) { + const start = base.mapOffset(span.startIndex); + const end = base.mapOffset(span.endIndex); + return { + startIndex: start.index, + endIndex: end.index, + startPosition: start.position, + endPosition: end.position, + }; + } + return { + startIndex: base.startIndex + span.startIndex, + endIndex: base.startIndex + span.endIndex, + startPosition: translatePosition(span.startPosition, base.startPosition), + endPosition: translatePosition(span.endPosition, base.startPosition), + }; +} + +function spanFromNode(node: TreeSitterNode, base: SpanBase = ROOT_SPAN_BASE): SourceSpan { + const span = { + startIndex: node.startIndex, + endIndex: node.endIndex, + startPosition: { row: node.startPosition.row, column: node.startPosition.column }, + endPosition: { row: node.endPosition.row, column: node.endPosition.column }, + }; + return translateSpan(span, base); +} + +function advancePosition( + position: SourceSpan["startPosition"], + text: string, +): SourceSpan["startPosition"] { + let row = position.row; + let column = position.column; + for (let index = 0; index < text.length; index += 1) { + const ch = text[index]; + if (ch === "\r") { + if (text[index + 1] === "\n") { + index += 1; + } + row += 1; + column = 0; + continue; + } + if (ch === "\n") { + row += 1; + column = 0; + continue; + } + column += 1; + } + return { row, column }; +} + +function utf8ByteLengthForCodePoint(codePoint: number): number { + if (codePoint <= 0x7f) { + return 1; + } + if (codePoint <= 0x7ff) { + return 2; + } + if (codePoint <= 0xffff) { + return 3; + } + return 4; +} + +function utf8ByteLength(text: string): number { + let length = 0; + for (let index = 0; index < text.length; index += 1) { + const codePoint = text.codePointAt(index); + if (codePoint === undefined) { + continue; + } + length += utf8ByteLengthForCodePoint(codePoint); + if (codePoint > 0xffff) { + index += 1; + } + } + return length; +} + +function utf8ByteOffsetToStringIndex(text: string, byteOffset: number): number { + if (byteOffset <= 0) { + return 0; + } + let currentByteOffset = 0; + for (let index = 0; index < text.length; index += 1) { + const codePoint = text.codePointAt(index); + if (codePoint === undefined) { + return text.length; + } + const codePointLength = utf8ByteLengthForCodePoint(codePoint); + if (currentByteOffset + codePointLength > byteOffset) { + return index; + } + currentByteOffset += codePointLength; + if (currentByteOffset === byteOffset) { + return codePoint > 0xffff ? index + 2 : index + 1; + } + if (codePoint > 0xffff) { + index += 1; + } + } + return text.length; +} + +function parserOffsetToStringIndex( + source: string, + rootNode: TreeSitterNode, +): (offset: number) => number { + const utf8Length = utf8ByteLength(source); + if (utf8Length !== source.length && rootNode.endIndex === utf8Length) { + return (offset) => utf8ByteOffsetToStringIndex(source, offset); + } + return (offset) => offset; +} + +function spanBaseForParserSource( + source: string, + rootNode: TreeSitterNode, + base: SpanBase, +): SpanBase { + const offsetToStringIndex = parserOffsetToStringIndex(source, rootNode); + return { + startIndex: base.startIndex, + startPosition: base.startPosition, + mapOffset(offset) { + const sourceIndex = offsetToStringIndex(offset); + if (base.mapOffset) { + return base.mapOffset(sourceIndex); + } + return { + index: base.startIndex + sourceIndex, + position: advancePosition(base.startPosition, source.slice(0, sourceIndex)), + }; + }, + }; +} + +function valuePrefixLength(node: TreeSitterNode): number { + if (node.type === "string" || node.type === "raw_string") { + return 1; + } + if (node.type === "ansi_c_string") { + return 2; + } + return 0; +} + +type DecodedShellText = { + value: string; + sourceOffsets: number[]; +}; + +function appendDecodedText( + decoded: DecodedShellText, + value: string, + sourceEndOffset: number, +): void { + decoded.value += value; + for (let index = 0; index < value.length; index += 1) { + decoded.sourceOffsets.push(sourceEndOffset); + } +} + +function identityDecodedShellText(text: string, sourceOffset = 0): DecodedShellText { + return { + value: text, + sourceOffsets: Array.from({ length: text.length + 1 }, (_, index) => sourceOffset + index), + }; +} + +function decodedSourceOffsetsForNode(node: TreeSitterNode, value: string): number[] { + let decoded: DecodedShellText; + switch (node.type) { + case "raw_string": + decoded = identityDecodedShellText(node.text.slice(1, -1), 1); + break; + case "string": + decoded = decodeDoubleQuotedTextWithOffsets(node.text); + break; + case "ansi_c_string": + decoded = decodeAnsiCStringWithOffsets(node.text); + break; + default: + decoded = decodeUnquotedShellTextWithOffsets(node.text); + break; + } + if (decoded.value === value && decoded.sourceOffsets.length === value.length + 1) { + return decoded.sourceOffsets; + } + const prefixLength = valuePrefixLength(node); + return Array.from({ length: value.length + 1 }, (_, index) => prefixLength + index); +} + +function argumentFromNode( + index: number, + node: TreeSitterNode, + value: ShellWordValue, + base: SpanBase, +): CommandArgument { + const span = spanFromNode(node, base); + const decodedSourceOffsets = decodedSourceOffsetsForNode(node, value.value); + return { + index, + text: node.text, + value: value.value, + span, + decodedSourceOffsets, + }; +} + +type ShellWordValue = { kind: "literal"; value: string } | { kind: "dynamic"; value: string }; + +const DYNAMIC_WORD_NODE_TYPES = new Set([ + "arithmetic_expansion", + "command_substitution", + "expansion", + "process_substitution", + "simple_expansion", +]); + +const COMMAND_ARGUMENT_NODE_TYPES = new Set([ + "ansi_c_string", + "arithmetic_expansion", + "command_substitution", + "concatenation", + "expansion", + "number", + "process_substitution", + "raw_string", + "simple_expansion", + "string", + "word", +]); + +function hasEscapedLineContinuation(text: string): boolean { + return /\\(?:\r\n|[\r\n])/.test(text); +} + +function hasExecutableLineContinuation(text: string): boolean { + return /^[^\s]*\\(?:\r\n|[\r\n])/.test(text); +} + +function hasUnescapedDynamicPattern(text: string): boolean { + for (let index = 0; index < text.length; index += 1) { + const ch = text[index]; + if (ch === "\\") { + index += 1; + continue; + } + if (ch === "*" || ch === "?") { + return true; + } + if (ch === "[" && text.indexOf("]", index + 1) > index + 1) { + return true; + } + if (ch === "{" && text.indexOf("}", index + 1) > index + 1) { + return true; + } + } + return false; +} + +function decodeUnquotedShellTextWithOffsets(text: string): DecodedShellText { + const decoded: DecodedShellText = { value: "", sourceOffsets: [0] }; + for (let index = 0; index < text.length; index += 1) { + const ch = text[index]; + const next = text[index + 1]; + if (ch === "\\" && next !== undefined) { + if (next === "\r" && text[index + 2] === "\n") { + decoded.sourceOffsets[decoded.value.length] = index + 3; + index += 2; + continue; + } + if (next === "\n" || next === "\r") { + decoded.sourceOffsets[decoded.value.length] = index + 2; + index += 1; + continue; + } + appendDecodedText(decoded, next, index + 2); + index += 1; + continue; + } + appendDecodedText(decoded, ch, index + 1); + } + return decoded; +} + +function decodeUnquotedShellText(text: string): string { + return decodeUnquotedShellTextWithOffsets(text).value; +} + +function decodeDoubleQuotedTextWithOffsets(text: string): DecodedShellText { + const hasQuotes = text.startsWith('"') && text.endsWith('"'); + const bodyStart = hasQuotes ? 1 : 0; + const body = hasQuotes ? text.slice(1, -1) : text; + const decoded: DecodedShellText = { value: "", sourceOffsets: [bodyStart] }; + for (let index = 0; index < body.length; index += 1) { + const ch = body[index]; + const next = body[index + 1]; + const sourceOffset = bodyStart + index; + if (ch === "\\" && next !== undefined) { + if (next === "\r" && body[index + 2] === "\n") { + decoded.sourceOffsets[decoded.value.length] = sourceOffset + 3; + index += 2; + continue; + } + if (["\\", '"', "$", "`", "\n", "\r"].includes(next)) { + if (next !== "\n" && next !== "\r") { + appendDecodedText(decoded, next, sourceOffset + 2); + } else { + decoded.sourceOffsets[decoded.value.length] = sourceOffset + 2; + } + index += 1; + continue; + } + } + appendDecodedText(decoded, ch, sourceOffset + 1); + } + return decoded; +} + +function decodeDoubleQuotedText(text: string): string { + return decodeDoubleQuotedTextWithOffsets(text).value; +} + +const ANSI_C_SIMPLE_ESCAPES: Record = { + "'": "'", + '"': '"', + "?": "?", + "\\": "\\", + a: "\u0007", + b: "\b", + e: "\u001B", + E: "\u001B", + f: "\f", + n: "\n", + r: "\r", + t: "\t", + v: "\v", +}; + +function decodeAnsiCStringWithOffsets(text: string): DecodedShellText { + const hasQuotes = text.startsWith("$'") && text.endsWith("'"); + const bodyStart = hasQuotes ? 2 : 0; + const body = hasQuotes ? text.slice(2, -1) : text; + const decoded: DecodedShellText = { value: "", sourceOffsets: [bodyStart] }; + for (let index = 0; index < body.length; index += 1) { + const ch = body[index]; + const sourceOffset = bodyStart + index; + if (ch !== "\\") { + appendDecodedText(decoded, ch, sourceOffset + 1); + continue; + } + + const next = body[index + 1]; + if (next === undefined) { + appendDecodedText(decoded, "\\", sourceOffset + 1); + continue; + } + + const simple = ANSI_C_SIMPLE_ESCAPES[next]; + if (simple !== undefined) { + appendDecodedText(decoded, simple, sourceOffset + 2); + index += 1; + continue; + } + + if (next === "x") { + const hex = body.slice(index + 2).match(/^[0-9A-Fa-f]{1,2}/)?.[0] ?? ""; + if (hex) { + appendDecodedText( + decoded, + String.fromCodePoint(Number.parseInt(hex, 16)), + sourceOffset + 2 + hex.length, + ); + index += 1 + hex.length; + continue; + } + } + + if (next === "u" || next === "U") { + const maxLength = next === "u" ? 4 : 8; + const hex = + body.slice(index + 2).match(new RegExp(`^[0-9A-Fa-f]{1,${maxLength}}`))?.[0] ?? ""; + if (hex) { + const codePoint = Number.parseInt(hex, 16); + try { + appendDecodedText( + decoded, + String.fromCodePoint(codePoint), + sourceOffset + 2 + hex.length, + ); + } catch { + appendDecodedText(decoded, `\\${next}${hex}`, sourceOffset + 2 + hex.length); + } + index += 1 + hex.length; + continue; + } + } + + if (/^[0-7]$/.test(next)) { + const octal = body.slice(index + 1).match(/^[0-7]{1,3}/)?.[0] ?? ""; + if (octal) { + appendDecodedText( + decoded, + String.fromCodePoint(Number.parseInt(octal, 8)), + sourceOffset + 1 + octal.length, + ); + index += octal.length; + continue; + } + } + + appendDecodedText(decoded, next, sourceOffset + 2); + index += 1; + } + return decoded; +} + +function decodeAnsiCString(text: string): string { + return decodeAnsiCStringWithOffsets(text).value; +} + +function hasDynamicWordPart(node: TreeSitterNode): boolean { + return ( + DYNAMIC_WORD_NODE_TYPES.has(node.type) || + namedChildren(node).some((child) => hasDynamicWordPart(child)) + ); +} + +function shellWordValue(node: TreeSitterNode): ShellWordValue { + if (DYNAMIC_WORD_NODE_TYPES.has(node.type)) { + return { kind: "dynamic", value: node.text }; + } + if ( + node.type !== "command_name" && + node.type !== "concatenation" && + namedChildren(node).some((child) => hasDynamicWordPart(child)) + ) { + return { + kind: "dynamic", + value: node.type === "string" ? decodeDoubleQuotedText(node.text) : node.text, + }; + } + + switch (node.type) { + case "command_name": { + const parts = namedChildren(node); + if (parts.length === 0) { + return hasUnescapedDynamicPattern(node.text) + ? { kind: "dynamic", value: decodeUnquotedShellText(node.text) } + : { kind: "literal", value: decodeUnquotedShellText(node.text) }; + } + let value = ""; + for (const part of parts) { + const partValue = shellWordValue(part); + value += partValue.value; + if (partValue.kind !== "literal") { + return { kind: "dynamic", value }; + } + } + return { kind: "literal", value }; + } + case "word": + return hasUnescapedDynamicPattern(node.text) + ? { kind: "dynamic", value: decodeUnquotedShellText(node.text) } + : { kind: "literal", value: decodeUnquotedShellText(node.text) }; + case "raw_string": + return { kind: "literal", value: node.text.slice(1, -1) }; + case "string": + return { kind: "literal", value: decodeDoubleQuotedText(node.text) }; + case "ansi_c_string": + return { kind: "literal", value: decodeAnsiCString(node.text) }; + case "concatenation": { + if (hasUnescapedDynamicPattern(node.text)) { + return { kind: "dynamic", value: decodeUnquotedShellText(node.text) }; + } + let value = ""; + let dynamic = false; + for (const child of namedChildren(node)) { + const childValue = shellWordValue(child); + value += childValue.value; + if (childValue.kind !== "literal") { + dynamic = true; + } + } + return dynamic ? { kind: "dynamic", value } : { kind: "literal", value }; + } + default: + return namedChildren(node).some((child) => shellWordValue(child).kind === "dynamic") + ? { kind: "dynamic", value: decodeUnquotedShellText(node.text) } + : { kind: "literal", value: decodeUnquotedShellText(node.text) }; + } +} + +function commandNameNode(node: TreeSitterNode): TreeSitterNode | null { + return ( + node.childForFieldName("name") ?? + namedChildren(node).find((child) => child.type === "command_name") ?? + null + ); +} + +function argvFromCommand( + node: TreeSitterNode, + nameNode: TreeSitterNode, + state: WalkState, +): CommandArgv | null { + if (hasEscapedLineContinuation(nameNode.text) || hasExecutableLineContinuation(node.text)) { + return null; + } + const executable = shellWordValue(nameNode); + if (executable.kind !== "literal") { + return null; + } + + const skipped = new Set([nameNode, ...namedChildren(nameNode)]); + const argv = [executable.value]; + const argumentsList: CommandArgument[] = []; + const dynamicArguments: DynamicArgument[] = []; + for (const child of namedChildren(node)) { + if ( + skipped.has(child) || + child.type === "command_name" || + child.type === "variable_assignment" || + !COMMAND_ARGUMENT_NODE_TYPES.has(child.type) + ) { + continue; + } + const value = shellWordValue(child); + const argument = argumentFromNode(argv.length, child, value, state.spanBase); + argumentsList.push(argument); + if (value.kind === "dynamic") { + dynamicArguments.push({ + index: argument.index, + text: argument.text, + value: argument.value, + span: argument.span, + }); + } + argv.push(value.value); + } + return { argv, arguments: argumentsList, dynamicArguments }; +} + +function firstShellToken(text: string): string { + return text.trimStart().match(/^\S+/)?.[0] ?? ""; +} + +function argvFromDeclarationCommand(node: TreeSitterNode, state: WalkState): CommandArgv | null { + const executable = firstShellToken(node.text); + if (!executable) { + return null; + } + const argv = [executable]; + const argumentsList: CommandArgument[] = []; + const dynamicArguments: DynamicArgument[] = []; + for (const child of namedChildren(node)) { + if (!COMMAND_ARGUMENT_NODE_TYPES.has(child.type) && child.type !== "variable_assignment") { + continue; + } + const value = shellWordValue(child); + const argument = argumentFromNode(argv.length, child, value, state.spanBase); + argumentsList.push(argument); + if (value.kind === "dynamic") { + dynamicArguments.push({ + index: argument.index, + text: argument.text, + value: argument.value, + span: argument.span, + }); + } + argv.push(value.value); + } + return { argv, arguments: argumentsList, dynamicArguments }; +} + +function appendTestCommandArguments( + node: TreeSitterNode, + argv: string[], + argumentsList: CommandArgument[], + dynamicArguments: DynamicArgument[], + state: WalkState, +): void { + if (node.type === "test_operator" || COMMAND_ARGUMENT_NODE_TYPES.has(node.type)) { + const value = shellWordValue(node); + const argument = argumentFromNode(argv.length, node, value, state.spanBase); + argumentsList.push(argument); + if (value.kind === "dynamic") { + dynamicArguments.push({ + index: argument.index, + text: argument.text, + value: argument.value, + span: argument.span, + }); + } + argv.push(value.value); + return; + } + for (const child of namedChildren(node)) { + appendTestCommandArguments(child, argv, argumentsList, dynamicArguments, state); + } +} + +function argvFromTestCommand(node: TreeSitterNode, state: WalkState): CommandArgv | null { + const trimmed = node.text.trimStart(); + const executable = trimmed.startsWith("[[") ? "[[" : trimmed.startsWith("[") ? "[" : ""; + if (!executable) { + return null; + } + const argv = [executable]; + const argumentsList: CommandArgument[] = []; + const dynamicArguments: DynamicArgument[] = []; + for (const child of namedChildren(node)) { + appendTestCommandArguments(child, argv, argumentsList, dynamicArguments, state); + } + return { argv, arguments: argumentsList, dynamicArguments }; +} + +function isCommandLikeNode(node: TreeSitterNode): boolean { + return ( + node.type === "command" || node.type === "declaration_command" || node.type === "test_command" + ); +} + +function recordShape(node: TreeSitterNode, output: MutableExplanation): void { + if ( + (node.type === "program" || node.type === "list") && + (hasDirectChildType(node, ";") || namedChildren(node).filter(isCommandLikeNode).length > 1) + ) { + output.shapes.add("sequence"); + } + if (hasDirectChildType(node, "&")) { + output.shapes.add("background"); + } + if (node.type === "pipeline") { + output.shapes.add("pipeline"); + } + if (node.type === "list") { + if (hasDirectChildType(node, "&&")) { + output.shapes.add("and"); + } + if (hasDirectChildType(node, "||")) { + output.shapes.add("or"); + } + } + if (node.type === "if_statement") { + output.shapes.add("if"); + } + if (node.type === "for_statement") { + output.shapes.add("for"); + } + if (node.type === "while_statement") { + output.shapes.add("while"); + } + if (node.type === "case_statement") { + output.shapes.add("case"); + } + if (node.type === "subshell") { + output.shapes.add("subshell"); + } + if (node.type === "compound_statement") { + output.shapes.add("group"); + } +} + +function shellCommandFlag( + argv: string[], + startIndex: number, +): { flag: string; index: number } | null { + const shell = normalizeExecutableToken(argv[startIndex - 1] ?? argv[0] ?? ""); + for (let index = startIndex; index < argv.length; index += 1) { + const token = argv[index]?.trim(); + if (!token) { + continue; + } + if (token === "--") { + break; + } + const lower = token.toLowerCase(); + if (shell === "cmd") { + if (lower === "/c" || lower === "/k") { + return { flag: token, index }; + } + continue; + } + if (shell === "powershell" || shell === "pwsh") { + if ( + lower === "-c" || + lower === "-command" || + lower === "--command" || + lower === "-encodedcommand" || + lower === "-enc" || + lower === "-e" || + lower === "-f" || + lower === "-file" + ) { + return { flag: token, index }; + } + continue; + } + if (lower === "-c" || lower === "--command") { + return { flag: token, index }; + } + if (token.startsWith("-") && !token.startsWith("--") && lower.slice(1).includes("c")) { + return { flag: token, index }; + } + } + return null; +} + +function canParseShellWrapperPayload(transportArgv: string[], commandFlag: string | null): boolean { + const shellExecutable = normalizeExecutableToken(transportArgv[0] ?? ""); + if (!PARSEABLE_SHELL_WRAPPERS.has(shellExecutable)) { + return false; + } + const lowerFlag = commandFlag?.toLowerCase() ?? ""; + return lowerFlag === "-c" || lowerFlag === "--command" || /^-[^-]*c[^-]*$/i.test(lowerFlag); +} + +function isDynamicPayload(payload: string, dynamicArguments: DynamicArgument[]): boolean { + return dynamicArguments.some((argument) => argument.value === payload); +} + +function payloadBaseFromArgument(argument: CommandArgument, payload: string): SpanBase | null { + const payloadOffset = argument.value.indexOf(payload); + if (payloadOffset < 0) { + return null; + } + const rawPayloadOffset = argument.decodedSourceOffsets[payloadOffset]; + if (rawPayloadOffset === undefined) { + return null; + } + const prefix = argument.text.slice(0, rawPayloadOffset); + return { + startIndex: argument.span.startIndex + rawPayloadOffset, + startPosition: advancePosition(argument.span.startPosition, prefix), + mapOffset(offset) { + const rawOffset = argument.decodedSourceOffsets[payloadOffset + offset]; + const mappedRawOffset = rawOffset ?? rawPayloadOffset + offset; + return { + index: argument.span.startIndex + mappedRawOffset, + position: advancePosition( + argument.span.startPosition, + argument.text.slice(0, mappedRawOffset), + ), + }; + }, + }; +} + +function payloadBaseFromArguments( + payload: string, + argumentsList: CommandArgument[], +): SpanBase | null { + const exactArgument = argumentsList.find((argument) => argument.value === payload); + if (exactArgument) { + return payloadBaseFromArgument(exactArgument, payload); + } + for (const argument of argumentsList) { + const base = payloadBaseFromArgument(argument, payload); + if (base) { + return base; + } + } + return null; +} + +function shellWrapperPayloadForParsing( + argv: string[], + argumentsList: CommandArgument[], + dynamicArguments: DynamicArgument[], +): { command: string; spanBase: SpanBase } | null { + const shellWrapper = extractShellWrapperCommand(argv); + if ( + !shellWrapper.isWrapper || + !shellWrapper.command || + isDynamicPayload(shellWrapper.command, dynamicArguments) + ) { + return null; + } + const spanBase = payloadBaseFromArguments(shellWrapper.command, argumentsList); + if (!spanBase) { + return null; + } + const transportArgv = resolveShellWrapperTransportArgv(argv) ?? argv; + const commandFlag = shellCommandFlag(transportArgv, 1) ?? shellCommandFlag(argv, 1); + if (!canParseShellWrapperPayload(transportArgv, commandFlag?.flag ?? null)) { + return null; + } + return { command: shellWrapper.command, spanBase }; +} + +type InlineEvalHit = NonNullable>; + +function detectCarrierInlineEvalArgv(argv: string[]): InlineEvalHit | null { + const dispatchUnwrap = unwrapKnownDispatchWrapperInvocation(argv); + if (dispatchUnwrap.kind === "unwrapped") { + return detectInterpreterInlineEvalArgv(dispatchUnwrap.argv); + } + + const executable = normalizeExecutableToken(argv[0] ?? ""); + if (!SHELL_CARRIER_EXECUTABLES.has(executable)) { + return null; + } + for (let index = 1; index < argv.length; index += 1) { + const hit = detectInterpreterInlineEvalArgv(argv.slice(index)); + if (hit) { + return hit; + } + } + return null; +} + +function envSplitStringFlag(argv: string[]): string | null { + if (normalizeExecutableToken(argv[0] ?? "") !== "env") { + return null; + } + for (const arg of argv.slice(1)) { + const token = arg.trim(); + if (token === "-S" || token === "--split-string") { + return token; + } + if (token.startsWith("--split-string=") || (token.startsWith("-S") && token.length > 2)) { + return token.startsWith("--") ? "--split-string" : "-S"; + } + } + return null; +} + +function recordInlineEvalRisk( + inlineEval: InlineEvalHit, + text: string, + span: SourceSpan, + output: MutableExplanation, +): void { + output.risks.push({ + kind: "inline-eval", + command: inlineEval.normalizedExecutable, + flag: inlineEval.flag, + text, + span, + }); +} + +function recordDynamicArgumentRisks( + command: string, + dynamicArguments: DynamicArgument[], + output: MutableExplanation, +): void { + for (const argument of dynamicArguments) { + output.risks.push({ + kind: "dynamic-argument", + command, + argumentIndex: argument.index, + text: argument.text, + span: argument.span, + }); + } +} + +function recordCommandRisks( + argv: string[], + dynamicArguments: DynamicArgument[], + text: string, + span: SourceSpan, + output: MutableExplanation, +): void { + const executable = argv[0]; + if (!executable) { + return; + } + const normalizedExecutable = normalizeExecutableToken(executable); + recordDynamicArgumentRisks(normalizedExecutable, dynamicArguments, output); + const inlineEval = detectInterpreterInlineEvalArgv(argv) ?? detectCarrierInlineEvalArgv(argv); + if (inlineEval) { + recordInlineEvalRisk(inlineEval, text, span, output); + } + + const shellWrapper = extractShellWrapperCommand(argv); + if (shellWrapper.isWrapper && shellWrapper.command) { + const transportArgv = resolveShellWrapperTransportArgv(argv) ?? argv; + const shellExecutable = transportArgv[0] ?? executable; + const commandFlag = shellCommandFlag(transportArgv, 1) ?? shellCommandFlag(argv, 1); + if (isShellWrapperExecutable(executable)) { + output.risks.push({ + kind: "shell-wrapper", + executable: shellExecutable, + flag: commandFlag?.flag ?? "-c", + payload: shellWrapper.command, + text, + span, + }); + } else { + output.risks.push({ + kind: "shell-wrapper-through-carrier", + command: normalizedExecutable, + text, + span, + }); + } + } + + if (normalizedExecutable === "find") { + const flag = argv.find((arg) => ["-exec", "-execdir", "-ok", "-okdir"].includes(arg)); + if (flag) { + output.risks.push({ kind: "command-carrier", command: executable, flag, text, span }); + } + } + if (normalizedExecutable === "xargs") { + output.risks.push({ kind: "command-carrier", command: normalizedExecutable, text, span }); + } + const splitStringFlag = envSplitStringFlag(argv); + if (splitStringFlag) { + output.risks.push({ + kind: "command-carrier", + command: normalizedExecutable, + flag: splitStringFlag, + text, + span, + }); + } + if (normalizedExecutable === "eval") { + output.risks.push({ kind: "eval", text, span }); + } + if (SOURCE_EXECUTABLES.has(normalizedExecutable)) { + output.risks.push({ kind: "source", command: normalizedExecutable, text, span }); + } + if (normalizedExecutable === "alias") { + output.risks.push({ kind: "alias", text, span }); + } + if (!shellWrapper.isWrapper && SHELL_CARRIER_EXECUTABLES.has(normalizedExecutable)) { + const shellIndex = argv.findIndex((arg) => isShellWrapperExecutable(arg)); + if (shellIndex >= 0 && shellCommandFlag(argv, shellIndex + 1)) { + output.risks.push({ + kind: "shell-wrapper-through-carrier", + command: normalizedExecutable, + text, + span, + }); + } + + const carriedCommand = argv.slice(1).find((arg) => { + const normalized = normalizeExecutableToken(arg); + return normalized === "eval" || SOURCE_EXECUTABLES.has(normalized); + }); + const normalizedCarriedCommand = carriedCommand + ? normalizeExecutableToken(carriedCommand) + : undefined; + if (normalizedCarriedCommand === "eval") { + output.risks.push({ kind: "eval", text, span }); + } else if (normalizedCarriedCommand && SOURCE_EXECUTABLES.has(normalizedCarriedCommand)) { + output.risks.push({ + kind: "source", + command: normalizedCarriedCommand, + text, + span, + }); + } + } +} + +async function walk( + node: TreeSitterNode, + output: MutableExplanation, + context: CommandContext, + state: WalkState, +): Promise { + recordShape(node, output); + + const span = spanFromNode(node, state.spanBase); + let childContext = context; + if (node.type === "program" && hasEscapedLineContinuation(node.text)) { + output.risks.push({ kind: "line-continuation", text: node.text, span }); + } + + if (node.type === "function_definition") { + const nameNode = node.childForFieldName("name"); + output.risks.push({ + kind: "function-definition", + name: nameNode?.text ?? "", + text: node.text, + span, + }); + childContext = "function-definition"; + } else if (node.type === "command_substitution") { + output.risks.push({ kind: "command-substitution", text: node.text, span }); + childContext = "command-substitution"; + } else if (node.type === "process_substitution") { + output.risks.push({ kind: "process-substitution", text: node.text, span }); + childContext = "process-substitution"; + } else if (node.type === "heredoc_redirect") { + output.risks.push({ kind: "heredoc", text: node.text, span }); + } else if (node.type === "herestring_redirect") { + output.risks.push({ kind: "here-string", text: node.text, span }); + } else if (node.type === "file_redirect") { + output.risks.push({ kind: "redirect", text: node.text, span }); + } else if (node.type === "ERROR") { + output.risks.push({ kind: "syntax-error", text: node.text, span }); + } + + if ( + node.type === "command" || + node.type === "declaration_command" || + node.type === "test_command" + ) { + const nameNode = node.type === "command" ? commandNameNode(node) : null; + const parsed = + node.type === "command" + ? nameNode + ? argvFromCommand(node, nameNode, state) + : null + : node.type === "declaration_command" + ? argvFromDeclarationCommand(node, state) + : argvFromTestCommand(node, state); + if (node.type === "command" && nameNode && !parsed) { + output.risks.push({ + kind: "dynamic-executable", + text: nameNode.text, + span: spanFromNode(nameNode, state.spanBase), + }); + } else if (parsed) { + const step: CommandStep = { + context, + executable: parsed.argv[0] ?? "", + argv: parsed.argv, + text: node.text, + span, + }; + if (step.executable) { + output.commands.push(step); + recordCommandRisks(parsed.argv, parsed.dynamicArguments, node.text, span, output); + const wrapperPayload = shellWrapperPayloadForParsing( + parsed.argv, + parsed.arguments, + parsed.dynamicArguments, + ); + if (wrapperPayload && state.wrapperPayloadDepth < MAX_WRAPPER_PAYLOAD_DEPTH) { + const wrapperTree = await parseBashForCommandExplanation(wrapperPayload.command); + const wrapperSpanBase = spanBaseForParserSource( + wrapperPayload.command, + wrapperTree.rootNode, + wrapperPayload.spanBase, + ); + try { + if (wrapperTree.rootNode.hasError) { + output.hasParseError = true; + output.risks.push({ + kind: "syntax-error", + text: wrapperPayload.command, + span: spanFromNode(wrapperTree.rootNode, wrapperSpanBase), + }); + } + await walk(wrapperTree.rootNode, output, "wrapper-payload", { + wrapperPayloadDepth: state.wrapperPayloadDepth + 1, + spanBase: wrapperSpanBase, + }); + } finally { + wrapperTree.delete(); + } + } + } + } + } + for (const child of namedChildren(node)) { + await walk(child, output, childContext, state); + } +} + +export async function explainShellCommand(source: string): Promise { + const tree = await parseBashForCommandExplanation(source); + try { + const spanBase = spanBaseForParserSource(source, tree.rootNode, ROOT_SPAN_BASE); + const output: MutableExplanation = { + shapes: new Set(), + commands: [], + risks: [], + hasParseError: tree.rootNode.hasError, + }; + await walk(tree.rootNode, output, "top-level", { + wrapperPayloadDepth: 0, + spanBase, + }); + const topLevelCommands = output.commands.filter((command) => command.context === "top-level"); + return { + ok: !output.hasParseError, + source, + shapes: [...output.shapes], + topLevelCommands, + nestedCommands: output.commands.filter((command) => command.context !== "top-level"), + risks: output.risks, + }; + } finally { + tree.delete(); + } +} diff --git a/src/infra/command-explainer/index.ts b/src/infra/command-explainer/index.ts new file mode 100644 index 00000000000..57414ac1430 --- /dev/null +++ b/src/infra/command-explainer/index.ts @@ -0,0 +1,9 @@ +export { explainShellCommand } from "./extract.js"; +export type { + CommandContext, + CommandExplanation, + CommandRisk, + CommandShape, + CommandStep, + SourceSpan, +} from "./types.js"; diff --git a/src/infra/command-explainer/tree-sitter-runtime.ts b/src/infra/command-explainer/tree-sitter-runtime.ts new file mode 100644 index 00000000000..08792a7851a --- /dev/null +++ b/src/infra/command-explainer/tree-sitter-runtime.ts @@ -0,0 +1,107 @@ +import fs from "node:fs"; +import { createRequire } from "node:module"; +import path from "node:path"; +import * as TreeSitter from "web-tree-sitter"; + +const require = createRequire(import.meta.url); + +let parserPromise: Promise | null = null; +let parserLoader: () => Promise = loadParser; +const MAX_COMMAND_EXPLANATION_SOURCE_CHARS = 128 * 1024; +const MAX_COMMAND_EXPLANATION_PARSE_MS = 500; + +export function resolvePackageFileForCommandExplanation( + packageName: string, + fileName: string, +): string { + let packageEntry: string; + try { + packageEntry = require.resolve(packageName); + } catch (error) { + throw new Error( + `Unable to resolve ${packageName} while loading the shell command explainer parser`, + { cause: error }, + ); + } + + let directory = path.dirname(packageEntry); + const searched: string[] = []; + for (let depth = 0; depth < 5; depth += 1) { + const candidate = path.join(directory, fileName); + searched.push(candidate); + if (fs.existsSync(candidate)) { + return candidate; + } + const parent = path.dirname(directory); + if (parent === directory) { + break; + } + directory = parent; + } + throw new Error( + `Unable to locate ${fileName} in ${packageName} while loading the shell command explainer parser; searched ${searched.join(", ")}`, + ); +} + +function resolveWebTreeSitterFile(fileName: string): string { + return resolvePackageFileForCommandExplanation("web-tree-sitter", fileName); +} + +function resolveBashWasmPath(): string { + return resolvePackageFileForCommandExplanation("tree-sitter-bash", "tree-sitter-bash.wasm"); +} + +async function loadParser(): Promise { + await TreeSitter.Parser.init({ + locateFile: resolveWebTreeSitterFile, + }); + const language = await TreeSitter.Language.load(resolveBashWasmPath()); + const parser = new TreeSitter.Parser(); + parser.setLanguage(language); + return parser; +} + +export function getBashParserForCommandExplanation(): Promise { + parserPromise ??= parserLoader().catch((error: unknown) => { + parserPromise = null; + throw error; + }); + return parserPromise; +} + +export function setBashParserLoaderForCommandExplanationForTest( + loader?: () => Promise, +): void { + parserPromise = null; + parserLoader = loader ?? loadParser; +} + +/** + * Low-level parser access for tests and parser diagnostics. + * Callers own the returned Tree and must call tree.delete(). + * Prefer explainShellCommand for normal command-explainer use. + */ +export async function parseBashForCommandExplanation(source: string): Promise { + if (source.length > MAX_COMMAND_EXPLANATION_SOURCE_CHARS) { + throw new Error("Shell command is too large to explain"); + } + const parser = await getBashParserForCommandExplanation(); + const deadlineMs = performance.now() + MAX_COMMAND_EXPLANATION_PARSE_MS; + let timedOut = false; + const tree = parser.parse(source, null, { + progressCallback: () => { + timedOut = performance.now() > deadlineMs; + return timedOut; + }, + }); + if (!tree) { + parser.reset(); + if (timedOut) { + throw new Error( + `tree-sitter-bash timed out after ${MAX_COMMAND_EXPLANATION_PARSE_MS}ms while parsing shell command`, + ); + } + throw new Error("tree-sitter-bash returned no parse tree"); + } + return tree; +} diff --git a/src/infra/command-explainer/types.ts b/src/infra/command-explainer/types.ts new file mode 100644 index 00000000000..8ee70539db4 --- /dev/null +++ b/src/infra/command-explainer/types.ts @@ -0,0 +1,75 @@ +export type CommandContext = + | "top-level" + | "command-substitution" + | "process-substitution" + | "function-definition" + | "wrapper-payload"; + +export type CommandShape = + | "pipeline" + | "and" + | "or" + | "sequence" + | "if" + | "for" + | "while" + | "case" + | "subshell" + | "group" + | "background"; + +export type SourceSpan = { + startIndex: number; + endIndex: number; + startPosition: { row: number; column: number }; + endPosition: { row: number; column: number }; +}; + +export type CommandStep = { + context: CommandContext; + executable: string; + argv: string[]; + text: string; + span: SourceSpan; +}; + +export type CommandRisk = + | { kind: "inline-eval"; command: string; flag: string; text: string; span: SourceSpan } + | { + kind: "shell-wrapper"; + executable: string; + flag: string; + payload: string; + text: string; + span: SourceSpan; + } + | { kind: "shell-wrapper-through-carrier"; command: string; text: string; span: SourceSpan } + | { kind: "command-carrier"; command: string; flag?: string; text: string; span: SourceSpan } + | { kind: "command-substitution"; text: string; span: SourceSpan } + | { kind: "process-substitution"; text: string; span: SourceSpan } + | { kind: "dynamic-executable"; text: string; span: SourceSpan } + | { + kind: "dynamic-argument"; + command: string; + argumentIndex: number; + text: string; + span: SourceSpan; + } + | { kind: "eval"; text: string; span: SourceSpan } + | { kind: "source"; command: string; text: string; span: SourceSpan } + | { kind: "alias"; text: string; span: SourceSpan } + | { kind: "function-definition"; name: string; text: string; span: SourceSpan } + | { kind: "line-continuation"; text: string; span: SourceSpan } + | { kind: "heredoc"; text: string; span: SourceSpan } + | { kind: "here-string"; text: string; span: SourceSpan } + | { kind: "redirect"; text: string; span: SourceSpan } + | { kind: "syntax-error"; text: string; span: SourceSpan }; + +export type CommandExplanation = { + ok: boolean; + source: string; + shapes: CommandShape[]; + topLevelCommands: CommandStep[]; + nestedCommands: CommandStep[]; + risks: CommandRisk[]; +};