diff --git a/CHANGELOG.md b/CHANGELOG.md index 58a4c921196..f544749c59d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,6 +24,7 @@ Docs: https://docs.openclaw.ai ### Changes - Gateway/nodes: add disabled-by-default `gateway.nodes.pairing.autoApproveCidrs` for first-time node pairing from explicit trusted CIDRs, while keeping operator/browser pairing and all upgrade flows manual. Fixes #60800. Thanks @sahilsatralkar. +- Browser: add viewport coordinate clicks for managed and existing-session automation, plus `openclaw browser click-coords` for CLI use. (#54452) Thanks @dluttz. - Browser/config: support per-profile `browser.profiles..headless` overrides for locally launched browser profiles, so one profile can run headless without forcing all browser profiles headless. Thanks @nakamotoliu. - Plugins/PDF: move local PDF extraction into a bundled `document-extract` plugin so core no longer owns `pdfjs-dist` or PDF image-rendering dependencies. Thanks @vincentkoc. - Matrix: require full cross-signing identity trust for self-device verification and add `openclaw matrix verify self` so operators can establish that trust from the CLI. (#70401) Thanks @gumadeiras. diff --git a/docs/cli/browser.md b/docs/cli/browser.md index 032ed879290..38ab18d4a99 100644 --- a/docs/cli/browser.md +++ b/docs/cli/browser.md @@ -164,6 +164,7 @@ Navigate/click/type (ref-based UI automation): ```bash openclaw browser navigate https://example.com openclaw browser click +openclaw browser click-coords 120 340 openclaw browser type "hello" openclaw browser press Enter openclaw browser hover diff --git a/docs/tools/browser-control.md b/docs/tools/browser-control.md index 7cc5878b3b5..31164ec16ae 100644 --- a/docs/tools/browser-control.md +++ b/docs/tools/browser-control.md @@ -165,6 +165,7 @@ openclaw browser responsebody "**/api" --max-chars 5000 openclaw browser navigate https://example.com openclaw browser resize 1280 720 openclaw browser click 12 --double # or e12 for role refs +openclaw browser click-coords 120 340 # viewport coordinates openclaw browser type 23 "hello" --submit openclaw browser press Enter openclaw browser hover 44 @@ -212,7 +213,7 @@ openclaw browser set device "iPhone 14" Notes: - `upload` and `dialog` are **arming** calls; run them before the click/press that triggers the chooser/dialog. -- `click`/`type`/etc require a `ref` from `snapshot` (numeric `12` or role ref `e12`). CSS selectors are intentionally not supported for actions. +- `click`/`type`/etc require a `ref` from `snapshot` (numeric `12` or role ref `e12`). CSS selectors are intentionally not supported for actions. Use `click-coords` when the visible viewport position is the only reliable target. - Download, trace, and upload paths are constrained to OpenClaw temp roots: `/tmp/openclaw{,/downloads,/uploads}` (fallback: `${os.tmpdir()}/openclaw/...`). - `upload` can also set file inputs directly via `--input-ref` or `--element`. diff --git a/docs/tools/browser.md b/docs/tools/browser.md index aa3ad316d1c..50962d93fb4 100644 --- a/docs/tools/browser.md +++ b/docs/tools/browser.md @@ -529,7 +529,7 @@ Notes: Compared to the managed `openclaw` profile, existing-session drivers are more constrained: - **Screenshots** — page captures and `--ref` element captures work; CSS `--element` selectors do not. `--full-page` cannot combine with `--ref` or `--element`. Playwright is not required for page or ref-based element screenshots. -- **Actions** — `click`, `type`, `hover`, `scrollIntoView`, `drag`, and `select` require snapshot refs (no CSS selectors). `click` is left-button only. `type` does not support `slowly=true`; use `fill` or `press`. `press` does not support `delayMs`. `type`, `hover`, `scrollIntoView`, `drag`, `select`, `fill`, and `evaluate` do not support per-call timeouts. `select` accepts a single value. +- **Actions** — `click`, `type`, `hover`, `scrollIntoView`, `drag`, and `select` require snapshot refs (no CSS selectors). `click-coords` clicks visible viewport coordinates and does not require a snapshot ref. `click` is left-button only. `type` does not support `slowly=true`; use `fill` or `press`. `press` does not support `delayMs`. `type`, `hover`, `scrollIntoView`, `drag`, `select`, `fill`, and `evaluate` do not support per-call timeouts. `select` accepts a single value. - **Wait / upload / dialog** — `wait --url` supports exact, substring, and glob patterns; `wait --load networkidle` is not supported. Upload hooks require `ref` or `inputRef`, one file at a time, no CSS `element`. Dialog hooks do not support timeout overrides. - **Managed-only features** — batch actions, PDF export, download interception, and `responsebody` still require the managed browser path. diff --git a/extensions/browser/src/browser-tool.schema.ts b/extensions/browser/src/browser-tool.schema.ts index a1c336f04f4..1750da81610 100644 --- a/extensions/browser/src/browser-tool.schema.ts +++ b/extensions/browser/src/browser-tool.schema.ts @@ -3,6 +3,7 @@ import { Type } from "typebox"; const BROWSER_ACT_KINDS = [ "click", + "clickCoords", "type", "press", "hover", @@ -55,6 +56,8 @@ const BrowserActSchema = Type.Object({ doubleClick: Type.Optional(Type.Boolean()), button: Type.Optional(Type.String()), modifiers: Type.Optional(Type.Array(Type.String())), + x: Type.Optional(Type.Number()), + y: Type.Optional(Type.Number()), // type text: Type.Optional(Type.String()), submit: Type.Optional(Type.Boolean()), @@ -122,6 +125,8 @@ export const BrowserToolSchema = Type.Object({ doubleClick: Type.Optional(Type.Boolean()), button: Type.Optional(Type.String()), modifiers: Type.Optional(Type.Array(Type.String())), + x: Type.Optional(Type.Number()), + y: Type.Optional(Type.Number()), text: Type.Optional(Type.String()), submit: Type.Optional(Type.Boolean()), slowly: Type.Optional(Type.Boolean()), diff --git a/extensions/browser/src/browser-tool.ts b/extensions/browser/src/browser-tool.ts index f40cee6dc29..1cb6f4930b5 100644 --- a/extensions/browser/src/browser-tool.ts +++ b/extensions/browser/src/browser-tool.ts @@ -147,6 +147,8 @@ const LEGACY_BROWSER_ACT_REQUEST_KEYS = [ "doubleClick", "button", "modifiers", + "x", + "y", "text", "submit", "slowly", diff --git a/extensions/browser/src/browser/chrome-mcp.ts b/extensions/browser/src/browser/chrome-mcp.ts index a940bd0fbb8..3853f982eca 100644 --- a/extensions/browser/src/browser/chrome-mcp.ts +++ b/extensions/browser/src/browser/chrome-mcp.ts @@ -824,6 +824,65 @@ export async function clickChromeMcpElement(params: { ); } +export async function clickChromeMcpCoords(params: { + profileName: string; + userDataDir?: string; + targetId: string; + x: number; + y: number; + doubleClick?: boolean; + button?: "left" | "right" | "middle"; + delayMs?: number; +}): Promise { + const button = params.button ?? "left"; + const buttonCode = button === "middle" ? 1 : button === "right" ? 2 : 0; + const pressedButtons = button === "middle" ? 4 : button === "right" ? 2 : 1; + const x = JSON.stringify(params.x); + const y = JSON.stringify(params.y); + const delayMs = JSON.stringify(Math.max(0, Math.floor(params.delayMs ?? 0))); + const doubleClick = params.doubleClick ? "true" : "false"; + await evaluateChromeMcpScript({ + profileName: params.profileName, + userDataDir: params.userDataDir, + targetId: params.targetId, + fn: `async () => { + const x = ${x}; + const y = ${y}; + const delayMs = ${delayMs}; + const doubleClick = ${doubleClick}; + const target = document.elementFromPoint(x, y) ?? document.body ?? document.documentElement ?? document; + const base = { + bubbles: true, + cancelable: true, + view: window, + clientX: x, + clientY: y, + screenX: window.screenX + x, + screenY: window.screenY + y, + button: ${buttonCode}, + }; + const pressedButtons = ${pressedButtons}; + const dispatch = (type, buttons, detail) => { + target.dispatchEvent(new MouseEvent(type, { ...base, buttons, detail })); + }; + dispatch("mousemove", 0, 0); + dispatch("mousedown", pressedButtons, 1); + if (delayMs > 0) { + await new Promise((resolve) => setTimeout(resolve, delayMs)); + } + dispatch("mouseup", 0, 1); + dispatch("click", 0, 1); + if (doubleClick) { + dispatch("mousedown", pressedButtons, 2); + dispatch("mouseup", 0, 2); + dispatch("click", 0, 2); + dispatch("dblclick", 0, 2); + } + return true; + }`, + }); +} + export async function fillChromeMcpElement(params: { profileName: string; userDataDir?: string; diff --git a/extensions/browser/src/browser/client-actions.types.ts b/extensions/browser/src/browser/client-actions.types.ts index 167e9c9469c..b4aa244a28d 100644 --- a/extensions/browser/src/browser/client-actions.types.ts +++ b/extensions/browser/src/browser/client-actions.types.ts @@ -16,6 +16,16 @@ export type BrowserActRequest = delayMs?: number; timeoutMs?: number; } + | { + kind: "clickCoords"; + x: number; + y: number; + targetId?: string; + doubleClick?: boolean; + button?: string; + delayMs?: number; + timeoutMs?: number; + } | { kind: "type"; ref?: string; diff --git a/extensions/browser/src/browser/pw-tools-core.interactions.ts b/extensions/browser/src/browser/pw-tools-core.interactions.ts index ba2af27ad0e..5463889dc96 100644 --- a/extensions/browser/src/browser/pw-tools-core.interactions.ts +++ b/extensions/browser/src/browser/pw-tools-core.interactions.ts @@ -592,6 +592,35 @@ export async function clickViaPlaywright(opts: { } } +export async function clickCoordsViaPlaywright(opts: { + cdpUrl: string; + targetId?: string; + x: number; + y: number; + doubleClick?: boolean; + button?: "left" | "right" | "middle"; + delayMs?: number; + timeoutMs?: number; + ssrfPolicy?: SsrFPolicy; +}): Promise { + const page = await getRestoredPageForTarget(opts); + const previousUrl = page.url(); + await assertInteractionNavigationCompletedSafely({ + action: async () => { + await page.mouse.click(opts.x, opts.y, { + button: opts.button, + clickCount: opts.doubleClick ? 2 : 1, + delay: resolveBoundedDelayMs(opts.delayMs, "clickCoords delayMs", ACT_MAX_CLICK_DELAY_MS), + }); + }, + cdpUrl: opts.cdpUrl, + page, + previousUrl, + ssrfPolicy: opts.ssrfPolicy, + targetId: opts.targetId, + }); +} + export async function hoverViaPlaywright(opts: { cdpUrl: string; targetId?: string; @@ -1244,6 +1273,19 @@ async function executeSingleAction( ssrfPolicy, }); break; + case "clickCoords": + await clickCoordsViaPlaywright({ + cdpUrl, + targetId: effectiveTargetId, + x: action.x, + y: action.y, + doubleClick: action.doubleClick, + button: action.button as "left" | "right" | "middle" | undefined, + delayMs: action.delayMs, + timeoutMs: action.timeoutMs, + ssrfPolicy, + }); + break; case "type": await typeViaPlaywright({ cdpUrl, diff --git a/extensions/browser/src/browser/routes/agent.act.normalize.ts b/extensions/browser/src/browser/routes/agent.act.normalize.ts index 50a16e85b25..7f8d33afd32 100644 --- a/extensions/browser/src/browser/routes/agent.act.normalize.ts +++ b/extensions/browser/src/browser/routes/agent.act.normalize.ts @@ -114,6 +114,36 @@ export function normalizeActRequest( ...(timeoutMs !== undefined ? { timeoutMs } : {}), }; } + case "clickCoords": { + const x = toNumber(body.x); + const y = toNumber(body.y); + if (x === undefined || y === undefined || x < 0 || y < 0) { + throw new Error("clickCoords requires non-negative x and y"); + } + const buttonRaw = toStringOrEmpty(body.button); + const button = buttonRaw ? parseClickButton(buttonRaw) : undefined; + if (buttonRaw && !button) { + throw new Error("clickCoords button must be left|right|middle"); + } + const doubleClick = toBoolean(body.doubleClick); + const delayMs = normalizeActBoundedNonNegativeMs( + toNumber(body.delayMs), + "clickCoords delayMs", + ACT_MAX_CLICK_DELAY_MS, + ); + const timeoutMs = toNumber(body.timeoutMs); + const targetId = toStringOrEmpty(body.targetId) || undefined; + return { + kind, + x, + y, + ...(targetId ? { targetId } : {}), + ...(doubleClick !== undefined ? { doubleClick } : {}), + ...(button ? { button } : {}), + ...(delayMs !== undefined ? { delayMs } : {}), + ...(timeoutMs !== undefined ? { timeoutMs } : {}), + }; + } case "type": { const ref = toStringOrEmpty(body.ref) || undefined; const selector = toStringOrEmpty(body.selector) || undefined; diff --git a/extensions/browser/src/browser/routes/agent.act.shared.ts b/extensions/browser/src/browser/routes/agent.act.shared.ts index b22f35e7ef2..bfa04d65588 100644 --- a/extensions/browser/src/browser/routes/agent.act.shared.ts +++ b/extensions/browser/src/browser/routes/agent.act.shared.ts @@ -1,6 +1,7 @@ export const ACT_KINDS = [ "batch", "click", + "clickCoords", "close", "drag", "evaluate", diff --git a/extensions/browser/src/browser/routes/agent.act.ts b/extensions/browser/src/browser/routes/agent.act.ts index 63356a1afa3..c75dafb0b40 100644 --- a/extensions/browser/src/browser/routes/agent.act.ts +++ b/extensions/browser/src/browser/routes/agent.act.ts @@ -1,6 +1,7 @@ import { formatErrorMessage } from "../../infra/errors.js"; import { clickChromeMcpElement, + clickChromeMcpCoords, closeChromeMcpTab, dragChromeMcpElement, evaluateChromeMcpScript, @@ -279,6 +280,8 @@ function getExistingSessionUnsupportedMessage(action: BrowserActRequest): string return EXISTING_SESSION_LIMITS.act.clickButtonOrModifiers; } return null; + case "clickCoords": + return null; case "type": if (action.selector) { return EXISTING_SESSION_LIMITS.act.typeSelector; @@ -425,6 +428,22 @@ export function registerBrowserAgentActRoutes( guard: existingSessionNavigationGuard, }); return res.json({ ok: true, targetId: tab.targetId, url: tab.url }); + case "clickCoords": + await runExistingSessionActionWithNavigationGuard({ + execute: () => + clickChromeMcpCoords({ + profileName, + userDataDir: profileCtx.profile.userDataDir, + targetId: tab.targetId, + x: action.x, + y: action.y, + doubleClick: action.doubleClick ?? false, + button: action.button as "left" | "right" | "middle" | undefined, + delayMs: action.delayMs, + }), + guard: existingSessionNavigationGuard, + }); + return res.json({ ok: true, targetId: tab.targetId, url: tab.url }); case "type": await runExistingSessionActionWithNavigationGuard({ execute: async () => { @@ -610,6 +629,7 @@ export function registerBrowserAgentActRoutes( result: result.result, }); case "click": + case "clickCoords": case "resize": return res.json({ ok: true, targetId: tab.targetId, url: tab.url }); default: diff --git a/extensions/browser/src/browser/routes/agent.existing-session.test.ts b/extensions/browser/src/browser/routes/agent.existing-session.test.ts index 118ab6857fd..d48a70dfb8c 100644 --- a/extensions/browser/src/browser/routes/agent.existing-session.test.ts +++ b/extensions/browser/src/browser/routes/agent.existing-session.test.ts @@ -8,6 +8,7 @@ import { createBrowserRouteApp, createBrowserRouteResponse } from "./test-helper const routeState = existingSessionRouteState; const chromeMcpMocks = vi.hoisted(() => ({ + clickChromeMcpCoords: vi.fn(async () => {}), clickChromeMcpElement: vi.fn(async () => {}), evaluateChromeMcpScript: vi.fn( async (_params: { profileName: string; targetId: string; fn: string }) => true, @@ -30,6 +31,7 @@ const navigationGuardMocks = vi.hoisted(() => ({ })); vi.mock("../chrome-mcp.js", () => ({ + clickChromeMcpCoords: chromeMcpMocks.clickChromeMcpCoords, clickChromeMcpElement: chromeMcpMocks.clickChromeMcpElement, closeChromeMcpTab: vi.fn(async () => {}), dragChromeMcpElement: vi.fn(async () => {}), @@ -108,6 +110,7 @@ describe("existing-session browser routes", () => { beforeEach(() => { routeState.profileCtx.ensureTabAvailable.mockClear(); routeState.profileCtx.listTabs.mockClear(); + chromeMcpMocks.clickChromeMcpCoords.mockClear(); chromeMcpMocks.clickChromeMcpElement.mockClear(); chromeMcpMocks.evaluateChromeMcpScript.mockReset(); chromeMcpMocks.fillChromeMcpElement.mockClear(); @@ -313,4 +316,31 @@ describe("existing-session browser routes", () => { signal: ctrl.signal, }); }); + + it("supports coordinate clicks for existing-session profiles", async () => { + const handler = getActPostHandler(); + const response = createBrowserRouteResponse(); + + await handler?.( + { + params: {}, + query: {}, + body: { kind: "clickCoords", x: 25, y: "32", doubleClick: true, delayMs: 5 }, + }, + response.res, + ); + + expect(response.statusCode).toBe(200); + expect(response.body).toMatchObject({ ok: true, targetId: "7", url: "https://example.com" }); + expect(chromeMcpMocks.clickChromeMcpCoords).toHaveBeenCalledWith({ + profileName: "chrome-live", + userDataDir: undefined, + targetId: "7", + x: 25, + y: 32, + doubleClick: true, + button: undefined, + delayMs: 5, + }); + }); }); diff --git a/extensions/browser/src/browser/server.agent-contract-core.test.ts b/extensions/browser/src/browser/server.agent-contract-core.test.ts index 9a504c34362..96f800f2d9f 100644 --- a/extensions/browser/src/browser/server.agent-contract-core.test.ts +++ b/extensions/browser/src/browser/server.agent-contract-core.test.ts @@ -82,6 +82,23 @@ describe("browser control server", () => { slowTimeoutMs, ); + it( + "returns ACT_INVALID_REQUEST for malformed coordinate clicks", + async () => { + const base = await startServerAndBase(); + const response = await postActAndReadError(base, { + kind: "clickCoords", + x: -1, + y: 20, + }); + + expect(response.status).toBe(400); + expect(response.body.code).toBe("ACT_INVALID_REQUEST"); + expect(response.body.error).toContain("clickCoords requires non-negative x and y"); + }, + slowTimeoutMs, + ); + it( "returns ACT_EXISTING_SESSION_UNSUPPORTED for unsupported existing-session actions", async () => { @@ -297,6 +314,31 @@ describe("browser control server", () => { const [clickSelectorArgs] = pwMocks.clickViaPlaywright.mock.calls[1] ?? []; expect((clickSelectorArgs as { doubleClick?: boolean }).doubleClick).toBeUndefined(); + const clickCoords = await postJson<{ ok: boolean; url?: string }>(`${base}/act`, { + kind: "clickCoords", + x: "42.5", + y: 64, + doubleClick: "true", + button: "left", + delayMs: "10", + }); + expect(clickCoords.ok).toBe(true); + expect(clickCoords.url).toBe("https://example.com"); + expect(pwMocks.clickCoordsViaPlaywright).toHaveBeenCalledWith( + expect.objectContaining({ + cdpUrl: state.cdpBaseUrl, + targetId: "abcd1234", + x: 42.5, + y: 64, + doubleClick: true, + button: "left", + delayMs: 10, + ssrfPolicy: { + dangerouslyAllowPrivateNetwork: true, + }, + }), + ); + const type = await postJson<{ ok: boolean }>(`${base}/act`, { kind: "type", ref: "1", diff --git a/extensions/browser/src/browser/server.control-server.test-harness.ts b/extensions/browser/src/browser/server.control-server.test-harness.ts index be9211eb70c..6dd393e0133 100644 --- a/extensions/browser/src/browser/server.control-server.test-harness.ts +++ b/extensions/browser/src/browser/server.control-server.test-harness.ts @@ -147,6 +147,7 @@ const pwMocks = vi.hoisted(() => ({ armDialogViaPlaywright: vi.fn(async () => {}), armFileUploadViaPlaywright: vi.fn(async () => {}), batchViaPlaywright: vi.fn(async (_opts?: unknown) => ({ results: [] })), + clickCoordsViaPlaywright: vi.fn(async (_opts?: unknown) => {}), clickViaPlaywright: vi.fn(async (_opts?: unknown) => {}), closePageViaPlaywright: vi.fn(async (_opts?: unknown) => {}), closePlaywrightBrowserConnection: vi.fn(async () => {}), @@ -194,6 +195,11 @@ const passThroughActDispatch: Record = { fields: ["ref", "selector", "doubleClick", "button", "modifiers", "delayMs", "timeoutMs"], includeSsrf: true, }, + clickCoords: { + mock: pwMocks.clickCoordsViaPlaywright, + fields: ["x", "y", "doubleClick", "button", "delayMs", "timeoutMs"], + includeSsrf: true, + }, type: { mock: pwMocks.typeViaPlaywright, fields: ["ref", "selector", "text", "submit", "slowly", "timeoutMs"], @@ -301,6 +307,7 @@ export function getPwMocks(): Record { } const chromeMcpMocks = vi.hoisted(() => ({ + clickChromeMcpCoords: vi.fn(async () => {}), clickChromeMcpElement: vi.fn(async () => {}), closeChromeMcpSession: vi.fn(async () => true), closeChromeMcpTab: vi.fn(async () => {}), diff --git a/extensions/browser/src/cli/browser-cli-actions-input/register.element.ts b/extensions/browser/src/cli/browser-cli-actions-input/register.element.ts index a71eabc48a7..810df665322 100644 --- a/extensions/browser/src/cli/browser-cli-actions-input/register.element.ts +++ b/extensions/browser/src/cli/browser-cli-actions-input/register.element.ts @@ -75,6 +75,37 @@ export function registerBrowserElementCommands( }); }); + browser + .command("click-coords") + .description("Click viewport coordinates") + .argument("", "Viewport x coordinate") + .argument("", "Viewport y coordinate") + .option("--target-id ", "CDP target id (or unique prefix)") + .option("--double", "Double click", false) + .option("--button ", "Mouse button to use") + .option("--delay-ms ", "Delay between mouse down/up", (v: string) => Number(v)) + .action(async (xRaw: string, yRaw: string, opts, cmd) => { + const x = Number(xRaw); + const y = Number(yRaw); + await runElementAction({ + cmd, + body: { + kind: "clickCoords", + x, + y, + targetId: normalizeOptionalString(opts.targetId), + doubleClick: Boolean(opts.double), + button: normalizeOptionalString(opts.button), + delayMs: Number.isFinite(opts.delayMs) ? opts.delayMs : undefined, + }, + successMessage: (result) => { + const url = (result as { url?: unknown }).url; + const suffix = typeof url === "string" && url ? ` on ${url}` : ""; + return `clicked ${x},${y}${suffix}`; + }, + }); + }); + browser .command("type") .description("Type into an element by ref from snapshot") diff --git a/extensions/browser/src/cli/browser-cli-examples.ts b/extensions/browser/src/cli/browser-cli-examples.ts index 7e6df7cd6db..de621a80f60 100644 --- a/extensions/browser/src/cli/browser-cli-examples.ts +++ b/extensions/browser/src/cli/browser-cli-examples.ts @@ -19,6 +19,7 @@ export const browserActionExamples = [ "openclaw browser navigate https://example.com", "openclaw browser resize 1280 720", "openclaw browser click 12 --double", + "openclaw browser click-coords 120 340", 'openclaw browser type 23 "hello" --submit', "openclaw browser press Enter", "openclaw browser hover 44",