feat(browser): add coordinate click action

Co-authored-by: Daniel Lutts <daniellutts@10-19-94-204.dynapool.wireless.nyu.edu>
This commit is contained in:
Peter Steinberger
2026-04-25 07:31:18 +01:00
parent 982230f460
commit 209d50b52c
17 changed files with 285 additions and 2 deletions

View File

@@ -24,6 +24,7 @@ Docs: https://docs.openclaw.ai
### Changes
- Gateway/nodes: add disabled-by-default `gateway.nodes.pairing.autoApproveCidrs` for first-time node pairing from explicit trusted CIDRs, while keeping operator/browser pairing and all upgrade flows manual. Fixes #60800. Thanks @sahilsatralkar.
- Browser: add viewport coordinate clicks for managed and existing-session automation, plus `openclaw browser click-coords` for CLI use. (#54452) Thanks @dluttz.
- Browser/config: support per-profile `browser.profiles.<name>.headless` overrides for locally launched browser profiles, so one profile can run headless without forcing all browser profiles headless. Thanks @nakamotoliu.
- Plugins/PDF: move local PDF extraction into a bundled `document-extract` plugin so core no longer owns `pdfjs-dist` or PDF image-rendering dependencies. Thanks @vincentkoc.
- Matrix: require full cross-signing identity trust for self-device verification and add `openclaw matrix verify self` so operators can establish that trust from the CLI. (#70401) Thanks @gumadeiras.

View File

@@ -164,6 +164,7 @@ Navigate/click/type (ref-based UI automation):
```bash
openclaw browser navigate https://example.com
openclaw browser click <ref>
openclaw browser click-coords 120 340
openclaw browser type <ref> "hello"
openclaw browser press Enter
openclaw browser hover <ref>

View File

@@ -165,6 +165,7 @@ openclaw browser responsebody "**/api" --max-chars 5000
openclaw browser navigate https://example.com
openclaw browser resize 1280 720
openclaw browser click 12 --double # or e12 for role refs
openclaw browser click-coords 120 340 # viewport coordinates
openclaw browser type 23 "hello" --submit
openclaw browser press Enter
openclaw browser hover 44
@@ -212,7 +213,7 @@ openclaw browser set device "iPhone 14"
Notes:
- `upload` and `dialog` are **arming** calls; run them before the click/press that triggers the chooser/dialog.
- `click`/`type`/etc require a `ref` from `snapshot` (numeric `12` or role ref `e12`). CSS selectors are intentionally not supported for actions.
- `click`/`type`/etc require a `ref` from `snapshot` (numeric `12` or role ref `e12`). CSS selectors are intentionally not supported for actions. Use `click-coords` when the visible viewport position is the only reliable target.
- Download, trace, and upload paths are constrained to OpenClaw temp roots: `/tmp/openclaw{,/downloads,/uploads}` (fallback: `${os.tmpdir()}/openclaw/...`).
- `upload` can also set file inputs directly via `--input-ref` or `--element`.

View File

@@ -529,7 +529,7 @@ Notes:
Compared to the managed `openclaw` profile, existing-session drivers are more constrained:
- **Screenshots** — page captures and `--ref` element captures work; CSS `--element` selectors do not. `--full-page` cannot combine with `--ref` or `--element`. Playwright is not required for page or ref-based element screenshots.
- **Actions** — `click`, `type`, `hover`, `scrollIntoView`, `drag`, and `select` require snapshot refs (no CSS selectors). `click` is left-button only. `type` does not support `slowly=true`; use `fill` or `press`. `press` does not support `delayMs`. `type`, `hover`, `scrollIntoView`, `drag`, `select`, `fill`, and `evaluate` do not support per-call timeouts. `select` accepts a single value.
- **Actions** — `click`, `type`, `hover`, `scrollIntoView`, `drag`, and `select` require snapshot refs (no CSS selectors). `click-coords` clicks visible viewport coordinates and does not require a snapshot ref. `click` is left-button only. `type` does not support `slowly=true`; use `fill` or `press`. `press` does not support `delayMs`. `type`, `hover`, `scrollIntoView`, `drag`, `select`, `fill`, and `evaluate` do not support per-call timeouts. `select` accepts a single value.
- **Wait / upload / dialog** — `wait --url` supports exact, substring, and glob patterns; `wait --load networkidle` is not supported. Upload hooks require `ref` or `inputRef`, one file at a time, no CSS `element`. Dialog hooks do not support timeout overrides.
- **Managed-only features** — batch actions, PDF export, download interception, and `responsebody` still require the managed browser path.

View File

@@ -3,6 +3,7 @@ import { Type } from "typebox";
const BROWSER_ACT_KINDS = [
"click",
"clickCoords",
"type",
"press",
"hover",
@@ -55,6 +56,8 @@ const BrowserActSchema = Type.Object({
doubleClick: Type.Optional(Type.Boolean()),
button: Type.Optional(Type.String()),
modifiers: Type.Optional(Type.Array(Type.String())),
x: Type.Optional(Type.Number()),
y: Type.Optional(Type.Number()),
// type
text: Type.Optional(Type.String()),
submit: Type.Optional(Type.Boolean()),
@@ -122,6 +125,8 @@ export const BrowserToolSchema = Type.Object({
doubleClick: Type.Optional(Type.Boolean()),
button: Type.Optional(Type.String()),
modifiers: Type.Optional(Type.Array(Type.String())),
x: Type.Optional(Type.Number()),
y: Type.Optional(Type.Number()),
text: Type.Optional(Type.String()),
submit: Type.Optional(Type.Boolean()),
slowly: Type.Optional(Type.Boolean()),

View File

@@ -147,6 +147,8 @@ const LEGACY_BROWSER_ACT_REQUEST_KEYS = [
"doubleClick",
"button",
"modifiers",
"x",
"y",
"text",
"submit",
"slowly",

View File

@@ -824,6 +824,65 @@ export async function clickChromeMcpElement(params: {
);
}
export async function clickChromeMcpCoords(params: {
profileName: string;
userDataDir?: string;
targetId: string;
x: number;
y: number;
doubleClick?: boolean;
button?: "left" | "right" | "middle";
delayMs?: number;
}): Promise<void> {
const button = params.button ?? "left";
const buttonCode = button === "middle" ? 1 : button === "right" ? 2 : 0;
const pressedButtons = button === "middle" ? 4 : button === "right" ? 2 : 1;
const x = JSON.stringify(params.x);
const y = JSON.stringify(params.y);
const delayMs = JSON.stringify(Math.max(0, Math.floor(params.delayMs ?? 0)));
const doubleClick = params.doubleClick ? "true" : "false";
await evaluateChromeMcpScript({
profileName: params.profileName,
userDataDir: params.userDataDir,
targetId: params.targetId,
fn: `async () => {
const x = ${x};
const y = ${y};
const delayMs = ${delayMs};
const doubleClick = ${doubleClick};
const target = document.elementFromPoint(x, y) ?? document.body ?? document.documentElement ?? document;
const base = {
bubbles: true,
cancelable: true,
view: window,
clientX: x,
clientY: y,
screenX: window.screenX + x,
screenY: window.screenY + y,
button: ${buttonCode},
};
const pressedButtons = ${pressedButtons};
const dispatch = (type, buttons, detail) => {
target.dispatchEvent(new MouseEvent(type, { ...base, buttons, detail }));
};
dispatch("mousemove", 0, 0);
dispatch("mousedown", pressedButtons, 1);
if (delayMs > 0) {
await new Promise((resolve) => setTimeout(resolve, delayMs));
}
dispatch("mouseup", 0, 1);
dispatch("click", 0, 1);
if (doubleClick) {
dispatch("mousedown", pressedButtons, 2);
dispatch("mouseup", 0, 2);
dispatch("click", 0, 2);
dispatch("dblclick", 0, 2);
}
return true;
}`,
});
}
export async function fillChromeMcpElement(params: {
profileName: string;
userDataDir?: string;

View File

@@ -16,6 +16,16 @@ export type BrowserActRequest =
delayMs?: number;
timeoutMs?: number;
}
| {
kind: "clickCoords";
x: number;
y: number;
targetId?: string;
doubleClick?: boolean;
button?: string;
delayMs?: number;
timeoutMs?: number;
}
| {
kind: "type";
ref?: string;

View File

@@ -592,6 +592,35 @@ export async function clickViaPlaywright(opts: {
}
}
export async function clickCoordsViaPlaywright(opts: {
cdpUrl: string;
targetId?: string;
x: number;
y: number;
doubleClick?: boolean;
button?: "left" | "right" | "middle";
delayMs?: number;
timeoutMs?: number;
ssrfPolicy?: SsrFPolicy;
}): Promise<void> {
const page = await getRestoredPageForTarget(opts);
const previousUrl = page.url();
await assertInteractionNavigationCompletedSafely({
action: async () => {
await page.mouse.click(opts.x, opts.y, {
button: opts.button,
clickCount: opts.doubleClick ? 2 : 1,
delay: resolveBoundedDelayMs(opts.delayMs, "clickCoords delayMs", ACT_MAX_CLICK_DELAY_MS),
});
},
cdpUrl: opts.cdpUrl,
page,
previousUrl,
ssrfPolicy: opts.ssrfPolicy,
targetId: opts.targetId,
});
}
export async function hoverViaPlaywright(opts: {
cdpUrl: string;
targetId?: string;
@@ -1244,6 +1273,19 @@ async function executeSingleAction(
ssrfPolicy,
});
break;
case "clickCoords":
await clickCoordsViaPlaywright({
cdpUrl,
targetId: effectiveTargetId,
x: action.x,
y: action.y,
doubleClick: action.doubleClick,
button: action.button as "left" | "right" | "middle" | undefined,
delayMs: action.delayMs,
timeoutMs: action.timeoutMs,
ssrfPolicy,
});
break;
case "type":
await typeViaPlaywright({
cdpUrl,

View File

@@ -114,6 +114,36 @@ export function normalizeActRequest(
...(timeoutMs !== undefined ? { timeoutMs } : {}),
};
}
case "clickCoords": {
const x = toNumber(body.x);
const y = toNumber(body.y);
if (x === undefined || y === undefined || x < 0 || y < 0) {
throw new Error("clickCoords requires non-negative x and y");
}
const buttonRaw = toStringOrEmpty(body.button);
const button = buttonRaw ? parseClickButton(buttonRaw) : undefined;
if (buttonRaw && !button) {
throw new Error("clickCoords button must be left|right|middle");
}
const doubleClick = toBoolean(body.doubleClick);
const delayMs = normalizeActBoundedNonNegativeMs(
toNumber(body.delayMs),
"clickCoords delayMs",
ACT_MAX_CLICK_DELAY_MS,
);
const timeoutMs = toNumber(body.timeoutMs);
const targetId = toStringOrEmpty(body.targetId) || undefined;
return {
kind,
x,
y,
...(targetId ? { targetId } : {}),
...(doubleClick !== undefined ? { doubleClick } : {}),
...(button ? { button } : {}),
...(delayMs !== undefined ? { delayMs } : {}),
...(timeoutMs !== undefined ? { timeoutMs } : {}),
};
}
case "type": {
const ref = toStringOrEmpty(body.ref) || undefined;
const selector = toStringOrEmpty(body.selector) || undefined;

View File

@@ -1,6 +1,7 @@
export const ACT_KINDS = [
"batch",
"click",
"clickCoords",
"close",
"drag",
"evaluate",

View File

@@ -1,6 +1,7 @@
import { formatErrorMessage } from "../../infra/errors.js";
import {
clickChromeMcpElement,
clickChromeMcpCoords,
closeChromeMcpTab,
dragChromeMcpElement,
evaluateChromeMcpScript,
@@ -279,6 +280,8 @@ function getExistingSessionUnsupportedMessage(action: BrowserActRequest): string
return EXISTING_SESSION_LIMITS.act.clickButtonOrModifiers;
}
return null;
case "clickCoords":
return null;
case "type":
if (action.selector) {
return EXISTING_SESSION_LIMITS.act.typeSelector;
@@ -425,6 +428,22 @@ export function registerBrowserAgentActRoutes(
guard: existingSessionNavigationGuard,
});
return res.json({ ok: true, targetId: tab.targetId, url: tab.url });
case "clickCoords":
await runExistingSessionActionWithNavigationGuard({
execute: () =>
clickChromeMcpCoords({
profileName,
userDataDir: profileCtx.profile.userDataDir,
targetId: tab.targetId,
x: action.x,
y: action.y,
doubleClick: action.doubleClick ?? false,
button: action.button as "left" | "right" | "middle" | undefined,
delayMs: action.delayMs,
}),
guard: existingSessionNavigationGuard,
});
return res.json({ ok: true, targetId: tab.targetId, url: tab.url });
case "type":
await runExistingSessionActionWithNavigationGuard({
execute: async () => {
@@ -610,6 +629,7 @@ export function registerBrowserAgentActRoutes(
result: result.result,
});
case "click":
case "clickCoords":
case "resize":
return res.json({ ok: true, targetId: tab.targetId, url: tab.url });
default:

View File

@@ -8,6 +8,7 @@ import { createBrowserRouteApp, createBrowserRouteResponse } from "./test-helper
const routeState = existingSessionRouteState;
const chromeMcpMocks = vi.hoisted(() => ({
clickChromeMcpCoords: vi.fn(async () => {}),
clickChromeMcpElement: vi.fn(async () => {}),
evaluateChromeMcpScript: vi.fn(
async (_params: { profileName: string; targetId: string; fn: string }) => true,
@@ -30,6 +31,7 @@ const navigationGuardMocks = vi.hoisted(() => ({
}));
vi.mock("../chrome-mcp.js", () => ({
clickChromeMcpCoords: chromeMcpMocks.clickChromeMcpCoords,
clickChromeMcpElement: chromeMcpMocks.clickChromeMcpElement,
closeChromeMcpTab: vi.fn(async () => {}),
dragChromeMcpElement: vi.fn(async () => {}),
@@ -108,6 +110,7 @@ describe("existing-session browser routes", () => {
beforeEach(() => {
routeState.profileCtx.ensureTabAvailable.mockClear();
routeState.profileCtx.listTabs.mockClear();
chromeMcpMocks.clickChromeMcpCoords.mockClear();
chromeMcpMocks.clickChromeMcpElement.mockClear();
chromeMcpMocks.evaluateChromeMcpScript.mockReset();
chromeMcpMocks.fillChromeMcpElement.mockClear();
@@ -313,4 +316,31 @@ describe("existing-session browser routes", () => {
signal: ctrl.signal,
});
});
it("supports coordinate clicks for existing-session profiles", async () => {
const handler = getActPostHandler();
const response = createBrowserRouteResponse();
await handler?.(
{
params: {},
query: {},
body: { kind: "clickCoords", x: 25, y: "32", doubleClick: true, delayMs: 5 },
},
response.res,
);
expect(response.statusCode).toBe(200);
expect(response.body).toMatchObject({ ok: true, targetId: "7", url: "https://example.com" });
expect(chromeMcpMocks.clickChromeMcpCoords).toHaveBeenCalledWith({
profileName: "chrome-live",
userDataDir: undefined,
targetId: "7",
x: 25,
y: 32,
doubleClick: true,
button: undefined,
delayMs: 5,
});
});
});

View File

@@ -82,6 +82,23 @@ describe("browser control server", () => {
slowTimeoutMs,
);
it(
"returns ACT_INVALID_REQUEST for malformed coordinate clicks",
async () => {
const base = await startServerAndBase();
const response = await postActAndReadError(base, {
kind: "clickCoords",
x: -1,
y: 20,
});
expect(response.status).toBe(400);
expect(response.body.code).toBe("ACT_INVALID_REQUEST");
expect(response.body.error).toContain("clickCoords requires non-negative x and y");
},
slowTimeoutMs,
);
it(
"returns ACT_EXISTING_SESSION_UNSUPPORTED for unsupported existing-session actions",
async () => {
@@ -297,6 +314,31 @@ describe("browser control server", () => {
const [clickSelectorArgs] = pwMocks.clickViaPlaywright.mock.calls[1] ?? [];
expect((clickSelectorArgs as { doubleClick?: boolean }).doubleClick).toBeUndefined();
const clickCoords = await postJson<{ ok: boolean; url?: string }>(`${base}/act`, {
kind: "clickCoords",
x: "42.5",
y: 64,
doubleClick: "true",
button: "left",
delayMs: "10",
});
expect(clickCoords.ok).toBe(true);
expect(clickCoords.url).toBe("https://example.com");
expect(pwMocks.clickCoordsViaPlaywright).toHaveBeenCalledWith(
expect.objectContaining({
cdpUrl: state.cdpBaseUrl,
targetId: "abcd1234",
x: 42.5,
y: 64,
doubleClick: true,
button: "left",
delayMs: 10,
ssrfPolicy: {
dangerouslyAllowPrivateNetwork: true,
},
}),
);
const type = await postJson<{ ok: boolean }>(`${base}/act`, {
kind: "type",
ref: "1",

View File

@@ -147,6 +147,7 @@ const pwMocks = vi.hoisted(() => ({
armDialogViaPlaywright: vi.fn(async () => {}),
armFileUploadViaPlaywright: vi.fn(async () => {}),
batchViaPlaywright: vi.fn(async (_opts?: unknown) => ({ results: [] })),
clickCoordsViaPlaywright: vi.fn(async (_opts?: unknown) => {}),
clickViaPlaywright: vi.fn(async (_opts?: unknown) => {}),
closePageViaPlaywright: vi.fn(async (_opts?: unknown) => {}),
closePlaywrightBrowserConnection: vi.fn(async () => {}),
@@ -194,6 +195,11 @@ const passThroughActDispatch: Record<string, PassThroughActDispatch> = {
fields: ["ref", "selector", "doubleClick", "button", "modifiers", "delayMs", "timeoutMs"],
includeSsrf: true,
},
clickCoords: {
mock: pwMocks.clickCoordsViaPlaywright,
fields: ["x", "y", "doubleClick", "button", "delayMs", "timeoutMs"],
includeSsrf: true,
},
type: {
mock: pwMocks.typeViaPlaywright,
fields: ["ref", "selector", "text", "submit", "slowly", "timeoutMs"],
@@ -301,6 +307,7 @@ export function getPwMocks(): Record<string, MockFn> {
}
const chromeMcpMocks = vi.hoisted(() => ({
clickChromeMcpCoords: vi.fn(async () => {}),
clickChromeMcpElement: vi.fn(async () => {}),
closeChromeMcpSession: vi.fn(async () => true),
closeChromeMcpTab: vi.fn(async () => {}),

View File

@@ -75,6 +75,37 @@ export function registerBrowserElementCommands(
});
});
browser
.command("click-coords")
.description("Click viewport coordinates")
.argument("<x>", "Viewport x coordinate")
.argument("<y>", "Viewport y coordinate")
.option("--target-id <id>", "CDP target id (or unique prefix)")
.option("--double", "Double click", false)
.option("--button <left|right|middle>", "Mouse button to use")
.option("--delay-ms <ms>", "Delay between mouse down/up", (v: string) => Number(v))
.action(async (xRaw: string, yRaw: string, opts, cmd) => {
const x = Number(xRaw);
const y = Number(yRaw);
await runElementAction({
cmd,
body: {
kind: "clickCoords",
x,
y,
targetId: normalizeOptionalString(opts.targetId),
doubleClick: Boolean(opts.double),
button: normalizeOptionalString(opts.button),
delayMs: Number.isFinite(opts.delayMs) ? opts.delayMs : undefined,
},
successMessage: (result) => {
const url = (result as { url?: unknown }).url;
const suffix = typeof url === "string" && url ? ` on ${url}` : "";
return `clicked ${x},${y}${suffix}`;
},
});
});
browser
.command("type")
.description("Type into an element by ref from snapshot")

View File

@@ -19,6 +19,7 @@ export const browserActionExamples = [
"openclaw browser navigate https://example.com",
"openclaw browser resize 1280 720",
"openclaw browser click 12 --double",
"openclaw browser click-coords 120 340",
'openclaw browser type 23 "hello" --submit',
"openclaw browser press Enter",
"openclaw browser hover 44",