diff --git a/CHANGELOG.md b/CHANGELOG.md index 5d996c4ddc1..4d7076d17dc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,6 +19,7 @@ - Browser: ship a built-in `chrome` profile for extension relay and start the relay automatically when running locally. - Browser: default `browser.defaultProfile` to `chrome` (existing Chrome takeover mode). - Browser: add `clawdbot browser extension install/path` and copy extension path to clipboard. +- Browser: add `snapshot refs=aria` (Playwright aria-ref ids) for self-resolving refs across `snapshot` → `act`. - Control UI: show raw any-map entries in config views; move Docs link into the left nav. #### Plugins diff --git a/src/agents/tools/browser-tool.schema.ts b/src/agents/tools/browser-tool.schema.ts index e012ce221b5..5acad1688de 100644 --- a/src/agents/tools/browser-tool.schema.ts +++ b/src/agents/tools/browser-tool.schema.ts @@ -39,6 +39,7 @@ const BROWSER_TARGETS = ["sandbox", "host", "custom"] as const; const BROWSER_SNAPSHOT_FORMATS = ["aria", "ai"] as const; const BROWSER_SNAPSHOT_MODES = ["efficient"] as const; +const BROWSER_SNAPSHOT_REFS = ["role", "aria"] as const; const BROWSER_IMAGE_TYPES = ["png", "jpeg"] as const; @@ -91,6 +92,7 @@ export const BrowserToolSchema = Type.Object({ maxChars: Type.Optional(Type.Number()), mode: optionalStringEnum(BROWSER_SNAPSHOT_MODES), format: optionalStringEnum(BROWSER_SNAPSHOT_FORMATS), + refs: optionalStringEnum(BROWSER_SNAPSHOT_REFS), interactive: Type.Optional(Type.Boolean()), compact: Type.Optional(Type.Boolean()), depth: Type.Optional(Type.Number()), diff --git a/src/agents/tools/browser-tool.test.ts b/src/agents/tools/browser-tool.test.ts index a1e695fde09..97bb3b0261f 100644 --- a/src/agents/tools/browser-tool.test.ts +++ b/src/agents/tools/browser-tool.test.ts @@ -121,6 +121,19 @@ describe("browser tool snapshot maxChars", () => { expect(browserClientMocks.browserProfiles).toHaveBeenCalledWith("http://127.0.0.1:18791"); }); + + it("passes refs mode through to browser snapshot", async () => { + const tool = createBrowserTool(); + await tool.execute?.(null, { action: "snapshot", format: "ai", refs: "aria" }); + + expect(browserClientMocks.browserSnapshot).toHaveBeenCalledWith( + "http://127.0.0.1:18791", + expect.objectContaining({ + format: "ai", + refs: "aria", + }), + ); + }); }); describe("browser tool snapshot labels", () => { diff --git a/src/agents/tools/browser-tool.ts b/src/agents/tools/browser-tool.ts index 0c5ad095f43..0a1c34125e3 100644 --- a/src/agents/tools/browser-tool.ts +++ b/src/agents/tools/browser-tool.ts @@ -128,6 +128,7 @@ export function createBrowserTool(opts?: { 'Profiles: use profile="chrome" for Chrome extension relay takeover (your existing Chrome tabs). Use profile="clawd" for the isolated clawd-managed browser.', "Chrome extension relay needs an attached tab: user must click the Clawdbot Browser Relay toolbar icon on the tab (badge ON). If no tab is connected, ask them to attach it.", "When using refs from snapshot (e.g. e12), keep the same tab: prefer passing targetId from the snapshot response into subsequent actions (act/click/type/etc).", + 'For stable, self-resolving refs across calls, use snapshot with refs="aria" (Playwright aria-ref ids). Default refs="role" are role+name-based.', "Use snapshot+act for UI automation. Avoid act:wait by default; use only in exceptional cases when no reliable UI state exists.", `target selects browser location (sandbox|host|custom). Default: ${targetDefault}.`, "controlUrl implies target=custom (remote control server).", @@ -190,6 +191,7 @@ export function createBrowserTool(opts?: { : "ai"; const mode = params.mode === "efficient" ? "efficient" : undefined; const labels = typeof params.labels === "boolean" ? params.labels : undefined; + const refs = params.refs === "aria" || params.refs === "role" ? params.refs : undefined; const hasMaxChars = Object.hasOwn(params, "maxChars"); const targetId = typeof params.targetId === "string" ? params.targetId.trim() : undefined; const limit = @@ -224,6 +226,7 @@ export function createBrowserTool(opts?: { targetId, limit, ...(typeof resolvedMaxChars === "number" ? { maxChars: resolvedMaxChars } : {}), + refs, interactive, compact, depth, diff --git a/src/browser/client.test.ts b/src/browser/client.test.ts index 54876820d85..7721828f8e3 100644 --- a/src/browser/client.test.ts +++ b/src/browser/client.test.ts @@ -118,6 +118,36 @@ describe("browser client", () => { expect(parsed.searchParams.get("mode")).toBe("efficient"); }); + it("adds refs=aria to snapshots when requested", async () => { + const calls: string[] = []; + vi.stubGlobal( + "fetch", + vi.fn(async (url: string) => { + calls.push(url); + return { + ok: true, + json: async () => ({ + ok: true, + format: "ai", + targetId: "t1", + url: "https://x", + snapshot: "ok", + }), + } as unknown as Response; + }), + ); + + await browserSnapshot("http://127.0.0.1:18791", { + format: "ai", + refs: "aria", + }); + + const snapshotCall = calls.find((url) => url.includes("/snapshot?")); + expect(snapshotCall).toBeTruthy(); + const parsed = new URL(snapshotCall as string); + expect(parsed.searchParams.get("refs")).toBe("aria"); + }); + it("uses the expected endpoints + methods for common calls", async () => { const calls: Array<{ url: string; init?: RequestInit }> = []; diff --git a/src/browser/client.ts b/src/browser/client.ts index 958303a2549..2505f8dc9a9 100644 --- a/src/browser/client.ts +++ b/src/browser/client.ts @@ -270,6 +270,7 @@ export async function browserSnapshot( targetId?: string; limit?: number; maxChars?: number; + refs?: "role" | "aria"; interactive?: boolean; compact?: boolean; depth?: number; @@ -287,6 +288,7 @@ export async function browserSnapshot( if (typeof opts.maxChars === "number" && Number.isFinite(opts.maxChars)) { q.set("maxChars", String(opts.maxChars)); } + if (opts.refs === "aria" || opts.refs === "role") q.set("refs", opts.refs); if (typeof opts.interactive === "boolean") q.set("interactive", String(opts.interactive)); if (typeof opts.compact === "boolean") q.set("compact", String(opts.compact)); if (typeof opts.depth === "number" && Number.isFinite(opts.depth)) diff --git a/src/browser/pw-role-snapshot.test.ts b/src/browser/pw-role-snapshot.test.ts index abf49c5b9ff..3ba9ccfe75d 100644 --- a/src/browser/pw-role-snapshot.test.ts +++ b/src/browser/pw-role-snapshot.test.ts @@ -1,6 +1,7 @@ import { describe, expect, it } from "vitest"; import { + buildRoleSnapshotFromAiSnapshot, buildRoleSnapshotFromAriaSnapshot, getRoleSnapshotStats, parseRoleRef, @@ -67,4 +68,24 @@ describe("pw-role-snapshot", () => { expect(parseRoleRef("12")).toBeNull(); expect(parseRoleRef("")).toBeNull(); }); + + it("preserves Playwright aria-ref ids in ai snapshots", () => { + const ai = [ + '- navigation [ref=e1]:', + ' - link "Home" [ref=e5]', + ' - heading "Title" [ref=e6]', + ' - button "Save" [ref=e7] [cursor=pointer]:', + " - paragraph: hello", + ].join("\n"); + + const res = buildRoleSnapshotFromAiSnapshot(ai, { interactive: true }); + expect(res.snapshot).toContain('[ref=e5]'); + expect(res.snapshot).toContain('- link "Home"'); + expect(res.snapshot).toContain('- button "Save"'); + expect(res.snapshot).not.toContain("navigation"); + expect(res.snapshot).not.toContain("heading"); + expect(Object.keys(res.refs).sort()).toEqual(["e5", "e7"]); + expect(res.refs.e5).toMatchObject({ role: "link", name: "Home" }); + expect(res.refs.e7).toMatchObject({ role: "button", name: "Save" }); + }); }); diff --git a/src/browser/pw-role-snapshot.ts b/src/browser/pw-role-snapshot.ts index 091373ab57e..0f9a800bbde 100644 --- a/src/browser/pw-role-snapshot.ts +++ b/src/browser/pw-role-snapshot.ts @@ -293,3 +293,75 @@ export function buildRoleSnapshotFromAriaSnapshot( refs, }; } + +function parseAiSnapshotRef(suffix: string): string | null { + const match = suffix.match(/\[ref=(e\d+)\]/i); + return match ? match[1] : null; +} + +/** + * Build a role snapshot from Playwright's AI snapshot output while preserving Playwright's own + * aria-ref ids (e.g. ref=e13). This makes the refs self-resolving across calls. + */ +export function buildRoleSnapshotFromAiSnapshot( + aiSnapshot: string, + options: RoleSnapshotOptions = {}, +): { snapshot: string; refs: RoleRefMap } { + const lines = String(aiSnapshot ?? "").split("\n"); + const refs: RoleRefMap = {}; + + if (options.interactive) { + const out: string[] = []; + for (const line of lines) { + const depth = getIndentLevel(line); + if (options.maxDepth !== undefined && depth > options.maxDepth) continue; + const match = line.match(/^(\s*-\s*)(\w+)(?:\s+"([^"]*)")?(.*)$/); + if (!match) continue; + const [, , roleRaw, name, suffix] = match; + if (roleRaw.startsWith("/")) continue; + const role = roleRaw.toLowerCase(); + if (!INTERACTIVE_ROLES.has(role)) continue; + const ref = parseAiSnapshotRef(suffix); + if (!ref) continue; + refs[ref] = { role, ...(name ? { name } : {}) }; + out.push(`- ${roleRaw}${name ? ` "${name}"` : ""}${suffix}`); + } + return { + snapshot: out.join("\n") || "(no interactive elements)", + refs, + }; + } + + const out: string[] = []; + for (const line of lines) { + const depth = getIndentLevel(line); + if (options.maxDepth !== undefined && depth > options.maxDepth) continue; + + const match = line.match(/^(\s*-\s*)(\w+)(?:\s+"([^"]*)")?(.*)$/); + if (!match) { + out.push(line); + continue; + } + const [, , roleRaw, name, suffix] = match; + if (roleRaw.startsWith("/")) { + out.push(line); + continue; + } + + const role = roleRaw.toLowerCase(); + const isStructural = STRUCTURAL_ROLES.has(role); + + if (options.compact && isStructural && !name) continue; + + const ref = parseAiSnapshotRef(suffix); + if (ref) refs[ref] = { role, ...(name ? { name } : {}) }; + + out.push(line); + } + + const tree = out.join("\n") || "(empty)"; + return { + snapshot: options.compact ? compactTree(tree) : tree, + refs, + }; +} diff --git a/src/browser/pw-session.test.ts b/src/browser/pw-session.test.ts index ec5c75135b8..1832120a5f0 100644 --- a/src/browser/pw-session.test.ts +++ b/src/browser/pw-session.test.ts @@ -62,6 +62,16 @@ describe("pw-session refLocator", () => { expect(mocks.getByRole).toHaveBeenCalled(); }); + + it("uses aria-ref locators when refs mode is aria", () => { + const { page, mocks } = fakePage(); + const state = ensurePageState(page); + state.roleRefsMode = "aria"; + + refLocator(page, "e1"); + + expect(mocks.locator).toHaveBeenCalledWith("aria-ref=e1"); + }); }); describe("pw-session role refs cache", () => { diff --git a/src/browser/pw-session.ts b/src/browser/pw-session.ts index 7997fe15124..ef814232379 100644 --- a/src/browser/pw-session.ts +++ b/src/browser/pw-session.ts @@ -64,9 +64,11 @@ type PageState = { armIdDownload: number; /** * Role-based refs from the last role snapshot (e.g. e1/e2). - * These refs are NOT Playwright's `aria-ref` values. + * Mode "role" refs are generated from ariaSnapshot and resolved via getByRole. + * Mode "aria" refs are Playwright aria-ref ids and resolved via `aria-ref=...`. */ roleRefs?: Record; + roleRefsMode?: "role" | "aria"; roleRefsFrameSelector?: string; }; @@ -74,6 +76,7 @@ type RoleRefs = NonNullable; type RoleRefsCacheEntry = { refs: RoleRefs; frameSelector?: string; + mode?: NonNullable; }; type ContextState = { @@ -110,12 +113,14 @@ export function rememberRoleRefsForTarget(opts: { targetId: string; refs: RoleRefs; frameSelector?: string; + mode?: NonNullable; }): void { const targetId = opts.targetId.trim(); if (!targetId) return; roleRefsByTarget.set(roleRefsKey(opts.cdpUrl, targetId), { refs: opts.refs, ...(opts.frameSelector ? { frameSelector: opts.frameSelector } : {}), + ...(opts.mode ? { mode: opts.mode } : {}), }); while (roleRefsByTarget.size > MAX_ROLE_REFS_CACHE) { const first = roleRefsByTarget.keys().next(); @@ -137,6 +142,7 @@ export function restoreRoleRefsForTarget(opts: { if (state.roleRefs) return; state.roleRefs = cached.refs; state.roleRefsFrameSelector = cached.frameSelector; + state.roleRefsMode = cached.mode; } export function ensurePageState(page: Page): PageState { @@ -339,6 +345,12 @@ export function refLocator(page: Page, ref: string) { if (/^e\d+$/.test(normalized)) { const state = pageStates.get(page); + if (state?.roleRefsMode === "aria") { + const scope = state.roleRefsFrameSelector + ? page.frameLocator(state.roleRefsFrameSelector) + : page; + return scope.locator(`aria-ref=${normalized}`); + } const info = state?.roleRefs?.[normalized]; if (!info) { throw new Error( diff --git a/src/browser/pw-tools-core.interactions.ts b/src/browser/pw-tools-core.interactions.ts index 0f652be34dc..ea1ac9514fd 100644 --- a/src/browser/pw-tools-core.interactions.ts +++ b/src/browser/pw-tools-core.interactions.ts @@ -265,6 +265,7 @@ export async function scrollIntoViewViaPlaywright(opts: { }): Promise { const page = await getPageForTargetId(opts); ensurePageState(page); + restoreRoleRefsForTarget({ cdpUrl: opts.cdpUrl, targetId: opts.targetId, page }); const timeout = normalizeTimeoutMs(opts.timeoutMs, 20_000); const ref = requireRef(opts.ref); @@ -340,6 +341,7 @@ export async function takeScreenshotViaPlaywright(opts: { }): Promise<{ buffer: Buffer }> { const page = await getPageForTargetId(opts); ensurePageState(page); + restoreRoleRefsForTarget({ cdpUrl: opts.cdpUrl, targetId: opts.targetId, page }); const type = opts.type ?? "png"; if (opts.ref) { if (opts.fullPage) throw new Error("fullPage is not supported for element screenshots"); @@ -369,6 +371,7 @@ export async function screenshotWithLabelsViaPlaywright(opts: { }): Promise<{ buffer: Buffer; labels: number; skipped: number }> { const page = await getPageForTargetId(opts); ensurePageState(page); + restoreRoleRefsForTarget({ cdpUrl: opts.cdpUrl, targetId: opts.targetId, page }); const type = opts.type ?? "png"; const maxLabels = typeof opts.maxLabels === "number" && Number.isFinite(opts.maxLabels) @@ -495,6 +498,7 @@ export async function setInputFilesViaPlaywright(opts: { }): Promise { const page = await getPageForTargetId(opts); ensurePageState(page); + restoreRoleRefsForTarget({ cdpUrl: opts.cdpUrl, targetId: opts.targetId, page }); if (!opts.paths.length) throw new Error("paths are required"); const inputRef = typeof opts.inputRef === "string" ? opts.inputRef.trim() : ""; const element = typeof opts.element === "string" ? opts.element.trim() : ""; diff --git a/src/browser/pw-tools-core.snapshot.ts b/src/browser/pw-tools-core.snapshot.ts index 92ab149aa79..5cfa77e91bc 100644 --- a/src/browser/pw-tools-core.snapshot.ts +++ b/src/browser/pw-tools-core.snapshot.ts @@ -2,6 +2,7 @@ import type { Page } from "playwright-core"; import { type AriaSnapshotNode, formatAriaSnapshot, type RawAXNode } from "./cdp.js"; import { + buildRoleSnapshotFromAiSnapshot, buildRoleSnapshotFromAriaSnapshot, getRoleSnapshotStats, type RoleSnapshotOptions, @@ -76,6 +77,7 @@ export async function snapshotRoleViaPlaywright(opts: { targetId?: string; selector?: string; frameSelector?: string; + refsMode?: "role" | "aria"; options?: RoleSnapshotOptions; }): Promise<{ snapshot: string; @@ -88,6 +90,37 @@ export async function snapshotRoleViaPlaywright(opts: { }); const state = ensurePageState(page); + if (opts.refsMode === "aria") { + if (opts.selector?.trim() || opts.frameSelector?.trim()) { + throw new Error("refs=aria does not support selector/frame snapshots yet."); + } + const maybe = page as unknown as WithSnapshotForAI; + if (!maybe._snapshotForAI) { + throw new Error("refs=aria requires Playwright _snapshotForAI support."); + } + const result = await maybe._snapshotForAI({ + timeout: 5000, + track: "response", + }); + const built = buildRoleSnapshotFromAiSnapshot(String(result?.full ?? ""), opts.options); + state.roleRefs = built.refs; + state.roleRefsFrameSelector = undefined; + state.roleRefsMode = "aria"; + if (opts.targetId) { + rememberRoleRefsForTarget({ + cdpUrl: opts.cdpUrl, + targetId: opts.targetId, + refs: built.refs, + mode: "aria", + }); + } + return { + snapshot: built.snapshot, + refs: built.refs, + stats: getRoleSnapshotStats(built.snapshot, built.refs), + }; + } + const frameSelector = opts.frameSelector?.trim() || ""; const selector = opts.selector?.trim() || ""; const locator = frameSelector @@ -102,12 +135,14 @@ export async function snapshotRoleViaPlaywright(opts: { const built = buildRoleSnapshotFromAriaSnapshot(String(ariaSnapshot ?? ""), opts.options); state.roleRefs = built.refs; state.roleRefsFrameSelector = frameSelector || undefined; + state.roleRefsMode = "role"; if (opts.targetId) { rememberRoleRefsForTarget({ cdpUrl: opts.cdpUrl, targetId: opts.targetId, refs: built.refs, frameSelector: frameSelector || undefined, + mode: "role", }); } return { diff --git a/src/browser/routes/agent.snapshot.ts b/src/browser/routes/agent.snapshot.ts index 66a39b2a119..fdeb7f69ead 100644 --- a/src/browser/routes/agent.snapshot.ts +++ b/src/browser/routes/agent.snapshot.ts @@ -169,6 +169,8 @@ export function registerBrowserAgentSnapshotRoutes(app: express.Express, ctx: Br const interactiveRaw = toBoolean(req.query.interactive); const compactRaw = toBoolean(req.query.compact); const depthRaw = toNumber(req.query.depth); + const refsModeRaw = toStringOrEmpty(req.query.refs).trim(); + const refsMode = refsModeRaw === "aria" ? "aria" : refsModeRaw === "role" ? "role" : undefined; const interactive = interactiveRaw ?? (mode === "efficient" ? true : undefined); const compact = compactRaw ?? (mode === "efficient" ? true : undefined); const depth = @@ -199,6 +201,7 @@ export function registerBrowserAgentSnapshotRoutes(app: express.Express, ctx: Br targetId: tab.targetId, selector: selector.trim() || undefined, frameSelector: frameSelector.trim() || undefined, + refsMode, options: { interactive: interactive ?? undefined, compact: compact ?? undefined, @@ -219,6 +222,7 @@ export function registerBrowserAgentSnapshotRoutes(app: express.Express, ctx: Br targetId: tab.targetId, selector: selector.trim() || undefined, frameSelector: frameSelector.trim() || undefined, + refsMode, options: { interactive: interactive ?? undefined, compact: compact ?? undefined,