diff --git a/README.md b/README.md index 849c8d052..69833b890 100644 --- a/README.md +++ b/README.md @@ -477,7 +477,7 @@ If you run into any issues, checkout our [troubleshooting guide](./docs/troubles -- **Input automation** (10 tools) +- **Input automation** (11 tools) - [`click`](docs/tool-reference.md#click) - [`drag`](docs/tool-reference.md#drag) - [`fill`](docs/tool-reference.md#fill) @@ -488,6 +488,7 @@ If you run into any issues, checkout our [troubleshooting guide](./docs/troubles - [`type_text`](docs/tool-reference.md#type_text) - [`upload_file`](docs/tool-reference.md#upload_file) - [`click_at`](docs/tool-reference.md#click_at) + - [`get_element_at`](docs/tool-reference.md#get_element_at) - **Navigation automation** (6 tools) - [`close_page`](docs/tool-reference.md#close_page) - [`list_pages`](docs/tool-reference.md#list_pages) diff --git a/docs/tool-reference.md b/docs/tool-reference.md index 238d5981a..6de32d256 100644 --- a/docs/tool-reference.md +++ b/docs/tool-reference.md @@ -2,7 +2,7 @@ # Chrome DevTools MCP Tool Reference -- **[Input automation](#input-automation)** (10 tools) +- **[Input automation](#input-automation)** (11 tools) - [`click`](#click) - [`drag`](#drag) - [`fill`](#fill) @@ -13,6 +13,7 @@ - [`type_text`](#type_text) - [`upload_file`](#upload_file) - [`click_at`](#click_at) + - [`get_element_at`](#get_element_at) - **[Navigation automation](#navigation-automation)** (6 tools) - [`close_page`](#close_page) - [`list_pages`](#list_pages) @@ -175,6 +176,17 @@ --- +### `get_element_at` + +**Description:** Returns the uid of the DOM element at viewport-relative CSS-pixel coordinates (x, y). Pair with [`take_screenshot`](#take_screenshot) + a vision model that emits coordinates; feed the returned uid into uid-based tools such as [`click`](#click), [`hover`](#hover), or [`fill`](#fill). The response also includes the refreshed page snapshot. Pierces open shadow roots and descends same-origin iframes. Cannot reach closed shadow roots or cross-origin / OOPIF iframes. (requires flag: --experimentalVision=true) + +**Parameters:** + +- **x** (number) **(required)**: CSS-pixel X coordinate, viewport-relative. +- **y** (number) **(required)**: CSS-pixel Y coordinate, viewport-relative. + +--- + ## Navigation automation ### `close_page` diff --git a/scripts/eval_scenarios/get_element_at_test.ts b/scripts/eval_scenarios/get_element_at_test.ts new file mode 100644 index 000000000..8f5f99970 --- /dev/null +++ b/scripts/eval_scenarios/get_element_at_test.ts @@ -0,0 +1,89 @@ +/** + * @license + * Copyright 2026 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import assert from 'node:assert'; + +import type {TestScenario} from '../eval_gemini.ts'; + +export const scenario: TestScenario = { + serverArgs: ['--experimentalVision=true'], + prompt: `Take a screenshot of . There is a single large blue square on the page. Use the get_element_at tool to inspect the DOM element at the center of that blue square (the page is 800x600 and the square spans roughly x=100..300, y=100..300, so a coordinate around 200,200 is appropriate). Then tell me the element's id and class.`, + maxTurns: 4, + htmlRoute: { + path: '/get_element_at_test.html', + htmlContent: ` + + + + + + +
CLICK ME
+ + + `, + }, + expectations: calls => { + const visualCalls = calls.filter( + c => c.name === 'take_screenshot' || c.name === 'take_snapshot', + ); + assert.ok( + visualCalls.length >= 1, + 'Expected at least one take_screenshot or take_snapshot call before inspecting coordinates', + ); + + const elementAtCalls = calls.filter(c => c.name === 'get_element_at'); + assert.ok( + elementAtCalls.length >= 1, + 'Expected at least one get_element_at call', + ); + + let withinTarget = 0; + for (const call of elementAtCalls) { + const x = call.args.x; + const y = call.args.y; + assert.strictEqual( + typeof x, + 'number', + 'get_element_at must receive a numeric x', + ); + assert.strictEqual( + typeof y, + 'number', + 'get_element_at must receive a numeric y', + ); + if ( + typeof x === 'number' && + typeof y === 'number' && + x >= 100 && + x <= 300 && + y >= 100 && + y <= 300 + ) { + withinTarget++; + } + } + assert.ok( + withinTarget >= 1, + 'Expected at least one get_element_at call with x in [100,300] and y in [100,300] (inside the blue square)', + ); + }, +}; diff --git a/src/bin/chrome-devtools-cli-options.ts b/src/bin/chrome-devtools-cli-options.ts index a0e315c42..6690d82c0 100644 --- a/src/bin/chrome-devtools-cli-options.ts +++ b/src/bin/chrome-devtools-cli-options.ts @@ -284,6 +284,25 @@ export const commands: Commands = { }, }, }, + get_element_at: { + description: + 'Returns the uid of the DOM element at viewport-relative CSS-pixel coordinates (x, y). Pair with take_screenshot + a vision model that emits coordinates; feed the returned uid into uid-based tools such as click, hover, or fill. The response also includes the refreshed page snapshot. Pierces open shadow roots and descends same-origin iframes. Cannot reach closed shadow roots or cross-origin / OOPIF iframes. (requires flag: --experimentalVision=true)', + category: 'Input automation', + args: { + x: { + name: 'x', + type: 'number', + description: 'CSS-pixel X coordinate, viewport-relative.', + required: true, + }, + y: { + name: 'y', + type: 'number', + description: 'CSS-pixel Y coordinate, viewport-relative.', + required: true, + }, + }, + }, get_memory_snapshot_details: { description: 'Loads a memory heapsnapshot and returns all available information including statistics, static data, and aggregated node information. Supports pagination for aggregates. (requires flag: --experimentalMemory=true)', diff --git a/src/telemetry/tool_call_metrics.json b/src/telemetry/tool_call_metrics.json index a86c9c45a..ed32b656c 100644 --- a/src/telemetry/tool_call_metrics.json +++ b/src/telemetry/tool_call_metrics.json @@ -631,5 +631,38 @@ { "name": "list3p_developer_tools", "args": [] + }, + { + "name": "get_element_at", + "args": [ + { + "name": "x", + "argType": "number" + }, + { + "name": "y", + "argType": "number" + }, + { + "name": "mode", + "argType": "string", + "isDeprecated": true + }, + { + "name": "css", + "argType": "string", + "isDeprecated": true + }, + { + "name": "pierce_shadow", + "argType": "boolean", + "isDeprecated": true + }, + { + "name": "file_path_length", + "argType": "number", + "isDeprecated": true + } + ] } ] diff --git a/src/tools/input.ts b/src/tools/input.ts index 01d77a142..8b22dc0a1 100644 --- a/src/tools/input.ts +++ b/src/tools/input.ts @@ -6,6 +6,8 @@ import {logger} from '../logger.js'; import type {McpContext} from '../McpContext.js'; +import type {McpPage} from '../McpPage.js'; +import {TextSnapshot} from '../TextSnapshot.js'; import {zod} from '../third_party/index.js'; import type {ElementHandle, KeyInput} from '../third_party/index.js'; import type {TextSnapshotNode} from '../types.js'; @@ -536,3 +538,119 @@ export const pressKey = definePageTool({ } }, }); + +async function hitTestElementHandle( + page: ContextPage, + x: number, + y: number, +): Promise | null> { + const handle = await page.pptrPage.evaluateHandle( + (px, py) => { + let doc: Document | ShadowRoot = document; + let cx = px; + let cy = py; + let hit: Element | null = null; + for (let i = 0; i < 32; i++) { + const candidate: Element | null = doc.elementFromPoint(cx, cy); + if (!candidate) { + break; + } + hit = candidate; + if ( + candidate instanceof HTMLIFrameElement || + candidate instanceof HTMLFrameElement + ) { + const rect = candidate.getBoundingClientRect(); + let inner: Document | null = null; + try { + inner = candidate.contentDocument; + } catch { + inner = null; + } + if (inner) { + doc = inner; + cx -= rect.left; + cy -= rect.top; + continue; + } + break; + } + const shadow: ShadowRoot | null = candidate.shadowRoot; + if (shadow) { + doc = shadow; + continue; + } + break; + } + return hit; + }, + x, + y, + ); + const element = handle.asElement() as ElementHandle | null; + if (!element) { + void handle.dispose(); + return null; + } + return element; +} + +export const getElementAt = definePageTool({ + name: 'get_element_at', + description: `Returns the uid of the DOM element at viewport-relative CSS-pixel coordinates (x, y). Pair with take_screenshot + a vision model that emits coordinates; feed the returned uid into uid-based tools such as click, hover, or fill. The response also includes the refreshed page snapshot. Pierces open shadow roots and descends same-origin iframes. Cannot reach closed shadow roots or cross-origin / OOPIF iframes.`, + annotations: { + category: ToolCategory.INPUT, + readOnlyHint: true, + conditions: ['experimentalVision'], + }, + schema: { + x: zod.number().describe('CSS-pixel X coordinate, viewport-relative.'), + y: zod.number().describe('CSS-pixel Y coordinate, viewport-relative.'), + }, + blockedByDialog: true, + handler: async (request, response) => { + const {x, y} = request.params; + const page = request.page as McpPage; + + const element = await hitTestElementHandle(page, x, y); + if (!element) { + throw new Error( + `No element found at (${x}, ${y}). The coordinate may be outside the viewport, inside a closed shadow root, or inside a cross-origin iframe. Call take_screenshot to verify the page state, or list_pages + select_page to switch into a cross-origin frame.`, + ); + } + + const backendNodeId = await element.backendNodeId(); + if (!backendNodeId) { + void element.dispose(); + throw new Error(`Could not resolve element identity at (${x}, ${y}).`); + } + + if (!page.textSnapshot) { + page.textSnapshot = await TextSnapshot.create(page); + } + + let uid = page.resolveCdpElementId(backendNodeId); + let elementOwned = true; + if (!uid) { + // Hit-tested element is not in the a11y-tree snapshot (e.g. a plain + // div with no accessible role). Inject it via extraHandles so the + // refreshed snapshot exposes it with a uid that uid-based tools + // (click, hover, fill, …) can consume. + const extraHandles = [...page.extraHandles, element]; + page.textSnapshot = await TextSnapshot.create(page, {extraHandles}); + elementOwned = false; + uid = page.resolveCdpElementId(backendNodeId); + } + + if (elementOwned) { + void element.dispose(); + } + + if (!uid) { + throw new Error(`Could not assign a uid to element at (${x}, ${y}).`); + } + + response.appendResponseLine(`Element uid: ${uid}`); + response.includeSnapshot(); + }, +}); diff --git a/tests/e2e/chrome-devtools-commands.test.ts b/tests/e2e/chrome-devtools-commands.test.ts index 4884d4f82..9e44a362a 100644 --- a/tests/e2e/chrome-devtools-commands.test.ts +++ b/tests/e2e/chrome-devtools-commands.test.ts @@ -106,4 +106,21 @@ describe('chrome-devtools', () => { 'restart command suggestion is miss: ' + result.stdout, ); }); + + it('fails to invoke get_element_at when experimentalVision is disabled (default)', async () => { + await runCli(['start'], sessionId); + + const result = await runCli(['get_element_at', '100', '100'], sessionId); + assert.strictEqual(result.status, 0); + assert( + result.stdout.includes( + 'Tool get_element_at requires experimental feature --experimentalVision and is currently disabled', + ), + 'error message is unexpected: ' + result.stdout, + ); + assert( + result.stdout.includes('chrome-devtools start --experimentalVision=true'), + 'restart command suggestion is miss: ' + result.stdout, + ); + }); }); diff --git a/tests/index.test.ts b/tests/index.test.ts index 62f2019ec..96b8ffc5e 100644 --- a/tests/index.test.ts +++ b/tests/index.test.ts @@ -140,6 +140,8 @@ describe('e2e', () => { const {tools} = await client.listTools(); const clickAt = tools.find(t => t.name === 'click_at'); assert.ok(clickAt); + const getElementAt = tools.find(t => t.name === 'get_element_at'); + assert.ok(getElementAt); }, ['--experimental-vision'], ); diff --git a/tests/tools/input.test.ts b/tests/tools/input.test.ts index 491b008fe..f676d9343 100644 --- a/tests/tools/input.test.ts +++ b/tests/tools/input.test.ts @@ -21,6 +21,7 @@ import { uploadFile, pressKey, clickAt, + getElementAt, typeText, } from '../../src/tools/input.js'; import {parseKey} from '../../src/utils/keyboard.js'; @@ -1360,4 +1361,169 @@ describe('input', () => { }); }); }); + + describe('get_element_at', () => { + it('returns a uid for an element at coordinates and refreshes the snapshot', async () => { + await withMcpContext(async (response, context) => { + const page = context.getSelectedPptrPage(); + await page.setContent( + html``, + ); + await getElementAt.handler( + { + params: {x: 50, y: 50}, + page: context.getSelectedMcpPage(), + }, + response, + context, + ); + const output = response.responseLines.join('\n'); + const match = /Element uid: (\S+)/.exec(output); + assert.ok(match, `output should contain a uid line: ${output}`); + assert.notStrictEqual(response.snapshotParams, undefined); + + const mcpPage = context.getSelectedMcpPage(); + const handle = await mcpPage.getElementByUid(match[1]); + const id = await handle.evaluate(el => (el as HTMLElement).id); + assert.strictEqual(id, 'x'); + }); + }); + + it('throws when no element is found at the coordinates', async () => { + await withMcpContext(async (response, context) => { + const page = context.getSelectedPptrPage(); + await page.setContent( + html``, + ); + await assert.rejects( + getElementAt.handler( + { + params: {x: 5000, y: 5000}, + page: context.getSelectedMcpPage(), + }, + response, + context, + ), + /No element found at \(5000, 5000\)/, + ); + }); + }); + + it('resolves a plain non-accessible element via the extraHandles fallback', async () => { + await withMcpContext(async (response, context) => { + const page = context.getSelectedPptrPage(); + await page.setContent( + html`
`, + ); + await getElementAt.handler( + { + params: {x: 50, y: 50}, + page: context.getSelectedMcpPage(), + }, + response, + context, + ); + const output = response.responseLines.join('\n'); + const match = /Element uid: (\S+)/.exec(output); + assert.ok(match, `output should contain a uid line: ${output}`); + + const mcpPage = context.getSelectedMcpPage(); + const handle = await mcpPage.getElementByUid(match[1]); + const id = await handle.evaluate(el => (el as HTMLElement).id); + assert.strictEqual(id, 'plain'); + }); + }); + + it('pierces open shadow roots', async () => { + await withMcpContext(async (response, context) => { + const page = context.getSelectedPptrPage(); + await page.setContent( + html` + `, + ); + await page.waitForFunction( + () => { + const host = document.querySelector('my-host'); + return Boolean(host?.shadowRoot?.querySelector('#inner')); + }, + {timeout: 5000}, + ); + await getElementAt.handler( + { + params: {x: 50, y: 50}, + page: context.getSelectedMcpPage(), + }, + response, + context, + ); + const output = response.responseLines.join('\n'); + const match = /Element uid: (\S+)/.exec(output); + assert.ok(match, `output should contain a uid line: ${output}`); + + const mcpPage = context.getSelectedMcpPage(); + const handle = await mcpPage.getElementByUid(match[1]); + const id = await handle.evaluate(el => (el as HTMLElement).id); + assert.strictEqual(id, 'inner'); + }); + }); + + it('descends into a same-origin iframe', async () => { + await withMcpContext(async (response, context) => { + const page = context.getSelectedPptrPage(); + await page.setContent( + html``, + ); + await page.waitForFunction( + () => { + const frame = document.querySelector('iframe'); + return Boolean(frame?.contentDocument?.querySelector('#inner')); + }, + {timeout: 5000}, + ); + await getElementAt.handler( + { + params: {x: 50, y: 50}, + page: context.getSelectedMcpPage(), + }, + response, + context, + ); + const output = response.responseLines.join('\n'); + const match = /Element uid: (\S+)/.exec(output); + assert.ok(match, `output should contain a uid line: ${output}`); + + const mcpPage = context.getSelectedMcpPage(); + const handle = await mcpPage.getElementByUid(match[1]); + const id = await handle.evaluate(el => (el as HTMLElement).id); + assert.strictEqual(id, 'inner'); + }); + }); + }); });