page.evaluate() — Run JS in Page Context
The most flexible extraction method. Runs JavaScript inside the page and returns serializable data to Node.js/Python.
const { chromium } = require('playwright');
const browser = await chromium.launch();
const page = await browser.newPage();
await page.goto('https://news.ycombinator.com', { waitUntil: 'networkidle' });
// Extract structured data with page.evaluate()
const stories = await page.evaluate(() => {
return Array.from(document.querySelectorAll('.athing')).map((row, i) => {
const titleLink = row.querySelector('.titleline > a');
const subRow = row.nextElementSibling;
const score = subRow?.querySelector('.score')?.textContent;
const comments = subRow?.querySelector('a:last-child')?.textContent;
return {
rank: i + 1,
title: titleLink?.textContent?.trim(),
url: titleLink?.href,
score: score ? parseInt(score) : 0,
comments: comments?.includes('comment') ? parseInt(comments) : 0
};
}).filter(s => s.title);
});
console.log(stories.slice(0, 5));
await browser.close();
Playwright Locators (Preferred Modern API)
Locators are Playwright's preferred way to select elements. They auto-wait for elements to appear and are more reliable than $() selectors.
// CSS selector locator
const title = await page.locator('h1').textContent();
// Role-based locator (best for accessibility)
const submitBtn = page.getByRole('button', { name: 'Submit' });
await submitBtn.click();
// Text locator
const loginLink = page.getByText('Log in');
// Label locator
const emailInput = page.getByLabel('Email address');
// Extract multiple elements
const prices = await page.locator('.product-price').allTextContents();
// Returns: ['$29.99', '$49.99', '$79.99']
// Extract attributes
const imageUrls = await page.locator('img.product-image').evaluateAll(
imgs => imgs.map(img => img.src)
);
$$eval — Batch Extract All Matching Elements
// $$eval runs a function on ALL matching elements at once
const products = await page.$$eval('.product-card', cards => {
return cards.map(card => ({
name: card.querySelector('.name')?.textContent?.trim(),
price: parseFloat(card.querySelector('.price')?.textContent?.replace(/[^\d.]/g, '')),
rating: card.querySelector('[data-rating]')?.dataset?.rating,
imageUrl: card.querySelector('img')?.src,
link: card.querySelector('a')?.href
}));
});
// $eval targets a single element
const heroTitle = await page.$eval('.hero h1', el => el.textContent.trim());
Extracting HTML Tables
// Extract any HTML table as an array of objects
async function extractTable(page, selector = 'table') {
return page.evaluate((sel) => {
const table = document.querySelector(sel);
if (!table) return [];
const headers = Array.from(table.querySelectorAll('thead th, thead td'))
.map(th => th.textContent.trim().toLowerCase().replace(/\s+/g, '_'));
return Array.from(table.querySelectorAll('tbody tr')).map(row => {
const cells = Array.from(row.querySelectorAll('td'));
return headers.reduce((obj, header, i) => {
obj[header] = cells[i]?.textContent?.trim() ?? '';
return obj;
}, {});
});
}, selector);
}
await page.goto('https://en.wikipedia.org/wiki/List_of_countries_by_GDP');
const gdpTable = await extractTable(page, 'table.wikitable');
console.log(gdpTable.slice(0, 5));
Intercept API Responses (Get Clean JSON)
Modern SPAs load data via XHR/fetch. Intercept those responses directly instead of parsing the DOM — cleaner and more reliable.
const { chromium } = require('playwright');
async function interceptApiData(url, apiUrlPattern) {
const browser = await chromium.launch();
const page = await browser.newPage();
const capturedData = [];
// Listen for responses matching the pattern
page.on('response', async response => {
if (response.url().includes(apiUrlPattern) && response.ok()) {
try {
const json = await response.json();
const items = json.data || json.results || json.items || [json];
capturedData.push(...items);
} catch {}
}
});
await page.goto(url, { waitUntil: 'networkidle' });
// Scroll to trigger lazy-loaded batches
for (let i = 0; i < 3; i++) {
await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
await page.waitForTimeout(1000);
}
await browser.close();
return capturedData;
}
const products = await interceptApiData(
'https://shop.example.com/category/electronics',
'/api/v2/products'
);
console.log(`Captured ${products.length} products`);
Python: Playwright Data Extraction
from playwright.sync_api import sync_playwright
def extract_data(url: str, selector: str) -> list[dict]:
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
page = browser.new_page()
page.goto(url, wait_until='networkidle')
items = page.eval_on_selector_all(selector, '''
elements => elements.map(el => ({
text: el.textContent.trim(),
href: el.href || null,
src: el.src || null
}))
''')
browser.close()
return items
# Extract all article titles and links
articles = extract_data('https://blog.example.com', 'article h2 a')
for a in articles[:5]:
print(a['text'], a['href'])
Schema-Based Extraction with SnapAPI
Writing CSS selectors for every site is tedious and breaks when the site redesigns. SnapAPI's /v1/extract uses AI to extract structured data from a schema definition — no selectors needed.
// Define what you want — no selectors needed
const response = await fetch('https://api.snapapi.pics/v1/extract', {
method: 'POST',
headers: { 'X-Api-Key': process.env.SNAPAPI_KEY, 'Content-Type': 'application/json' },
body: JSON.stringify({
url: 'https://shop.example.com/product/wireless-headphones',
schema: {
name: 'string',
price: 'number',
currency: 'string',
in_stock: 'boolean',
description: 'string',
images: 'string[]',
rating: 'number',
review_count: 'number',
features: 'string[]'
}
})
});
const { data } = await response.json();
console.log(data);
// { name: 'Sony WH-1000XM6', price: 299.99, currency: 'USD', in_stock: true, ... }
# Python equivalent
import requests, os
resp = requests.post(
'https://api.snapapi.pics/v1/extract',
headers={'X-Api-Key': os.environ['SNAPAPI_KEY']},
json={
'url': 'https://shop.example.com/product/123',
'schema': {
'name': 'string',
'price': 'number',
'in_stock': 'boolean',
'images': 'string[]'
}
}
)
data = resp.json()['data']
print(data['name'], data['price'])
Tips for Reliable Extraction
- Always use
wait_until: 'networkidle'for SPAs — otherwise JS may not have rendered the content yet - Prefer locators over
$selectors — they auto-wait and retry, making tests less flaky - Intercept APIs when possible — you get clean JSON and avoid DOM parsing complexity
- Use
page.waitForSelector()before extracting if content loads asynchronously - Handle pagination — check for a next-page link or button and loop
- Rate limit — 1–2 requests/second is polite; save/restore context between pages to reuse auth