Playwright Extract Data: Scraping & Data Extraction Guide (2026)

page.evaluate() — Run JS in Page Context

The most flexible extraction method. Runs JavaScript inside the page and returns serializable data to Node.js/Python.

const { chromium } = require('playwright');

const browser = await chromium.launch();
const page = await browser.newPage();
await page.goto('https://news.ycombinator.com', { waitUntil: 'networkidle' });

// Extract structured data with page.evaluate()
const stories = await page.evaluate(() => {
  return Array.from(document.querySelectorAll('.athing')).map((row, i) => {
    const titleLink = row.querySelector('.titleline > a');
    const subRow = row.nextElementSibling;
    const score = subRow?.querySelector('.score')?.textContent;
    const comments = subRow?.querySelector('a:last-child')?.textContent;

    return {
      rank: i + 1,
      title: titleLink?.textContent?.trim(),
      url: titleLink?.href,
      score: score ? parseInt(score) : 0,
      comments: comments?.includes('comment') ? parseInt(comments) : 0
    };
  }).filter(s => s.title);
});

console.log(stories.slice(0, 5));
await browser.close();

Playwright Locators (Preferred Modern API)

Locators are Playwright's preferred way to select elements. They auto-wait for elements to appear and are more reliable than $() selectors.

// CSS selector locator
const title = await page.locator('h1').textContent();

// Role-based locator (best for accessibility)
const submitBtn = page.getByRole('button', { name: 'Submit' });
await submitBtn.click();

// Text locator
const loginLink = page.getByText('Log in');

// Label locator
const emailInput = page.getByLabel('Email address');

// Extract multiple elements
const prices = await page.locator('.product-price').allTextContents();
// Returns: ['$29.99', '$49.99', '$79.99']

// Extract attributes
const imageUrls = await page.locator('img.product-image').evaluateAll(
  imgs => imgs.map(img => img.src)
);

$$eval — Batch Extract All Matching Elements

// $$eval runs a function on ALL matching elements at once
const products = await page.$$eval('.product-card', cards => {
  return cards.map(card => ({
    name: card.querySelector('.name')?.textContent?.trim(),
    price: parseFloat(card.querySelector('.price')?.textContent?.replace(/[^\d.]/g, '')),
    rating: card.querySelector('[data-rating]')?.dataset?.rating,
    imageUrl: card.querySelector('img')?.src,
    link: card.querySelector('a')?.href
  }));
});

// $eval targets a single element
const heroTitle = await page.$eval('.hero h1', el => el.textContent.trim());

Extracting HTML Tables

// Extract any HTML table as an array of objects
async function extractTable(page, selector = 'table') {
  return page.evaluate((sel) => {
    const table = document.querySelector(sel);
    if (!table) return [];

    const headers = Array.from(table.querySelectorAll('thead th, thead td'))
      .map(th => th.textContent.trim().toLowerCase().replace(/\s+/g, '_'));

    return Array.from(table.querySelectorAll('tbody tr')).map(row => {
      const cells = Array.from(row.querySelectorAll('td'));
      return headers.reduce((obj, header, i) => {
        obj[header] = cells[i]?.textContent?.trim() ?? '';
        return obj;
      }, {});
    });
  }, selector);
}

await page.goto('https://en.wikipedia.org/wiki/List_of_countries_by_GDP');
const gdpTable = await extractTable(page, 'table.wikitable');
console.log(gdpTable.slice(0, 5));

Intercept API Responses (Get Clean JSON)

Modern SPAs load data via XHR/fetch. Intercept those responses directly instead of parsing the DOM — cleaner and more reliable.

const { chromium } = require('playwright');

async function interceptApiData(url, apiUrlPattern) {
  const browser = await chromium.launch();
  const page = await browser.newPage();
  const capturedData = [];

  // Listen for responses matching the pattern
  page.on('response', async response => {
    if (response.url().includes(apiUrlPattern) && response.ok()) {
      try {
        const json = await response.json();
        const items = json.data || json.results || json.items || [json];
        capturedData.push(...items);
      } catch {}
    }
  });

  await page.goto(url, { waitUntil: 'networkidle' });

  // Scroll to trigger lazy-loaded batches
  for (let i = 0; i < 3; i++) {
    await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
    await page.waitForTimeout(1000);
  }

  await browser.close();
  return capturedData;
}

const products = await interceptApiData(
  'https://shop.example.com/category/electronics',
  '/api/v2/products'
);
console.log(`Captured ${products.length} products`);

Python: Playwright Data Extraction

from playwright.sync_api import sync_playwright

def extract_data(url: str, selector: str) -> list[dict]:
    with sync_playwright() as p:
        browser = p.chromium.launch(headless=True)
        page = browser.new_page()
        page.goto(url, wait_until='networkidle')

        items = page.eval_on_selector_all(selector, '''
            elements => elements.map(el => ({
                text: el.textContent.trim(),
                href: el.href || null,
                src: el.src || null
            }))
        ''')

        browser.close()
        return items

# Extract all article titles and links
articles = extract_data('https://blog.example.com', 'article h2 a')
for a in articles[:5]:
    print(a['text'], a['href'])

Schema-Based Extraction with SnapAPI

Writing CSS selectors for every site is tedious and breaks when the site redesigns. SnapAPI's /v1/extract uses AI to extract structured data from a schema definition — no selectors needed.

// Define what you want — no selectors needed
const response = await fetch('https://api.snapapi.pics/v1/extract', {
  method: 'POST',
  headers: { 'X-Api-Key': process.env.SNAPAPI_KEY, 'Content-Type': 'application/json' },
  body: JSON.stringify({
    url: 'https://shop.example.com/product/wireless-headphones',
    schema: {
      name: 'string',
      price: 'number',
      currency: 'string',
      in_stock: 'boolean',
      description: 'string',
      images: 'string[]',
      rating: 'number',
      review_count: 'number',
      features: 'string[]'
    }
  })
});

const { data } = await response.json();
console.log(data);
// { name: 'Sony WH-1000XM6', price: 299.99, currency: 'USD', in_stock: true, ... }

# Python equivalent
import requests, os

resp = requests.post(
    'https://api.snapapi.pics/v1/extract',
    headers={'X-Api-Key': os.environ['SNAPAPI_KEY']},
    json={
        'url': 'https://shop.example.com/product/123',
        'schema': {
            'name': 'string',
            'price': 'number',
            'in_stock': 'boolean',
            'images': 'string[]'
        }
    }
)
data = resp.json()['data']
print(data['name'], data['price'])

Extract vs Playwright: Use Playwright when you need full browser control or want to interact with the page. Use SnapAPI /extract when you just want the data — no selectors to maintain, works despite site redesigns, and handles bot protection automatically.

Tips for Reliable Extraction

Always use wait_until: 'networkidle' for SPAs — otherwise JS may not have rendered the content yet
Prefer locators over $ selectors — they auto-wait and retry, making tests less flaky
Intercept APIs when possible — you get clean JSON and avoid DOM parsing complexity
Use page.waitForSelector() before extracting if content loads asynchronously
Handle pagination — check for a next-page link or button and loop
Rate limit — 1–2 requests/second is polite; save/restore context between pages to reuse auth

Playwright Extract Data: Complete Guide (2026)

page.evaluate() — Run JS in Page Context

Playwright Locators (Preferred Modern API)

$$eval — Batch Extract All Matching Elements

Extracting HTML Tables

Intercept API Responses (Get Clean JSON)

Python: Playwright Data Extraction

Schema-Based Extraction with SnapAPI

Tips for Reliable Extraction