Web Scraping

Web Scraping with JavaScript in 2026

April 202613 min readCheerio · Playwright · SnapAPI

The complete JavaScript web scraping guide for 2026: fetch + Cheerio for static pages, Playwright for SPAs, and SnapAPI for production-grade scraping with anti-bot bypass. Pagination, rate limiting, auth, and storage all covered.

Static Pages: fetch + Cheerio

For server-rendered HTML pages (Wikipedia, most blogs, government sites, e-commerce product pages), you don't need a browser at all. fetch + Cheerio is faster and lighter.

npm install cheerio
const cheerio = require('cheerio');

async function scrapeStatic(url) {
  const response = await fetch(url, {
    headers: {
      'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
    }
  });

  if (!response.ok) throw new Error(`HTTP ${response.status}`);

  const html = await response.text();
  const $ = cheerio.load(html);

  // Extract data with CSS selectors — jQuery-like syntax
  const results = [];
  $('article.post').each((i, el) => {
    results.push({
      title: $(el).find('h2').text().trim(),
      url: $(el).find('a').attr('href'),
      date: $(el).find('time').attr('datetime'),
      excerpt: $(el).find('p.excerpt').text().trim()
    });
  });

  return results;
}

const posts = await scrapeStatic('https://blog.example.com');
console.log(posts);

Pagination with Cheerio

async function scrapeAllPages(baseUrl) {
  const allItems = [];
  let page = 1;
  let hasNextPage = true;

  while (hasNextPage) {
    const url = `${baseUrl}?page=${page}`;
    const response = await fetch(url, {
      headers: { 'User-Agent': 'Mozilla/5.0 (compatible; MyScraper/1.0)' }
    });
    const $ = cheerio.load(await response.text());

    // Extract items on this page
    $('li.item').each((_, el) => {
      allItems.push({
        title: $(el).find('.title').text().trim(),
        price: parseFloat($(el).find('.price').text().replace('$', ''))
      });
    });

    // Check if there's a next page link
    hasNextPage = $('a.next-page').length > 0;
    page++;

    // Rate limit: 1 request/second
    await new Promise(r => setTimeout(r, 1000));
    console.log(`Page ${page - 1}: ${allItems.length} total items`);
  }

  return allItems;
}

SPAs: Playwright

React, Vue, Next.js, and Angular render content client-side. fetch gets the empty shell. You need a real browser to execute the JavaScript.

const { chromium } = require('playwright');

async function scrapeSPA(url) {
  const browser = await chromium.launch({ headless: true });
  const page = await browser.newPage();

  await page.goto(url, { waitUntil: 'networkidle' });

  // Extract data from the rendered DOM
  const data = await page.evaluate(() => {
    return Array.from(document.querySelectorAll('.product-card')).map(el => ({
      name: el.querySelector('.name')?.textContent?.trim(),
      price: parseFloat(el.querySelector('.price')?.textContent?.replace(/[^0-9.]/g, '')),
      rating: parseFloat(el.querySelector('[data-rating]')?.dataset?.rating),
      url: el.querySelector('a')?.href
    }));
  });

  await browser.close();
  return data;
}

Playwright: Intercept API Responses (Faster)

Instead of scraping the rendered DOM, intercept the API calls the SPA makes. This gives you clean JSON without HTML parsing.

const { chromium } = require('playwright');

async function interceptApiData(url, apiPattern) {
  const browser = await chromium.launch({ headless: true });
  const page = await browser.newPage();

  const capturedData = [];

  // Intercept matching XHR/fetch responses
  page.on('response', async response => {
    if (response.url().includes(apiPattern) && response.status() === 200) {
      try {
        const json = await response.json();
        capturedData.push(...(json.data || json.items || [json]));
      } catch {}
    }
  });

  await page.goto(url, { waitUntil: 'networkidle' });

  // Scroll to trigger lazy-loaded content
  await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
  await page.waitForTimeout(1000);

  await browser.close();
  return capturedData;
}

// Intercepts calls to /api/products
const products = await interceptApiData('https://shop.example.com', '/api/products');

Playwright: Handle Infinite Scroll

async function scrapeInfiniteScroll(url) {
  const browser = await chromium.launch({ headless: true });
  const page = await browser.newPage();
  await page.goto(url, { waitUntil: 'networkidle' });

  const allItems = new Set();

  for (let i = 0; i < 10; i++) {  // Max 10 scroll iterations
    // Collect current items
    const items = await page.$$eval('.item', els =>
      els.map(el => el.textContent.trim())
    );
    const prevSize = allItems.size;
    items.forEach(item => allItems.add(item));

    // Scroll to bottom
    await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
    await page.waitForTimeout(1500);  // Wait for new content to load

    // Stop if no new items appeared
    if (allItems.size === prevSize) break;
  }

  await browser.close();
  return [...allItems];
}

Production Scraping: SnapAPI

For production workloads where anti-bot protection, stealth mode, and no browser management matter, use a dedicated scraping API. SnapAPI's /v1/scrape and /v1/extract handle the browser layer for you.

// Scrape raw HTML + text + links
const response = await fetch('https://api.snapapi.pics/v1/scrape', {
  method: 'POST',
  headers: { 'X-Api-Key': process.env.SNAPAPI_KEY, 'Content-Type': 'application/json' },
  body: JSON.stringify({
    url: 'https://news.ycombinator.com',
    wait_for: 'networkidle',
    stealth: true
  })
});
const { html, text, links, title } = await response.json();

// Now parse with cheerio — best of both worlds
const cheerio = require('cheerio');
const $ = cheerio.load(html);
const stories = [];
$('.athing').each((i, el) => {
  const id = $(el).attr('id');
  const titleEl = $(el).find('.titleline > a');
  stories.push({ rank: i + 1, title: titleEl.text(), url: titleEl.attr('href') });
});
console.log(stories.slice(0, 5));
// Extract structured data with schema (no CSS selector knowledge needed)
const extractRes = await fetch('https://api.snapapi.pics/v1/extract', {
  method: 'POST',
  headers: { 'X-Api-Key': process.env.SNAPAPI_KEY, 'Content-Type': 'application/json' },
  body: JSON.stringify({
    url: 'https://shop.example.com/product/123',
    schema: {
      name: 'string',
      price: 'number',
      currency: 'string',
      in_stock: 'boolean',
      images: 'string[]',
      description: 'string',
      rating: 'number',
      review_count: 'number'
    }
  })
});
const { data } = await extractRes.json();
// data.name, data.price, data.in_stock — clean structured output

Rate Limiting and Polite Scraping

// Token bucket rate limiter
class RateLimiter {
  constructor(requestsPerSecond) {
    this.interval = 1000 / requestsPerSecond;
    this.lastRequest = 0;
  }

  async wait() {
    const now = Date.now();
    const timeSinceLastRequest = now - this.lastRequest;
    if (timeSinceLastRequest < this.interval) {
      await new Promise(r => setTimeout(r, this.interval - timeSinceLastRequest));
    }
    this.lastRequest = Date.now();
  }
}

const limiter = new RateLimiter(2);  // Max 2 requests/second

async function scrapeWithRateLimit(urls) {
  const results = [];
  for (const url of urls) {
    await limiter.wait();
    try {
      const response = await fetch(url, {
        headers: { 'User-Agent': 'Mozilla/5.0 (compatible; ResearchBot/1.0)' }
      });
      results.push({ url, html: await response.text() });
    } catch (err) {
      results.push({ url, error: err.message });
    }
  }
  return results;
}

Scraping Authenticated Pages

const { chromium } = require('playwright');

async function scrapeAuthenticated(loginUrl, credentials, targetUrl) {
  const browser = await chromium.launch({ headless: true });
  const context = await browser.newContext();
  const page = await context.newPage();

  // Log in
  await page.goto(loginUrl);
  await page.fill('#email', credentials.email);
  await page.fill('#password', credentials.password);
  await page.click('[type="submit"]');
  await page.waitForNavigation({ waitUntil: 'networkidle' });

  // Save session state for reuse
  const storageState = await context.storageState();
  require('fs').writeFileSync('session.json', JSON.stringify(storageState));

  // Scrape authenticated page
  await page.goto(targetUrl, { waitUntil: 'networkidle' });
  const data = await page.evaluate(() => document.body.innerHTML);

  await browser.close();
  return data;
}

// Reuse saved session (skip login)
async function scrapeWithSavedSession(url) {
  const browser = await chromium.launch({ headless: true });
  const context = await browser.newContext({
    storageState: JSON.parse(require('fs').readFileSync('session.json'))
  });
  const page = await context.newPage();
  await page.goto(url, { waitUntil: 'networkidle' });
  const html = await page.content();
  await browser.close();
  return html;
}

Storing Scraped Data

// SQLite storage with better-sqlite3 (sync, fast for local use)
const Database = require('better-sqlite3');
const db = new Database('scraped.db');

db.exec(`
  CREATE TABLE IF NOT EXISTS pages (
    id INTEGER PRIMARY KEY,
    url TEXT UNIQUE,
    title TEXT,
    content TEXT,
    scraped_at INTEGER DEFAULT (unixepoch())
  )
`);

const insert = db.prepare(
  'INSERT OR REPLACE INTO pages (url, title, content) VALUES (?, ?, ?)'
);

function savePage(url, title, content) {
  insert.run(url, title, content);
}

// Batch insert many rows efficiently
const insertMany = db.transaction((pages) => {
  for (const page of pages) insert.run(page.url, page.title, page.content);
});
insertMany(pages);
// PostgreSQL with pg for production
const { Pool } = require('pg');
const pool = new Pool({ connectionString: process.env.DATABASE_URL });

async function saveScrapedData(url, data) {
  await pool.query(
    'INSERT INTO scraped_pages (url, data, scraped_at) VALUES ($1, $2, NOW()) ON CONFLICT (url) DO UPDATE SET data = $2, scraped_at = NOW()',
    [url, JSON.stringify(data)]
  );
}

Tool Selection Guide

ScenarioBest toolWhy
Static HTML sitefetch + CheerioFastest, no browser overhead
React/Vue/Next.js SPAPlaywrightnetworkidle wait handles async rendering
Cloudflare-protected siteSnapAPI (stealth)Maintained evasion, no arms race
Structured data extractionSnapAPI /extractSchema-based, no CSS selector maintenance
Full site crawl (>10K pages)Crawlee + ApifyBuilt-in queue, retry, deduplication
Authenticated pagesPlaywright + storageStateFull session control
Serverless / LambdaSnapAPINo binary, no layer, 50ms cold start
SnapAPI scraping: Stealth mode, networkidle wait, structured extraction with schema. Free tier: 200 calls/month. Try free →