Node.js Web Scraping in 2026: Cheerio, Playwright, and Production Patterns

Choose Your Tool First

Site type	Tool	Why
Static HTML (no JS)	fetch + Cheerio	Fast, lightweight, no browser needed
Server-rendered (Next.js, Rails)	fetch + Cheerio	HTML is in the initial response
SPA (React, Vue, Angular)	Playwright	Content rendered after JS execution
Infinite scroll / lazy load	Playwright	Need to trigger scroll events
Protected (Cloudflare, DataDome)	SnapAPI stealth	Handles bot protection automatically
Structured data extraction	SnapAPI /extract	Schema-based, no CSS selector writing

Cheerio: Static HTML Scraping

npm install cheerio node-fetch

import * as cheerio from 'cheerio';

async function scrapeProducts(url) {
  const res = await fetch(url, {
    headers: {
      'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',
      'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
      'Accept-Language': 'en-US,en;q=0.5',
    },
  });
  const html = await res.text();
  const $ = cheerio.load(html);

  const products = [];
  $('.product-card').each((_, el) => {
    products.push({
      title: $(el).find('.product-title').text().trim(),
      price: $(el).find('.price').text().trim(),
      url: $(el).find('a').attr('href'),
      image: $(el).find('img').attr('src'),
    });
  });

  return products;
}

Pagination with Cheerio

async function scrapeAllPages(baseUrl) {
  let allItems = [];
  let page = 1;
  let hasNextPage = true;

  while (hasNextPage) {
    const url = `${baseUrl}?page=${page}`;
    const res = await fetch(url, { headers: { 'User-Agent': '...' } });
    const $ = cheerio.load(await res.text());

    // Extract items from current page
    $('.item').each((_, el) => allItems.push($(el).text().trim()));

    // Check for next page link
    hasNextPage = !!$('a[rel="next"]').length;
    page++;

    // Polite delay
    if (hasNextPage) await new Promise(r => setTimeout(r, 1000 + Math.random() * 500));
  }

  return allItems;
}

Playwright: SPA Scraping

import { chromium } from 'playwright'; import * as cheerio from 'cheerio'; async function scrapeSPA(url) { const browser = await chromium.launch({ headless: true }); const context = await browser.newContext({ userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)...', }); // Block images and fonts to speed up load await context.route('**/*', (route) => { const type = route.request().resourceType(); if (['image', 'media', 'font'].includes(type)) return route.abort(); return route.continue(); }); const page = await context.newPage(); await page.goto(url, { waitUntil: 'networkidle', timeout: 20000 }); // Wait for the specific content to load await page.waitForSelector('.product-list', { timeout: 10000 }); // Option A: Use evaluate for simple extraction const products = await page.evaluate(() => { return [...document.querySelectorAll('.product-card')].map(el => ({ title: el.querySelector('.title')?.textContent?.trim(), price: el.querySelector('.price')?.textContent?.trim(), })); }); // Option B: Pass rendered HTML to Cheerio const html = await page.content(); const $ = cheerio.load(html); // ... use Cheerio as normal await browser.close(); return products; }

Infinite scroll

async function scrapeInfiniteScroll(url, itemSelector, maxItems = 200) { const browser = await chromium.launch(); const page = await browser.newPage(); await page.goto(url, { waitUntil: 'networkidle' }); const items = new Set(); let previousCount = 0; while (items.size < maxItems) { // Collect visible items const newItems = await page.evaluate((sel) => [...document.querySelectorAll(sel)].map(el => el.textContent.trim()), itemSelector ); newItems.forEach(i => items.add(i)); // No new items after scroll — reached end if (items.size === previousCount) break; previousCount = items.size; // Scroll to bottom to trigger next load await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight)); await page.waitForTimeout(1500); // wait for next batch to load } await browser.close(); return [...items]; }

SnapAPI: Managed Scraping

For sites with bot protection or when you don't want to manage Playwright infrastructure:

// Scrape full rendered HTML (after JS execution) const res = await fetch('https://api.snapapi.pics/v1/scrape', { method: 'POST', headers: { 'X-Api-Key': process.env.SNAPAPI_KEY, 'Content-Type': 'application/json' }, body: JSON.stringify({ url: 'https://protected-site.com/products', stealth: true, waitForSelector: '.product-list', waitUntil: 'networkidle', }), }); const { html } = await res.json(); // Parse with Cheerio as usual const $ = cheerio.load(html);

// Schema-based extraction — no CSS selectors needed const res = await fetch('https://api.snapapi.pics/v1/extract', { method: 'POST', headers: { 'X-Api-Key': process.env.SNAPAPI_KEY, 'Content-Type': 'application/json' }, body: JSON.stringify({ url: 'https://example.com/product', schema: { title: { type: 'string', description: 'Product name' }, price: { type: 'string', description: 'Price with currency' }, rating: { type: 'number', description: 'Star rating 1-5' }, reviewCount: { type: 'integer', description: 'Number of reviews' }, inStock: { type: 'boolean', description: 'Whether available' }, images: { type: 'array', items: { type: 'string' }, description: 'Image URLs' }, }, }), }); const { data } = await res.json(); // data = { title: 'Widget Pro', price: '$29.99', rating: 4.7, ... }

Data Storage Patterns

SQLite (local, fast, zero setup)

import Database from 'better-sqlite3'; const db = new Database('./scrape.db'); db.exec(` CREATE TABLE IF NOT EXISTS products ( id INTEGER PRIMARY KEY AUTOINCREMENT, url TEXT UNIQUE, title TEXT, price TEXT, scraped_at DATETIME DEFAULT CURRENT_TIMESTAMP ) `); const upsert = db.prepare(` INSERT INTO products (url, title, price) VALUES (@url, @title, @price) ON CONFLICT(url) DO UPDATE SET title = excluded.title, price = excluded.price, scraped_at = CURRENT_TIMESTAMP `); // Batch insert with transaction (fast) const insertMany = db.transaction((items) => { for (const item of items) upsert.run(item); }); insertMany(products);

JSONL streaming (append-only log)

import { createWriteStream } from 'fs'; const stream = createWriteStream('./data.jsonl', { flags: 'a' }); for (const item of products) { stream.write(JSON.stringify({ ...item, ts: Date.now() }) + '\n'); } stream.end();