Node.js Web Scraping in 2026

From Cheerio basics to Playwright SPA scraping, anti-bot evasion, proxy rotation, and production data pipelines. Everything you need to scrape reliably at scale.

Node.jsCheerioPlaywright ScrapingSQLiteApril 2026

Choose Your Tool First

Site typeToolWhy
Static HTML (no JS)fetch + CheerioFast, lightweight, no browser needed
Server-rendered (Next.js, Rails)fetch + CheerioHTML is in the initial response
SPA (React, Vue, Angular)PlaywrightContent rendered after JS execution
Infinite scroll / lazy loadPlaywrightNeed to trigger scroll events
Protected (Cloudflare, DataDome)SnapAPI stealthHandles bot protection automatically
Structured data extractionSnapAPI /extractSchema-based, no CSS selector writing

Cheerio: Static HTML Scraping

npm install cheerio node-fetch
import * as cheerio from 'cheerio';

async function scrapeProducts(url) {
  const res = await fetch(url, {
    headers: {
      'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',
      'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
      'Accept-Language': 'en-US,en;q=0.5',
    },
  });
  const html = await res.text();
  const $ = cheerio.load(html);

  const products = [];
  $('.product-card').each((_, el) => {
    products.push({
      title: $(el).find('.product-title').text().trim(),
      price: $(el).find('.price').text().trim(),
      url: $(el).find('a').attr('href'),
      image: $(el).find('img').attr('src'),
    });
  });

  return products;
}

Pagination with Cheerio

async function scrapeAllPages(baseUrl) {
  let allItems = [];
  let page = 1;
  let hasNextPage = true;

  while (hasNextPage) {
    const url = `${baseUrl}?page=${page}`;
    const res = await fetch(url, { headers: { 'User-Agent': '...' } });
    const $ = cheerio.load(await res.text());

    // Extract items from current page
    $('.item').each((_, el) => allItems.push($(el).text().trim()));

    // Check for next page link
    hasNextPage = !!$('a[rel="next"]').length;
    page++;

    // Polite delay
    if (hasNextPage) await new Promise(r => setTimeout(r, 1000 + Math.random() * 500));
  }

  return allItems;
}

Playwright: SPA Scraping

import { chromium } from 'playwright';
import * as cheerio from 'cheerio';

async function scrapeSPA(url) {
  const browser = await chromium.launch({ headless: true });
  const context = await browser.newContext({
    userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)...',
  });

  // Block images and fonts to speed up load
  await context.route('**/*', (route) => {
    const type = route.request().resourceType();
    if (['image', 'media', 'font'].includes(type)) return route.abort();
    return route.continue();
  });

  const page = await context.newPage();
  await page.goto(url, { waitUntil: 'networkidle', timeout: 20000 });

  // Wait for the specific content to load
  await page.waitForSelector('.product-list', { timeout: 10000 });

  // Option A: Use evaluate for simple extraction
  const products = await page.evaluate(() => {
    return [...document.querySelectorAll('.product-card')].map(el => ({
      title: el.querySelector('.title')?.textContent?.trim(),
      price: el.querySelector('.price')?.textContent?.trim(),
    }));
  });

  // Option B: Pass rendered HTML to Cheerio
  const html = await page.content();
  const $ = cheerio.load(html);
  // ... use Cheerio as normal

  await browser.close();
  return products;
}

Infinite scroll

async function scrapeInfiniteScroll(url, itemSelector, maxItems = 200) {
  const browser = await chromium.launch();
  const page = await browser.newPage();
  await page.goto(url, { waitUntil: 'networkidle' });

  const items = new Set();
  let previousCount = 0;

  while (items.size < maxItems) {
    // Collect visible items
    const newItems = await page.evaluate((sel) =>
      [...document.querySelectorAll(sel)].map(el => el.textContent.trim()),
      itemSelector
    );
    newItems.forEach(i => items.add(i));

    // No new items after scroll — reached end
    if (items.size === previousCount) break;
    previousCount = items.size;

    // Scroll to bottom to trigger next load
    await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
    await page.waitForTimeout(1500);  // wait for next batch to load
  }

  await browser.close();
  return [...items];
}

SnapAPI: Managed Scraping

For sites with bot protection or when you don't want to manage Playwright infrastructure:

// Scrape full rendered HTML (after JS execution)
const res = await fetch('https://api.snapapi.pics/v1/scrape', {
  method: 'POST',
  headers: { 'X-Api-Key': process.env.SNAPAPI_KEY, 'Content-Type': 'application/json' },
  body: JSON.stringify({
    url: 'https://protected-site.com/products',
    stealth: true,
    waitForSelector: '.product-list',
    waitUntil: 'networkidle',
  }),
});
const { html } = await res.json();
// Parse with Cheerio as usual
const $ = cheerio.load(html);
// Schema-based extraction — no CSS selectors needed
const res = await fetch('https://api.snapapi.pics/v1/extract', {
  method: 'POST',
  headers: { 'X-Api-Key': process.env.SNAPAPI_KEY, 'Content-Type': 'application/json' },
  body: JSON.stringify({
    url: 'https://example.com/product',
    schema: {
      title:       { type: 'string',  description: 'Product name' },
      price:       { type: 'string',  description: 'Price with currency' },
      rating:      { type: 'number',  description: 'Star rating 1-5' },
      reviewCount: { type: 'integer', description: 'Number of reviews' },
      inStock:     { type: 'boolean', description: 'Whether available' },
      images:      { type: 'array',   items: { type: 'string' }, description: 'Image URLs' },
    },
  }),
});
const { data } = await res.json();
// data = { title: 'Widget Pro', price: '$29.99', rating: 4.7, ... }

Data Storage Patterns

SQLite (local, fast, zero setup)

import Database from 'better-sqlite3';

const db = new Database('./scrape.db');
db.exec(`
  CREATE TABLE IF NOT EXISTS products (
    id       INTEGER PRIMARY KEY AUTOINCREMENT,
    url      TEXT UNIQUE,
    title    TEXT,
    price    TEXT,
    scraped_at DATETIME DEFAULT CURRENT_TIMESTAMP
  )
`);

const upsert = db.prepare(`
  INSERT INTO products (url, title, price)
  VALUES (@url, @title, @price)
  ON CONFLICT(url) DO UPDATE SET
    title = excluded.title,
    price = excluded.price,
    scraped_at = CURRENT_TIMESTAMP
`);

// Batch insert with transaction (fast)
const insertMany = db.transaction((items) => {
  for (const item of items) upsert.run(item);
});

insertMany(products);

JSONL streaming (append-only log)

import { createWriteStream } from 'fs';

const stream = createWriteStream('./data.jsonl', { flags: 'a' });

for (const item of products) {
  stream.write(JSON.stringify({ ...item, ts: Date.now() }) + '\n');
}
stream.end();