Web Scraping with JavaScript and Cheerio in 2026

Cheerio remains the fastest way to parse and extract data from HTML in Node.js. It implements a subset of jQuery's API on a virtual DOM — no browser needed. Combined with Axios or fetch for HTTP requests, you get a lightweight, high-performance scraping stack. This guide covers everything from basic extraction to production-grade scraping with error handling, pagination, and scaling with SnapAPI.

Setup and Installation

Install Cheerio and Axios. Cheerio parses HTML into a traversable DOM, while Axios handles HTTP requests with automatic retries and encoding detection:

npm install cheerio axios

Basic scraping pattern — fetch a page and extract data:

import * as cheerio from 'cheerio';
import axios from 'axios';

async function scrape(url) {
  const { data: html } = await axios.get(url, {
    headers: {
      'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
    },
    timeout: 10000,
  });

  const $ = cheerio.load(html);

  const title = $('title').text();
  const description = $('meta[name="description"]').attr('content');
  const h1 = $('h1').first().text().trim();

  return { title, description, h1 };
}

const data = await scrape('https://example.com');
console.log(data);

Cheerio Selectors Deep Dive

Cheerio supports CSS selectors, jQuery extensions, and chaining. Here's a comprehensive reference for extracting different types of data:

import * as cheerio from 'cheerio';

const html = `
<div class="products">
  <article class="product" data-id="101">
    <h2 class="name">Widget Pro</h2>
    <span class="price">$49.99</span>
    <div class="rating" data-stars="4.5">★★★★½</div>
    <ul class="tags">
      <li>electronics</li>
      <li>gadgets</li>
    </ul>
    <a href="/products/101">View Details</a>
  </article>
</div>`;

const $ = cheerio.load(html);

// Text content
$('.name').text();                    // "Widget Pro"

// Attribute values
$('.product').attr('data-id');        // "101"
$('.rating').data('stars');           // 4.5 (auto-parsed)
$('a').attr('href');                  // "/products/101"

// Multiple elements → array
const tags = $('li').map((i, el) => $(el).text()).get();
// ["electronics", "gadgets"]

// Parse numbers from text
const price = parseFloat($('.price').text().replace('$', ''));
// 49.99

// Traversal
$('.product').find('.name').text();   // "Widget Pro"
$('.product').children('h2').text();  // "Widget Pro"
$('.name').parent().attr('class');    // "product"
$('.name').next().text();             // "$49.99"

// Filtering
$('li').filter((i, el) => $(el).text().includes('elec'));
$('.product').has('.price');          // Products with prices
$('article:first-child');            // First article

Extracting Structured Data

Real-world scraping usually means extracting arrays of structured objects. Here's a pattern for scraping product listings, job boards, or any repeated content:

import * as cheerio from 'cheerio';
import axios from 'axios';

async function scrapeProducts(url) {
  const { data: html } = await axios.get(url, {
    headers: { 'User-Agent': 'Mozilla/5.0 (compatible; DataBot/1.0)' },
  });

  const $ = cheerio.load(html);
  const products = [];

  $('.product-card').each((index, element) => {
    const el = $(element);
    products.push({
      name: el.find('.product-title').text().trim(),
      price: parseFloat(el.find('.price').text().replace(/[^0-9.]/g, '')),
      rating: parseFloat(el.find('.stars').attr('data-rating') || '0'),
      image: el.find('img').attr('src'),
      url: el.find('a.product-link').attr('href'),
      inStock: !el.find('.out-of-stock').length,
      tags: el.find('.tag').map((i, t) => $(t).text().trim()).get(),
    });
  });

  return products;
}

// Extract a table into an array of objects
async function scrapeTable(url, tableSelector) {
  const { data: html } = await axios.get(url);
  const $ = cheerio.load(html);

  const headers = $(tableSelector + ' thead th')
    .map((i, el) => $(el).text().trim().toLowerCase().replace(/\s+/g, '_'))
    .get();

  const rows = [];
  $(tableSelector + ' tbody tr').each((i, row) => {
    const obj = {};
    $(row).find('td').each((j, cell) => {
      obj[headers[j]] = $(cell).text().trim();
    });
    rows.push(obj);
  });

  return rows;
}

Handling Pagination

Most sites paginate results. Here's a robust pagination handler that follows "next" links or increments page numbers:

import * as cheerio from 'cheerio';
import axios from 'axios';

class PaginatedScraper {
  constructor(baseUrl, options = {}) {
    this.baseUrl = baseUrl;
    this.maxPages = options.maxPages || 50;
    this.delay = options.delay || 1000;
    this.results = [];
  }

  async scrapeAll(extractFn) {
    let url = this.baseUrl;
    let page = 1;

    while (url && page <= this.maxPages) {
      console.log(`Scraping page ${page}: ${url}`);

      const { data: html } = await axios.get(url, {
        headers: { 'User-Agent': 'Mozilla/5.0 (compatible; DataBot/1.0)' },
      });

      const $ = cheerio.load(html);

      // Extract items from this page
      const items = extractFn($);
      this.results.push(...items);

      if (items.length === 0) break; // No more results

      // Find next page URL
      const nextLink = $('a.next-page, a[rel="next"], .pagination a:last-child')
        .attr('href');

      if (nextLink && nextLink !== '#') {
        url = new URL(nextLink, this.baseUrl).href;
      } else {
        url = null;
      }

      page++;
      await new Promise(r => setTimeout(r, this.delay));
    }

    console.log(`Done! ${this.results.length} items from ${page - 1} pages`);
    return this.results;
  }
}

// Usage
const scraper = new PaginatedScraper('https://shop.example.com/products', {
  maxPages: 10,
  delay: 1500,
});

const products = await scraper.scrapeAll(($) => {
  return $('.product').map((i, el) => ({
    name: $(el).find('h3').text().trim(),
    price: $(el).find('.price').text().trim(),
  })).get();
});

Error Handling and Retries

Production scrapers need to handle network failures, rate limiting, and blocked requests gracefully:

import axios from 'axios';

const client = axios.create({
  timeout: 15000,
  headers: {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
    'Accept': 'text/html,application/xhtml+xml',
    'Accept-Language': 'en-US,en;q=0.9',
  },
});

async function fetchWithRetry(url, maxRetries = 3) {
  for (let attempt = 1; attempt <= maxRetries; attempt++) {
    try {
      const response = await client.get(url);
      return response.data;
    } catch (error) {
      const status = error.response?.status;

      // Don't retry client errors (except 429)
      if (status && status >= 400 && status < 500 && status !== 429) {
        throw new Error(`Client error ${status} for ${url}`);
      }

      // Rate limited — wait and retry
      if (status === 429) {
        const retryAfter = parseInt(error.response.headers['retry-after'] || '5');
        console.log(`Rate limited, waiting ${retryAfter}s...`);
        await new Promise(r => setTimeout(r, retryAfter * 1000));
        continue;
      }

      if (attempt === maxRetries) {
        throw new Error(`Failed after ${maxRetries} attempts: ${url}`);
      }

      // Exponential backoff
      const delay = Math.min(1000 * Math.pow(2, attempt), 30000);
      console.log(`Attempt ${attempt} failed, retrying in ${delay}ms...`);
      await new Promise(r => setTimeout(r, delay));
    }
  }
}

// Concurrent scraping with concurrency limit
async function scrapeUrls(urls, concurrency = 5) {
  const results = [];
  const queue = [...urls];

  async function worker() {
    while (queue.length > 0) {
      const url = queue.shift();
      try {
        const html = await fetchWithRetry(url);
        results.push({ url, html, error: null });
      } catch (error) {
        results.push({ url, html: null, error: error.message });
      }
    }
  }

  const workers = Array.from({ length: concurrency }, () => worker());
  await Promise.all(workers);
  return results;
}

Handling JavaScript-Rendered Pages

Cheerio only parses static HTML — it can't execute JavaScript. For SPAs, React apps, or pages that load content dynamically, you have two options: use a headless browser or use an API that handles rendering for you.

// Option 1: Playwright for JS-rendered content
import { chromium } from 'playwright';
import * as cheerio from 'cheerio';

async function scrapeJSPage(url) {
  const browser = await chromium.launch();
  const page = await browser.newPage();
  await page.goto(url, { waitUntil: 'networkidle' });

  // Get rendered HTML, then parse with Cheerio
  const html = await page.content();
  const $ = cheerio.load(html);

  const data = $('.dynamic-content').text();
  await browser.close();
  return data;
}

// Option 2: SnapAPI — no browser to manage
import SnapAPI from 'snapapi-js';

const snap = new SnapAPI('sk_live_your_key');

// Scrape JS-rendered pages without managing browsers
async function scrapeWithAPI(url) {
  const result = await snap.scrape({
    url,
    render_js: true,
    wait_for: '.dynamic-content',
    block_ads: true,
  });

  // Parse the rendered HTML with Cheerio
  const $ = cheerio.load(result.html);
  return $('.dynamic-content').text();
}

// Option 3: SnapAPI Extract — get structured data directly
async function extractWithAPI(url) {
  const result = await snap.extract({
    url,
    schema: {
      products: [{
        name: 'string',
        price: 'number',
        rating: 'number',
        reviews: 'number',
      }],
    },
  });

  // Structured JSON — no Cheerio needed
  return result.data.products;
}

Scraping Approach Comparison

Approach	JS Rendering	Speed	Complexity	Cost
Cheerio + Axios	No	Very fast (~50ms)	Low	Free
Playwright	Yes	Slow (~2-5s)	High (browser mgmt)	Server costs
Puppeteer	Yes	Slow (~2-5s)	High	Server costs
SnapAPI /scrape	Yes	Fast (~1-2s)	None (API call)	Free tier: 200/mo
SnapAPI /extract	Yes	Fast (~1-3s)	None (schema only)	Free tier: 200/mo

Production Scraping Architecture

For production workloads, build a scraping pipeline with job queues, caching, and output storage:

import * as cheerio from 'cheerio';
import axios from 'axios';
import { createClient } from 'redis';
import crypto from 'crypto';

class ScrapingPipeline {
  constructor(options = {}) {
    this.redis = createClient({ url: options.redisUrl });
    this.cacheTTL = options.cacheTTL || 3600; // 1 hour
    this.results = [];
  }

  async init() {
    await this.redis.connect();
  }

  cacheKey(url) {
    return `scrape:${crypto.createHash('md5').update(url).digest('hex')}`;
  }

  async scrape(url, extractFn) {
    // Check cache first
    const cached = await this.redis.get(this.cacheKey(url));
    if (cached) {
      console.log(`[CACHE HIT] ${url}`);
      return JSON.parse(cached);
    }

    const { data: html } = await axios.get(url, {
      headers: { 'User-Agent': 'Mozilla/5.0 (compatible; DataBot/1.0)' },
      timeout: 10000,
    });

    const $ = cheerio.load(html);
    const data = extractFn($, url);

    // Cache the result
    await this.redis.set(
      this.cacheKey(url),
      JSON.stringify(data),
      { EX: this.cacheTTL }
    );

    console.log(`[SCRAPED] ${url}`);
    return data;
  }

  async scrapeMany(urls, extractFn, concurrency = 3) {
    const results = [];
    for (let i = 0; i < urls.length; i += concurrency) {
      const batch = urls.slice(i, i + concurrency);
      const batchResults = await Promise.allSettled(
        batch.map(url => this.scrape(url, extractFn))
      );

      for (const result of batchResults) {
        if (result.status === 'fulfilled') {
          results.push(result.value);
        }
      }

      // Polite delay between batches
      if (i + concurrency < urls.length) {
        await new Promise(r => setTimeout(r, 1000));
      }
    }

    return results;
  }

  async close() {
    await this.redis.quit();
  }
}

Skip the Infrastructure — Use SnapAPI

Screenshots, scraping, structured extraction, PDF generation, and AI analysis through a single API. Handles JavaScript rendering, stealth mode, and anti-bot bypasses automatically.

Start Free — 200 Captures/Month

Best Practices

Respect robots.txt. Check the site's robots.txt before scraping. Honor crawl-delay directives and don't scrape pages marked as disallowed.
Rate limit your requests. Add delays between requests (1-2 seconds minimum). Use exponential backoff on errors. Never hammer a server with concurrent requests.
Cache aggressively. Store scraped HTML and parsed results in Redis or on disk. Avoid re-scraping the same page within your cache TTL.
Handle encoding correctly. Use axios with responseType: 'arraybuffer' and detect encoding from headers or meta tags for non-UTF-8 pages.
Validate extracted data. Check for empty strings, null values, and unexpected formats. Sites change their HTML structure without warning.
Use SnapAPI for JS-heavy sites. Don't manage Playwright infrastructure for scraping. SnapAPI handles rendering, stealth mode, and anti-bot detection in one API call.