Cheerio remains the fastest way to parse and extract data from HTML in Node.js. It implements a subset of jQuery's API on a virtual DOM — no browser needed. Combined with Axios or fetch for HTTP requests, you get a lightweight, high-performance scraping stack. This guide covers everything from basic extraction to production-grade scraping with error handling, pagination, and scaling with SnapAPI.
Setup and Installation
Install Cheerio and Axios. Cheerio parses HTML into a traversable DOM, while Axios handles HTTP requests with automatic retries and encoding detection:
npm install cheerio axios
Basic scraping pattern — fetch a page and extract data:
import * as cheerio from 'cheerio';
import axios from 'axios';
async function scrape(url) {
const { data: html } = await axios.get(url, {
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
},
timeout: 10000,
});
const $ = cheerio.load(html);
const title = $('title').text();
const description = $('meta[name="description"]').attr('content');
const h1 = $('h1').first().text().trim();
return { title, description, h1 };
}
const data = await scrape('https://example.com');
console.log(data);
Cheerio Selectors Deep Dive
Cheerio supports CSS selectors, jQuery extensions, and chaining. Here's a comprehensive reference for extracting different types of data:
import * as cheerio from 'cheerio';
const html = `
<div class="products">
<article class="product" data-id="101">
<h2 class="name">Widget Pro</h2>
<span class="price">$49.99</span>
<div class="rating" data-stars="4.5">★★★★½</div>
<ul class="tags">
<li>electronics</li>
<li>gadgets</li>
</ul>
<a href="/products/101">View Details</a>
</article>
</div>`;
const $ = cheerio.load(html);
// Text content
$('.name').text(); // "Widget Pro"
// Attribute values
$('.product').attr('data-id'); // "101"
$('.rating').data('stars'); // 4.5 (auto-parsed)
$('a').attr('href'); // "/products/101"
// Multiple elements → array
const tags = $('li').map((i, el) => $(el).text()).get();
// ["electronics", "gadgets"]
// Parse numbers from text
const price = parseFloat($('.price').text().replace('$', ''));
// 49.99
// Traversal
$('.product').find('.name').text(); // "Widget Pro"
$('.product').children('h2').text(); // "Widget Pro"
$('.name').parent().attr('class'); // "product"
$('.name').next().text(); // "$49.99"
// Filtering
$('li').filter((i, el) => $(el).text().includes('elec'));
$('.product').has('.price'); // Products with prices
$('article:first-child'); // First article
Extracting Structured Data
Real-world scraping usually means extracting arrays of structured objects. Here's a pattern for scraping product listings, job boards, or any repeated content:
import * as cheerio from 'cheerio';
import axios from 'axios';
async function scrapeProducts(url) {
const { data: html } = await axios.get(url, {
headers: { 'User-Agent': 'Mozilla/5.0 (compatible; DataBot/1.0)' },
});
const $ = cheerio.load(html);
const products = [];
$('.product-card').each((index, element) => {
const el = $(element);
products.push({
name: el.find('.product-title').text().trim(),
price: parseFloat(el.find('.price').text().replace(/[^0-9.]/g, '')),
rating: parseFloat(el.find('.stars').attr('data-rating') || '0'),
image: el.find('img').attr('src'),
url: el.find('a.product-link').attr('href'),
inStock: !el.find('.out-of-stock').length,
tags: el.find('.tag').map((i, t) => $(t).text().trim()).get(),
});
});
return products;
}
// Extract a table into an array of objects
async function scrapeTable(url, tableSelector) {
const { data: html } = await axios.get(url);
const $ = cheerio.load(html);
const headers = $(tableSelector + ' thead th')
.map((i, el) => $(el).text().trim().toLowerCase().replace(/\s+/g, '_'))
.get();
const rows = [];
$(tableSelector + ' tbody tr').each((i, row) => {
const obj = {};
$(row).find('td').each((j, cell) => {
obj[headers[j]] = $(cell).text().trim();
});
rows.push(obj);
});
return rows;
}
Handling Pagination
Most sites paginate results. Here's a robust pagination handler that follows "next" links or increments page numbers:
import * as cheerio from 'cheerio';
import axios from 'axios';
class PaginatedScraper {
constructor(baseUrl, options = {}) {
this.baseUrl = baseUrl;
this.maxPages = options.maxPages || 50;
this.delay = options.delay || 1000;
this.results = [];
}
async scrapeAll(extractFn) {
let url = this.baseUrl;
let page = 1;
while (url && page <= this.maxPages) {
console.log(`Scraping page ${page}: ${url}`);
const { data: html } = await axios.get(url, {
headers: { 'User-Agent': 'Mozilla/5.0 (compatible; DataBot/1.0)' },
});
const $ = cheerio.load(html);
// Extract items from this page
const items = extractFn($);
this.results.push(...items);
if (items.length === 0) break; // No more results
// Find next page URL
const nextLink = $('a.next-page, a[rel="next"], .pagination a:last-child')
.attr('href');
if (nextLink && nextLink !== '#') {
url = new URL(nextLink, this.baseUrl).href;
} else {
url = null;
}
page++;
await new Promise(r => setTimeout(r, this.delay));
}
console.log(`Done! ${this.results.length} items from ${page - 1} pages`);
return this.results;
}
}
// Usage
const scraper = new PaginatedScraper('https://shop.example.com/products', {
maxPages: 10,
delay: 1500,
});
const products = await scraper.scrapeAll(($) => {
return $('.product').map((i, el) => ({
name: $(el).find('h3').text().trim(),
price: $(el).find('.price').text().trim(),
})).get();
});
Error Handling and Retries
Production scrapers need to handle network failures, rate limiting, and blocked requests gracefully:
import axios from 'axios';
const client = axios.create({
timeout: 15000,
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Accept': 'text/html,application/xhtml+xml',
'Accept-Language': 'en-US,en;q=0.9',
},
});
async function fetchWithRetry(url, maxRetries = 3) {
for (let attempt = 1; attempt <= maxRetries; attempt++) {
try {
const response = await client.get(url);
return response.data;
} catch (error) {
const status = error.response?.status;
// Don't retry client errors (except 429)
if (status && status >= 400 && status < 500 && status !== 429) {
throw new Error(`Client error ${status} for ${url}`);
}
// Rate limited — wait and retry
if (status === 429) {
const retryAfter = parseInt(error.response.headers['retry-after'] || '5');
console.log(`Rate limited, waiting ${retryAfter}s...`);
await new Promise(r => setTimeout(r, retryAfter * 1000));
continue;
}
if (attempt === maxRetries) {
throw new Error(`Failed after ${maxRetries} attempts: ${url}`);
}
// Exponential backoff
const delay = Math.min(1000 * Math.pow(2, attempt), 30000);
console.log(`Attempt ${attempt} failed, retrying in ${delay}ms...`);
await new Promise(r => setTimeout(r, delay));
}
}
}
// Concurrent scraping with concurrency limit
async function scrapeUrls(urls, concurrency = 5) {
const results = [];
const queue = [...urls];
async function worker() {
while (queue.length > 0) {
const url = queue.shift();
try {
const html = await fetchWithRetry(url);
results.push({ url, html, error: null });
} catch (error) {
results.push({ url, html: null, error: error.message });
}
}
}
const workers = Array.from({ length: concurrency }, () => worker());
await Promise.all(workers);
return results;
}
Handling JavaScript-Rendered Pages
Cheerio only parses static HTML — it can't execute JavaScript. For SPAs, React apps, or pages that load content dynamically, you have two options: use a headless browser or use an API that handles rendering for you.
// Option 1: Playwright for JS-rendered content
import { chromium } from 'playwright';
import * as cheerio from 'cheerio';
async function scrapeJSPage(url) {
const browser = await chromium.launch();
const page = await browser.newPage();
await page.goto(url, { waitUntil: 'networkidle' });
// Get rendered HTML, then parse with Cheerio
const html = await page.content();
const $ = cheerio.load(html);
const data = $('.dynamic-content').text();
await browser.close();
return data;
}
// Option 2: SnapAPI — no browser to manage
import SnapAPI from 'snapapi-js';
const snap = new SnapAPI('sk_live_your_key');
// Scrape JS-rendered pages without managing browsers
async function scrapeWithAPI(url) {
const result = await snap.scrape({
url,
render_js: true,
wait_for: '.dynamic-content',
block_ads: true,
});
// Parse the rendered HTML with Cheerio
const $ = cheerio.load(result.html);
return $('.dynamic-content').text();
}
// Option 3: SnapAPI Extract — get structured data directly
async function extractWithAPI(url) {
const result = await snap.extract({
url,
schema: {
products: [{
name: 'string',
price: 'number',
rating: 'number',
reviews: 'number',
}],
},
});
// Structured JSON — no Cheerio needed
return result.data.products;
}
Scraping Approach Comparison
| Approach | JS Rendering | Speed | Complexity | Cost |
|---|---|---|---|---|
| Cheerio + Axios | No | Very fast (~50ms) | Low | Free |
| Playwright | Yes | Slow (~2-5s) | High (browser mgmt) | Server costs |
| Puppeteer | Yes | Slow (~2-5s) | High | Server costs |
| SnapAPI /scrape | Yes | Fast (~1-2s) | None (API call) | Free tier: 200/mo |
| SnapAPI /extract | Yes | Fast (~1-3s) | None (schema only) | Free tier: 200/mo |
Production Scraping Architecture
For production workloads, build a scraping pipeline with job queues, caching, and output storage:
import * as cheerio from 'cheerio';
import axios from 'axios';
import { createClient } from 'redis';
import crypto from 'crypto';
class ScrapingPipeline {
constructor(options = {}) {
this.redis = createClient({ url: options.redisUrl });
this.cacheTTL = options.cacheTTL || 3600; // 1 hour
this.results = [];
}
async init() {
await this.redis.connect();
}
cacheKey(url) {
return `scrape:${crypto.createHash('md5').update(url).digest('hex')}`;
}
async scrape(url, extractFn) {
// Check cache first
const cached = await this.redis.get(this.cacheKey(url));
if (cached) {
console.log(`[CACHE HIT] ${url}`);
return JSON.parse(cached);
}
const { data: html } = await axios.get(url, {
headers: { 'User-Agent': 'Mozilla/5.0 (compatible; DataBot/1.0)' },
timeout: 10000,
});
const $ = cheerio.load(html);
const data = extractFn($, url);
// Cache the result
await this.redis.set(
this.cacheKey(url),
JSON.stringify(data),
{ EX: this.cacheTTL }
);
console.log(`[SCRAPED] ${url}`);
return data;
}
async scrapeMany(urls, extractFn, concurrency = 3) {
const results = [];
for (let i = 0; i < urls.length; i += concurrency) {
const batch = urls.slice(i, i + concurrency);
const batchResults = await Promise.allSettled(
batch.map(url => this.scrape(url, extractFn))
);
for (const result of batchResults) {
if (result.status === 'fulfilled') {
results.push(result.value);
}
}
// Polite delay between batches
if (i + concurrency < urls.length) {
await new Promise(r => setTimeout(r, 1000));
}
}
return results;
}
async close() {
await this.redis.quit();
}
}
Skip the Infrastructure — Use SnapAPI
Screenshots, scraping, structured extraction, PDF generation, and AI analysis through a single API. Handles JavaScript rendering, stealth mode, and anti-bot bypasses automatically.
Start Free — 200 Captures/MonthBest Practices
- Respect robots.txt. Check the site's robots.txt before scraping. Honor crawl-delay directives and don't scrape pages marked as disallowed.
- Rate limit your requests. Add delays between requests (1-2 seconds minimum). Use exponential backoff on errors. Never hammer a server with concurrent requests.
- Cache aggressively. Store scraped HTML and parsed results in Redis or on disk. Avoid re-scraping the same page within your cache TTL.
- Handle encoding correctly. Use
axioswithresponseType: 'arraybuffer'and detect encoding from headers or meta tags for non-UTF-8 pages. - Validate extracted data. Check for empty strings, null values, and unexpected formats. Sites change their HTML structure without warning.
- Use SnapAPI for JS-heavy sites. Don't manage Playwright infrastructure for scraping. SnapAPI handles rendering, stealth mode, and anti-bot detection in one API call.