How to Scrape a Website with Node.js in 2026

Static Pages: axios + Cheerio

npm install axios cheerio

const axios = require('axios');
const cheerio = require('cheerio');

async function scrape(url) {
  const { data: html } = await axios.get(url, {
    headers: { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36' },
    timeout: 15000
  });

  const $ = cheerio.load(html);
  const results = [];

  $('article.post, .blog-card, li.result').each((i, el) => {
    results.push({
      title: $(el).find('h2, h3, .title').first().text().trim(),
      url: $(el).find('a').first().attr('href'),
      description: $(el).find('p, .excerpt, .description').first().text().trim(),
      date: $(el).find('time, .date').first().attr('datetime') || $(el).find('time, .date').first().text().trim()
    });
  });

  return results.filter(r => r.title);
}

const posts = await scrape('https://blog.example.com');
console.log(posts);

Pagination

async function scrapeAllPages(baseUrl, maxPages = 10) {
  const allItems = [];
  let page = 1;

  while (page <= maxPages) {
    const url = `${baseUrl}?page=${page}`;
    try {
      const { data: html } = await axios.get(url, {
        headers: { 'User-Agent': 'Mozilla/5.0 (compatible; ResearchBot/1.0)' }
      });
      const $ = cheerio.load(html);

      const items = [];
      $('.item, article, li.result').each((_, el) => {
        items.push({
          title: $(el).find('h2, .title').text().trim(),
          href: $(el).find('a').attr('href')
        });
      });

      if (items.length === 0) break;  // No more results
      allItems.push(...items);

      // Check for next page link
      const hasNext = $('a[rel="next"], .pagination .next, a:contains("Next")').length > 0;
      if (!hasNext) break;

      page++;
      await new Promise(r => setTimeout(r, 1000));  // 1 req/sec
    } catch (err) {
      if (err.response?.status === 404) break;
      throw err;
    }
  }

  return allItems;
}

SPAs: Playwright

const { chromium } = require('playwright');
const cheerio = require('cheerio');

async function scrapeSPA(url) {
  const browser = await chromium.launch({ headless: true });
  const page = await browser.newPage();

  // Block images/fonts for faster loading
  await page.route('**/*.{png,jpg,jpeg,gif,webp,woff,woff2,ttf}', route => route.abort());

  await page.goto(url, { waitUntil: 'networkidle' });

  // Option 1: Extract via page.evaluate (runs in browser context)
  const data = await page.evaluate(() => {
    return Array.from(document.querySelectorAll('.product, [data-product]')).map(el => ({
      name: el.querySelector('.name, h2')?.textContent?.trim(),
      price: parseFloat(el.querySelector('.price')?.textContent?.replace(/[^\d.]/g, '')),
      sku: el.dataset.sku || el.querySelector('[data-sku]')?.dataset?.sku,
      inStock: !el.querySelector('.out-of-stock')
    }));
  });

  // Option 2: Get full HTML and use Cheerio (more familiar syntax)
  const html = await page.content();
  const $ = cheerio.load(html);
  const altData = [];
  $('.product').each((_, el) => {
    altData.push({ title: $(el).find('h2').text(), price: $(el).find('.price').text() });
  });

  await browser.close();
  return data;
}

Retry Logic and Error Handling

async function fetchWithRetry(url, options = {}, maxRetries = 3) {
  const delays = [1000, 3000, 10000];  // Exponential backoff

  for (let attempt = 0; attempt <= maxRetries; attempt++) {
    try {
      const response = await axios.get(url, {
        ...options,
        timeout: 15000,
        headers: {
          'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)',
          ...options.headers
        }
      });
      return response;
    } catch (err) {
      const isRetryable = !err.response || [429, 500, 502, 503, 504].includes(err.response.status);
      if (!isRetryable || attempt === maxRetries) throw err;

      const delay = delays[attempt] || 10000;
      console.log(`Retry ${attempt + 1}/${maxRetries} after ${delay}ms: ${url}`);
      await new Promise(r => setTimeout(r, delay));
    }
  }
}

Production Scraping: SnapAPI

For sites with Cloudflare, anti-bot protection, or JavaScript rendering, SnapAPI handles the headless browser layer for you.

async function scrapeProduction(url) {
  // Returns full HTML of the rendered page
  const response = await fetch('https://api.snapapi.pics/v1/scrape', {
    method: 'POST',
    headers: { 'X-Api-Key': process.env.SNAPAPI_KEY, 'Content-Type': 'application/json' },
    body: JSON.stringify({
      url,
      wait_for: 'networkidle',
      stealth: true,           // Bypass anti-bot detection
      block_ads: true
    })
  });

  const { html, text, links, title } = await response.json();

  // Parse the rendered HTML with Cheerio
  const $ = cheerio.load(html);
  const products = [];
  $('.product-card').each((_, el) => {
    products.push({
      name: $(el).find('.name').text().trim(),
      price: parseFloat($(el).find('.price').text().replace(/[^\d.]/g, ''))
    });
  });
  return products;
}

// Or extract structured data with a schema (no CSS selectors needed)
async function extractStructured(url) {
  const response = await fetch('https://api.snapapi.pics/v1/extract', {
    method: 'POST',
    headers: { 'X-Api-Key': process.env.SNAPAPI_KEY, 'Content-Type': 'application/json' },
    body: JSON.stringify({
      url,
      schema: { name: 'string', price: 'number', in_stock: 'boolean', images: 'string[]' }
    })
  });
  return (await response.json()).data;
}

Storing Scraped Data

// SQLite with better-sqlite3 (sync, zero config)
const Database = require('better-sqlite3');
const db = new Database('data.db');

db.exec(`CREATE TABLE IF NOT EXISTS items (
  id INTEGER PRIMARY KEY,
  url TEXT UNIQUE,
  data TEXT,
  scraped_at INTEGER DEFAULT (unixepoch())
)`);

const upsert = db.prepare(
  'INSERT OR REPLACE INTO items (url, data) VALUES (?, ?)'
);

// Batch insert with transaction (100x faster than one-by-one)
const insertBatch = db.transaction((items) => {
  for (const item of items) upsert.run(item.url, JSON.stringify(item));
});

// Usage
const results = await scrapeSPA('https://shop.example.com');
insertBatch(results);
console.log(`Saved ${results.length} items`);

// Save as JSONL (newline-delimited JSON) — works with jq, BigQuery, Spark
const fs = require('fs');
const stream = fs.createWriteStream('output.jsonl', { flags: 'a' });

results.forEach(item => stream.write(JSON.stringify(item) + '\n'));
stream.end();

Which Tool for Which Site

Site type	Best tool	Notes
Static HTML (blogs, gov)	axios + Cheerio	Fastest, no browser overhead
React/Vue/Next.js SPA	Playwright	networkidle handles async render
Cloudflare-protected	SnapAPI (stealth)	Maintained evasion layer
Need structured data	SnapAPI /extract	No CSS selector maintenance
Site crawl (>10K pages)	Crawlee + Playwright	Built-in queue + deduplication
Serverless function	SnapAPI	No binary, works on Lambda

Free to try: SnapAPI's /v1/scrape returns full rendered HTML + clean text from any URL. 200 free calls/month. Get key →