Node.js HTML Parsing Web Scraping April 5, 2026

How to Parse HTML in Node.js (2026): Cheerio, JSDOM & More

Node.js doesn't ship with a built-in HTML parser — but the ecosystem has you covered. This guide compares every major option: Cheerio for jQuery-style selectors, JSDOM for full DOM emulation, node-html-parser for raw speed, and SnapAPI when the page requires a real browser. Code examples for each, plus a benchmark table so you can choose the right tool for your workload.

Parser Comparison

Library	API style	Speed	JS execution	Best for
Cheerio	jQuery ($)	Fast	No	Scraping static HTML, most common choice
JSDOM	Web DOM API	Slower	Partial	Code that expects a browser DOM environment
node-html-parser	Subset DOM	Very fast	No	High-volume parsing where speed is critical
parse5	AST / DOM	Fast	No	Spec-compliant parsing, HTML tooling
SnapAPI /scrape	REST API	Network-bound	Yes (real browser)	SPAs, bot-protected pages, remote URLs

Cheerio — jQuery for Node.js

Cheerio is the most popular HTML parser in the Node.js ecosystem with 30M+ weekly downloads. It implements a subset of jQuery's API and runs on top of htmlparser2 for speed.

npm install cheerio axios

const axios = require('axios');
const cheerio = require('cheerio');

async function parseWithCheerio(url) {
  const { data: html } = await axios.get(url, {
    headers: { 'User-Agent': 'Mozilla/5.0 (compatible; MyScraper/1.0)' }
  });

  const $ = cheerio.load(html);

  // jQuery-style selectors
  const title    = $('h1').first().text().trim();
  const metaDesc = $('meta[name="description"]').attr('content') ?? '';

  // Extract all links
  const links = $('a[href]').map((_, el) => ({
    text: $(el).text().trim(),
    href: $(el).attr('href')
  })).get().filter(l => l.text && l.href);

  // Scrape a product listing
  const products = $('.product-card').map((_, el) => ({
    name:  $(el).find('.product-title').text().trim(),
    price: parseFloat($(el).find('.price').text().replace(/[^0-9.]/g, '')),
    url:   $(el).find('a').attr('href')
  })).get();

  return { title, metaDesc, links, products };
}

Cheerio with pagination

async function scrapeAllPages(baseUrl) {
  const results = [];
  let page = 1;
  let hasMore = true;

  while (hasMore) {
    const { data: html } = await axios.get(`${baseUrl}?page=${page}`, {
      headers: { 'User-Agent': 'Mozilla/5.0' }
    });
    const $ = cheerio.load(html);

    const items = $('.item').map((_, el) => ({
      title: $(el).find('h3').text().trim(),
      price: $(el).find('.price').text().trim()
    })).get();

    results.push(...items);

    // Stop if no next page link
    hasMore = !!$('a[rel="next"]').length && items.length > 0;
    page++;

    if (hasMore) await new Promise(r => setTimeout(r, 1000)); // polite delay
  }

  return results;
}

JSDOM — Full DOM in Node.js

JSDOM creates a full browser-like DOM environment. Use it when your parsing code uses document.querySelector, element.closest(), or any other standard Web API — for example when sharing parsing logic between browser and server.

npm install jsdom axios

const { JSDOM } = require('jsdom');
const axios = require('axios');

async function parseWithJSDOM(url) {
  const { data: html } = await axios.get(url);
  const dom = new JSDOM(html, { url }); // url needed for relative URL resolution
  const { document } = dom.window;

  // Standard Web APIs work as-is
  const title = document.querySelector('h1')?.textContent?.trim();
  const links  = [...document.querySelectorAll('a[href]')].map(a => ({
    text: a.textContent.trim(),
    href: a.href // absolute URL thanks to the url option
  }));

  // closest(), matches(), querySelectorAll — all work
  const activeNav = document.querySelector('.nav-item.active');
  const section   = activeNav?.closest('nav')?.textContent;

  dom.window.close(); // important: free memory
  return { title, links, section };
}

Memory note: Always call dom.window.close() when done. JSDOM holds event listeners and timers in memory — without cleanup you'll leak in long-running processes.

node-html-parser — Maximum Speed

node-html-parser is a lightweight, zero-dependency parser that's 10–50× faster than JSDOM for simple extractions. It doesn't support the full DOM API but covers the most common selectors.

npm install node-html-parser

const { parse } = require('node-html-parser');
const fs = require('fs');

// Parse from a string (or file) — no network call needed
function parseLocal(htmlPath) {
  const html = fs.readFileSync(htmlPath, 'utf8');
  const root = parse(html);

  // CSS selectors supported
  const title       = root.querySelector('h1')?.text.trim();
  const description = root.querySelector('meta[name="description"]')?.getAttribute('content');

  // querySelectorAll + map
  const headings = root.querySelectorAll('h2, h3').map(h => ({
    level: h.tagName.toLowerCase(),
    text:  h.text.trim()
  }));

  // getAttribute, innerHTML, rawText
  const firstImage = root.querySelector('img');
  const imgSrc = firstImage?.getAttribute('src');
  const imgAlt = firstImage?.getAttribute('alt');

  return { title, description, headings, firstImage: { src: imgSrc, alt: imgAlt } };
}

// Benchmark: parse 10,000 HTML files
async function benchmarkParse(dir) {
  const files = require('fs').readdirSync(dir).filter(f => f.endsWith('.html'));
  const start = Date.now();
  for (const file of files) {
    parseLocal(`${dir}/${file}`);
  }
  console.log(`Parsed ${files.length} files in ${Date.now() - start}ms`);
}

parse5 — Spec-Compliant AST

parse5 is the HTML parser used by Angular, jsdom, and many other tools. It produces a spec-compliant AST (Abstract Syntax Tree) — useful when you need to transform or serialise HTML rather than just read data from it.

const parse5 = require('parse5');

const html = '<div class="box"><p>Hello <strong>world</strong></p></div>';
const document = parse5.parse(html);

// Walk the AST
function findNodes(node, tagName, results = []) {
  if (node.nodeName === tagName) results.push(node);
  (node.childNodes ?? []).forEach(child => findNodes(child, tagName, results));
  return results;
}

const paragraphs = findNodes(document, 'p');
paragraphs.forEach(p => {
  const text = p.childNodes.map(n => n.value ?? '').join('').trim();
  console.log('Paragraph:', text);
});

// Serialise back to HTML after modification
const serialised = parse5.serialize(document);

SnapAPI — Parse Remote Pages (with a Real Browser)

Static parsers fail on React, Vue, and Angular apps — the HTML they fetch is an empty shell. SnapAPI renders the page in a real Chromium instance and returns the fully-rendered HTML, which you can then parse with Cheerio or node-html-parser.

const axios   = require('axios');
const cheerio = require('cheerio');

async function fetchAndParse(url, options = {}) {
  // 1. Render with SnapAPI (handles JS, SPAs, bot protection)
  const { data } = await axios.post('https://api.snapapi.pics/v1/scrape', {
    url,
    stealth:     options.stealth ?? false,
    waitFor:     options.waitFor ?? 'networkidle',  // or a CSS selector
    blockAds:    true,
    blockCookieBanners: true
  }, { headers: { 'X-Api-Key': process.env.SNAPAPI_KEY } });

  // 2. Parse the rendered HTML with Cheerio
  const $ = cheerio.load(data.html);

  return {
    title:    $('h1').first().text().trim(),
    metaDesc: $('meta[name="description"]').attr('content'),
    text:     $('body').text().replace(/\s+/g, ' ').trim().slice(0, 2000),
    links:    $('a[href]').map((_, el) => $(el).attr('href')).get()
  };
}

// Usage — works on SPAs and protected pages
fetchAndParse('https://app.example.com/dashboard', { stealth: true })
  .then(data => console.log(data));

Schema extraction (skip parsing entirely)

// If you want structured data, skip HTML parsing and use /v1/extract instead
async function extractStructured(url) {
  const { data } = await axios.post('https://api.snapapi.pics/v1/extract', {
    url,
    schema: {
      title:       { type: 'string' },
      price:       { type: 'number' },
      description: { type: 'string' },
      features:    { type: 'array', items: { type: 'string' } }
    }
  }, { headers: { 'X-Api-Key': process.env.SNAPAPI_KEY } });

  return data.data; // typed JSON — no HTML parsing needed
}

Which Parser Should You Use?

Cheerio — default choice for 90% of scraping tasks. Fast, familiar, huge community. Use for any static or server-rendered HTML.
JSDOM — when you're sharing DOM-manipulation code between browser and server, or need element.closest() / computed styles.
node-html-parser — bulk processing of thousands of HTML files where speed matters more than selector coverage.
parse5 — HTML tooling, linters, transpilers — any scenario where you need a spec-compliant AST.
SnapAPI /scrape + Cheerio — when the target page is a SPA, requires login, or blocks bots. Render first, parse second.
SnapAPI /extract — when you want typed structured JSON and don't want to write any selector code at all.

Parse any page — static or SPA

SnapAPI renders in a real browser and returns clean HTML ready for Cheerio — or typed JSON if you use /extract. 200 free requests/month.

Try SnapAPI Free →