How to Parse HTML in Node.js (2026): Cheerio, JSDOM & More
Node.js doesn't ship with a built-in HTML parser — but the ecosystem has you covered. This guide compares every major option: Cheerio for jQuery-style selectors, JSDOM for full DOM emulation, node-html-parser for raw speed, and SnapAPI when the page requires a real browser. Code examples for each, plus a benchmark table so you can choose the right tool for your workload.
Parser Comparison
| Library | API style | Speed | JS execution | Best for |
|---|---|---|---|---|
| Cheerio | jQuery ($) | Fast | No | Scraping static HTML, most common choice |
| JSDOM | Web DOM API | Slower | Partial | Code that expects a browser DOM environment |
| node-html-parser | Subset DOM | Very fast | No | High-volume parsing where speed is critical |
| parse5 | AST / DOM | Fast | No | Spec-compliant parsing, HTML tooling |
| SnapAPI /scrape | REST API | Network-bound | Yes (real browser) | SPAs, bot-protected pages, remote URLs |
Cheerio — jQuery for Node.js
Cheerio is the most popular HTML parser in the Node.js ecosystem with 30M+ weekly downloads. It implements a subset of jQuery's API and runs on top of htmlparser2 for speed.
npm install cheerio axios
const axios = require('axios');
const cheerio = require('cheerio');
async function parseWithCheerio(url) {
const { data: html } = await axios.get(url, {
headers: { 'User-Agent': 'Mozilla/5.0 (compatible; MyScraper/1.0)' }
});
const $ = cheerio.load(html);
// jQuery-style selectors
const title = $('h1').first().text().trim();
const metaDesc = $('meta[name="description"]').attr('content') ?? '';
// Extract all links
const links = $('a[href]').map((_, el) => ({
text: $(el).text().trim(),
href: $(el).attr('href')
})).get().filter(l => l.text && l.href);
// Scrape a product listing
const products = $('.product-card').map((_, el) => ({
name: $(el).find('.product-title').text().trim(),
price: parseFloat($(el).find('.price').text().replace(/[^0-9.]/g, '')),
url: $(el).find('a').attr('href')
})).get();
return { title, metaDesc, links, products };
}
Cheerio with pagination
async function scrapeAllPages(baseUrl) {
const results = [];
let page = 1;
let hasMore = true;
while (hasMore) {
const { data: html } = await axios.get(`${baseUrl}?page=${page}`, {
headers: { 'User-Agent': 'Mozilla/5.0' }
});
const $ = cheerio.load(html);
const items = $('.item').map((_, el) => ({
title: $(el).find('h3').text().trim(),
price: $(el).find('.price').text().trim()
})).get();
results.push(...items);
// Stop if no next page link
hasMore = !!$('a[rel="next"]').length && items.length > 0;
page++;
if (hasMore) await new Promise(r => setTimeout(r, 1000)); // polite delay
}
return results;
}
JSDOM — Full DOM in Node.js
JSDOM creates a full browser-like DOM environment. Use it when your parsing code uses document.querySelector, element.closest(), or any other standard Web API — for example when sharing parsing logic between browser and server.
npm install jsdom axios
const { JSDOM } = require('jsdom');
const axios = require('axios');
async function parseWithJSDOM(url) {
const { data: html } = await axios.get(url);
const dom = new JSDOM(html, { url }); // url needed for relative URL resolution
const { document } = dom.window;
// Standard Web APIs work as-is
const title = document.querySelector('h1')?.textContent?.trim();
const links = [...document.querySelectorAll('a[href]')].map(a => ({
text: a.textContent.trim(),
href: a.href // absolute URL thanks to the url option
}));
// closest(), matches(), querySelectorAll — all work
const activeNav = document.querySelector('.nav-item.active');
const section = activeNav?.closest('nav')?.textContent;
dom.window.close(); // important: free memory
return { title, links, section };
}
dom.window.close() when done. JSDOM holds event listeners and timers in memory — without cleanup you'll leak in long-running processes.
node-html-parser — Maximum Speed
node-html-parser is a lightweight, zero-dependency parser that's 10–50× faster than JSDOM for simple extractions. It doesn't support the full DOM API but covers the most common selectors.
npm install node-html-parser
const { parse } = require('node-html-parser');
const fs = require('fs');
// Parse from a string (or file) — no network call needed
function parseLocal(htmlPath) {
const html = fs.readFileSync(htmlPath, 'utf8');
const root = parse(html);
// CSS selectors supported
const title = root.querySelector('h1')?.text.trim();
const description = root.querySelector('meta[name="description"]')?.getAttribute('content');
// querySelectorAll + map
const headings = root.querySelectorAll('h2, h3').map(h => ({
level: h.tagName.toLowerCase(),
text: h.text.trim()
}));
// getAttribute, innerHTML, rawText
const firstImage = root.querySelector('img');
const imgSrc = firstImage?.getAttribute('src');
const imgAlt = firstImage?.getAttribute('alt');
return { title, description, headings, firstImage: { src: imgSrc, alt: imgAlt } };
}
// Benchmark: parse 10,000 HTML files
async function benchmarkParse(dir) {
const files = require('fs').readdirSync(dir).filter(f => f.endsWith('.html'));
const start = Date.now();
for (const file of files) {
parseLocal(`${dir}/${file}`);
}
console.log(`Parsed ${files.length} files in ${Date.now() - start}ms`);
}
parse5 — Spec-Compliant AST
parse5 is the HTML parser used by Angular, jsdom, and many other tools. It produces a spec-compliant AST (Abstract Syntax Tree) — useful when you need to transform or serialise HTML rather than just read data from it.
const parse5 = require('parse5');
const html = '<div class="box"><p>Hello <strong>world</strong></p></div>';
const document = parse5.parse(html);
// Walk the AST
function findNodes(node, tagName, results = []) {
if (node.nodeName === tagName) results.push(node);
(node.childNodes ?? []).forEach(child => findNodes(child, tagName, results));
return results;
}
const paragraphs = findNodes(document, 'p');
paragraphs.forEach(p => {
const text = p.childNodes.map(n => n.value ?? '').join('').trim();
console.log('Paragraph:', text);
});
// Serialise back to HTML after modification
const serialised = parse5.serialize(document);
SnapAPI — Parse Remote Pages (with a Real Browser)
Static parsers fail on React, Vue, and Angular apps — the HTML they fetch is an empty shell. SnapAPI renders the page in a real Chromium instance and returns the fully-rendered HTML, which you can then parse with Cheerio or node-html-parser.
const axios = require('axios');
const cheerio = require('cheerio');
async function fetchAndParse(url, options = {}) {
// 1. Render with SnapAPI (handles JS, SPAs, bot protection)
const { data } = await axios.post('https://api.snapapi.pics/v1/scrape', {
url,
stealth: options.stealth ?? false,
waitFor: options.waitFor ?? 'networkidle', // or a CSS selector
blockAds: true,
blockCookieBanners: true
}, { headers: { 'X-Api-Key': process.env.SNAPAPI_KEY } });
// 2. Parse the rendered HTML with Cheerio
const $ = cheerio.load(data.html);
return {
title: $('h1').first().text().trim(),
metaDesc: $('meta[name="description"]').attr('content'),
text: $('body').text().replace(/\s+/g, ' ').trim().slice(0, 2000),
links: $('a[href]').map((_, el) => $(el).attr('href')).get()
};
}
// Usage — works on SPAs and protected pages
fetchAndParse('https://app.example.com/dashboard', { stealth: true })
.then(data => console.log(data));
Schema extraction (skip parsing entirely)
// If you want structured data, skip HTML parsing and use /v1/extract instead
async function extractStructured(url) {
const { data } = await axios.post('https://api.snapapi.pics/v1/extract', {
url,
schema: {
title: { type: 'string' },
price: { type: 'number' },
description: { type: 'string' },
features: { type: 'array', items: { type: 'string' } }
}
}, { headers: { 'X-Api-Key': process.env.SNAPAPI_KEY } });
return data.data; // typed JSON — no HTML parsing needed
}
Which Parser Should You Use?
- Cheerio — default choice for 90% of scraping tasks. Fast, familiar, huge community. Use for any static or server-rendered HTML.
- JSDOM — when you're sharing DOM-manipulation code between browser and server, or need
element.closest()/ computed styles. - node-html-parser — bulk processing of thousands of HTML files where speed matters more than selector coverage.
- parse5 — HTML tooling, linters, transpilers — any scenario where you need a spec-compliant AST.
- SnapAPI /scrape + Cheerio — when the target page is a SPA, requires login, or blocks bots. Render first, parse second.
- SnapAPI /extract — when you want typed structured JSON and don't want to write any selector code at all.
Parse any page — static or SPA
SnapAPI renders in a real browser and returns clean HTML ready for Cheerio — or typed JSON if you use /extract. 200 free requests/month.
Try SnapAPI Free →