Build a Web Crawler in Node.js (2026): Complete Guide
A web crawler systematically browses websites, following links to discover and index pages. Whether you're building a search index, an SEO auditor, or a data pipeline, this guide covers everything: BFS traversal, robots.txt compliance, rate limiting, sitemap parsing, concurrent crawling, and handling JS-heavy SPAs.
robots.txt, honour Crawl-delay, and check the site's Terms of Service before crawling at scale.
The Minimal Crawler (BFS)
At its core, a crawler is a breadth-first search over URLs. Start with a seed URL, extract all links, add unvisited ones to the queue, repeat.
npm install axios cheerio
const axios = require('axios');
const cheerio = require('cheerio');
const { URL } = require('url');
const HEADERS = {
'User-Agent': 'MyBot/1.0 (+https://mysite.com/bot.html)',
'Accept': 'text/html,application/xhtml+xml;q=0.9,*/*;q=0.8',
};
async function fetchPage(url) {
try {
const { data, headers } = await axios.get(url, { headers: HEADERS, timeout: 10000 });
const contentType = headers['content-type'] ?? '';
if (!contentType.includes('text/html')) return null;
return data;
} catch { return null; }
}
function extractLinks(html, baseUrl) {
const $ = cheerio.load(html);
const base = new URL(baseUrl);
const links = new Set();
$('a[href]').each((_, el) => {
try {
const href = new URL($(el).attr('href'), baseUrl);
if (href.hostname === base.hostname) {
href.hash = ''; // strip fragments
href.search = ''; // strip query strings (optional)
links.add(href.href);
}
} catch {}
});
return [...links];
}
async function crawl(seedUrl, maxPages = 100) {
const visited = new Set();
const queue = [seedUrl];
const results = [];
while (queue.length && results.length < maxPages) {
const url = queue.shift();
if (visited.has(url)) continue;
visited.add(url);
const html = await fetchPage(url);
if (!html) continue;
const $ = cheerio.load(html);
results.push({ url, title: $('title').text().trim(), links: extractLinks(html, url).length });
const newLinks = extractLinks(html, url).filter(l => !visited.has(l));
queue.push(...newLinks);
await new Promise(r => setTimeout(r, 1500)); // polite delay
console.log(`[${results.length}/${maxPages}] ${url}`);
}
return results;
}
crawl('https://example.com').then(pages => {
console.log(`Crawled ${pages.length} pages`);
});
Respecting robots.txt
Parsing robots.txt before crawling is both ethical and practical — many CDNs will rate-limit or block crawlers that ignore it. Use the robots-parser package:
npm install robots-parser
const robotsParser = require('robots-parser');
const axios = require('axios');
const BOT_NAME = 'MyBot';
async function loadRobots(baseUrl) {
try {
const { data } = await axios.get(new URL('/robots.txt', baseUrl).href, { timeout: 5000 });
return robotsParser(new URL('/robots.txt', baseUrl).href, data);
} catch {
return null; // if robots.txt is missing, assume everything is allowed
}
}
async function crawlWithRobots(seedUrl, maxPages = 200) {
const robots = await loadRobots(seedUrl);
const crawlDelay = (robots?.getCrawlDelay(BOT_NAME) ?? 1) * 1000;
const isAllowed = (url) => !robots || robots.isAllowed(url, BOT_NAME);
const visited = new Set();
const queue = [seedUrl];
const results = [];
while (queue.length && results.length < maxPages) {
const url = queue.shift();
if (visited.has(url) || !isAllowed(url)) continue;
visited.add(url);
const html = await fetchPage(url);
if (!html) continue;
const $ = cheerio.load(html);
results.push({ url, title: $('title').text().trim() });
extractLinks(html, url)
.filter(l => !visited.has(l) && isAllowed(l))
.forEach(l => queue.push(l));
await new Promise(r => setTimeout(r, crawlDelay));
}
return results;
}
Sitemap Crawling (Faster Discovery)
Instead of following links page by page, parse the sitemap to get all URLs instantly — much faster for indexing large sites.
const axios = require('axios');
const cheerio = require('cheerio');
async function parseSitemap(sitemapUrl) {
const { data } = await axios.get(sitemapUrl, { headers: HEADERS, timeout: 15000 });
const $ = cheerio.load(data, { xmlMode: true });
const urls = [];
// Regular sitemap
$('url > loc').each((_, el) => urls.push($(el).text().trim()));
// Sitemap index — recursively fetch child sitemaps
const childSitemaps = [];
$('sitemap > loc').each((_, el) => childSitemaps.push($(el).text().trim()));
if (childSitemaps.length) {
const children = await Promise.all(childSitemaps.map(u => parseSitemap(u).catch(() => [])));
urls.push(...children.flat());
}
return urls;
}
// Try sitemap locations in order
async function discoverSitemap(baseUrl) {
const candidates = [
new URL('/sitemap.xml', baseUrl).href,
new URL('/sitemap_index.xml', baseUrl).href,
new URL('/sitemaps/sitemap.xml', baseUrl).href,
];
// Check robots.txt for Sitemap directive first
try {
const { data } = await axios.get(new URL('/robots.txt', baseUrl).href);
const match = data.match(/^Sitemap:\s*(.+)$/im);
if (match) candidates.unshift(match[1].trim());
} catch {}
for (const url of candidates) {
try { return await parseSitemap(url); } catch {}
}
return []; // no sitemap found, fall back to BFS
}
// Usage
discoverSitemap('https://example.com').then(urls => {
console.log(`Found ${urls.length} URLs in sitemap`);
});
Concurrent Crawling with p-queue
Sequential crawling is slow. p-queue lets you run N parallel requests while respecting rate limits per domain.
const PQueue = require('p-queue');
const { default: pQueue } = await import('p-queue');
class Crawler {
constructor({ concurrency = 5, intervalMs = 1000, maxPages = 500 } = {}) {
this.queue = new PQueue({ concurrency, interval: intervalMs, intervalCap: concurrency });
this.visited = new Set();
this.results = [];
this.maxPages = maxPages;
}
async crawlPage(url) {
if (this.visited.has(url) || this.results.length >= this.maxPages) return;
this.visited.add(url);
const html = await fetchPage(url);
if (!html) return;
const $ = require('cheerio').load(html);
const result = {
url,
title: $('title').text().trim(),
h1: $('h1').first().text().trim(),
wordCount: $('body').text().split(/\s+/).length,
internalLinks: 0,
};
const links = extractLinks(html, url);
result.internalLinks = links.length;
this.results.push(result);
// Queue newly discovered links
links.filter(l => !this.visited.has(l)).forEach(link => {
this.queue.add(() => this.crawlPage(link));
});
}
async run(seedUrl) {
this.queue.add(() => this.crawlPage(seedUrl));
await this.queue.onIdle();
return this.results;
}
}
const crawler = new Crawler({ concurrency: 3, intervalMs: 1000, maxPages: 300 });
crawler.run('https://example.com').then(pages => {
console.log(`Crawled ${pages.length} pages`);
console.log('Avg word count:', pages.reduce((s, p) => s + p.wordCount, 0) / pages.length | 0);
});
Storing Crawl Results in SQLite
const Database = require('better-sqlite3');
const db = new Database('./crawl.db');
db.exec(`
CREATE TABLE IF NOT EXISTS pages (
id INTEGER PRIMARY KEY AUTOINCREMENT,
url TEXT UNIQUE NOT NULL,
title TEXT,
h1 TEXT,
word_count INTEGER,
status_code INTEGER,
crawled_at TEXT DEFAULT (datetime('now'))
)
`);
const upsert = db.prepare(`
INSERT INTO pages (url, title, h1, word_count, status_code)
VALUES (@url, @title, @h1, @wordCount, @statusCode)
ON CONFLICT(url) DO UPDATE SET
title = excluded.title,
h1 = excluded.h1,
word_count = excluded.word_count,
crawled_at = datetime('now')
`);
// Batch insert with a transaction (much faster than one-by-one)
const batchInsert = db.transaction(pages => {
for (const page of pages) upsert.run(page);
});
// After crawl completes
batchInsert(crawlResults);
console.log('Saved', db.prepare('SELECT COUNT(*) as n FROM pages').get().n, 'pages');
Crawling JS-Heavy SPAs with SnapAPI
When a site is built with React, Vue, or Angular, axios.get() returns an empty shell. Swap in SnapAPI's /v1/scrape to get fully-rendered HTML for each page:
const axios = require('axios');
const cheerio = require('cheerio');
async function fetchRendered(url) {
const { data } = await axios.post('https://api.snapapi.pics/v1/scrape', {
url,
waitFor: 'networkidle',
blockAds: true,
stealth: false // enable if site has bot protection
}, { headers: { 'X-Api-Key': process.env.SNAPAPI_KEY } });
return data.html;
}
// Drop-in replacement for fetchPage() in the BFS crawler above
async function crawlSPA(seedUrl, maxPages = 50) {
const visited = new Set();
const queue = [seedUrl];
const results = [];
while (queue.length && results.length < maxPages) {
const url = queue.shift();
if (visited.has(url)) continue;
visited.add(url);
const html = await fetchRendered(url);
const $ = cheerio.load(html);
results.push({ url, title: $('title').text().trim() });
extractLinks(html, url)
.filter(l => !visited.has(l))
.forEach(l => queue.push(l));
await new Promise(r => setTimeout(r, 2000));
}
return results;
}
Crawl any website — static or SPA
SnapAPI renders pages in a real Chromium browser. Swap one function call and your crawler handles React, Vue, and Angular sites. 200 free requests/month.
Try SnapAPI Free →