Node.js Web Crawling Automation April 5, 2026

Build a Web Crawler in Node.js (2026): Complete Guide

A web crawler systematically browses websites, following links to discover and index pages. Whether you're building a search index, an SEO auditor, or a data pipeline, this guide covers everything: BFS traversal, robots.txt compliance, rate limiting, sitemap parsing, concurrent crawling, and handling JS-heavy SPAs.

Ethical crawling: Always respect robots.txt, honour Crawl-delay, and check the site's Terms of Service before crawling at scale.

The Minimal Crawler (BFS)

At its core, a crawler is a breadth-first search over URLs. Start with a seed URL, extract all links, add unvisited ones to the queue, repeat.

npm install axios cheerio

const axios   = require('axios');
const cheerio = require('cheerio');
const { URL } = require('url');

const HEADERS = {
  'User-Agent': 'MyBot/1.0 (+https://mysite.com/bot.html)',
  'Accept': 'text/html,application/xhtml+xml;q=0.9,*/*;q=0.8',
};

async function fetchPage(url) {
  try {
    const { data, headers } = await axios.get(url, { headers: HEADERS, timeout: 10000 });
    const contentType = headers['content-type'] ?? '';
    if (!contentType.includes('text/html')) return null;
    return data;
  } catch { return null; }
}

function extractLinks(html, baseUrl) {
  const $ = cheerio.load(html);
  const base = new URL(baseUrl);
  const links = new Set();
  $('a[href]').each((_, el) => {
    try {
      const href = new URL($(el).attr('href'), baseUrl);
      if (href.hostname === base.hostname) {
        href.hash = '';         // strip fragments
        href.search = '';       // strip query strings (optional)
        links.add(href.href);
      }
    } catch {}
  });
  return [...links];
}

async function crawl(seedUrl, maxPages = 100) {
  const visited = new Set();
  const queue   = [seedUrl];
  const results = [];

  while (queue.length && results.length < maxPages) {
    const url = queue.shift();
    if (visited.has(url)) continue;
    visited.add(url);

    const html = await fetchPage(url);
    if (!html) continue;

    const $ = cheerio.load(html);
    results.push({ url, title: $('title').text().trim(), links: extractLinks(html, url).length });

    const newLinks = extractLinks(html, url).filter(l => !visited.has(l));
    queue.push(...newLinks);

    await new Promise(r => setTimeout(r, 1500)); // polite delay
    console.log(`[${results.length}/${maxPages}] ${url}`);
  }

  return results;
}

crawl('https://example.com').then(pages => {
  console.log(`Crawled ${pages.length} pages`);
});

Respecting robots.txt

Parsing robots.txt before crawling is both ethical and practical — many CDNs will rate-limit or block crawlers that ignore it. Use the robots-parser package:

npm install robots-parser

const robotsParser = require('robots-parser');
const axios = require('axios');

const BOT_NAME = 'MyBot';

async function loadRobots(baseUrl) {
  try {
    const { data } = await axios.get(new URL('/robots.txt', baseUrl).href, { timeout: 5000 });
    return robotsParser(new URL('/robots.txt', baseUrl).href, data);
  } catch {
    return null; // if robots.txt is missing, assume everything is allowed
  }
}

async function crawlWithRobots(seedUrl, maxPages = 200) {
  const robots = await loadRobots(seedUrl);
  const crawlDelay = (robots?.getCrawlDelay(BOT_NAME) ?? 1) * 1000;

  const isAllowed = (url) => !robots || robots.isAllowed(url, BOT_NAME);

  const visited = new Set();
  const queue   = [seedUrl];
  const results = [];

  while (queue.length && results.length < maxPages) {
    const url = queue.shift();
    if (visited.has(url) || !isAllowed(url)) continue;
    visited.add(url);

    const html = await fetchPage(url);
    if (!html) continue;

    const $ = cheerio.load(html);
    results.push({ url, title: $('title').text().trim() });

    extractLinks(html, url)
      .filter(l => !visited.has(l) && isAllowed(l))
      .forEach(l => queue.push(l));

    await new Promise(r => setTimeout(r, crawlDelay));
  }

  return results;
}

Sitemap Crawling (Faster Discovery)

Instead of following links page by page, parse the sitemap to get all URLs instantly — much faster for indexing large sites.

const axios   = require('axios');
const cheerio = require('cheerio');

async function parseSitemap(sitemapUrl) {
  const { data } = await axios.get(sitemapUrl, { headers: HEADERS, timeout: 15000 });
  const $ = cheerio.load(data, { xmlMode: true });
  const urls = [];

  // Regular sitemap
  $('url > loc').each((_, el) => urls.push($(el).text().trim()));

  // Sitemap index — recursively fetch child sitemaps
  const childSitemaps = [];
  $('sitemap > loc').each((_, el) => childSitemaps.push($(el).text().trim()));

  if (childSitemaps.length) {
    const children = await Promise.all(childSitemaps.map(u => parseSitemap(u).catch(() => [])));
    urls.push(...children.flat());
  }

  return urls;
}

// Try sitemap locations in order
async function discoverSitemap(baseUrl) {
  const candidates = [
    new URL('/sitemap.xml', baseUrl).href,
    new URL('/sitemap_index.xml', baseUrl).href,
    new URL('/sitemaps/sitemap.xml', baseUrl).href,
  ];

  // Check robots.txt for Sitemap directive first
  try {
    const { data } = await axios.get(new URL('/robots.txt', baseUrl).href);
    const match = data.match(/^Sitemap:\s*(.+)$/im);
    if (match) candidates.unshift(match[1].trim());
  } catch {}

  for (const url of candidates) {
    try { return await parseSitemap(url); } catch {}
  }
  return []; // no sitemap found, fall back to BFS
}

// Usage
discoverSitemap('https://example.com').then(urls => {
  console.log(`Found ${urls.length} URLs in sitemap`);
});

Concurrent Crawling with p-queue

Sequential crawling is slow. p-queue lets you run N parallel requests while respecting rate limits per domain.

const PQueue = require('p-queue');
const { default: pQueue } = await import('p-queue');

class Crawler {
  constructor({ concurrency = 5, intervalMs = 1000, maxPages = 500 } = {}) {
    this.queue    = new PQueue({ concurrency, interval: intervalMs, intervalCap: concurrency });
    this.visited  = new Set();
    this.results  = [];
    this.maxPages = maxPages;
  }

  async crawlPage(url) {
    if (this.visited.has(url) || this.results.length >= this.maxPages) return;
    this.visited.add(url);

    const html = await fetchPage(url);
    if (!html) return;

    const $ = require('cheerio').load(html);
    const result = {
      url,
      title:       $('title').text().trim(),
      h1:          $('h1').first().text().trim(),
      wordCount:   $('body').text().split(/\s+/).length,
      internalLinks: 0,
    };

    const links = extractLinks(html, url);
    result.internalLinks = links.length;
    this.results.push(result);

    // Queue newly discovered links
    links.filter(l => !this.visited.has(l)).forEach(link => {
      this.queue.add(() => this.crawlPage(link));
    });
  }

  async run(seedUrl) {
    this.queue.add(() => this.crawlPage(seedUrl));
    await this.queue.onIdle();
    return this.results;
  }
}

const crawler = new Crawler({ concurrency: 3, intervalMs: 1000, maxPages: 300 });
crawler.run('https://example.com').then(pages => {
  console.log(`Crawled ${pages.length} pages`);
  console.log('Avg word count:', pages.reduce((s, p) => s + p.wordCount, 0) / pages.length | 0);
});

Concurrency sweet spot: 3–5 concurrent requests per domain is polite and unlikely to trigger rate limiting. Increase to 10+ only for APIs or sites you control.

Storing Crawl Results in SQLite

const Database = require('better-sqlite3');

const db = new Database('./crawl.db');
db.exec(`
  CREATE TABLE IF NOT EXISTS pages (
    id          INTEGER PRIMARY KEY AUTOINCREMENT,
    url         TEXT UNIQUE NOT NULL,
    title       TEXT,
    h1          TEXT,
    word_count  INTEGER,
    status_code INTEGER,
    crawled_at  TEXT DEFAULT (datetime('now'))
  )
`);

const upsert = db.prepare(`
  INSERT INTO pages (url, title, h1, word_count, status_code)
  VALUES (@url, @title, @h1, @wordCount, @statusCode)
  ON CONFLICT(url) DO UPDATE SET
    title      = excluded.title,
    h1         = excluded.h1,
    word_count = excluded.word_count,
    crawled_at = datetime('now')
`);

// Batch insert with a transaction (much faster than one-by-one)
const batchInsert = db.transaction(pages => {
  for (const page of pages) upsert.run(page);
});

// After crawl completes
batchInsert(crawlResults);
console.log('Saved', db.prepare('SELECT COUNT(*) as n FROM pages').get().n, 'pages');

Crawling JS-Heavy SPAs with SnapAPI

When a site is built with React, Vue, or Angular, axios.get() returns an empty shell. Swap in SnapAPI's /v1/scrape to get fully-rendered HTML for each page:

const axios = require('axios');
const cheerio = require('cheerio');

async function fetchRendered(url) {
  const { data } = await axios.post('https://api.snapapi.pics/v1/scrape', {
    url,
    waitFor:  'networkidle',
    blockAds: true,
    stealth:  false   // enable if site has bot protection
  }, { headers: { 'X-Api-Key': process.env.SNAPAPI_KEY } });
  return data.html;
}

// Drop-in replacement for fetchPage() in the BFS crawler above
async function crawlSPA(seedUrl, maxPages = 50) {
  const visited = new Set();
  const queue   = [seedUrl];
  const results = [];

  while (queue.length && results.length < maxPages) {
    const url = queue.shift();
    if (visited.has(url)) continue;
    visited.add(url);

    const html = await fetchRendered(url);
    const $    = cheerio.load(html);
    results.push({ url, title: $('title').text().trim() });

    extractLinks(html, url)
      .filter(l => !visited.has(l))
      .forEach(l => queue.push(l));

    await new Promise(r => setTimeout(r, 2000));
  }
  return results;
}

Crawl any website — static or SPA

SnapAPI renders pages in a real Chromium browser. Swap one function call and your crawler handles React, Vue, and Angular sites. 200 free requests/month.

Try SnapAPI Free →