Static Pages: fetch + Cheerio
For server-rendered HTML pages (Wikipedia, most blogs, government sites, e-commerce product pages), you don't need a browser at all. fetch + Cheerio is faster and lighter.
npm install cheerio
const cheerio = require('cheerio');
async function scrapeStatic(url) {
const response = await fetch(url, {
headers: {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
}
});
if (!response.ok) throw new Error(`HTTP ${response.status}`);
const html = await response.text();
const $ = cheerio.load(html);
// Extract data with CSS selectors — jQuery-like syntax
const results = [];
$('article.post').each((i, el) => {
results.push({
title: $(el).find('h2').text().trim(),
url: $(el).find('a').attr('href'),
date: $(el).find('time').attr('datetime'),
excerpt: $(el).find('p.excerpt').text().trim()
});
});
return results;
}
const posts = await scrapeStatic('https://blog.example.com');
console.log(posts);
Pagination with Cheerio
async function scrapeAllPages(baseUrl) {
const allItems = [];
let page = 1;
let hasNextPage = true;
while (hasNextPage) {
const url = `${baseUrl}?page=${page}`;
const response = await fetch(url, {
headers: { 'User-Agent': 'Mozilla/5.0 (compatible; MyScraper/1.0)' }
});
const $ = cheerio.load(await response.text());
// Extract items on this page
$('li.item').each((_, el) => {
allItems.push({
title: $(el).find('.title').text().trim(),
price: parseFloat($(el).find('.price').text().replace('$', ''))
});
});
// Check if there's a next page link
hasNextPage = $('a.next-page').length > 0;
page++;
// Rate limit: 1 request/second
await new Promise(r => setTimeout(r, 1000));
console.log(`Page ${page - 1}: ${allItems.length} total items`);
}
return allItems;
}
SPAs: Playwright
React, Vue, Next.js, and Angular render content client-side. fetch gets the empty shell. You need a real browser to execute the JavaScript.
const { chromium } = require('playwright');
async function scrapeSPA(url) {
const browser = await chromium.launch({ headless: true });
const page = await browser.newPage();
await page.goto(url, { waitUntil: 'networkidle' });
// Extract data from the rendered DOM
const data = await page.evaluate(() => {
return Array.from(document.querySelectorAll('.product-card')).map(el => ({
name: el.querySelector('.name')?.textContent?.trim(),
price: parseFloat(el.querySelector('.price')?.textContent?.replace(/[^0-9.]/g, '')),
rating: parseFloat(el.querySelector('[data-rating]')?.dataset?.rating),
url: el.querySelector('a')?.href
}));
});
await browser.close();
return data;
}
Playwright: Intercept API Responses (Faster)
Instead of scraping the rendered DOM, intercept the API calls the SPA makes. This gives you clean JSON without HTML parsing.
const { chromium } = require('playwright');
async function interceptApiData(url, apiPattern) {
const browser = await chromium.launch({ headless: true });
const page = await browser.newPage();
const capturedData = [];
// Intercept matching XHR/fetch responses
page.on('response', async response => {
if (response.url().includes(apiPattern) && response.status() === 200) {
try {
const json = await response.json();
capturedData.push(...(json.data || json.items || [json]));
} catch {}
}
});
await page.goto(url, { waitUntil: 'networkidle' });
// Scroll to trigger lazy-loaded content
await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
await page.waitForTimeout(1000);
await browser.close();
return capturedData;
}
// Intercepts calls to /api/products
const products = await interceptApiData('https://shop.example.com', '/api/products');
Playwright: Handle Infinite Scroll
async function scrapeInfiniteScroll(url) {
const browser = await chromium.launch({ headless: true });
const page = await browser.newPage();
await page.goto(url, { waitUntil: 'networkidle' });
const allItems = new Set();
for (let i = 0; i < 10; i++) { // Max 10 scroll iterations
// Collect current items
const items = await page.$$eval('.item', els =>
els.map(el => el.textContent.trim())
);
const prevSize = allItems.size;
items.forEach(item => allItems.add(item));
// Scroll to bottom
await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
await page.waitForTimeout(1500); // Wait for new content to load
// Stop if no new items appeared
if (allItems.size === prevSize) break;
}
await browser.close();
return [...allItems];
}
Production Scraping: SnapAPI
For production workloads where anti-bot protection, stealth mode, and no browser management matter, use a dedicated scraping API. SnapAPI's /v1/scrape and /v1/extract handle the browser layer for you.
// Scrape raw HTML + text + links
const response = await fetch('https://api.snapapi.pics/v1/scrape', {
method: 'POST',
headers: { 'X-Api-Key': process.env.SNAPAPI_KEY, 'Content-Type': 'application/json' },
body: JSON.stringify({
url: 'https://news.ycombinator.com',
wait_for: 'networkidle',
stealth: true
})
});
const { html, text, links, title } = await response.json();
// Now parse with cheerio — best of both worlds
const cheerio = require('cheerio');
const $ = cheerio.load(html);
const stories = [];
$('.athing').each((i, el) => {
const id = $(el).attr('id');
const titleEl = $(el).find('.titleline > a');
stories.push({ rank: i + 1, title: titleEl.text(), url: titleEl.attr('href') });
});
console.log(stories.slice(0, 5));
// Extract structured data with schema (no CSS selector knowledge needed)
const extractRes = await fetch('https://api.snapapi.pics/v1/extract', {
method: 'POST',
headers: { 'X-Api-Key': process.env.SNAPAPI_KEY, 'Content-Type': 'application/json' },
body: JSON.stringify({
url: 'https://shop.example.com/product/123',
schema: {
name: 'string',
price: 'number',
currency: 'string',
in_stock: 'boolean',
images: 'string[]',
description: 'string',
rating: 'number',
review_count: 'number'
}
})
});
const { data } = await extractRes.json();
// data.name, data.price, data.in_stock — clean structured output
Rate Limiting and Polite Scraping
// Token bucket rate limiter
class RateLimiter {
constructor(requestsPerSecond) {
this.interval = 1000 / requestsPerSecond;
this.lastRequest = 0;
}
async wait() {
const now = Date.now();
const timeSinceLastRequest = now - this.lastRequest;
if (timeSinceLastRequest < this.interval) {
await new Promise(r => setTimeout(r, this.interval - timeSinceLastRequest));
}
this.lastRequest = Date.now();
}
}
const limiter = new RateLimiter(2); // Max 2 requests/second
async function scrapeWithRateLimit(urls) {
const results = [];
for (const url of urls) {
await limiter.wait();
try {
const response = await fetch(url, {
headers: { 'User-Agent': 'Mozilla/5.0 (compatible; ResearchBot/1.0)' }
});
results.push({ url, html: await response.text() });
} catch (err) {
results.push({ url, error: err.message });
}
}
return results;
}
Scraping Authenticated Pages
const { chromium } = require('playwright');
async function scrapeAuthenticated(loginUrl, credentials, targetUrl) {
const browser = await chromium.launch({ headless: true });
const context = await browser.newContext();
const page = await context.newPage();
// Log in
await page.goto(loginUrl);
await page.fill('#email', credentials.email);
await page.fill('#password', credentials.password);
await page.click('[type="submit"]');
await page.waitForNavigation({ waitUntil: 'networkidle' });
// Save session state for reuse
const storageState = await context.storageState();
require('fs').writeFileSync('session.json', JSON.stringify(storageState));
// Scrape authenticated page
await page.goto(targetUrl, { waitUntil: 'networkidle' });
const data = await page.evaluate(() => document.body.innerHTML);
await browser.close();
return data;
}
// Reuse saved session (skip login)
async function scrapeWithSavedSession(url) {
const browser = await chromium.launch({ headless: true });
const context = await browser.newContext({
storageState: JSON.parse(require('fs').readFileSync('session.json'))
});
const page = await context.newPage();
await page.goto(url, { waitUntil: 'networkidle' });
const html = await page.content();
await browser.close();
return html;
}
Storing Scraped Data
// SQLite storage with better-sqlite3 (sync, fast for local use)
const Database = require('better-sqlite3');
const db = new Database('scraped.db');
db.exec(`
CREATE TABLE IF NOT EXISTS pages (
id INTEGER PRIMARY KEY,
url TEXT UNIQUE,
title TEXT,
content TEXT,
scraped_at INTEGER DEFAULT (unixepoch())
)
`);
const insert = db.prepare(
'INSERT OR REPLACE INTO pages (url, title, content) VALUES (?, ?, ?)'
);
function savePage(url, title, content) {
insert.run(url, title, content);
}
// Batch insert many rows efficiently
const insertMany = db.transaction((pages) => {
for (const page of pages) insert.run(page.url, page.title, page.content);
});
insertMany(pages);
// PostgreSQL with pg for production
const { Pool } = require('pg');
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
async function saveScrapedData(url, data) {
await pool.query(
'INSERT INTO scraped_pages (url, data, scraped_at) VALUES ($1, $2, NOW()) ON CONFLICT (url) DO UPDATE SET data = $2, scraped_at = NOW()',
[url, JSON.stringify(data)]
);
}
Tool Selection Guide
| Scenario | Best tool | Why |
|---|---|---|
| Static HTML site | fetch + Cheerio | Fastest, no browser overhead |
| React/Vue/Next.js SPA | Playwright | networkidle wait handles async rendering |
| Cloudflare-protected site | SnapAPI (stealth) | Maintained evasion, no arms race |
| Structured data extraction | SnapAPI /extract | Schema-based, no CSS selector maintenance |
| Full site crawl (>10K pages) | Crawlee + Apify | Built-in queue, retry, deduplication |
| Authenticated pages | Playwright + storageState | Full session control |
| Serverless / Lambda | SnapAPI | No binary, no layer, 50ms cold start |