Static Pages: axios + Cheerio
npm install axios cheerio
const axios = require('axios');
const cheerio = require('cheerio');
async function scrape(url) {
const { data: html } = await axios.get(url, {
headers: { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36' },
timeout: 15000
});
const $ = cheerio.load(html);
const results = [];
$('article.post, .blog-card, li.result').each((i, el) => {
results.push({
title: $(el).find('h2, h3, .title').first().text().trim(),
url: $(el).find('a').first().attr('href'),
description: $(el).find('p, .excerpt, .description').first().text().trim(),
date: $(el).find('time, .date').first().attr('datetime') || $(el).find('time, .date').first().text().trim()
});
});
return results.filter(r => r.title);
}
const posts = await scrape('https://blog.example.com');
console.log(posts);
Pagination
async function scrapeAllPages(baseUrl, maxPages = 10) {
const allItems = [];
let page = 1;
while (page <= maxPages) {
const url = `${baseUrl}?page=${page}`;
try {
const { data: html } = await axios.get(url, {
headers: { 'User-Agent': 'Mozilla/5.0 (compatible; ResearchBot/1.0)' }
});
const $ = cheerio.load(html);
const items = [];
$('.item, article, li.result').each((_, el) => {
items.push({
title: $(el).find('h2, .title').text().trim(),
href: $(el).find('a').attr('href')
});
});
if (items.length === 0) break; // No more results
allItems.push(...items);
// Check for next page link
const hasNext = $('a[rel="next"], .pagination .next, a:contains("Next")').length > 0;
if (!hasNext) break;
page++;
await new Promise(r => setTimeout(r, 1000)); // 1 req/sec
} catch (err) {
if (err.response?.status === 404) break;
throw err;
}
}
return allItems;
}
SPAs: Playwright
const { chromium } = require('playwright');
const cheerio = require('cheerio');
async function scrapeSPA(url) {
const browser = await chromium.launch({ headless: true });
const page = await browser.newPage();
// Block images/fonts for faster loading
await page.route('**/*.{png,jpg,jpeg,gif,webp,woff,woff2,ttf}', route => route.abort());
await page.goto(url, { waitUntil: 'networkidle' });
// Option 1: Extract via page.evaluate (runs in browser context)
const data = await page.evaluate(() => {
return Array.from(document.querySelectorAll('.product, [data-product]')).map(el => ({
name: el.querySelector('.name, h2')?.textContent?.trim(),
price: parseFloat(el.querySelector('.price')?.textContent?.replace(/[^\d.]/g, '')),
sku: el.dataset.sku || el.querySelector('[data-sku]')?.dataset?.sku,
inStock: !el.querySelector('.out-of-stock')
}));
});
// Option 2: Get full HTML and use Cheerio (more familiar syntax)
const html = await page.content();
const $ = cheerio.load(html);
const altData = [];
$('.product').each((_, el) => {
altData.push({ title: $(el).find('h2').text(), price: $(el).find('.price').text() });
});
await browser.close();
return data;
}
Retry Logic and Error Handling
async function fetchWithRetry(url, options = {}, maxRetries = 3) {
const delays = [1000, 3000, 10000]; // Exponential backoff
for (let attempt = 0; attempt <= maxRetries; attempt++) {
try {
const response = await axios.get(url, {
...options,
timeout: 15000,
headers: {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)',
...options.headers
}
});
return response;
} catch (err) {
const isRetryable = !err.response || [429, 500, 502, 503, 504].includes(err.response.status);
if (!isRetryable || attempt === maxRetries) throw err;
const delay = delays[attempt] || 10000;
console.log(`Retry ${attempt + 1}/${maxRetries} after ${delay}ms: ${url}`);
await new Promise(r => setTimeout(r, delay));
}
}
}
Production Scraping: SnapAPI
For sites with Cloudflare, anti-bot protection, or JavaScript rendering, SnapAPI handles the headless browser layer for you.
async function scrapeProduction(url) {
// Returns full HTML of the rendered page
const response = await fetch('https://api.snapapi.pics/v1/scrape', {
method: 'POST',
headers: { 'X-Api-Key': process.env.SNAPAPI_KEY, 'Content-Type': 'application/json' },
body: JSON.stringify({
url,
wait_for: 'networkidle',
stealth: true, // Bypass anti-bot detection
block_ads: true
})
});
const { html, text, links, title } = await response.json();
// Parse the rendered HTML with Cheerio
const $ = cheerio.load(html);
const products = [];
$('.product-card').each((_, el) => {
products.push({
name: $(el).find('.name').text().trim(),
price: parseFloat($(el).find('.price').text().replace(/[^\d.]/g, ''))
});
});
return products;
}
// Or extract structured data with a schema (no CSS selectors needed)
async function extractStructured(url) {
const response = await fetch('https://api.snapapi.pics/v1/extract', {
method: 'POST',
headers: { 'X-Api-Key': process.env.SNAPAPI_KEY, 'Content-Type': 'application/json' },
body: JSON.stringify({
url,
schema: { name: 'string', price: 'number', in_stock: 'boolean', images: 'string[]' }
})
});
return (await response.json()).data;
}
Storing Scraped Data
// SQLite with better-sqlite3 (sync, zero config)
const Database = require('better-sqlite3');
const db = new Database('data.db');
db.exec(`CREATE TABLE IF NOT EXISTS items (
id INTEGER PRIMARY KEY,
url TEXT UNIQUE,
data TEXT,
scraped_at INTEGER DEFAULT (unixepoch())
)`);
const upsert = db.prepare(
'INSERT OR REPLACE INTO items (url, data) VALUES (?, ?)'
);
// Batch insert with transaction (100x faster than one-by-one)
const insertBatch = db.transaction((items) => {
for (const item of items) upsert.run(item.url, JSON.stringify(item));
});
// Usage
const results = await scrapeSPA('https://shop.example.com');
insertBatch(results);
console.log(`Saved ${results.length} items`);
// Save as JSONL (newline-delimited JSON) — works with jq, BigQuery, Spark
const fs = require('fs');
const stream = fs.createWriteStream('output.jsonl', { flags: 'a' });
results.forEach(item => stream.write(JSON.stringify(item) + '\n'));
stream.end();
Which Tool for Which Site
| Site type | Best tool | Notes |
|---|---|---|
| Static HTML (blogs, gov) | axios + Cheerio | Fastest, no browser overhead |
| React/Vue/Next.js SPA | Playwright | networkidle handles async render |
| Cloudflare-protected | SnapAPI (stealth) | Maintained evasion layer |
| Need structured data | SnapAPI /extract | No CSS selector maintenance |
| Site crawl (>10K pages) | Crawlee + Playwright | Built-in queue + deduplication |
| Serverless function | SnapAPI | No binary, works on Lambda |
Free to try: SnapAPI's /v1/scrape returns full rendered HTML + clean text from any URL. 200 free calls/month. Get key →