Choose Your Tool First
| Site type | Tool | Why |
| Static HTML (no JS) | fetch + Cheerio | Fast, lightweight, no browser needed |
| Server-rendered (Next.js, Rails) | fetch + Cheerio | HTML is in the initial response |
| SPA (React, Vue, Angular) | Playwright | Content rendered after JS execution |
| Infinite scroll / lazy load | Playwright | Need to trigger scroll events |
| Protected (Cloudflare, DataDome) | SnapAPI stealth | Handles bot protection automatically |
| Structured data extraction | SnapAPI /extract | Schema-based, no CSS selector writing |
Cheerio: Static HTML Scraping
npm install cheerio node-fetch
import * as cheerio from 'cheerio';
async function scrapeProducts(url) {
const res = await fetch(url, {
headers: {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
},
});
const html = await res.text();
const $ = cheerio.load(html);
const products = [];
$('.product-card').each((_, el) => {
products.push({
title: $(el).find('.product-title').text().trim(),
price: $(el).find('.price').text().trim(),
url: $(el).find('a').attr('href'),
image: $(el).find('img').attr('src'),
});
});
return products;
}
Pagination with Cheerio
async function scrapeAllPages(baseUrl) {
let allItems = [];
let page = 1;
let hasNextPage = true;
while (hasNextPage) {
const url = `${baseUrl}?page=${page}`;
const res = await fetch(url, { headers: { 'User-Agent': '...' } });
const $ = cheerio.load(await res.text());
// Extract items from current page
$('.item').each((_, el) => allItems.push($(el).text().trim()));
// Check for next page link
hasNextPage = !!$('a[rel="next"]').length;
page++;
// Polite delay
if (hasNextPage) await new Promise(r => setTimeout(r, 1000 + Math.random() * 500));
}
return allItems;
}
Playwright: SPA Scraping
import { chromium } from 'playwright';
import * as cheerio from 'cheerio';
async function scrapeSPA(url) {
const browser = await chromium.launch({ headless: true });
const context = await browser.newContext({
userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)...',
});
// Block images and fonts to speed up load
await context.route('**/*', (route) => {
const type = route.request().resourceType();
if (['image', 'media', 'font'].includes(type)) return route.abort();
return route.continue();
});
const page = await context.newPage();
await page.goto(url, { waitUntil: 'networkidle', timeout: 20000 });
// Wait for the specific content to load
await page.waitForSelector('.product-list', { timeout: 10000 });
// Option A: Use evaluate for simple extraction
const products = await page.evaluate(() => {
return [...document.querySelectorAll('.product-card')].map(el => ({
title: el.querySelector('.title')?.textContent?.trim(),
price: el.querySelector('.price')?.textContent?.trim(),
}));
});
// Option B: Pass rendered HTML to Cheerio
const html = await page.content();
const $ = cheerio.load(html);
// ... use Cheerio as normal
await browser.close();
return products;
}
Infinite scroll
async function scrapeInfiniteScroll(url, itemSelector, maxItems = 200) {
const browser = await chromium.launch();
const page = await browser.newPage();
await page.goto(url, { waitUntil: 'networkidle' });
const items = new Set();
let previousCount = 0;
while (items.size < maxItems) {
// Collect visible items
const newItems = await page.evaluate((sel) =>
[...document.querySelectorAll(sel)].map(el => el.textContent.trim()),
itemSelector
);
newItems.forEach(i => items.add(i));
// No new items after scroll — reached end
if (items.size === previousCount) break;
previousCount = items.size;
// Scroll to bottom to trigger next load
await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
await page.waitForTimeout(1500); // wait for next batch to load
}
await browser.close();
return [...items];
}
SnapAPI: Managed Scraping
For sites with bot protection or when you don't want to manage Playwright infrastructure:
// Scrape full rendered HTML (after JS execution)
const res = await fetch('https://api.snapapi.pics/v1/scrape', {
method: 'POST',
headers: { 'X-Api-Key': process.env.SNAPAPI_KEY, 'Content-Type': 'application/json' },
body: JSON.stringify({
url: 'https://protected-site.com/products',
stealth: true,
waitForSelector: '.product-list',
waitUntil: 'networkidle',
}),
});
const { html } = await res.json();
// Parse with Cheerio as usual
const $ = cheerio.load(html);
// Schema-based extraction — no CSS selectors needed
const res = await fetch('https://api.snapapi.pics/v1/extract', {
method: 'POST',
headers: { 'X-Api-Key': process.env.SNAPAPI_KEY, 'Content-Type': 'application/json' },
body: JSON.stringify({
url: 'https://example.com/product',
schema: {
title: { type: 'string', description: 'Product name' },
price: { type: 'string', description: 'Price with currency' },
rating: { type: 'number', description: 'Star rating 1-5' },
reviewCount: { type: 'integer', description: 'Number of reviews' },
inStock: { type: 'boolean', description: 'Whether available' },
images: { type: 'array', items: { type: 'string' }, description: 'Image URLs' },
},
}),
});
const { data } = await res.json();
// data = { title: 'Widget Pro', price: '$29.99', rating: 4.7, ... }
Data Storage Patterns
SQLite (local, fast, zero setup)
import Database from 'better-sqlite3';
const db = new Database('./scrape.db');
db.exec(`
CREATE TABLE IF NOT EXISTS products (
id INTEGER PRIMARY KEY AUTOINCREMENT,
url TEXT UNIQUE,
title TEXT,
price TEXT,
scraped_at DATETIME DEFAULT CURRENT_TIMESTAMP
)
`);
const upsert = db.prepare(`
INSERT INTO products (url, title, price)
VALUES (@url, @title, @price)
ON CONFLICT(url) DO UPDATE SET
title = excluded.title,
price = excluded.price,
scraped_at = CURRENT_TIMESTAMP
`);
// Batch insert with transaction (fast)
const insertMany = db.transaction((items) => {
for (const item of items) upsert.run(item);
});
insertMany(products);
JSONL streaming (append-only log)
import { createWriteStream } from 'fs';
const stream = createWriteStream('./data.jsonl', { flags: 'a' });
for (const item of products) {
stream.write(JSON.stringify({ ...item, ts: Date.now() }) + '\n');
}
stream.end();