Choose Your Stack
| Scenario | Stack |
| Static HTML pages | requests + BeautifulSoup |
| Many pages fast | httpx async + BeautifulSoup |
| JavaScript-rendered (SPA) | Playwright (sync or async) |
| Large-scale crawl | Scrapy |
| Bot-protected site | SnapAPI stealth API |
| Structured data extraction | SnapAPI /extract schema |
requests + BeautifulSoup
pip install requests beautifulsoup4 lxml
import requests
from bs4 import BeautifulSoup
HEADERS = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
"Accept-Language": "en-US,en;q=0.9",
}
def scrape_products(url: str) -> list[dict]:
r = requests.get(url, headers=HEADERS, timeout=15)
r.raise_for_status()
soup = BeautifulSoup(r.text, "lxml")
products = []
for card in soup.select(".product-card"):
products.append({
"title": card.select_one(".title").get_text(strip=True),
"price": card.select_one(".price").get_text(strip=True),
"url": card.select_one("a")["href"],
})
return products
Pagination loop
import time, random
def scrape_all_pages(base_url: str) -> list[dict]:
all_items = []
page = 1
while True:
r = requests.get(f"{base_url}?page={page}", headers=HEADERS, timeout=15)
soup = BeautifulSoup(r.text, "lxml")
items = [el.get_text(strip=True) for el in soup.select(".item")]
if not items:
break
all_items.extend(items)
if not soup.select_one("a[rel='next']"):
break
page += 1
time.sleep(random.uniform(0.5, 1.5)) # polite delay
return all_items
Async httpx: Parallel Scraping
pip install httpx[asyncio] beautifulsoup4
import httpx
import asyncio
from bs4 import BeautifulSoup
async def fetch(client: httpx.AsyncClient, url: str) -> dict:
r = await client.get(url, timeout=20)
soup = BeautifulSoup(r.text, "lxml")
return {
"url": url,
"title": soup.find("title").get_text() if soup.find("title") else "",
"h1": soup.find("h1").get_text(strip=True) if soup.find("h1") else "",
}
async def scrape_batch(urls: list[str]) -> list[dict]:
limits = httpx.Limits(max_connections=10, max_keepalive_connections=5)
headers = {"User-Agent": "Mozilla/5.0 ..."}
async with httpx.AsyncClient(limits=limits, headers=headers) as client:
results = await asyncio.gather(*[fetch(client, url) for url in urls], return_exceptions=True)
return [r for r in results if not isinstance(r, Exception)]
# Run
results = asyncio.run(scrape_batch(["https://a.com", "https://b.com", "https://c.com"]))
Playwright: JavaScript-Rendered Sites
pip install playwright
python -m playwright install chromium
from playwright.sync_api import sync_playwright
from bs4 import BeautifulSoup
def scrape_spa(url: str) -> list[dict]:
with sync_playwright() as pw:
browser = pw.chromium.launch(headless=True)
context = browser.new_context(
user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)...",
locale="en-US",
)
page = context.new_page()
# Block images/fonts to speed up loading
page.route("**/*", lambda route: route.abort()
if route.request.resource_type in ["image", "media", "font"]
else route.continue_()
)
page.goto(url, wait_until="networkidle", timeout=20000)
page.wait_for_selector(".product-list", timeout=10000)
soup = BeautifulSoup(page.content(), "lxml")
browser.close()
return [
{"title": el.select_one(".title").get_text(strip=True)}
for el in soup.select(".product-card")
]
Async Playwright for parallel scraping
import asyncio
from playwright.async_api import async_playwright
async def scrape(url: str) -> str:
async with async_playwright() as pw:
browser = await pw.chromium.launch(headless=True)
page = await browser.new_page()
await page.goto(url, wait_until="networkidle", timeout=20000)
content = await page.content()
await browser.close()
return content
async def main():
urls = ["https://a.com", "https://b.com", "https://c.com"]
results = await asyncio.gather(*[scrape(url) for url in urls])
asyncio.run(main())
Retry with Exponential Backoff
import time, random
def fetch_with_retry(url: str, max_retries: int = 3) -> requests.Response:
delays = [1, 3, 10]
for attempt in range(max_retries + 1):
try:
r = requests.get(url, headers=HEADERS, timeout=15)
if r.status_code == 429:
delay = delays[min(attempt, len(delays) - 1)]
time.sleep(delay + random.random())
continue
r.raise_for_status()
return r
except requests.RequestException as e:
if attempt == max_retries:
raise
time.sleep(delays[min(attempt, len(delays) - 1)])
raise Exception(f"Failed after {max_retries} retries")
SnapAPI: Managed Python Scraping
import requests, os
SNAPAPI_KEY = os.environ["SNAPAPI_KEY"]
# Full rendered HTML (JavaScript executed, anti-bot bypassed)
r = requests.post(
"https://api.snapapi.pics/v1/scrape",
headers={"X-Api-Key": SNAPAPI_KEY},
json={
"url": "https://protected-site.com",
"stealth": True,
"waitForSelector": ".product-list",
"waitUntil": "networkidle",
},
timeout=60,
)
html = r.json()["html"]
soup = BeautifulSoup(html, "lxml") # parse as usual
# Schema-based extraction — LLM extracts structured data
r = requests.post(
"https://api.snapapi.pics/v1/extract",
headers={"X-Api-Key": SNAPAPI_KEY},
json={
"url": "https://example.com/product",
"schema": {
"title": {"type": "string", "description": "Product name"},
"price": {"type": "string", "description": "Price with currency"},
"rating": {"type": "number", "description": "Star rating 1-5"},
"in_stock": {"type": "boolean", "description": "Whether available"},
},
},
timeout=60,
)
data = r.json()["data"]
print(data) # {"title": "Widget Pro", "price": "$29.99", "rating": 4.7, "in_stock": True}
Storage: PostgreSQL with psycopg3
import psycopg
with psycopg.connect(os.environ["DATABASE_URL"]) as conn:
with conn.cursor() as cur:
cur.executemany(
"""
INSERT INTO products (url, title, price, scraped_at)
VALUES (%s, %s, %s, NOW())
ON CONFLICT (url) DO UPDATE SET
title = EXCLUDED.title,
price = EXCLUDED.price,
scraped_at = NOW()
""",
[(p["url"], p["title"], p["price"]) for p in products],
)
conn.commit()