Web Scraping with Python in 2026: BeautifulSoup, Playwright & SnapAPI

Choose Your Stack

Scenario	Stack
Static HTML pages	requests + BeautifulSoup
Many pages fast	httpx async + BeautifulSoup
JavaScript-rendered (SPA)	Playwright (sync or async)
Large-scale crawl	Scrapy
Bot-protected site	SnapAPI stealth API
Structured data extraction	SnapAPI /extract schema

requests + BeautifulSoup

pip install requests beautifulsoup4 lxml

import requests
from bs4 import BeautifulSoup

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9",
}

def scrape_products(url: str) -> list[dict]:
    r = requests.get(url, headers=HEADERS, timeout=15)
    r.raise_for_status()
    soup = BeautifulSoup(r.text, "lxml")

    products = []
    for card in soup.select(".product-card"):
        products.append({
            "title": card.select_one(".title").get_text(strip=True),
            "price": card.select_one(".price").get_text(strip=True),
            "url":   card.select_one("a")["href"],
        })
    return products

Pagination loop

import time, random

def scrape_all_pages(base_url: str) -> list[dict]:
    all_items = []
    page = 1
    while True:
        r = requests.get(f"{base_url}?page={page}", headers=HEADERS, timeout=15)
        soup = BeautifulSoup(r.text, "lxml")
        items = [el.get_text(strip=True) for el in soup.select(".item")]
        if not items:
            break
        all_items.extend(items)
        if not soup.select_one("a[rel='next']"):
            break
        page += 1
        time.sleep(random.uniform(0.5, 1.5))   # polite delay
    return all_items

Async httpx: Parallel Scraping

pip install httpx[asyncio] beautifulsoup4

import httpx
import asyncio
from bs4 import BeautifulSoup

async def fetch(client: httpx.AsyncClient, url: str) -> dict:
    r = await client.get(url, timeout=20)
    soup = BeautifulSoup(r.text, "lxml")
    return {
        "url": url,
        "title": soup.find("title").get_text() if soup.find("title") else "",
        "h1": soup.find("h1").get_text(strip=True) if soup.find("h1") else "",
    }

async def scrape_batch(urls: list[str]) -> list[dict]:
    limits = httpx.Limits(max_connections=10, max_keepalive_connections=5)
    headers = {"User-Agent": "Mozilla/5.0 ..."}
    async with httpx.AsyncClient(limits=limits, headers=headers) as client:
        results = await asyncio.gather(*[fetch(client, url) for url in urls], return_exceptions=True)
    return [r for r in results if not isinstance(r, Exception)]

# Run
results = asyncio.run(scrape_batch(["https://a.com", "https://b.com", "https://c.com"]))

Playwright: JavaScript-Rendered Sites

pip install playwright python -m playwright install chromium

from playwright.sync_api import sync_playwright from bs4 import BeautifulSoup def scrape_spa(url: str) -> list[dict]: with sync_playwright() as pw: browser = pw.chromium.launch(headless=True) context = browser.new_context( user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)...", locale="en-US", ) page = context.new_page() # Block images/fonts to speed up loading page.route("**/*", lambda route: route.abort() if route.request.resource_type in ["image", "media", "font"] else route.continue_() ) page.goto(url, wait_until="networkidle", timeout=20000) page.wait_for_selector(".product-list", timeout=10000) soup = BeautifulSoup(page.content(), "lxml") browser.close() return [ {"title": el.select_one(".title").get_text(strip=True)} for el in soup.select(".product-card") ]

Async Playwright for parallel scraping

import asyncio from playwright.async_api import async_playwright async def scrape(url: str) -> str: async with async_playwright() as pw: browser = await pw.chromium.launch(headless=True) page = await browser.new_page() await page.goto(url, wait_until="networkidle", timeout=20000) content = await page.content() await browser.close() return content async def main(): urls = ["https://a.com", "https://b.com", "https://c.com"] results = await asyncio.gather(*[scrape(url) for url in urls]) asyncio.run(main())

Retry with Exponential Backoff

import time, random def fetch_with_retry(url: str, max_retries: int = 3) -> requests.Response: delays = [1, 3, 10] for attempt in range(max_retries + 1): try: r = requests.get(url, headers=HEADERS, timeout=15) if r.status_code == 429: delay = delays[min(attempt, len(delays) - 1)] time.sleep(delay + random.random()) continue r.raise_for_status() return r except requests.RequestException as e: if attempt == max_retries: raise time.sleep(delays[min(attempt, len(delays) - 1)]) raise Exception(f"Failed after {max_retries} retries")

SnapAPI: Managed Python Scraping

import requests, os SNAPAPI_KEY = os.environ["SNAPAPI_KEY"] # Full rendered HTML (JavaScript executed, anti-bot bypassed) r = requests.post( "https://api.snapapi.pics/v1/scrape", headers={"X-Api-Key": SNAPAPI_KEY}, json={ "url": "https://protected-site.com", "stealth": True, "waitForSelector": ".product-list", "waitUntil": "networkidle", }, timeout=60, ) html = r.json()["html"] soup = BeautifulSoup(html, "lxml") # parse as usual

# Schema-based extraction — LLM extracts structured data r = requests.post( "https://api.snapapi.pics/v1/extract", headers={"X-Api-Key": SNAPAPI_KEY}, json={ "url": "https://example.com/product", "schema": { "title": {"type": "string", "description": "Product name"}, "price": {"type": "string", "description": "Price with currency"}, "rating": {"type": "number", "description": "Star rating 1-5"}, "in_stock": {"type": "boolean", "description": "Whether available"}, }, }, timeout=60, ) data = r.json()["data"] print(data) # {"title": "Widget Pro", "price": "$29.99", "rating": 4.7, "in_stock": True}

Storage: PostgreSQL with psycopg3

import psycopg with psycopg.connect(os.environ["DATABASE_URL"]) as conn: with conn.cursor() as cur: cur.executemany( """ INSERT INTO products (url, title, price, scraped_at) VALUES (%s, %s, %s, NOW()) ON CONFLICT (url) DO UPDATE SET title = EXCLUDED.title, price = EXCLUDED.price, scraped_at = NOW() """, [(p["url"], p["title"], p["price"]) for p in products], ) conn.commit()