Web Scraping with Python in 2026

From BeautifulSoup basics to async Playwright SPA scraping, anti-bot evasion, retry logic, and production data pipelines with PostgreSQL and S3.

PythonBeautifulSoupPlaywright httpxScrapyApril 2026

Choose Your Stack

ScenarioStack
Static HTML pagesrequests + BeautifulSoup
Many pages fasthttpx async + BeautifulSoup
JavaScript-rendered (SPA)Playwright (sync or async)
Large-scale crawlScrapy
Bot-protected siteSnapAPI stealth API
Structured data extractionSnapAPI /extract schema

requests + BeautifulSoup

pip install requests beautifulsoup4 lxml
import requests
from bs4 import BeautifulSoup

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9",
}

def scrape_products(url: str) -> list[dict]:
    r = requests.get(url, headers=HEADERS, timeout=15)
    r.raise_for_status()
    soup = BeautifulSoup(r.text, "lxml")

    products = []
    for card in soup.select(".product-card"):
        products.append({
            "title": card.select_one(".title").get_text(strip=True),
            "price": card.select_one(".price").get_text(strip=True),
            "url":   card.select_one("a")["href"],
        })
    return products

Pagination loop

import time, random

def scrape_all_pages(base_url: str) -> list[dict]:
    all_items = []
    page = 1
    while True:
        r = requests.get(f"{base_url}?page={page}", headers=HEADERS, timeout=15)
        soup = BeautifulSoup(r.text, "lxml")
        items = [el.get_text(strip=True) for el in soup.select(".item")]
        if not items:
            break
        all_items.extend(items)
        if not soup.select_one("a[rel='next']"):
            break
        page += 1
        time.sleep(random.uniform(0.5, 1.5))   # polite delay
    return all_items

Async httpx: Parallel Scraping

pip install httpx[asyncio] beautifulsoup4
import httpx
import asyncio
from bs4 import BeautifulSoup

async def fetch(client: httpx.AsyncClient, url: str) -> dict:
    r = await client.get(url, timeout=20)
    soup = BeautifulSoup(r.text, "lxml")
    return {
        "url": url,
        "title": soup.find("title").get_text() if soup.find("title") else "",
        "h1": soup.find("h1").get_text(strip=True) if soup.find("h1") else "",
    }

async def scrape_batch(urls: list[str]) -> list[dict]:
    limits = httpx.Limits(max_connections=10, max_keepalive_connections=5)
    headers = {"User-Agent": "Mozilla/5.0 ..."}
    async with httpx.AsyncClient(limits=limits, headers=headers) as client:
        results = await asyncio.gather(*[fetch(client, url) for url in urls], return_exceptions=True)
    return [r for r in results if not isinstance(r, Exception)]

# Run
results = asyncio.run(scrape_batch(["https://a.com", "https://b.com", "https://c.com"]))

Playwright: JavaScript-Rendered Sites

pip install playwright
python -m playwright install chromium
from playwright.sync_api import sync_playwright
from bs4 import BeautifulSoup

def scrape_spa(url: str) -> list[dict]:
    with sync_playwright() as pw:
        browser = pw.chromium.launch(headless=True)
        context = browser.new_context(
            user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)...",
            locale="en-US",
        )
        page = context.new_page()

        # Block images/fonts to speed up loading
        page.route("**/*", lambda route: route.abort()
            if route.request.resource_type in ["image", "media", "font"]
            else route.continue_()
        )

        page.goto(url, wait_until="networkidle", timeout=20000)
        page.wait_for_selector(".product-list", timeout=10000)

        soup = BeautifulSoup(page.content(), "lxml")
        browser.close()

    return [
        {"title": el.select_one(".title").get_text(strip=True)}
        for el in soup.select(".product-card")
    ]

Async Playwright for parallel scraping

import asyncio
from playwright.async_api import async_playwright

async def scrape(url: str) -> str:
    async with async_playwright() as pw:
        browser = await pw.chromium.launch(headless=True)
        page = await browser.new_page()
        await page.goto(url, wait_until="networkidle", timeout=20000)
        content = await page.content()
        await browser.close()
        return content

async def main():
    urls = ["https://a.com", "https://b.com", "https://c.com"]
    results = await asyncio.gather(*[scrape(url) for url in urls])

asyncio.run(main())

Retry with Exponential Backoff

import time, random

def fetch_with_retry(url: str, max_retries: int = 3) -> requests.Response:
    delays = [1, 3, 10]
    for attempt in range(max_retries + 1):
        try:
            r = requests.get(url, headers=HEADERS, timeout=15)
            if r.status_code == 429:
                delay = delays[min(attempt, len(delays) - 1)]
                time.sleep(delay + random.random())
                continue
            r.raise_for_status()
            return r
        except requests.RequestException as e:
            if attempt == max_retries:
                raise
            time.sleep(delays[min(attempt, len(delays) - 1)])
    raise Exception(f"Failed after {max_retries} retries")

SnapAPI: Managed Python Scraping

import requests, os

SNAPAPI_KEY = os.environ["SNAPAPI_KEY"]

# Full rendered HTML (JavaScript executed, anti-bot bypassed)
r = requests.post(
    "https://api.snapapi.pics/v1/scrape",
    headers={"X-Api-Key": SNAPAPI_KEY},
    json={
        "url": "https://protected-site.com",
        "stealth": True,
        "waitForSelector": ".product-list",
        "waitUntil": "networkidle",
    },
    timeout=60,
)
html = r.json()["html"]
soup = BeautifulSoup(html, "lxml")  # parse as usual
# Schema-based extraction — LLM extracts structured data
r = requests.post(
    "https://api.snapapi.pics/v1/extract",
    headers={"X-Api-Key": SNAPAPI_KEY},
    json={
        "url": "https://example.com/product",
        "schema": {
            "title":    {"type": "string",  "description": "Product name"},
            "price":    {"type": "string",  "description": "Price with currency"},
            "rating":   {"type": "number",  "description": "Star rating 1-5"},
            "in_stock": {"type": "boolean", "description": "Whether available"},
        },
    },
    timeout=60,
)
data = r.json()["data"]
print(data)  # {"title": "Widget Pro", "price": "$29.99", "rating": 4.7, "in_stock": True}

Storage: PostgreSQL with psycopg3

import psycopg

with psycopg.connect(os.environ["DATABASE_URL"]) as conn:
    with conn.cursor() as cur:
        cur.executemany(
            """
            INSERT INTO products (url, title, price, scraped_at)
            VALUES (%s, %s, %s, NOW())
            ON CONFLICT (url) DO UPDATE SET
              title = EXCLUDED.title,
              price = EXCLUDED.price,
              scraped_at = NOW()
            """,
            [(p["url"], p["title"], p["price"]) for p in products],
        )
    conn.commit()