Web Scraping with Python Requests: Complete Guide (2026)

Basic Scraping with Requests

The requests library handles HTTP calls; BeautifulSoup parses HTML. Together they handle most static page scraping.

import requests
from bs4 import BeautifulSoup

def scrape_page(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36'
    }
    response = requests.get(url, headers=headers, timeout=10)
    response.raise_for_status()
    
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Extract data
    title = soup.find('h1').get_text(strip=True) if soup.find('h1') else None
    paragraphs = [p.get_text(strip=True) for p in soup.find_all('p')]
    links = [{'text': a.get_text(strip=True), 'href': a['href']} 
             for a in soup.find_all('a', href=True)]
    
    return {'title': title, 'paragraphs': paragraphs, 'links': links}

Always set a User-Agent header — many sites block the default Python requests UA. Set timeout to avoid hanging on slow servers.

Sessions and Cookies

Use requests.Session() to persist cookies, headers, and connection pooling across requests. Essential for login-protected content and multi-page scraping.

session = requests.Session()
session.headers.update({
    'User-Agent': 'Mozilla/5.0...',
    'Accept-Language': 'en-US,en;q=0.9'
})

# Login
session.post('https://example.com/login', data={
    'username': 'user', 'password': 'pass'
})

# Now scrape authenticated pages — cookies persist
profile = session.get('https://example.com/dashboard')
soup = BeautifulSoup(profile.text, 'html.parser')

Handling Pagination

def scrape_all_pages(base_url):
    all_items = []
    page = 1
    
    while True:
        response = requests.get(f'{base_url}?page={page}', headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        items = soup.select('.product-card')
        if not items:
            break
        
        for item in items:
            all_items.append({
                'name': item.select_one('.title').get_text(strip=True),
                'price': item.select_one('.price').get_text(strip=True),
            })
        
        # Check for next page
        next_btn = soup.select_one('a.next-page')
        if not next_btn:
            break
        
        page += 1
        time.sleep(1)  # Be polite
    
    return all_items

Async Scraping with httpx

httpx is the modern async alternative to requests. Scrape multiple pages concurrently without threads.

import httpx
import asyncio
from bs4 import BeautifulSoup

async def scrape_urls(urls, max_concurrent=5):
    semaphore = asyncio.Semaphore(max_concurrent)
    results = []

    async def fetch(client, url):
        async with semaphore:
            resp = await client.get(url, timeout=15)
            soup = BeautifulSoup(resp.text, 'html.parser')
            title = soup.find('h1').get_text(strip=True) if soup.find('h1') else url
            return {'url': url, 'title': title, 'status': resp.status_code}

    async with httpx.AsyncClient(headers={'User-Agent': 'Mozilla/5.0...'}) as client:
        tasks = [fetch(client, url) for url in urls]
        results = await asyncio.gather(*tasks, return_exceptions=True)

    return [r for r in results if not isinstance(r, Exception)]

# Scrape 50 pages concurrently (5 at a time)
data = asyncio.run(scrape_urls(url_list))

JS-Rendered Pages: The Limitation

Requests fetches raw HTML — it doesn't execute JavaScript. If the content you need is loaded via React, Vue, or AJAX calls, you'll get an empty page. For JS-rendered content, you need either a headless browser (Playwright, Selenium) or an API that renders pages for you.

SnapAPI: Scrape JS Pages Without a Browser

SnapAPI renders pages in a real browser and returns the result via REST. No Selenium or Playwright setup needed — just an HTTP call.

import httpx

# Scrape a JS-rendered page — returns fully rendered HTML
resp = httpx.post('https://api.snapapi.pics/v1/scrape',
    headers={'X-Api-Key': 'sk_live_your_key'},
    json={'url': 'https://react-app.com/products', 'stealth': True},
    timeout=30
)
html = resp.json()['html']
soup = BeautifulSoup(html, 'html.parser')
# Now parse as usual — all JS content is rendered

# Extract structured data directly — no parsing needed
resp = httpx.post('https://api.snapapi.pics/v1/extract',
    headers={'X-Api-Key': 'sk_live_your_key'},
    json={
        'url': 'https://react-app.com/products',
        'schema': {
            'products': [{'name': 'string', 'price': 'number', 'inStock': 'boolean'}]
        }
    },
    timeout=30
)
products = resp.json()['data']['products']

SnapAPI handles JavaScript rendering, stealth mode, ad blocking, and cookie banner removal. The /extract endpoint returns structured JSON directly — no BeautifulSoup needed. 200 free requests/month, Python SDK available.