Basic Scraping with Requests
The requests library handles HTTP calls; BeautifulSoup parses HTML. Together they handle most static page scraping.
import requests
from bs4 import BeautifulSoup
def scrape_page(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36'
}
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
# Extract data
title = soup.find('h1').get_text(strip=True) if soup.find('h1') else None
paragraphs = [p.get_text(strip=True) for p in soup.find_all('p')]
links = [{'text': a.get_text(strip=True), 'href': a['href']}
for a in soup.find_all('a', href=True)]
return {'title': title, 'paragraphs': paragraphs, 'links': links}
Always set a User-Agent header — many sites block the default Python requests UA. Set timeout to avoid hanging on slow servers.
Sessions and Cookies
Use requests.Session() to persist cookies, headers, and connection pooling across requests. Essential for login-protected content and multi-page scraping.
session = requests.Session()
session.headers.update({
'User-Agent': 'Mozilla/5.0...',
'Accept-Language': 'en-US,en;q=0.9'
})
# Login
session.post('https://example.com/login', data={
'username': 'user', 'password': 'pass'
})
# Now scrape authenticated pages — cookies persist
profile = session.get('https://example.com/dashboard')
soup = BeautifulSoup(profile.text, 'html.parser')
Handling Pagination
def scrape_all_pages(base_url):
all_items = []
page = 1
while True:
response = requests.get(f'{base_url}?page={page}', headers=headers, timeout=10)
soup = BeautifulSoup(response.text, 'html.parser')
items = soup.select('.product-card')
if not items:
break
for item in items:
all_items.append({
'name': item.select_one('.title').get_text(strip=True),
'price': item.select_one('.price').get_text(strip=True),
})
# Check for next page
next_btn = soup.select_one('a.next-page')
if not next_btn:
break
page += 1
time.sleep(1) # Be polite
return all_items
Async Scraping with httpx
httpx is the modern async alternative to requests. Scrape multiple pages concurrently without threads.
import httpx
import asyncio
from bs4 import BeautifulSoup
async def scrape_urls(urls, max_concurrent=5):
semaphore = asyncio.Semaphore(max_concurrent)
results = []
async def fetch(client, url):
async with semaphore:
resp = await client.get(url, timeout=15)
soup = BeautifulSoup(resp.text, 'html.parser')
title = soup.find('h1').get_text(strip=True) if soup.find('h1') else url
return {'url': url, 'title': title, 'status': resp.status_code}
async with httpx.AsyncClient(headers={'User-Agent': 'Mozilla/5.0...'}) as client:
tasks = [fetch(client, url) for url in urls]
results = await asyncio.gather(*tasks, return_exceptions=True)
return [r for r in results if not isinstance(r, Exception)]
# Scrape 50 pages concurrently (5 at a time)
data = asyncio.run(scrape_urls(url_list))
JS-Rendered Pages: The Limitation
Requests fetches raw HTML — it doesn't execute JavaScript. If the content you need is loaded via React, Vue, or AJAX calls, you'll get an empty page. For JS-rendered content, you need either a headless browser (Playwright, Selenium) or an API that renders pages for you.
SnapAPI: Scrape JS Pages Without a Browser
SnapAPI renders pages in a real browser and returns the result via REST. No Selenium or Playwright setup needed — just an HTTP call.
import httpx
# Scrape a JS-rendered page — returns fully rendered HTML
resp = httpx.post('https://api.snapapi.pics/v1/scrape',
headers={'X-Api-Key': 'sk_live_your_key'},
json={'url': 'https://react-app.com/products', 'stealth': True},
timeout=30
)
html = resp.json()['html']
soup = BeautifulSoup(html, 'html.parser')
# Now parse as usual — all JS content is rendered
# Extract structured data directly — no parsing needed
resp = httpx.post('https://api.snapapi.pics/v1/extract',
headers={'X-Api-Key': 'sk_live_your_key'},
json={
'url': 'https://react-app.com/products',
'schema': {
'products': [{'name': 'string', 'price': 'number', 'inStock': 'boolean'}]
}
},
timeout=30
)
products = resp.json()['data']['products']
SnapAPI handles JavaScript rendering, stealth mode, ad blocking, and cookie banner removal. The /extract endpoint returns structured JSON directly — no BeautifulSoup needed. 200 free requests/month, Python SDK available.