Screenshot API for Data Pipelines — Airflow, Prefect & dbt

Screenshots as First-Class Pipeline Artifacts

Data pipelines extract, transform, and load structured data — but they often operate on sources that are only partially machine-readable. Web dashboards, PDF reports, and JavaScript-rendered analytics pages require a browser to render before data can be extracted. SnapAPI gives your pipeline a browser without managing one.

Use SnapAPI to capture screenshots at critical pipeline stages: screenshot the source dashboard before extraction as visual provenance, capture the output report after transformation for stakeholder review, and monitor data source pages between runs to detect layout changes that would break your scrapers.

Apache Airflow DAG with Screenshot Operators

from airflow import DAG
from airflow.operators.python import PythonOperator
from datetime import datetime, timedelta
import requests, os

SNAP_KEY = os.environ['SNAPAPI_KEY']

def capture_source_screenshot(url, output_path, **context):
    r = requests.get('https://api.snapapi.pics/screenshot', params={
        'access_key': SNAP_KEY,
        'url': url,
        'width': 1440,
        'height': 900,
        'full_page': 'false',
        'format': 'png',
        'delay': 2000,
    }, timeout=60)
    r.raise_for_status()
    with open(output_path, 'wb') as f:
        f.write(r.content)
    return output_path

with DAG(
    'data_pipeline_with_screenshots',
    default_args={'owner': 'data-team', 'retries': 2, 'retry_delay': timedelta(minutes=5)},
    schedule_interval='@daily',
    start_date=datetime(2026, 1, 1),
    catchup=False,
) as dag:

    screenshot_source = PythonOperator(
        task_id='screenshot_data_source',
        python_callable=capture_source_screenshot,
        op_kwargs={
            'url': 'https://data-source.example.com/dashboard',
            'output_path': '/data/artifacts/source_{{ ds }}.png',
        },
    )

    extract_data = PythonOperator(
        task_id='extract_data',
        python_callable=run_extraction,
    )

    screenshot_output = PythonOperator(
        task_id='screenshot_output_report',
        python_callable=capture_source_screenshot,
        op_kwargs={
            'url': 'https://reports.internal.example.com/daily',
            'output_path': '/data/artifacts/output_{{ ds }}.png',
        },
    )

    screenshot_source >> extract_data >> screenshot_output

Prefect Flow Integration

from prefect import flow, task
import requests, os
from pathlib import Path

SNAP_KEY = os.environ['SNAPAPI_KEY']

@task(retries=3, retry_delay_seconds=30)
def screenshot_dashboard(url: str, name: str, run_date: str) -> Path:
    r = requests.get('https://api.snapapi.pics/screenshot', params={
        'access_key': SNAP_KEY, 'url': url,
        'width': 1920, 'height': 1080,
        'full_page': 'false', 'format': 'png', 'delay': 3000,
    }, timeout=60)
    r.raise_for_status()
    path = Path(f'artifacts/{name}_{run_date}.png')
    path.parent.mkdir(exist_ok=True)
    path.write_bytes(r.content)
    return path

@task
def extract_page_data(url: str) -> dict:
    r = requests.get('https://api.snapapi.pics/extract', json={
        'access_key': SNAP_KEY,
        'url': url,
        'schema': {'revenue': 'number', 'orders': 'integer', 'conversion_rate': 'number'}
    }, timeout=60)
    r.raise_for_status()
    return r.json()

@flow(name='daily-data-pipeline')
def daily_pipeline(run_date: str):
    dashboard_url = f'https://analytics.example.com/daily?date={run_date}'
    screenshot = screenshot_dashboard(dashboard_url, 'analytics', run_date)
    data = extract_page_data(dashboard_url)
    return {'screenshot': str(screenshot), 'data': data}

if __name__ == '__main__':
    from datetime import date
    daily_pipeline(run_date=date.today().isoformat())

dbt Post-Run Report Screenshots

After your dbt models run, automatically screenshot the resulting dashboards in Metabase, Looker, or Mode Analytics to create a timestamped visual record of your data state:

#!/usr/bin/env python3
# scripts/post_dbt_screenshots.py
# Add to dbt project: on-run-end hook or call from CI after dbt run

import requests, os, sys
from datetime import date
from pathlib import Path

SNAP_KEY = os.environ['SNAPAPI_KEY']
METABASE_SESSION = os.environ.get('METABASE_SESSION', '')

DASHBOARDS = [
    {'name': 'revenue', 'url': 'https://metabase.company.com/dashboard/1'},
    {'name': 'funnel',  'url': 'https://metabase.company.com/dashboard/2'},
    {'name': 'cohorts', 'url': 'https://metabase.company.com/dashboard/3'},
]

run_date = date.today().isoformat()
out_dir = Path(f'dbt_screenshots/{run_date}')
out_dir.mkdir(parents=True, exist_ok=True)

for db in DASHBOARDS:
    r = requests.get('https://api.snapapi.pics/screenshot', params={
        'access_key': SNAP_KEY,
        'url': db['url'],
        'width': 1920,
        'height': 1080,
        'full_page': 'true',
        'delay': 5000,
        'cookies': f'metabase.SESSION={METABASE_SESSION}',
    }, timeout=90)
    if r.status_code == 200:
        path = out_dir / f'{db['name']}.png'
        path.write_bytes(r.content)
        print(f"OK: {db['name']} -> {path}")
    else:
        print(f"FAIL: {db['name']} HTTP {r.status_code}", file=sys.stderr)

Data Source Monitoring

Web scrapers break when source sites change their layout. SnapAPI lets you monitor data source pages visually, catching changes before they corrupt your pipeline. Run a nightly screenshot comparison and alert your team if the page structure changed significantly.

import requests, os, hashlib
from PIL import Image, ImageChops
import numpy as np
from pathlib import Path

def monitor_source(url: str, threshold: float = 0.02) -> bool:
    snap_key = os.environ['SNAPAPI_KEY']
    slug = hashlib.md5(url.encode()).hexdigest()[:8]
    baseline_path = Path(f'baselines/{slug}.png')
    current_path = Path(f'current/{slug}.png')

    r = requests.get('https://api.snapapi.pics/screenshot', params={
        'access_key': snap_key, 'url': url,
        'width': 1440, 'height': 900, 'format': 'png',
    }, timeout=60)
    r.raise_for_status()
    current_path.parent.mkdir(exist_ok=True)
    current_path.write_bytes(r.content)

    if not baseline_path.exists():
        baseline_path.parent.mkdir(exist_ok=True)
        baseline_path.write_bytes(r.content)
        print(f'Baseline created for {url}')
        return True

    img_a = np.array(Image.open(baseline_path).convert('RGB'), dtype=float)
    img_b = np.array(Image.open(current_path).convert('RGB'), dtype=float)
    if img_a.shape != img_b.shape:
        img_b = np.array(Image.open(current_path).resize(
            (img_a.shape[1], img_a.shape[0]), Image.LANCZOS).convert('RGB'), dtype=float)

    changed = np.sum(np.any(np.abs(img_a - img_b) > 15, axis=2))
    ratio = changed / (img_a.shape[0] * img_a.shape[1])
    if ratio > threshold:
        print(f'ALERT: {url} changed by {ratio:.1%} — scraper may break')
        return False
    return True

Pricing for Data Teams

Data pipelines run on schedules — often daily or hourly. SnapAPI pricing is built for this. The $19/month plan covers 5,000 screenshots, enough for a daily pipeline monitoring 10 sources across 16 pipeline runs. The $79/month plan covers 50,000 screenshots for larger data operations. Enterprise plans available for high-frequency monitoring workloads.

Start with the free tier (200/month) to build and test your pipeline integration. Sign up at snapapi.pics/dashboard.

Building a Data Provenance System with Screenshots

Regulatory compliance and data governance frameworks increasingly require provenance records — evidence of where data came from and what the source looked like at the time of extraction. Screenshots are the most human-readable form of provenance. SnapAPI lets you attach a visual timestamp to every extraction event in your pipeline.

Store screenshots in S3 with a key structure like provenance/{source_id}/{extraction_date}/{run_id}.png. Reference the S3 URL in your pipeline metadata. Auditors and data stewards can click the URL and see exactly what the data source looked like when your pipeline ran — layout, values, and all.

import boto3, requests, os
from datetime import date

s3 = boto3.client('s3', region_name='eu-central-1')
SNAP_KEY = os.environ['SNAPAPI_KEY']
BUCKET = os.environ['PROVENANCE_BUCKET']

def capture_provenance(url: str, source_id: str, run_id: str) -> str:
    r = requests.get('https://api.snapapi.pics/screenshot', params={
        'access_key': SNAP_KEY, 'url': url,
        'width': 1440, 'height': 900,
        'full_page': 'false', 'format': 'png',
    }, timeout=60)
    r.raise_for_status()

    today = date.today().isoformat()
    key = f'provenance/{source_id}/{today}/{run_id}.png'
    s3.put_object(
        Bucket=BUCKET, Key=key,
        Body=r.content, ContentType='image/png',
        Metadata={'source_url': url, 'run_id': run_id, 'captured_at': today}
    )
    url_s3 = f'https://{BUCKET}.s3.amazonaws.com/{key}'
    return url_s3

Integrating Screenshots into Great Expectations Checkpoints

Great Expectations is the leading data quality framework for Python data pipelines. You can extend GE checkpoints to capture screenshots of data source pages as part of your validation suite — providing visual evidence alongside your data quality assertions.

from great_expectations.checkpoint import SimpleCheckpoint
import requests, os
from pathlib import Path

SNAP_KEY = os.environ['SNAPAPI_KEY']

def run_checkpoint_with_screenshot(context, checkpoint_name: str, source_url: str):
    # Capture screenshot before validation
    r = requests.get('https://api.snapapi.pics/screenshot', params={
        'access_key': SNAP_KEY, 'url': source_url,
        'width': 1440, 'height': 900, 'format': 'png',
    }, timeout=60)

    if r.status_code == 200:
        shot_dir = Path('ge_evidence') / checkpoint_name
        shot_dir.mkdir(parents=True, exist_ok=True)
        (shot_dir / 'source_before_validation.png').write_bytes(r.content)

    # Run the checkpoint
    result = context.run_checkpoint(checkpoint_name=checkpoint_name)
    return result

Cost Analysis: Screenshot API vs Self-Hosted Browser

Running Playwright or Selenium in your data pipeline infrastructure costs real money. A dedicated EC2 t3.medium instance to host a browser pool runs $30-60/month. That instance needs monitoring, patching, and occasional restarts when the browser crashes. Engineering time to maintain it: 2-4 hours/month conservatively.

SnapAPI at $19/month for 5,000 screenshots eliminates that infrastructure entirely. For most data pipelines running daily jobs on 10-50 URLs, 5,000 captures per month provides significant headroom. The $79/month plan covers 50,000 screenshots — more than enough for large-scale monitoring operations.

Beyond cost, the operational simplicity is the real win. No browser process to restart, no Xvfb configuration, no Chromium version compatibility issues. Your pipeline just makes an HTTP call and moves on.

Get Started

Add SnapAPI to your next pipeline in under 10 minutes. Sign up at snapapi.pics/dashboard for a free API key with 200 screenshots/month. The Python examples in this guide are production-tested and ready to drop into Airflow, Prefect, or any Python data pipeline.

Visual Data Quality Reports

After your pipeline runs and your data quality checks pass, stakeholders still need to see the data. Screenshot-based reports bridge the gap between raw data validation results and human-readable evidence. Instead of sending a CSV of test results, send a PDF containing screenshots of every dashboard your pipeline feeds, timestamped at the moment validation passed.

from reportlab.platypus import SimpleDocTemplate, Image as RLImage, Paragraph, Spacer
from reportlab.lib.pagesizes import A4
from reportlab.lib.styles import getSampleStyleSheet
import requests, os, io
from datetime import datetime

SNAP_KEY = os.environ['SNAPAPI_KEY']

def build_pipeline_report(dashboards: list, output_path: str):
    doc = SimpleDocTemplate(output_path, pagesize=A4)
    styles = getSampleStyleSheet()
    story = [Paragraph(f'Pipeline Run Report — {datetime.utcnow().isoformat()}Z', styles['Title']), Spacer(1, 20)]

    for db in dashboards:
        r = requests.get('https://api.snapapi.pics/screenshot', params={
            'access_key': SNAP_KEY,
            'url': db['url'], 'width': 1440, 'height': 900,
            'format': 'jpeg', 'quality': 85, 'delay': 3000,
        }, timeout=90)
        if r.status_code == 200:
            story.append(Paragraph(db['name'], styles['Heading2']))
            story.append(RLImage(io.BytesIO(r.content), width=480, height=270))
            story.append(Spacer(1, 16))

    doc.build(story)
    print(f'Report: {output_path}')

Handling Authentication in Pipeline Screenshots

Most production dashboards require authentication. SnapAPI supports passing cookies and authorization headers, letting you capture authenticated dashboard screenshots without exposing credentials in your pipeline code.

import requests, os

SNAP_KEY = os.environ['SNAPAPI_KEY']
SESSION_COOKIE = os.environ['DASHBOARD_SESSION']  # from secrets manager

def capture_authenticated_dashboard(url: str) -> bytes:
    r = requests.get('https://api.snapapi.pics/screenshot', params={
        'access_key': SNAP_KEY,
        'url': url,
        'width': 1920, 'height': 1080,
        'full_page': 'false', 'format': 'png',
        'delay': 4000,
        'cookies': f'session={SESSION_COOKIE}',
    }, timeout=90)
    r.raise_for_status()
    return r.content

Summary and Next Steps

Screenshot APIs belong in your data pipeline toolkit alongside great-expectations, dbt, Airflow, and Prefect. They provide visual provenance, human-readable evidence of pipeline runs, and early warning of data source layout changes — all with a single HTTPS call per capture.

SnapAPI starts free at 200 screenshots/month. Paid plans from $19/month cover daily pipeline workloads. Sign up at snapapi.pics/dashboard and add your first screenshot operator to a pipeline today.

Screenshot API for Data Pipelines & ETL Workflows