Scraping Node.js Python April 5, 2026

Extract Emails from a Website with Node.js and Python (2026)

Finding contact emails on web pages is useful for sales outreach, lead generation, and competitive research. This guide covers the right way to do it — regex patterns, mailto link parsing, handling obfuscated emails, and using SnapAPI for JavaScript-rendered pages. We also cover the ethical and legal boundaries you need to respect.

Legal note: Only extract publicly visible emails. Respect GDPR, CAN-SPAM, and site Terms of Service. Never use extracted emails for unsolicited bulk messaging without consent.

Basic Regex Email Extraction (Node.js)

const axios   = require('axios');
const cheerio = require('cheerio');

// RFC 5322 simplified — catches 99% of real emails
const EMAIL_REGEX = /[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}/g;

async function extractEmails(url) {
  const { data: html } = await axios.get(url, {
    headers: { 'User-Agent': 'Mozilla/5.0 (compatible; EmailBot/1.0)' },
    timeout: 15000
  });

  const $ = cheerio.load(html);

  // Method 1: Regex on full page text
  const pageText = $('body').text();
  const regexEmails = pageText.match(EMAIL_REGEX) ?? [];

  // Method 2: Parse mailto: links (more reliable)
  const mailtoEmails = $('a[href^="mailto:"]').map((_, el) => {
    return $(el).attr('href').replace('mailto:', '').split('?')[0].trim();
  }).get();

  // Method 3: Check common meta/structured data
  const schemaEmails = [];
  $('script[type="application/ld+json"]').each((_, el) => {
    try {
      const json = JSON.parse($(el).html());
      if (json.email) schemaEmails.push(json.email);
      if (json.contactPoint?.email) schemaEmails.push(json.contactPoint.email);
    } catch {}
  });

  // Dedupe and clean
  const all = [...new Set([...regexEmails, ...mailtoEmails, ...schemaEmails])]
    .map(e => e.toLowerCase().trim())
    .filter(e => EMAIL_REGEX.test(e))
    .filter(e => !e.endsWith('.png') && !e.endsWith('.jpg')); // filter false positives

  return { url, emails: all, sources: { regex: regexEmails.length, mailto: mailtoEmails.length, schema: schemaEmails.length } };
}

extractEmails('https://example.com/contact').then(console.log);

Handling Obfuscated Emails

Many sites obfuscate emails to avoid scraping. Common patterns and how to decode them:

function deobfuscateEmails(html) {
  const $ = cheerio.load(html);
  const decoded = [];

  // Pattern 1: "user [at] domain [dot] com"
  const atDot = $('body').text().match(
    /[a-zA-Z0-9._%+\-]+\s*\[?\s*(?:at|AT)\s*\]?\s*[a-zA-Z0-9.\-]+\s*\[?\s*(?:dot|DOT)\s*\]?\s*[a-zA-Z]{2,}/g
  ) ?? [];
  atDot.forEach(match => {
    const email = match.replace(/\s*\[?\s*(?:at|AT)\s*\]?\s*/g, '@').replace(/\s*\[?\s*(?:dot|DOT)\s*\]?\s*/g, '.');
    decoded.push(email.trim());
  });

  // Pattern 2: HTML entity encoding (@ = @)
  const rawHtml = $('body').html() ?? '';
  const entityDecoded = rawHtml.replace(/&#(\d+);/g, (_, code) => String.fromCharCode(+code));
  const entityEmails = entityDecoded.match(EMAIL_REGEX) ?? [];
  decoded.push(...entityEmails);

  // Pattern 3: CSS direction:rtl reversal
  $('[style*="direction:rtl"], [style*="direction: rtl"]').each((_, el) => {
    const text = $(el).text().split('').reverse().join('');
    const matches = text.match(EMAIL_REGEX) ?? [];
    decoded.push(...matches);
  });

  // Pattern 4: data-email attribute
  $('[data-email], [data-mail]').each((_, el) => {
    const email = $(el).attr('data-email') ?? $(el).attr('data-mail');
    if (email) decoded.push(email);
  });

  return [...new Set(decoded)].map(e => e.toLowerCase());
}

Python Email Extraction

import re, httpx, asyncio
from bs4 import BeautifulSoup

EMAIL_RE = re.compile(r'[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}')

async def extract_emails(url: str) -> dict:
    async with httpx.AsyncClient(follow_redirects=True, timeout=15) as client:
        r = await client.get(url, headers={
            'User-Agent': 'Mozilla/5.0 (compatible; EmailBot/1.0)'
        })
        r.raise_for_status()

    soup = BeautifulSoup(r.text, 'html.parser')

    # Regex on page text
    regex_emails = set(EMAIL_RE.findall(soup.get_text()))

    # mailto: links
    mailto_emails = set()
    for a in soup.find_all('a', href=re.compile(r'^mailto:')):
        email = a['href'].replace('mailto:', '').split('?')[0].strip()
        if EMAIL_RE.match(email):
            mailto_emails.add(email.lower())

    all_emails = {e.lower() for e in regex_emails | mailto_emails
                  if not e.endswith(('.png', '.jpg', '.gif'))}

    return {'url': url, 'emails': sorted(all_emails), 'count': len(all_emails)}

result = asyncio.run(extract_emails('https://example.com/about'))
print(result)

AI-Powered Email Extraction with SnapAPI

For pages that load emails via JavaScript or use heavy obfuscation, SnapAPI's /v1/extract handles both rendering and extraction:

const axios = require('axios');

async function extractEmailsWithAPI(url) {
  const { data } = await axios.post('https://api.snapapi.pics/v1/extract', {
    url,
    schema: {
      emails: {
        type: 'array',
        items: { type: 'string' },
        description: 'All email addresses visible on this page, including obfuscated ones'
      },
      contactName: { type: 'string', description: 'Contact person name if visible' },
      department:  { type: 'string', description: 'Department or role if visible' }
    },
    stealth: true
  }, { headers: { 'X-Api-Key': process.env.SNAPAPI_KEY } });

  return data.data; // { emails: [...], contactName: '...', department: '...' }
}

Advantage: SnapAPI's AI extraction handles emails hidden by JavaScript rendering, CSS tricks, and obfuscation patterns without writing custom deobfuscation code.

Extract contact info from any page

SnapAPI renders the page and extracts structured data — emails, names, phone numbers — as typed JSON. 200 free requests/month.

Try SnapAPI Free →