Node.js Python LLM / RAG April 5, 2026

Convert Website to Markdown in Node.js and Python (2026)

Converting web pages to Markdown is increasingly important for feeding clean text into LLMs, building RAG (Retrieval-Augmented Generation) pipelines, archiving documentation, and migrating content between platforms. This guide covers every approach — from lightweight Turndown.js to SnapAPI's AI-powered extraction.

Why Convert to Markdown?

LLM context windows — Raw HTML wastes tokens on tags and attributes. Clean Markdown gives you 3–5× more content per prompt.
RAG pipelines — Markdown chunks embed better and are easier to split at semantic boundaries (headings, paragraphs).
Documentation archiving — Preserve external docs in a format that's readable without a browser.
Content migration — Move blog posts or articles from CMS to another platform.
Diff-friendly storage — Track content changes over time with git.

Turndown.js (Node.js)

Turndown converts HTML strings to Markdown. Pair it with axios or SnapAPI to fetch and convert any page.

npm install turndown axios cheerio

const axios    = require('axios');
const cheerio  = require('cheerio');
const TurndownService = require('turndown');

const turndown = new TurndownService({
  headingStyle:   'atx',      // # H1, ## H2 etc.
  bulletListMarker: '-',
  codeBlockStyle: 'fenced',   // ```code```
  fence: '```',
});

// Remove nav, footer, scripts, ads before converting
function cleanHTML(html) {
  const $ = cheerio.load(html);
  $('nav, footer, header, aside, script, style, .ad, .sidebar, [aria-hidden="true"]').remove();
  // Return just the main content area
  return $('article, main, .content, .post-body, body').first().html() ?? $('body').html();
}

async function websiteToMarkdown(url) {
  const { data: html } = await axios.get(url, {
    headers: { 'User-Agent': 'Mozilla/5.0 (compatible; MarkdownBot/1.0)' }
  });

  const cleanedHtml = cleanHTML(html);
  const markdown = turndown.turndown(cleanedHtml);

  return markdown
    .replace(/\n{3,}/g, '\n\n')  // collapse excessive blank lines
    .trim();
}

websiteToMarkdown('https://example.com/blog/post').then(md => {
  console.log(md);
});

Custom Turndown rules

// Keep code blocks with language hints
turndown.addRule('fencedCodeBlock', {
  filter: node => node.nodeName === 'PRE' && node.firstChild?.nodeName === 'CODE',
  replacement: (content, node) => {
    const lang = node.firstChild?.className?.replace('language-', '') ?? '';
    return `\n\`\`\`${lang}\n${node.firstChild?.textContent?.trim()}\n\`\`\`\n`;
  }
});

// Convert tables to GFM tables
const { tables } = require('turndown-plugin-gfm');
turndown.use(tables);

// Strip images but keep alt text as description
turndown.addRule('images', {
  filter: 'img',
  replacement: (content, node) => {
    const alt = node.getAttribute('alt');
    return alt ? `*[Image: ${alt}]*` : '';
  }
});

Batch convert + save to files

const fs   = require('fs/promises');
const path = require('path');

async function batchConvert(urls, outputDir = './markdown') {
  await fs.mkdir(outputDir, { recursive: true });

  const results = await Promise.allSettled(
    urls.map(async url => {
      const md       = await websiteToMarkdown(url);
      const slug     = url.replace(/https?:\/\/[^/]+/, '').replace(/\//g, '-').replace(/^-/, '') || 'index';
      const filename = path.join(outputDir, `${slug}.md`);
      await fs.writeFile(filename, `# Source: ${url}\n\n${md}`);
      return filename;
    })
  );

  results.forEach((r, i) => {
    if (r.status === 'fulfilled') console.log('✓', r.value);
    else console.error('✗', urls[i], r.reason.message);
  });
}

html2text (Python)

html2text is the Python equivalent of Turndown — a battle-tested HTML-to-Markdown converter used by tools like Scrapy.

pip install html2text httpx beautifulsoup4

import html2text
import httpx
from bs4 import BeautifulSoup

h = html2text.HTML2Text()
h.ignore_links       = False   # preserve [text](url) links
h.ignore_images      = True    # skip image tags
h.ignore_emphasis    = False
h.body_width         = 0       # no line wrapping
h.protect_links      = True
h.unicode_snob       = True    # prefer unicode over HTML entities

def clean_html(html: str) -> str:
    soup = BeautifulSoup(html, 'html.parser')
    for tag in soup.select('nav, footer, header, aside, script, style, .ad'):
        tag.decompose()
    main = soup.select_one('article, main, .content, .post-body') or soup.body
    return str(main)

async def url_to_markdown(url: str) -> str:
    async with httpx.AsyncClient(follow_redirects=True, timeout=15) as client:
        r = await client.get(url, headers={
            'User-Agent': 'Mozilla/5.0 (compatible; MarkdownBot/1.0)'
        })
        r.raise_for_status()
    cleaned = clean_html(r.text)
    md = h.handle(cleaned)
    return '\n'.join(
        line for line in md.splitlines()
        if line.strip() or not any(True for _ in [''])
    ).strip()

import asyncio
md = asyncio.run(url_to_markdown('https://example.com/article'))
print(md[:1000])

Converting SPAs to Markdown with SnapAPI

For React/Vue/Angular apps, axios and httpx fetch an empty shell. Use SnapAPI to get the rendered HTML, then convert with Turndown or html2text:

const axios   = require('axios');
const cheerio = require('cheerio');
const TurndownService = require('turndown');

const td = new TurndownService({ headingStyle: 'atx', codeBlockStyle: 'fenced' });

async function spaToMarkdown(url) {
  // 1. Render with SnapAPI (handles JS, SPAs, bot protection)
  const { data } = await axios.post('https://api.snapapi.pics/v1/scrape', {
    url,
    waitFor:  'networkidle',
    blockAds: true,
    blockCookieBanners: true
  }, { headers: { 'X-Api-Key': process.env.SNAPAPI_KEY } });

  // 2. Strip navigation/chrome
  const $ = cheerio.load(data.html);
  $('nav, footer, header, aside, script, style').remove();
  const mainHtml = $('article, main, [role="main"]').first().html()
    ?? $('body').html();

  // 3. Convert to Markdown
  return td.turndown(mainHtml).replace(/\n{3,}/g, '\n\n').trim();
}

spaToMarkdown('https://app.example.com/docs/getting-started').then(md => {
  require('fs').writeFileSync('./output.md', md);
  console.log(`Saved ${md.length} chars of Markdown`);
});

AI-powered extraction for cleaner output

For maximum quality, SnapAPI's /v1/analyze can directly return the article content as clean Markdown — the AI extracts just the relevant text, discarding nav, ads, and boilerplate:

async function aiToMarkdown(url) {
  const { data } = await axios.post('https://api.snapapi.pics/v1/analyze', {
    url,
    prompt: 'Extract the main article content and return it as clean Markdown. Include the title as a # heading, preserve all headings, lists, and code blocks. Exclude navigation, footers, ads, and sidebars.',
    stealth: false
  }, { headers: { 'X-Api-Key': process.env.SNAPAPI_KEY } });

  return data.result; // AI-generated Markdown
}

LLM tip: AI-extracted Markdown is typically 40–60% smaller than turndown output from the same page — because it strips boilerplate that mechanical conversion keeps. Smaller context = cheaper LLM calls.

Plugging Into a RAG Pipeline

A common workflow: scrape documentation sites → convert to Markdown → chunk → embed → store in a vector DB.

const { OpenAI } = require('openai');

const openai = new OpenAI();

// Chunk Markdown at heading boundaries
function chunkMarkdown(md, maxChunkSize = 1000) {
  const chunks = [];
  const sections = md.split(/(?=^#{1,3} )/m);
  let current = '';

  for (const section of sections) {
    if ((current + section).length > maxChunkSize && current) {
      chunks.push(current.trim());
      current = section;
    } else {
      current += section;
    }
  }
  if (current.trim()) chunks.push(current.trim());
  return chunks;
}

async function embedChunks(chunks) {
  const { data } = await openai.embeddings.create({
    model: 'text-embedding-3-small',
    input: chunks
  });
  return data.map((d, i) => ({ text: chunks[i], embedding: d.embedding }));
}

// Full pipeline
async function ingestUrl(url) {
  const md     = await spaToMarkdown(url);
  const chunks = chunkMarkdown(md);
  const vecs   = await embedChunks(chunks);
  console.log(`${url}: ${chunks.length} chunks, ${vecs.length} embeddings`);
  return vecs;
}

Get clean Markdown from any URL

SnapAPI renders the page and returns clean text — or let the AI extract just the article content as Markdown. 200 free requests/month.

Try SnapAPI Free →