OCR Node.js Python April 4, 2026

Screenshot to Text: OCR in Node.js and Python (2026)

Whether you're extracting text from a UI screenshot, digitising a scanned document, or reading data from an image, OCR (Optical Character Recognition) is the solution. This guide covers the full toolkit — from open-source Tesseract to cloud Vision APIs to AI-powered extraction — with working code in both Node.js and Python.

When You Need OCR

Not every "text from image" task needs OCR. If you control the source page, use a headless browser and read the DOM. OCR is for when the text is baked into pixels:

Screenshots of legacy or desktop software
Scanned PDFs and invoices
Images of price tags, signage, or whiteboards
Screenshots captured by automation where DOM access isn't possible
Thumbnail or OG-image text you want to index

OCR Methods at a Glance

Method	Language	Accuracy	Cost	Best for
Tesseract.js	Node.js	Good	Free	Clean printed text, Latin scripts
pytesseract	Python	Good	Free	Batch processing, local pipelines
Google Cloud Vision	Both	Excellent	~$1.50/1K	Handwriting, complex layouts
AWS Textract	Both	Excellent	~$1.50/1K	Forms, tables in documents
SnapAPI /analyze	Both	Excellent	Usage-based	Live web pages, no image upload needed

Tesseract.js (Node.js)

Tesseract.js runs the Tesseract OCR engine entirely in JavaScript — no native binaries, no system dependencies. Install it with npm and you're reading images in minutes.

npm install tesseract.js sharp

const Tesseract = require('tesseract.js');

// Basic OCR — read text from a local image or URL
async function imageToText(imagePath) {
  const { data } = await Tesseract.recognize(imagePath, 'eng', {
    logger: m => process.stdout.write('.')
  });
  return data.text.trim();
}

imageToText('./screenshot.png').then(text => {
  console.log('Extracted text:\n', text);
});

Image preprocessing with Sharp (improves accuracy dramatically)

const sharp = require('sharp');
const Tesseract = require('tesseract.js');
const path = require('path');

/**
 * Preprocess image for better OCR accuracy:
 * - Resize to 300 DPI equivalent
 * - Convert to greyscale
 * - Increase contrast
 * - Remove noise with median filter
 */
async function preprocessForOCR(inputPath) {
  const outputPath = inputPath.replace(/\.\w+$/, '-processed.png');
  await sharp(inputPath)
    .greyscale()
    .normalize()                    // stretch contrast to full range
    .sharpen({ sigma: 1.5 })       // edge enhancement
    .resize({ width: 2400, withoutEnlargement: false })  // higher res → better OCR
    .png({ compressionLevel: 0 })  // lossless for OCR
    .toFile(outputPath);
  return outputPath;
}

async function ocrWithPreprocessing(imagePath) {
  const processed = await preprocessForOCR(imagePath);
  const { data } = await Tesseract.recognize(processed, 'eng', {
    tessedit_pageseg_mode: '6'  // PSM 6: assume uniform block of text
  });
  return {
    text: data.text.trim(),
    confidence: data.confidence,
    words: data.words.map(w => ({ text: w.text, confidence: w.confidence }))
  };
}

Accuracy tip: Tesseract performs best on high-contrast, high-resolution images. A 2× upscale before OCR often raises accuracy by 20–30% on screenshots captured at 96 DPI.

Reusable worker pool (batch OCR without re-initialising)

const { createWorker } = require('tesseract.js');

// Initialise one worker per CPU core
const os = require('os');
const CONCURRENCY = Math.max(1, os.cpus().length - 1);

async function batchOCR(imagePaths) {
  const workers = await Promise.all(
    Array.from({ length: Math.min(CONCURRENCY, imagePaths.length) }, async () => {
      const w = await createWorker('eng');
      return w;
    })
  );

  const results = [];
  let idx = 0;

  await Promise.all(workers.map(async worker => {
    while (idx < imagePaths.length) {
      const i = idx++;
      const { data } = await worker.recognize(imagePaths[i]);
      results[i] = { path: imagePaths[i], text: data.text.trim() };
    }
    await worker.terminate();
  }));

  return results;
}

pytesseract (Python)

The Python wrapper around Tesseract requires the Tesseract binary installed on the system. On Ubuntu: apt install tesseract-ocr. On macOS: brew install tesseract.

pip install pytesseract pillow opencv-python-headless

import pytesseract
from PIL import Image, ImageFilter, ImageEnhance
import cv2
import numpy as np

def preprocess_image(path: str) -> Image.Image:
    """Greyscale + contrast boost + denoise for better OCR accuracy."""
    img = cv2.imread(path)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    # Adaptive threshold — handles uneven lighting
    thresh = cv2.adaptiveThreshold(
        gray, 255,
        cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
        cv2.THRESH_BINARY,
        blockSize=11, C=2
    )
    # Mild median blur to remove noise
    denoised = cv2.medianBlur(thresh, 3)
    return Image.fromarray(denoised)

def image_to_text(path: str, lang: str = 'eng') -> dict:
    img = preprocess_image(path)
    config = r'--oem 3 --psm 6'
    text = pytesseract.image_to_string(img, lang=lang, config=config)
    data = pytesseract.image_to_data(img, lang=lang, config=config,
                                      output_type=pytesseract.Output.DICT)
    avg_conf = sum(c for c in data['conf'] if c > 0) / max(1, sum(1 for c in data['conf'] if c > 0))
    return {'text': text.strip(), 'confidence': round(avg_conf, 1)}

# Usage
result = image_to_text('./invoice-screenshot.png')
print(f"Confidence: {result['confidence']}%")
print(result['text'])

Batch processing with asyncio

import asyncio
from concurrent.futures import ProcessPoolExecutor

def _ocr_worker(path):
    return path, image_to_text(path)['text']

async def batch_ocr(paths: list[str], workers: int = 4) -> list[dict]:
    loop = asyncio.get_event_loop()
    with ProcessPoolExecutor(max_workers=workers) as pool:
        tasks = [loop.run_in_executor(pool, _ocr_worker, p) for p in paths]
        results = await asyncio.gather(*tasks)
    return [{'path': p, 'text': t} for p, t in results]

if __name__ == '__main__':
    import glob
    images = glob.glob('./screenshots/*.png')
    results = asyncio.run(batch_ocr(images))
    for r in results:
        print(r['path'], '→', r['text'][:80])

Google Cloud Vision API

Cloud Vision handles handwriting, complex multi-column layouts, and non-Latin scripts far better than Tesseract. It also supports DOCUMENT_TEXT_DETECTION which preserves paragraph structure.

npm install @google-cloud/vision

const vision = require('@google-cloud/vision');
const fs = require('fs');

const client = new vision.ImageAnnotatorClient();

async function visionOCR(imagePath) {
  // Read local file as base64
  const imageBytes = fs.readFileSync(imagePath).toString('base64');

  const [result] = await client.documentTextDetection({
    image: { content: imageBytes }
  });

  const fullText = result.fullTextAnnotation;
  if (!fullText) return { text: '', pages: [] };

  // Extract structured paragraphs
  const pages = fullText.pages.map(page =>
    page.blocks
      .filter(b => b.blockType === 'TEXT')
      .map(block =>
        block.paragraphs
          .map(para =>
            para.words.map(w => w.symbols.map(s => s.text).join('')).join(' ')
          ).join('\n')
      ).join('\n\n')
  );

  return { text: fullText.text, pages, confidence: fullText.pages[0]?.confidence };
}

// Or pass a URL directly (no upload needed)
async function visionOCRFromUrl(imageUrl) {
  const [result] = await client.documentTextDetection({ image: { source: { imageUri: imageUrl } } });
  return result.fullTextAnnotation?.text ?? '';
}

Pricing: Cloud Vision charges ~$1.50 per 1,000 images. For high-volume workloads, Tesseract at $0 often makes more sense — especially after preprocessing.

AI Page Analysis with SnapAPI

SnapAPI's /v1/analyze endpoint takes a URL (not an image file) and returns AI-extracted text, data, or answers to specific questions. Perfect for reading text from live web pages without any image handling.

const axios = require('axios');

// Extract all visible text from a live webpage
async function analyzePageText(url) {
  const res = await axios.post('https://api.snapapi.pics/v1/analyze', {
    url,
    prompt: 'Extract all visible text content on this page, preserving headings and paragraph structure.',
    stealth: true
  }, { headers: { 'X-Api-Key': process.env.SNAPAPI_KEY } });

  return res.data.result; // AI-formatted text output
}

// Ask specific questions about page content
async function extractSpecificData(url, question) {
  const res = await axios.post('https://api.snapapi.pics/v1/analyze', {
    url,
    prompt: question,
    stealth: true
  }, { headers: { 'X-Api-Key': process.env.SNAPAPI_KEY } });

  return res.data.result;
}

// Examples
analyzePageText('https://competitor.com/pricing').then(console.log);
extractSpecificData('https://shop.com/product/xyz', 'What is the current price and stock status?').then(console.log);

import httpx, asyncio, os

async def analyze_page(url: str, prompt: str) -> str:
    async with httpx.AsyncClient() as client:
        r = await client.post(
            'https://api.snapapi.pics/v1/analyze',
            json={'url': url, 'prompt': prompt, 'stealth': True},
            headers={'X-Api-Key': os.environ['SNAPAPI_KEY']},
            timeout=60
        )
        r.raise_for_status()
        return r.json()['result']

# Extract form labels and field names from a signup page
text = asyncio.run(analyze_page(
    'https://app.example.com/register',
    'List all form field labels and placeholder text visible on this page.'
))
print(text)

Use case: SnapAPI /analyze is ideal when the "image" is actually a live URL — no need to capture, store, and then OCR a screenshot. The API renders the page in a real browser and runs AI analysis in one step.

Building a Production OCR Pipeline

A practical pipeline for processing large volumes of screenshots:

const { createWorker } = require('tesseract.js');
const sharp = require('sharp');
const fs = require('fs/promises');
const path = require('path');

class OCRPipeline {
  constructor({ concurrency = 4, lang = 'eng', outputDir = './ocr-output' } = {}) {
    this.concurrency = concurrency;
    this.lang = lang;
    this.outputDir = outputDir;
    this.workers = [];
  }

  async init() {
    await fs.mkdir(this.outputDir, { recursive: true });
    this.workers = await Promise.all(
      Array.from({ length: this.concurrency }, () => createWorker(this.lang))
    );
    this.queue = [...this.workers];
  }

  async process(imagePath) {
    // Preprocess
    const tmpPath = path.join(this.outputDir, '_tmp_' + path.basename(imagePath));
    await sharp(imagePath).greyscale().normalize().resize({ width: 2400, withoutEnlargement: false }).toFile(tmpPath);

    // Acquire worker from pool
    while (!this.queue.length) await new Promise(r => setTimeout(r, 50));
    const worker = this.queue.pop();

    try {
      const { data } = await worker.recognize(tmpPath);
      await fs.unlink(tmpPath).catch(() => {});
      return { path: imagePath, text: data.text.trim(), confidence: data.confidence };
    } finally {
      this.queue.push(worker); // return to pool
    }
  }

  async terminate() { await Promise.all(this.workers.map(w => w.terminate())); }
}

OCR Best Practices

Always preprocess — greyscale + normalize + upscale improves Tesseract accuracy by 20–40%.
Choose PSM mode carefully — PSM 6 for uniform blocks, PSM 11 for sparse text, PSM 3 (default) for mixed layouts.
Filter by confidence — discard words with confidence < 60 to reduce noise in output.
Use Cloud Vision for handwriting or complex layouts — Tesseract struggles with cursive and multi-column newspaper-style pages.
Cache results — OCR is CPU/API-expensive. Hash the image bytes and cache the text output in Redis or SQLite.
For live URLs, skip OCR entirely — use SnapAPI /analyze or a headless browser to read the DOM directly.

Need text from a live webpage? Skip the OCR

SnapAPI renders the page in a real browser and extracts exactly what you need — text, data, or answers to specific questions. 200 free requests/month.

Try SnapAPI Free →