Screenshot to Text: OCR in Node.js and Python (2026)
Whether you're extracting text from a UI screenshot, digitising a scanned document, or reading data from an image, OCR (Optical Character Recognition) is the solution. This guide covers the full toolkit — from open-source Tesseract to cloud Vision APIs to AI-powered extraction — with working code in both Node.js and Python.
When You Need OCR
Not every "text from image" task needs OCR. If you control the source page, use a headless browser and read the DOM. OCR is for when the text is baked into pixels:
- Screenshots of legacy or desktop software
- Scanned PDFs and invoices
- Images of price tags, signage, or whiteboards
- Screenshots captured by automation where DOM access isn't possible
- Thumbnail or OG-image text you want to index
OCR Methods at a Glance
| Method | Language | Accuracy | Cost | Best for |
|---|---|---|---|---|
| Tesseract.js | Node.js | Good | Free | Clean printed text, Latin scripts |
| pytesseract | Python | Good | Free | Batch processing, local pipelines |
| Google Cloud Vision | Both | Excellent | ~$1.50/1K | Handwriting, complex layouts |
| AWS Textract | Both | Excellent | ~$1.50/1K | Forms, tables in documents |
| SnapAPI /analyze | Both | Excellent | Usage-based | Live web pages, no image upload needed |
Tesseract.js (Node.js)
Tesseract.js runs the Tesseract OCR engine entirely in JavaScript — no native binaries, no system dependencies. Install it with npm and you're reading images in minutes.
npm install tesseract.js sharp
const Tesseract = require('tesseract.js');
// Basic OCR — read text from a local image or URL
async function imageToText(imagePath) {
const { data } = await Tesseract.recognize(imagePath, 'eng', {
logger: m => process.stdout.write('.')
});
return data.text.trim();
}
imageToText('./screenshot.png').then(text => {
console.log('Extracted text:\n', text);
});
Image preprocessing with Sharp (improves accuracy dramatically)
const sharp = require('sharp');
const Tesseract = require('tesseract.js');
const path = require('path');
/**
* Preprocess image for better OCR accuracy:
* - Resize to 300 DPI equivalent
* - Convert to greyscale
* - Increase contrast
* - Remove noise with median filter
*/
async function preprocessForOCR(inputPath) {
const outputPath = inputPath.replace(/\.\w+$/, '-processed.png');
await sharp(inputPath)
.greyscale()
.normalize() // stretch contrast to full range
.sharpen({ sigma: 1.5 }) // edge enhancement
.resize({ width: 2400, withoutEnlargement: false }) // higher res → better OCR
.png({ compressionLevel: 0 }) // lossless for OCR
.toFile(outputPath);
return outputPath;
}
async function ocrWithPreprocessing(imagePath) {
const processed = await preprocessForOCR(imagePath);
const { data } = await Tesseract.recognize(processed, 'eng', {
tessedit_pageseg_mode: '6' // PSM 6: assume uniform block of text
});
return {
text: data.text.trim(),
confidence: data.confidence,
words: data.words.map(w => ({ text: w.text, confidence: w.confidence }))
};
}
Reusable worker pool (batch OCR without re-initialising)
const { createWorker } = require('tesseract.js');
// Initialise one worker per CPU core
const os = require('os');
const CONCURRENCY = Math.max(1, os.cpus().length - 1);
async function batchOCR(imagePaths) {
const workers = await Promise.all(
Array.from({ length: Math.min(CONCURRENCY, imagePaths.length) }, async () => {
const w = await createWorker('eng');
return w;
})
);
const results = [];
let idx = 0;
await Promise.all(workers.map(async worker => {
while (idx < imagePaths.length) {
const i = idx++;
const { data } = await worker.recognize(imagePaths[i]);
results[i] = { path: imagePaths[i], text: data.text.trim() };
}
await worker.terminate();
}));
return results;
}
pytesseract (Python)
The Python wrapper around Tesseract requires the Tesseract binary installed on the system. On Ubuntu: apt install tesseract-ocr. On macOS: brew install tesseract.
pip install pytesseract pillow opencv-python-headless
import pytesseract
from PIL import Image, ImageFilter, ImageEnhance
import cv2
import numpy as np
def preprocess_image(path: str) -> Image.Image:
"""Greyscale + contrast boost + denoise for better OCR accuracy."""
img = cv2.imread(path)
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# Adaptive threshold — handles uneven lighting
thresh = cv2.adaptiveThreshold(
gray, 255,
cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY,
blockSize=11, C=2
)
# Mild median blur to remove noise
denoised = cv2.medianBlur(thresh, 3)
return Image.fromarray(denoised)
def image_to_text(path: str, lang: str = 'eng') -> dict:
img = preprocess_image(path)
config = r'--oem 3 --psm 6'
text = pytesseract.image_to_string(img, lang=lang, config=config)
data = pytesseract.image_to_data(img, lang=lang, config=config,
output_type=pytesseract.Output.DICT)
avg_conf = sum(c for c in data['conf'] if c > 0) / max(1, sum(1 for c in data['conf'] if c > 0))
return {'text': text.strip(), 'confidence': round(avg_conf, 1)}
# Usage
result = image_to_text('./invoice-screenshot.png')
print(f"Confidence: {result['confidence']}%")
print(result['text'])
Batch processing with asyncio
import asyncio
from concurrent.futures import ProcessPoolExecutor
def _ocr_worker(path):
return path, image_to_text(path)['text']
async def batch_ocr(paths: list[str], workers: int = 4) -> list[dict]:
loop = asyncio.get_event_loop()
with ProcessPoolExecutor(max_workers=workers) as pool:
tasks = [loop.run_in_executor(pool, _ocr_worker, p) for p in paths]
results = await asyncio.gather(*tasks)
return [{'path': p, 'text': t} for p, t in results]
if __name__ == '__main__':
import glob
images = glob.glob('./screenshots/*.png')
results = asyncio.run(batch_ocr(images))
for r in results:
print(r['path'], '→', r['text'][:80])
Google Cloud Vision API
Cloud Vision handles handwriting, complex multi-column layouts, and non-Latin scripts far better than Tesseract. It also supports DOCUMENT_TEXT_DETECTION which preserves paragraph structure.
npm install @google-cloud/vision
const vision = require('@google-cloud/vision');
const fs = require('fs');
const client = new vision.ImageAnnotatorClient();
async function visionOCR(imagePath) {
// Read local file as base64
const imageBytes = fs.readFileSync(imagePath).toString('base64');
const [result] = await client.documentTextDetection({
image: { content: imageBytes }
});
const fullText = result.fullTextAnnotation;
if (!fullText) return { text: '', pages: [] };
// Extract structured paragraphs
const pages = fullText.pages.map(page =>
page.blocks
.filter(b => b.blockType === 'TEXT')
.map(block =>
block.paragraphs
.map(para =>
para.words.map(w => w.symbols.map(s => s.text).join('')).join(' ')
).join('\n')
).join('\n\n')
);
return { text: fullText.text, pages, confidence: fullText.pages[0]?.confidence };
}
// Or pass a URL directly (no upload needed)
async function visionOCRFromUrl(imageUrl) {
const [result] = await client.documentTextDetection({ image: { source: { imageUri: imageUrl } } });
return result.fullTextAnnotation?.text ?? '';
}
AI Page Analysis with SnapAPI
SnapAPI's /v1/analyze endpoint takes a URL (not an image file) and returns AI-extracted text, data, or answers to specific questions. Perfect for reading text from live web pages without any image handling.
const axios = require('axios');
// Extract all visible text from a live webpage
async function analyzePageText(url) {
const res = await axios.post('https://api.snapapi.pics/v1/analyze', {
url,
prompt: 'Extract all visible text content on this page, preserving headings and paragraph structure.',
stealth: true
}, { headers: { 'X-Api-Key': process.env.SNAPAPI_KEY } });
return res.data.result; // AI-formatted text output
}
// Ask specific questions about page content
async function extractSpecificData(url, question) {
const res = await axios.post('https://api.snapapi.pics/v1/analyze', {
url,
prompt: question,
stealth: true
}, { headers: { 'X-Api-Key': process.env.SNAPAPI_KEY } });
return res.data.result;
}
// Examples
analyzePageText('https://competitor.com/pricing').then(console.log);
extractSpecificData('https://shop.com/product/xyz', 'What is the current price and stock status?').then(console.log);
import httpx, asyncio, os
async def analyze_page(url: str, prompt: str) -> str:
async with httpx.AsyncClient() as client:
r = await client.post(
'https://api.snapapi.pics/v1/analyze',
json={'url': url, 'prompt': prompt, 'stealth': True},
headers={'X-Api-Key': os.environ['SNAPAPI_KEY']},
timeout=60
)
r.raise_for_status()
return r.json()['result']
# Extract form labels and field names from a signup page
text = asyncio.run(analyze_page(
'https://app.example.com/register',
'List all form field labels and placeholder text visible on this page.'
))
print(text)
Building a Production OCR Pipeline
A practical pipeline for processing large volumes of screenshots:
const { createWorker } = require('tesseract.js');
const sharp = require('sharp');
const fs = require('fs/promises');
const path = require('path');
class OCRPipeline {
constructor({ concurrency = 4, lang = 'eng', outputDir = './ocr-output' } = {}) {
this.concurrency = concurrency;
this.lang = lang;
this.outputDir = outputDir;
this.workers = [];
}
async init() {
await fs.mkdir(this.outputDir, { recursive: true });
this.workers = await Promise.all(
Array.from({ length: this.concurrency }, () => createWorker(this.lang))
);
this.queue = [...this.workers];
}
async process(imagePath) {
// Preprocess
const tmpPath = path.join(this.outputDir, '_tmp_' + path.basename(imagePath));
await sharp(imagePath).greyscale().normalize().resize({ width: 2400, withoutEnlargement: false }).toFile(tmpPath);
// Acquire worker from pool
while (!this.queue.length) await new Promise(r => setTimeout(r, 50));
const worker = this.queue.pop();
try {
const { data } = await worker.recognize(tmpPath);
await fs.unlink(tmpPath).catch(() => {});
return { path: imagePath, text: data.text.trim(), confidence: data.confidence };
} finally {
this.queue.push(worker); // return to pool
}
}
async terminate() { await Promise.all(this.workers.map(w => w.terminate())); }
}
OCR Best Practices
- Always preprocess — greyscale + normalize + upscale improves Tesseract accuracy by 20–40%.
- Choose PSM mode carefully — PSM 6 for uniform blocks, PSM 11 for sparse text, PSM 3 (default) for mixed layouts.
- Filter by confidence — discard words with
confidence < 60to reduce noise in output. - Use Cloud Vision for handwriting or complex layouts — Tesseract struggles with cursive and multi-column newspaper-style pages.
- Cache results — OCR is CPU/API-expensive. Hash the image bytes and cache the text output in Redis or SQLite.
- For live URLs, skip OCR entirely — use SnapAPI /analyze or a headless browser to read the DOM directly.
Need text from a live webpage? Skip the OCR
SnapAPI renders the page in a real browser and extracts exactly what you need — text, data, or answers to specific questions. 200 free requests/month.
Try SnapAPI Free →