Node.js Worker Threads — True Parallelism Without the Cluster Mess

Node.js is single-threaded. You’ve heard this a thousand times. It’s also not entirely true — and hasn’t been since Node.js 12.

Worker Threads (experimental in Node 10.5, stable since Node 12) give you real OS threads sharing the same process. Not child processes (cluster module), not separate Node.js instances — actual threads with shared memory access, running JavaScript in parallel.

This post explains when and how to use them, with benchmarks that show both when they help and when they make things worse.

Open Table of contents

The Problem: CPU-Bound Work
Creating Your First Worker
The Thread Pool Pattern
SharedArrayBuffer: Zero-Copy Data Sharing
Benchmarks: When Workers Help (and When They Don’t)
Worker Threads vs cluster
Practical Example: Parallel CSV Processing

The Problem: CPU-Bound Work

Node.js’s event loop is excellent at I/O concurrency — handling thousands of simultaneous connections, waiting on databases, calling external APIs. The one thing it cannot do: parallelize CPU work.

// This blocks the event loop for its entire duration
function computeHash(data, rounds) {
  let hash = data;
  for (let i = 0; i < rounds; i++) {
    hash = crypto.createHash('sha256').update(hash).digest('hex');
  }
  return hash;
}

// HTTP server
app.get('/hash', (req, res) => {
  const result = computeHash(req.query.input, 100_000); // ~500ms
  res.json({ hash: result });
});

During that 500ms computation, NO other requests can be processed. The entire event loop is frozen. 100 concurrent users → 50 seconds of stacked latency for the last user.

Before Worker Threads, solutions were:

child_process: Spawn a new process, IPC over stdio. Heavy overhead (~50ms startup).
cluster: Multiple Node.js processes. Each has its own heap, no shared memory.
Move work to C++: Native addons. Powerful but complex.

Worker Threads are the right tool for CPU-bound JavaScript.

Creating Your First Worker

// worker.js — runs in the thread
import { workerData, parentPort } from 'worker_threads';
import crypto from 'crypto';

function computeHash(data, rounds) {
  let hash = data;
  for (let i = 0; i < rounds; i++) {
    hash = crypto.createHash('sha256').update(hash).digest('hex');
  }
  return hash;
}

const result = computeHash(workerData.input, workerData.rounds);
parentPort.postMessage(result); // Send result to main thread

import { Worker } from 'worker_threads';
import { fileURLToPath } from 'url';
import path from 'path';

const __dirname = path.dirname(fileURLToPath(import.meta.url));

function runHashInWorker(input, rounds) {
  return new Promise((resolve, reject) => {
    const worker = new Worker(path.join(__dirname, 'worker.js'), {
      workerData: { input, rounds },
    });

    worker.on('message', resolve);
    worker.on('error', reject);
    worker.on('exit', code => {
      if (code !== 0) reject(new Error(`Worker stopped with exit code ${code}`));
    });
  });
}

// Now the event loop is NOT blocked
app.get('/hash', async (req, res) => {
  const hash = await runHashInWorker(req.query.input, 100_000);
  res.json({ hash });
});

The worker runs in a separate thread. The await suspends the async function (yields to the event loop), so other requests can be processed during the 500ms computation.

The Thread Pool Pattern

Creating a new Worker for every request is expensive (~10ms startup). For production, maintain a pool of reusable workers:

import { Worker } from 'worker_threads';
import { EventEmitter } from 'events';
import os from 'os';

class WorkerPool extends EventEmitter {
  constructor(workerScript, numWorkers = os.cpus().length || 4) {
    super();
    this.workerScript = workerScript;
    this.freeWorkers = [];  // tracks which workers are actually idle
    this.queue = [];
    this.taskMap = new Map();

    for (let i = 0; i < numWorkers; i++) {
      this._createWorker();
    }
  }

  _createWorker() {
    const worker = new Worker(this.workerScript);
    worker.on('message', ({ id, result, error }) => {
      const task = this.taskMap.get(id);
      if (!task) return;
      this.taskMap.delete(id);

      if (error) task.reject(new Error(error));
      else task.resolve(result);

      // Worker is now free — either process queued work or mark idle
      if (this.queue.length > 0) {
        const next = this.queue.shift();
        this._dispatch(worker, next);
      } else {
        this.freeWorkers.push(worker);
      }
    });

    worker.on('error', err => console.error('Worker error:', err));
    this.freeWorkers.push(worker);
  }

  run(data) {
    return new Promise((resolve, reject) => {
      const id = Math.random().toString(36).slice(2);
      const task = { id, data, resolve, reject };

      const worker = this.freeWorkers.pop();
      if (worker) {
        this._dispatch(worker, task);
      } else {
        this.queue.push(task);  // all workers busy — queue it
      }
    });
  }

  _dispatch(worker, task) {
    this.taskMap.set(task.id, task);
    worker.postMessage({ id: task.id, data: task.data });
  }

  async destroy() {
    await Promise.all(
      [...this.freeWorkers].map(w => w.terminate())
    );
  }
}

// Usage
const pool = new WorkerPool('./worker.js', 4); // 4 threads

app.get('/hash', async (req, res) => {
  const hash = await pool.run({ input: req.query.input, rounds: 100_000 });
  res.json({ hash });
});

With a pool of 4 workers on a 4-core machine, you can process 4 CPU-bound tasks simultaneously, with additional tasks queued efficiently.

Workers communicate via message passing. By default, this copies data between threads (serialization → transfer). For large arrays, this is expensive.

SharedArrayBuffer allocates memory shared between all threads — no copying:

const sharedBuffer = new SharedArrayBuffer(1_000_000 * 4); // 4MB, shared
const sharedArray = new Float32Array(sharedBuffer);

// Fill with data in main thread
for (let i = 0; i < 1_000_000; i++) {
  sharedArray[i] = Math.random();
}

// Worker receives the SharedArrayBuffer reference — no copy!
const worker = new Worker('./worker.js', {
  workerData: { sharedBuffer, length: 1_000_000 },
});

import { workerData, parentPort } from 'worker_threads';

const array = new Float32Array(workerData.sharedBuffer);

// Compute sum on the shared data (no transfer overhead)
let sum = 0;
for (let i = 0; i < workerData.length; i++) {
  sum += array[i];
}

parentPort.postMessage(sum);

Warning: SharedArrayBuffer requires careful synchronization. Multiple threads writing to the same memory simultaneously is a data race. Use Atomics for safe concurrent access:

// Safe atomic operations
Atomics.add(int32Array, index, value); // Atomic increment
Atomics.compareExchange(int32Array, index, expected, replacement);
Atomics.wait(int32Array, index, expectedValue); // Block until value changes
Atomics.notify(int32Array, index, count); // Wake waiting threads

Benchmarks: When Workers Help (and When They Don’t)

Test 1: CPU-Intensive Hash Computation (Workers WIN)

Task: compute SHA-256 hash 100,000 times on a random string.

Concurrency	Single thread	4 Workers	Speedup
1 request	480ms	490ms	1x (overhead)
4 concurrent	1,920ms	500ms	3.8x
16 concurrent	7,680ms	1,200ms	6.4x
64 concurrent	30,720ms	4,800ms	6.4x

At 4 concurrent requests matching 4 workers: near-linear speedup. Beyond that, queueing overhead appears but throughput still dramatically better than single-thread.

Test 2: I/O-Bound Database Queries (Workers DON’T help)

Task: fetch a single row from PostgreSQL.

Concurrency	Single thread	4 Workers
1	5ms	15ms (overhead!)
10	50ms	55ms
100	350ms	380ms

Workers are slower for I/O. The event loop handles I/O concurrency natively. Adding threads adds overhead without benefit. Never use Workers for I/O-bound work.

Test 3: Image Resizing (Workers WIN significantly)

Task: resize a 4K JPEG to 800x600 using sharp.

Concurrency	Single thread	4 Workers	Speedup
1	210ms	225ms	1x
4	840ms	230ms	3.6x
20	4,200ms	1,150ms	3.6x

Sharp uses libvips (C/C++ via native addon), and the CPU-intensive image processing already runs off the main thread. The benefit of Workers here is isolation and controlled concurrency — distributing work across multiple V8 isolates prevents one batch of resizes from starving other request handlers.

Worker Threads vs cluster

Feature	Worker Threads	cluster
Memory sharing	Yes (SharedArrayBuffer)	No
Startup cost	~10ms	~200ms (full Node process)
Isolation	Separate V8 isolates (own heap), shared memory via SharedArrayBuffer	Separate heaps, separate processes
Communication	Fast message passing + SharedArrayBuffer	IPC (slower)
Best for	CPU-bound, shared data	HTTP server scaling
Crash isolation	Worker crash emits ‘error’ event — handled gracefully if caught	Child crash = isolated

For an HTTP server: cluster is still often better. A crashed worker brings down all requests in that process. With cluster, a crashed child doesn’t affect others.

For CPU-bound work within a single server instance: Workers are the right tool.

The hybrid approach used in many production systems: cluster for HTTP request isolation + Worker pool for CPU-bound tasks within each cluster worker.

Practical Example: Parallel CSV Processing

import { workerData, parentPort } from 'worker_threads';
import { parse } from 'csv-parse/sync';

const { csvChunk, chunkIndex } = workerData;

const records = parse(csvChunk, { columns: true, skip_empty_lines: true });

// Process records (CPU-intensive validation, transformation)
const processed = records.map(record => ({
  ...record,
  normalized_amount: parseFloat(record.amount) * 1.05,
  category: categorize(record.merchant_type),
  hash: computeRecordHash(record),
}));

parentPort.postMessage({ chunkIndex, records: processed });

// main.js — split file into chunks, process in parallel
import fs from 'fs';
import { Worker } from 'worker_threads';

async function processLargeCSV(filepath, numWorkers = 4) {
  const fileContent = fs.readFileSync(filepath, 'utf-8');
  const lines = fileContent.split('\n');
  const header = lines[0];
  const dataLines = lines.slice(1);

  // Split into equal chunks
  const chunkSize = Math.ceil(dataLines.length / numWorkers);
  const chunks = Array.from({ length: numWorkers }, (_, i) =>
    [header, ...dataLines.slice(i * chunkSize, (i + 1) * chunkSize)].join('\n')
  );

  // Process all chunks in parallel
  const results = await Promise.all(
    chunks.map((chunk, i) =>
      new Promise((resolve, reject) => {
        const worker = new Worker('./processor-worker.js', {
          workerData: { csvChunk: chunk, chunkIndex: i },
        });
        worker.on('message', resolve);
        worker.on('error', reject);
      })
    )
  );

  // Merge results in order
  return results
    .sort((a, b) => a.chunkIndex - b.chunkIndex)
    .flatMap(r => r.records);
}

Processing a 1M row CSV: ~45 seconds single-threaded → ~12 seconds with 4 Workers (parsing + transformation is CPU-bound).

Worker Threads unlock Node.js’s potential on multi-core hardware for CPU-heavy workloads. Used correctly — with a pool, avoiding them for I/O, and careful synchronization when sharing memory — they’re the difference between a Node.js server that saturates one core and one that saturates all of them.