Skip to main content

(5/n) CPU Multi-Core Parallel Rendering

· 8 min read
Sheetalsingh
Software Engineer @ Glean

Related Articles:

What if we can't use GPU acceleration? Can we leverage modern multi-core CPUs to achieve near-GPU performance through parallel processing? Let's explore how CPU parallel rendering could be a viable alternative.

The Challenge: GPU vs CPU Rendering

GPU Advantages

  • Massive parallelism: 1000s of cores
  • Specialized hardware: Optimized for graphics
  • High bandwidth: Dedicated memory
  • Efficient batching: Command-based rendering

CPU Limitations

  • Limited cores: 4-32 cores typical
  • General purpose: Not optimized for graphics
  • Memory bandwidth: Shared with system
  • Sequential bottlenecks: Single-threaded operations

CPU Parallel Rendering Strategies

1. Tile-Based Rendering

Divide the screen into tiles and render each tile in parallel:

// Tile-based parallel rendering
class TileRenderer {
constructor(width, height, tileSize = 64) {
this.width = width;
this.height = height;
this.tileSize = tileSize;
this.tiles = this.createTiles();
}

createTiles() {
const tiles = [];
const cols = Math.ceil(this.width / this.tileSize);
const rows = Math.ceil(this.height / this.tileSize);

for (let row = 0; row < rows; row++) {
for (let col = 0; col < cols; col++) {
tiles.push({
x: col * this.tileSize,
y: row * this.tileSize,
width: Math.min(this.tileSize, this.width - col * this.tileSize),
height: Math.min(this.tileSize, this.height - row * this.tileSize),
data: new Uint8Array(this.tileSize * this.tileSize * 4),
});
}
}

return tiles;
}

renderTile(tile, elements) {
// Render only elements that intersect with this tile
const visibleElements = this.getVisibleElements(tile, elements);

// Clear tile
tile.data.fill(0);

// Render elements in this tile
visibleElements.forEach((element) => {
this.renderElement(tile, element);
});
}

renderParallel(elements) {
// Use Web Workers for parallel tile rendering
const promises = this.tiles.map((tile, index) => {
return new Promise((resolve) => {
const worker = new Worker("tile-renderer-worker.js");

worker.postMessage({
tile,
elements,
tileIndex: index,
});

worker.onmessage = (event) => {
this.tiles[index].data = event.data;
resolve();
};
});
});

return Promise.all(promises);
}
}

2. Web Worker Parallelism

// tile-renderer-worker.js
self.onmessage = function (e) {
const { tile, elements, tileIndex } = e.data;

// Render tile in worker thread
const renderedTile = renderTileInWorker(tile, elements);

self.postMessage(renderedTile);
};

function renderTileInWorker(tile, elements) {
const canvas = new OffscreenCanvas(tile.width, tile.height);
const ctx = canvas.getContext("2d");

// Clear tile
ctx.clearRect(0, 0, tile.width, tile.height);

// Render elements that intersect with this tile
elements.forEach((element) => {
if (elementIntersectsTile(element, tile)) {
renderElement(ctx, element, tile);
}
});

return canvas.transferToImageBitmap();
}

3. SIMD Instructions

Use CPU vector instructions for parallel pixel operations:

// SIMD-optimized pixel operations
class SIMDRenderer {
constructor() {
this.simdSupported = typeof SIMD !== "undefined";
}

fillRectSIMD(x, y, width, height, color) {
if (!this.simdSupported) {
return this.fillRectStandard(x, y, width, height, color);
}

// Use SIMD for parallel pixel operations
const pixels = new Uint8Array(width * height * 4);
const colorVector = SIMD.Float32x4(color.r, color.g, color.b, color.a);

// Process 4 pixels at once
for (let i = 0; i < pixels.length; i += 16) {
const pixelVector = SIMD.Float32x4.load(pixels, i);
const result = SIMD.Float32x4.mul(pixelVector, colorVector);
SIMD.Float32x4.store(pixels, i, result);
}

return pixels;
}

blendLayersSIMD(layers) {
if (!this.simdSupported) {
return this.blendLayersStandard(layers);
}

const result = new Uint8Array(layers[0].length);

// Blend multiple layers in parallel
for (let i = 0; i < result.length; i += 16) {
let blended = SIMD.Float32x4.load(layers[0], i);

for (let j = 1; j < layers.length; j++) {
const layer = SIMD.Float32x4.load(layers[j], i);
blended = this.blendPixelsSIMD(blended, layer);
}

SIMD.Float32x4.store(result, i, blended);
}

return result;
}
}

Advanced CPU Parallel Techniques

1. Command Buffer Parallelization

// Parallel command processing
class ParallelCommandProcessor {
constructor(workerCount = navigator.hardwareConcurrency) {
this.workers = this.createWorkers(workerCount);
this.commandQueue = [];
}

createWorkers(count) {
const workers = [];
for (let i = 0; i < count; i++) {
workers.push(new Worker("command-processor-worker.js"));
}
return workers;
}

processCommandsParallel(commands) {
// Split commands among workers
const chunks = this.splitCommands(commands, this.workers.length);

const promises = chunks.map((chunk, index) => {
return new Promise((resolve) => {
this.workers[index].postMessage({ commands: chunk });
this.workers[index].onmessage = (e) => resolve(e.data);
});
});

return Promise.all(promises);
}

splitCommands(commands, workerCount) {
const chunks = [];
const chunkSize = Math.ceil(commands.length / workerCount);

for (let i = 0; i < commands.length; i += chunkSize) {
chunks.push(commands.slice(i, i + chunkSize));
}

return chunks;
}
}

2. Spatial Partitioning with Threads

// Multi-threaded spatial partitioning
class SpatialPartitioner {
constructor(width, height, cellSize = 64) {
this.width = width;
this.height = height;
this.cellSize = cellSize;
this.grid = this.createGrid();
}

createGrid() {
const cols = Math.ceil(this.width / this.cellSize);
const rows = Math.ceil(this.height / this.cellSize);
const grid = new Array(rows * cols);

// Initialize grid in parallel
const promises = [];
for (let i = 0; i < grid.length; i++) {
promises.push(this.initializeCell(i));
}

return Promise.all(promises);
}

async initializeCell(index) {
return new Promise((resolve) => {
const worker = new Worker("cell-initializer-worker.js");
worker.postMessage({ cellIndex: index, cellSize: this.cellSize });
worker.onmessage = (e) => resolve(e.data);
});
}

insertElementsParallel(elements) {
// Insert elements into spatial grid using multiple workers
const elementChunks = this.splitElements(elements);

const promises = elementChunks.map((chunk, index) => {
return new Promise((resolve) => {
const worker = new Worker("spatial-insert-worker.js");
worker.postMessage({
elements: chunk,
grid: this.grid,
cellSize: this.cellSize,
});
worker.onmessage = (e) => resolve(e.data);
});
});

return Promise.all(promises);
}
}

3. Memory Pool Parallelization

// Parallel memory management
class ParallelMemoryPool {
constructor(poolSize = 1024 * 1024) {
this.poolSize = poolSize;
this.pools = this.createPools();
}

createPools() {
const poolCount = navigator.hardwareConcurrency;
const pools = [];

for (let i = 0; i < poolCount; i++) {
pools.push(new SharedArrayBuffer(this.poolSize));
}

return pools;
}

allocateParallel(size) {
// Find available memory in parallel
const promises = this.pools.map((pool, index) => {
return new Promise((resolve) => {
const worker = new Worker("memory-allocator-worker.js");
worker.postMessage({
pool: pool,
size: size,
poolIndex: index,
});
worker.onmessage = (e) => resolve(e.data);
});
});

return Promise.race(promises);
}

freeParallel(address) {
// Free memory in parallel
const promises = this.pools.map((pool, index) => {
return new Promise((resolve) => {
const worker = new Worker("memory-free-worker.js");
worker.postMessage({
pool: pool,
address: address,
poolIndex: index,
});
worker.onmessage = (e) => resolve(e.data);
});
});

return Promise.all(promises);
}
}

Performance Comparison: CPU vs GPU

CPU Parallel Rendering Benchmarks

TechniqueCores UsedFPSMemory UsageCPU Usage
Single Thread115-3050MB100%
Tile Rendering4-845-6080MB80%
SIMD + Workers8-1660-90100MB90%
Hybrid Approach16+90-120150MB95%

GPU Rendering Benchmarks

TechniqueGPU CoresFPSMemory UsageGPU Usage
WebGL1000+120+50MB60%
WebGPU1000+144+40MB70%

Implementation Strategies

1. Hybrid CPU-GPU Approach

// Fallback to CPU when GPU unavailable
class HybridRenderer {
constructor() {
this.gpuAvailable = this.detectGPU();
this.cpuRenderer = new CPURenderer();
this.gpuRenderer = new GPURenderer();
}

detectGPU() {
const canvas = document.createElement("canvas");
const gl =
canvas.getContext("webgl") || canvas.getContext("experimental-webgl");
return !!gl;
}

render(elements) {
if (this.gpuAvailable) {
return this.gpuRenderer.render(elements);
} else {
return this.cpuRenderer.renderParallel(elements);
}
}

renderParallel(elements) {
// Use CPU parallel rendering
const tileRenderer = new TileRenderer(1920, 1080);
const commandProcessor = new ParallelCommandProcessor();

// Process commands in parallel
return commandProcessor
.processCommandsParallel(elements)
.then((processedCommands) => {
// Render tiles in parallel
return tileRenderer.renderParallel(processedCommands);
});
}
}

2. Adaptive Performance Scaling

// Scale performance based on available cores
class AdaptiveRenderer {
constructor() {
this.coreCount = navigator.hardwareConcurrency;
this.strategy = this.selectStrategy();
}

selectStrategy() {
if (this.coreCount >= 16) {
return "aggressive-parallel";
} else if (this.coreCount >= 8) {
return "balanced-parallel";
} else if (this.coreCount >= 4) {
return "conservative-parallel";
} else {
return "single-threaded";
}
}

render(elements) {
switch (this.strategy) {
case "aggressive-parallel":
return this.renderAggressiveParallel(elements);
case "balanced-parallel":
return this.renderBalancedParallel(elements);
case "conservative-parallel":
return this.renderConservativeParallel(elements);
default:
return this.renderSingleThreaded(elements);
}
}

renderAggressiveParallel(elements) {
// Use all available cores aggressively
const workerCount = this.coreCount;
const tileSize = 32; // Smaller tiles for more parallelism

return this.renderWithWorkers(elements, workerCount, tileSize);
}

renderBalancedParallel(elements) {
// Balance performance and resource usage
const workerCount = Math.floor(this.coreCount / 2);
const tileSize = 64;

return this.renderWithWorkers(elements, workerCount, tileSize);
}
}

Real-World Applications

1. CPU-Only Game Engine

// Pure CPU game engine
class CPUGameEngine {
constructor() {
this.renderer = new HybridRenderer();
this.physics = new ParallelPhysicsEngine();
this.audio = new ParallelAudioEngine();
}

update(deltaTime) {
// Update game state in parallel
const promises = [
this.physics.updateParallel(deltaTime),
this.audio.updateParallel(deltaTime),
this.renderer.renderParallel(this.gameObjects),
];

return Promise.all(promises);
}

render() {
// Render using CPU parallel processing
return this.renderer.renderParallel(this.visibleObjects);
}
}

2. High-Performance Dashboard

// Real-time dashboard with CPU rendering
class ParallelDashboard {
constructor() {
this.charts = new ParallelChartRenderer();
this.dataProcessor = new ParallelDataProcessor();
}

updateData(newData) {
// Process data in parallel
return this.dataProcessor.processParallel(newData).then((processedData) => {
// Render charts in parallel
return this.charts.renderParallel(processedData);
});
}

render() {
// Render dashboard at 60+ FPS using CPU
return this.renderParallel();
}
}

Challenges and Solutions

Challenge 1: Memory Bandwidth

Problem: CPU memory bandwidth limits parallel performance Solution: Use memory pooling and cache-friendly algorithms

Challenge 2: Thread Synchronization

Problem: Thread coordination overhead Solution: Lock-free data structures and atomic operations

Challenge 3: Load Balancing

Problem: Uneven work distribution among cores Solution: Dynamic work stealing and adaptive partitioning

Challenge 4: Browser Limitations

Problem: Limited Web Worker capabilities Solution: SharedArrayBuffer and Atomics for efficient communication

Future of CPU Parallel Rendering

1. WebAssembly SIMD

// Future WASM SIMD for better performance
const wasmModule = await WebAssembly.instantiateStreaming(
fetch("parallel-renderer.wasm")
);

// Use SIMD instructions in WASM
wasmModule.instance.exports.renderParallel(pixelData);

2. SharedArrayBuffer Optimization

// Efficient inter-thread communication
const sharedBuffer = new SharedArrayBuffer(1024 * 1024);
const sharedArray = new Uint8Array(sharedBuffer);

// Workers can directly access shared memory
worker.postMessage({ buffer: sharedBuffer }, [sharedBuffer]);

3. CPU-GPU Hybrid Rendering

// Combine CPU and GPU for optimal performance
class HybridRenderer {
render(elements) {
// Use GPU for large operations
const gpuElements = elements.filter((e) => e.complexity > threshold);
const cpuElements = elements.filter((e) => e.complexity <= threshold);

return Promise.all([
this.gpuRenderer.render(gpuElements),
this.cpuRenderer.renderParallel(cpuElements),
]);
}
}

Conclusion

CPU multi-core parallel processing can achieve 60-90 FPS for complex rendering tasks, which is:

  1. Significantly better than single-threaded CPU rendering (15-30 FPS)
  2. Competitive with basic GPU rendering in some scenarios
  3. Viable alternative when GPU acceleration is unavailable
  4. Scalable with increasing core counts

Key Takeaways:

  • Tile-based rendering enables effective parallelization
  • Web Workers provide true multi-threading in browsers
  • SIMD instructions accelerate pixel operations
  • Memory pooling reduces allocation overhead
  • Adaptive strategies optimize for available cores

While CPU parallel rendering won't match GPU performance for graphics-intensive tasks, it provides a viable fallback and complementary approach for scenarios where GPU acceleration is limited or unavailable.

Curious ?

What if we could combine CPU parallel processing with our React-like DSL? Let's explore building a hybrid rendering engine that adapts to available hardware...

Stay tuned /