Backpropagation Explained
Backpropagation: How Neural Networks Actually Learn
Backpropagation is the algorithm that makes deep learning work. It's just calculus — specifically the chain rule — applied to compute gradients efficiently in a neural network. Understanding it deeply separates people who use neural networks from people who understand them.
The Core Problem: Assigning Credit
After a neural network makes a wrong prediction, you need to know: which weights contributed to this error, and by how much? That's the credit assignment problem, and backpropagation solves it.
Forward pass: Input → [Hidden Layers] → Output → Loss
Backward pass: Loss → [Hidden Layers] → Gradient for each weight
The gradients tell you: "increase this weight → loss increases/decreases by this much."
The Chain Rule: The Math Behind Backprop
If Loss depends on output, which depends on hidden, which depends on weight:
∂Loss/∂weight = ∂Loss/∂output × ∂output/∂hidden × ∂hidden/∂weight
Each × connects one layer's gradient to the next
This chain of multiplications propagates the error backward
That's the entire algorithm. Everything in backprop is an application of this rule.
A Complete Manual Example
import numpy as np
class SimpleNeuralNet:
"""A tiny 2-layer network to demonstrate backprop manually."""
def __init__(self, input_size, hidden_size, output_size, lr=0.01):
# Xavier initialization
self.W1 = np.random.randn(input_size, hidden_size) * np.sqrt(2/input_size)
self.b1 = np.zeros(hidden_size)
self.W2 = np.random.randn(hidden_size, output_size) * np.sqrt(2/hidden_size)
self.b2 = np.zeros(output_size)
self.lr = lr
def sigmoid(self, z):
return 1 / (1 + np.exp(-z))
def sigmoid_derivative(self, a):
return a * (1 - a) # If a = sigmoid(z), then da/dz = a(1-a)
def forward(self, X):
# Layer 1
self.z1 = X @ self.W1 + self.b1 # (n, hidden)
self.a1 = self.sigmoid(self.z1) # (n, hidden)
# Layer 2 (output)
self.z2 = self.a1 @ self.W2 + self.b2 # (n, output)
self.a2 = self.sigmoid(self.z2) # (n, output)
return self.a2
def compute_loss(self, y_pred, y_true):
# Binary cross-entropy
m = len(y_true)
loss = -np.mean(y_true * np.log(y_pred + 1e-8) +
(1 - y_true) * np.log(1 - y_pred + 1e-8))
return loss
def backward(self, X, y_true):
m = len(y_true)
# ===== BACKWARD PASS =====
# Output layer error
# dL/da2 = -(y/a2 - (1-y)/(1-a2)) for cross-entropy
# × da2/dz2 = a2(1-a2) for sigmoid
# Combined: dL/dz2 = a2 - y (elegant simplification!)
dz2 = self.a2 - y_true.reshape(-1, 1) # (n, output)
# Gradient for W2 and b2
dW2 = (self.a1.T @ dz2) / m # (hidden, output)
db2 = dz2.mean(axis=0) # (output,)
# Propagate through layer 2 weights back to layer 1
da1 = dz2 @ self.W2.T # (n, hidden)
dz1 = da1 * self.sigmoid_derivative(self.a1) # (n, hidden)
# Gradient for W1 and b1
dW1 = (X.T @ dz1) / m # (input, hidden)
db1 = dz1.mean(axis=0) # (hidden,)
# ===== UPDATE WEIGHTS =====
# Gradient descent: move opposite to gradient
self.W2 -= self.lr * dW2
self.b2 -= self.lr * db2
self.W1 -= self.lr * dW1
self.b1 -= self.lr * db1
def train(self, X, y, epochs=1000):
losses = []
for epoch in range(epochs):
# Forward pass
y_pred = self.forward(X)
# Compute loss
loss = self.compute_loss(y_pred, y)
losses.append(loss)
# Backward pass — compute gradients and update weights
self.backward(X, y)
if epoch % 100 == 0:
print(f"Epoch {epoch}: Loss = {loss:.4f}")
return losses
# Train on XOR (non-linearly separable)
X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
y = np.array([0, 1, 1, 0]) # XOR
net = SimpleNeuralNet(input_size=2, hidden_size=4, output_size=1, lr=1.0)
losses = net.train(X, y, epochs=1000)
print("\nFinal predictions:")
for x_i, y_i in zip(X, y):
pred = net.forward(x_i.reshape(1, -1))[0, 0]
print(f" Input {x_i} → Pred: {pred:.4f}, True: {y_i}")
Vanishing Gradients: The Deep Network Problem
When you multiply many small numbers together (chain rule through many layers), gradients shrink exponentially.
Layer 20 gradient = grad × 0.25 × 0.25 × ... × 0.25
× 20 times
= grad × 0.25^20
≈ grad × 0.000000000001
Weights in early layers receive nearly zero gradient → they don't learn → deep networks fail.
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
# Demonstrate vanishing gradients with tanh
def check_gradients(n_layers):
model = nn.Sequential(*[
layer
for _ in range(n_layers)
for layer in [nn.Linear(10, 10), nn.Tanh()]
])
x = torch.randn(1, 10, requires_grad=True)
loss = model(x).sum()
loss.backward()
# Collect gradients of each Linear layer
grads = []
for name, param in model.named_parameters():
if 'weight' in name and param.grad is not None:
grads.append(param.grad.abs().mean().item())
return grads
grads_5 = check_gradients(5)
grads_20 = check_gradients(20)
print("5-layer network, gradient norms by layer:")
for i, g in enumerate(grads_5): print(f" Layer {i+1}: {g:.6f}")
print("\n20-layer network, gradient norms by layer:")
for i, g in enumerate(grads_20[:5]): print(f" Layer {i+1}: {g:.10f}")
print(f" ... (last layer): {grads_20[-1]:.6f}")
Solutions to Vanishing Gradients
# Solution 1: ReLU activation (doesn't saturate like sigmoid/tanh)
class ModernNet(nn.Module):
def __init__(self, n_layers):
super().__init__()
layers = [nn.Linear(10, 64), nn.ReLU()]
for _ in range(n_layers - 2):
layers.extend([nn.Linear(64, 64), nn.ReLU()])
layers.append(nn.Linear(64, 1))
self.net = nn.Sequential(*layers)
def forward(self, x):
return self.net(x)
# Solution 2: Residual connections (skip connections)
class ResidualBlock(nn.Module):
def __init__(self, dim):
super().__init__()
self.block = nn.Sequential(
nn.Linear(dim, dim),
nn.BatchNorm1d(dim),
nn.ReLU(),
nn.Linear(dim, dim),
nn.BatchNorm1d(dim)
)
self.relu = nn.ReLU()
def forward(self, x):
return self.relu(x + self.block(x)) # Identity shortcut
# Gradient now has a direct path through the skip connection
# Solution 3: Batch Normalization — normalizes activations
class BatchNormNet(nn.Module):
def __init__(self):
super().__init__()
self.net = nn.Sequential(
nn.Linear(10, 64),
nn.BatchNorm1d(64), # Normalize before activation
nn.ReLU(),
nn.Linear(64, 64),
nn.BatchNorm1d(64),
nn.ReLU(),
nn.Linear(64, 1)
)
Gradient Clipping: Fighting Exploding Gradients
The opposite problem — gradients become huge and training diverges.
import torch.optim as optim
model = ModernNet(n_layers=5)
optimizer = optim.Adam(model.parameters(), lr=0.001)
for batch_X, batch_y in dataloader:
optimizer.zero_grad()
output = model(batch_X)
loss = criterion(output, batch_y)
loss.backward()
# Clip gradients before update
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
optimizer.step()
Gradient clipping is standard practice in RNN and transformer training.
Learning Rates: How Big Should Steps Be?
# Learning rate too high: overshoots minimum, diverges
# Learning rate too low: learns too slowly, gets stuck
# Learning rate schedulers
optimizer = optim.Adam(model.parameters(), lr=0.001)
# Reduce on plateau
scheduler = optim.lr_scheduler.ReduceLROnPlateau(
optimizer, mode='min', factor=0.5, patience=10
)
# Cosine annealing (widely used in practice)
scheduler = optim.lr_scheduler.CosineAnnealingLR(
optimizer, T_max=100, eta_min=1e-6
)
# One-cycle schedule (fast.ai popularized this)
scheduler = optim.lr_scheduler.OneCycleLR(
optimizer, max_lr=0.01,
steps_per_epoch=len(train_loader), epochs=30
)
for epoch in range(epochs):
train_loss = train_epoch(model, train_loader, optimizer)
val_loss = validate(model, val_loader)
scheduler.step(val_loss) # For ReduceLROnPlateau
Understanding backpropagation makes you a better deep learning practitioner — you know why networks fail, what the training curves mean, and how to fix training problems systematically.
Next lesson: Deep Learning with PyTorch — building and training real neural networks.
Get this course's notes on Telegram!
Free cheat sheets, summaries & practice exercises