Convolutional Neural Networks (CNNs)
Convolutional Neural Networks: The Architecture That Changed Computer Vision
Before CNNs, image classification required hand-crafted features. After CNNs, you feed raw pixels and the network learns what to look for. This breakthrough — learning visual features directly from data — transformed computer vision and ignited the deep learning revolution.
The Problem with Fully Connected Networks for Images
A 224×224 RGB image has 224 × 224 × 3 = 150,528 pixels. A single fully connected layer with 1000 neurons would need 150 million parameters — just for the first layer. That's computationally infeasible and massively overfits.
CNNs solve this with two key ideas:
- Local connectivity — each neuron connects to a small region of the input
- Weight sharing — the same filter is applied across the entire image
The Building Blocks
Convolutional Layer
A filter (kernel) slides across the input, computing dot products at each position.
Input image (5×5): Filter (3×3): Output (3×3):
1 1 1 0 0 1 0 1
0 1 1 1 0 0 1 0 → 4 3 4
0 0 1 1 1 1 0 1 2 4 3
0 0 1 1 0 2 3 4
0 1 1 0 0
The filter learns to detect features like edges, corners, textures. Deep layers combine these to detect complex patterns: faces, cars, text.
import torch
import torch.nn as nn
# Single convolutional layer
conv = nn.Conv2d(
in_channels=3, # RGB input
out_channels=32, # 32 different filters (feature maps)
kernel_size=3, # 3×3 filters
stride=1, # Move 1 pixel at a time
padding=1 # Pad input to maintain spatial size
)
# Input: (batch, channels, height, width)
x = torch.randn(8, 3, 224, 224) # Batch of 8 RGB images
output = conv(x)
print(f"Input: {x.shape}") # torch.Size([8, 3, 224, 224])
print(f"Output: {output.shape}") # torch.Size([8, 32, 224, 224])
Pooling Layer
Reduces spatial dimensions — keeps the most important features, ignores exact positions.
# Max pooling: take the maximum value in each window
maxpool = nn.MaxPool2d(kernel_size=2, stride=2) # Halves spatial dimensions
x = torch.randn(8, 32, 224, 224)
output = maxpool(x)
print(f"After maxpool: {output.shape}") # torch.Size([8, 32, 112, 112])
# Average pooling
avgpool = nn.AvgPool2d(kernel_size=2, stride=2)
# Global average pooling — reduces to 1×1 (replaces flatten in modern nets)
gap = nn.AdaptiveAvgPool2d(1)
output = gap(x)
print(f"After GAP: {output.shape}") # torch.Size([8, 32, 1, 1])
Building a CNN from Scratch
class SimpleCNN(nn.Module):
def __init__(self, num_classes=10):
super().__init__()
# Feature extraction
self.features = nn.Sequential(
# Block 1: 3×32×32 → 32×32×32
nn.Conv2d(3, 32, kernel_size=3, padding=1),
nn.BatchNorm2d(32),
nn.ReLU(inplace=True),
nn.Conv2d(32, 32, kernel_size=3, padding=1),
nn.BatchNorm2d(32),
nn.ReLU(inplace=True),
nn.MaxPool2d(2, 2), # → 32×16×16
nn.Dropout2d(0.25),
# Block 2: 32×16×16 → 64×16×16
nn.Conv2d(32, 64, kernel_size=3, padding=1),
nn.BatchNorm2d(64),
nn.ReLU(inplace=True),
nn.Conv2d(64, 64, kernel_size=3, padding=1),
nn.BatchNorm2d(64),
nn.ReLU(inplace=True),
nn.MaxPool2d(2, 2), # → 64×8×8
nn.Dropout2d(0.25),
)
# Classifier
self.classifier = nn.Sequential(
nn.Flatten(),
nn.Linear(64 * 8 * 8, 512),
nn.BatchNorm1d(512),
nn.ReLU(inplace=True),
nn.Dropout(0.5),
nn.Linear(512, num_classes)
)
def forward(self, x):
x = self.features(x)
x = self.classifier(x)
return x
model = SimpleCNN(num_classes=10)
print(model)
x = torch.randn(1, 3, 32, 32) # Single CIFAR-10 image
output = model(x)
print(f"Output shape: {output.shape}") # (1, 10) — 10 class scores
Training on CIFAR-10
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
# Data augmentation for training
train_transforms = transforms.Compose([
transforms.RandomHorizontalFlip(),
transforms.RandomCrop(32, padding=4),
transforms.ColorJitter(brightness=0.2, contrast=0.2),
transforms.ToTensor(),
transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
])
# No augmentation for testing — just normalize
test_transforms = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
])
train_set = torchvision.datasets.CIFAR10(root='./data', train=True,
download=True, transform=train_transforms)
test_set = torchvision.datasets.CIFAR10(root='./data', train=False,
download=True, transform=test_transforms)
train_loader = DataLoader(train_set, batch_size=128, shuffle=True, num_workers=4)
test_loader = DataLoader(test_set, batch_size=256, shuffle=False, num_workers=4)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = SimpleCNN(num_classes=10).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001, weight_decay=5e-4)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=100)
def train_one_epoch(model, loader, optimizer, criterion):
model.train()
total_loss, correct, total = 0, 0, 0
for images, labels in loader:
images, labels = images.to(device), labels.to(device)
optimizer.zero_grad()
outputs = model(images)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
total_loss += loss.item()
correct += (outputs.argmax(1) == labels).sum().item()
total += len(labels)
return total_loss / len(loader), correct / total
# Training loop (abbreviated)
for epoch in range(100):
train_loss, train_acc = train_one_epoch(model, train_loader, optimizer, criterion)
scheduler.step()
if (epoch + 1) % 10 == 0:
print(f"Epoch {epoch+1}: Loss={train_loss:.4f}, Acc={train_acc:.3f}")
Transfer Learning: Standing on Giant Shoulders
Training from scratch is expensive. Transfer learning uses models pre-trained on ImageNet (1.2M images, 1000 classes) as a starting point.
import torchvision.models as models
# Option 1: Feature extraction — freeze all layers except the head
resnet18 = models.resnet18(pretrained=True)
# Freeze all parameters
for param in resnet18.parameters():
param.requires_grad = False
# Replace the final classification layer
num_features = resnet18.fc.in_features
resnet18.fc = nn.Sequential(
nn.Linear(num_features, 256),
nn.ReLU(),
nn.Dropout(0.5),
nn.Linear(256, num_classes)
)
# Only fc parameters will be updated
# Option 2: Fine-tuning — unfreeze all layers, use small learning rate
efficientnet = models.efficientnet_b0(pretrained=True)
efficientnet.classifier[1] = nn.Linear(
efficientnet.classifier[1].in_features, num_classes
)
# Use different learning rates for backbone vs head
optimizer = torch.optim.AdamW([
{'params': efficientnet.features.parameters(), 'lr': 1e-5}, # Very small for pretrained
{'params': efficientnet.classifier.parameters(), 'lr': 1e-3} # Larger for new head
])
Transfer learning typically achieves 90%+ accuracy on custom datasets with only a few hundred images per class — a task that would be impossible training from scratch.
Popular CNN Architectures
# Load pre-trained models (all pretrained on ImageNet)
vgg16 = models.vgg16(pretrained=True) # Classic, simple, large
resnet50 = models.resnet50(pretrained=True) # Residual connections, workhorse
efficientnet_b0 = models.efficientnet_b0(pretrained=True) # Most efficient
mobilenet_v3 = models.mobilenet_v3_small(pretrained=True) # Mobile deployment
vit_b_16 = models.vit_b_16(pretrained=True) # Vision Transformer, state-of-art
| Architecture | Year | Params | Top-1 Acc | Use When |
|---|---|---|---|---|
| VGG-16 | 2014 | 138M | 74% | Simple baseline, easy to understand |
| ResNet-50 | 2015 | 25M | 80% | General purpose, reliable |
| EfficientNet-B0 | 2019 | 5M | 77% | Limited compute/memory |
| MobileNet-V3 | 2019 | 5M | 75% | Mobile/edge deployment |
| ViT-B/16 | 2020 | 86M | 85%+ | State-of-the-art, large datasets |
Visualizing CNN Features
import matplotlib.pyplot as plt
def visualize_filters(model_layer, n_filters=32):
"""Visualize learned convolutional filters."""
filters = model_layer.weight.data.cpu().numpy()
fig, axes = plt.subplots(4, 8, figsize=(16, 8))
for i, ax in enumerate(axes.flat):
if i < n_filters and i < len(filters):
# Take the mean across input channels for RGB
filter_img = filters[i].mean(axis=0)
filter_img = (filter_img - filter_img.min()) / (filter_img.max() - filter_img.min())
ax.imshow(filter_img, cmap='gray')
ax.axis('off')
plt.suptitle('Learned Convolutional Filters')
plt.tight_layout()
plt.show()
# First layer filters of a trained model
visualize_filters(model.features[0], n_filters=32)
CNNs remain foundational to computer vision even as Vision Transformers rise — most production vision systems still use CNN-based architectures or CNN-transformer hybrids.
Next lesson: Accuracy, Precision, Recall, and F1 — choosing the right metric for your problem.
Get this course's notes on Telegram!
Free cheat sheets, summaries & practice exercises