Convolutional Neural Networks (CNNs) | Machine Learning Fundamentals | AiTechWorlds

Convolutional Neural Networks: The Architecture That Changed Computer Vision

Before CNNs, image classification required hand-crafted features. After CNNs, you feed raw pixels and the network learns what to look for. This breakthrough — learning visual features directly from data — transformed computer vision and ignited the deep learning revolution.

The Problem with Fully Connected Networks for Images

A 224×224 RGB image has 224 × 224 × 3 = 150,528 pixels. A single fully connected layer with 1000 neurons would need 150 million parameters — just for the first layer. That's computationally infeasible and massively overfits.

CNNs solve this with two key ideas:

Local connectivity — each neuron connects to a small region of the input
Weight sharing — the same filter is applied across the entire image

The Building Blocks

Convolutional Layer

A filter (kernel) slides across the input, computing dot products at each position.

Input image (5×5):        Filter (3×3):       Output (3×3):
1 1 1 0 0                 1 0 1              
0 1 1 1 0                 0 1 0   →    4 3 4
0 0 1 1 1                 1 0 1         2 4 3
0 0 1 1 0                               2 3 4
0 1 1 0 0

The filter learns to detect features like edges, corners, textures. Deep layers combine these to detect complex patterns: faces, cars, text.

import torch
import torch.nn as nn

# Single convolutional layer
conv = nn.Conv2d(
    in_channels=3,      # RGB input
    out_channels=32,    # 32 different filters (feature maps)
    kernel_size=3,      # 3×3 filters
    stride=1,           # Move 1 pixel at a time
    padding=1           # Pad input to maintain spatial size
)

# Input: (batch, channels, height, width)
x = torch.randn(8, 3, 224, 224)  # Batch of 8 RGB images
output = conv(x)
print(f"Input: {x.shape}")       # torch.Size([8, 3, 224, 224])
print(f"Output: {output.shape}") # torch.Size([8, 32, 224, 224])

Pooling Layer

Reduces spatial dimensions — keeps the most important features, ignores exact positions.

# Max pooling: take the maximum value in each window
maxpool = nn.MaxPool2d(kernel_size=2, stride=2)  # Halves spatial dimensions

x = torch.randn(8, 32, 224, 224)
output = maxpool(x)
print(f"After maxpool: {output.shape}")  # torch.Size([8, 32, 112, 112])

# Average pooling
avgpool = nn.AvgPool2d(kernel_size=2, stride=2)

# Global average pooling — reduces to 1×1 (replaces flatten in modern nets)
gap = nn.AdaptiveAvgPool2d(1)
output = gap(x)
print(f"After GAP: {output.shape}")  # torch.Size([8, 32, 1, 1])

Building a CNN from Scratch

class SimpleCNN(nn.Module):
    def __init__(self, num_classes=10):
        super().__init__()
        
        # Feature extraction
        self.features = nn.Sequential(
            # Block 1: 3×32×32 → 32×32×32
            nn.Conv2d(3, 32, kernel_size=3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(inplace=True),
            nn.Conv2d(32, 32, kernel_size=3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2, 2),     # → 32×16×16
            nn.Dropout2d(0.25),
            
            # Block 2: 32×16×16 → 64×16×16
            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.Conv2d(64, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2, 2),     # → 64×8×8
            nn.Dropout2d(0.25),
        )
        
        # Classifier
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(64 * 8 * 8, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(inplace=True),
            nn.Dropout(0.5),
            nn.Linear(512, num_classes)
        )
    
    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x)
        return x

model = SimpleCNN(num_classes=10)
print(model)
x = torch.randn(1, 3, 32, 32)  # Single CIFAR-10 image
output = model(x)
print(f"Output shape: {output.shape}")  # (1, 10) — 10 class scores

Training on CIFAR-10

import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader

# Data augmentation for training
train_transforms = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.RandomCrop(32, padding=4),
    transforms.ColorJitter(brightness=0.2, contrast=0.2),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
])

# No augmentation for testing — just normalize
test_transforms = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
])

train_set = torchvision.datasets.CIFAR10(root='./data', train=True, 
                                           download=True, transform=train_transforms)
test_set = torchvision.datasets.CIFAR10(root='./data', train=False,
                                          download=True, transform=test_transforms)

train_loader = DataLoader(train_set, batch_size=128, shuffle=True, num_workers=4)
test_loader = DataLoader(test_set, batch_size=256, shuffle=False, num_workers=4)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = SimpleCNN(num_classes=10).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001, weight_decay=5e-4)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=100)

def train_one_epoch(model, loader, optimizer, criterion):
    model.train()
    total_loss, correct, total = 0, 0, 0
    
    for images, labels in loader:
        images, labels = images.to(device), labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        correct += (outputs.argmax(1) == labels).sum().item()
        total += len(labels)
    
    return total_loss / len(loader), correct / total

# Training loop (abbreviated)
for epoch in range(100):
    train_loss, train_acc = train_one_epoch(model, train_loader, optimizer, criterion)
    scheduler.step()
    
    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch+1}: Loss={train_loss:.4f}, Acc={train_acc:.3f}")

Transfer Learning: Standing on Giant Shoulders

Training from scratch is expensive. Transfer learning uses models pre-trained on ImageNet (1.2M images, 1000 classes) as a starting point.

import torchvision.models as models

# Option 1: Feature extraction — freeze all layers except the head
resnet18 = models.resnet18(pretrained=True)

# Freeze all parameters
for param in resnet18.parameters():
    param.requires_grad = False

# Replace the final classification layer
num_features = resnet18.fc.in_features
resnet18.fc = nn.Sequential(
    nn.Linear(num_features, 256),
    nn.ReLU(),
    nn.Dropout(0.5),
    nn.Linear(256, num_classes)
)
# Only fc parameters will be updated

# Option 2: Fine-tuning — unfreeze all layers, use small learning rate
efficientnet = models.efficientnet_b0(pretrained=True)
efficientnet.classifier[1] = nn.Linear(
    efficientnet.classifier[1].in_features, num_classes
)

# Use different learning rates for backbone vs head
optimizer = torch.optim.AdamW([
    {'params': efficientnet.features.parameters(), 'lr': 1e-5},  # Very small for pretrained
    {'params': efficientnet.classifier.parameters(), 'lr': 1e-3}  # Larger for new head
])

Transfer learning typically achieves 90%+ accuracy on custom datasets with only a few hundred images per class — a task that would be impossible training from scratch.

Popular CNN Architectures

# Load pre-trained models (all pretrained on ImageNet)
vgg16 = models.vgg16(pretrained=True)         # Classic, simple, large
resnet50 = models.resnet50(pretrained=True)    # Residual connections, workhorse
efficientnet_b0 = models.efficientnet_b0(pretrained=True)  # Most efficient
mobilenet_v3 = models.mobilenet_v3_small(pretrained=True)  # Mobile deployment
vit_b_16 = models.vit_b_16(pretrained=True)   # Vision Transformer, state-of-art

Architecture	Year	Params	Top-1 Acc	Use When
VGG-16	2014	138M	74%	Simple baseline, easy to understand
ResNet-50	2015	25M	80%	General purpose, reliable
EfficientNet-B0	2019	5M	77%	Limited compute/memory
MobileNet-V3	2019	5M	75%	Mobile/edge deployment
ViT-B/16	2020	86M	85%+	State-of-the-art, large datasets

Visualizing CNN Features

import matplotlib.pyplot as plt

def visualize_filters(model_layer, n_filters=32):
    """Visualize learned convolutional filters."""
    filters = model_layer.weight.data.cpu().numpy()
    
    fig, axes = plt.subplots(4, 8, figsize=(16, 8))
    for i, ax in enumerate(axes.flat):
        if i < n_filters and i < len(filters):
            # Take the mean across input channels for RGB
            filter_img = filters[i].mean(axis=0)
            filter_img = (filter_img - filter_img.min()) / (filter_img.max() - filter_img.min())
            ax.imshow(filter_img, cmap='gray')
        ax.axis('off')
    
    plt.suptitle('Learned Convolutional Filters')
    plt.tight_layout()
    plt.show()

# First layer filters of a trained model
visualize_filters(model.features[0], n_filters=32)

CNNs remain foundational to computer vision even as Vision Transformers rise — most production vision systems still use CNN-based architectures or CNN-transformer hybrids.

Next lesson: Accuracy, Precision, Recall, and F1 — choosing the right metric for your problem.