40 minLesson 29 of 31
Real Projects
Project 3: Image Recognition App
Project 3: Image Recognition App
Build a real-world image classification system using transfer learning. You'll fine-tune a pre-trained neural network to recognize custom categories, then wrap it in a simple API — the same architecture used in production AI systems.
What You'll Build
A complete image recognition system that:
- Classifies images into custom categories with >95% accuracy
- Uses transfer learning from EfficientNet (trained on ImageNet)
- Includes training, evaluation, and inference code
- Exposes predictions through a FastAPI endpoint
Project Setup
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import torchvision
import torchvision.transforms as transforms
import torchvision.models as models
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
import os
from pathlib import Path
import time
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using: {device}")
if torch.cuda.is_available():
print(f"GPU: {torch.cuda.get_device_name(0)}")
Dataset: Dog vs Cat vs Wild Animal Classifier
We'll use a subset of ImageNet with 3 classes. The same code works for any custom dataset organized in folders.
data/
├── train/
│ ├── dogs/ (500 images)
│ ├── cats/ (500 images)
│ └── wild/ (500 images)
└── val/
├── dogs/ (100 images)
├── cats/ (100 images)
└── wild/ (100 images)
# Data augmentation — critical for small datasets
train_transforms = transforms.Compose([
transforms.Resize((256, 256)),
transforms.RandomCrop(224),
transforms.RandomHorizontalFlip(),
transforms.RandomRotation(15),
transforms.ColorJitter(brightness=0.3, contrast=0.3, saturation=0.3),
transforms.RandomAffine(degrees=0, translate=(0.1, 0.1)),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) # ImageNet stats
])
val_transforms = transforms.Compose([
transforms.Resize((224, 224)),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])
# Load datasets
data_dir = Path('data')
train_dataset = torchvision.datasets.ImageFolder(
data_dir / 'train', transform=train_transforms
)
val_dataset = torchvision.datasets.ImageFolder(
data_dir / 'val', transform=val_transforms
)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True,
num_workers=4, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False,
num_workers=4, pin_memory=True)
class_names = train_dataset.classes
num_classes = len(class_names)
print(f"Classes: {class_names}")
print(f"Train: {len(train_dataset)}, Val: {len(val_dataset)}")
# Visualize a batch
def show_batch(loader, class_names):
images, labels = next(iter(loader))
# Denormalize
mean = torch.tensor([0.485, 0.456, 0.406])
std = torch.tensor([0.229, 0.224, 0.225])
images = images * std.view(3,1,1) + mean.view(3,1,1)
images = images.clamp(0, 1)
fig, axes = plt.subplots(2, 4, figsize=(14, 7))
for i, ax in enumerate(axes.flat):
if i < len(images):
ax.imshow(images[i].permute(1, 2, 0))
ax.set_title(class_names[labels[i]])
ax.axis('off')
plt.suptitle('Training Batch')
plt.tight_layout()
plt.show()
show_batch(train_loader, class_names)
Build the Model with Transfer Learning
class ImageClassifier(nn.Module):
def __init__(self, num_classes, backbone='efficientnet_b0', freeze_backbone=False):
super().__init__()
# Load pretrained backbone
if backbone == 'efficientnet_b0':
self.backbone = models.efficientnet_b0(pretrained=True)
feature_dim = self.backbone.classifier[1].in_features
self.backbone.classifier = nn.Identity() # Remove original head
elif backbone == 'resnet50':
self.backbone = models.resnet50(pretrained=True)
feature_dim = self.backbone.fc.in_features
self.backbone.fc = nn.Identity()
# Freeze backbone if doing feature extraction
if freeze_backbone:
for param in self.backbone.parameters():
param.requires_grad = False
# Custom classification head
self.classifier = nn.Sequential(
nn.Dropout(0.3),
nn.Linear(feature_dim, 256),
nn.ReLU(),
nn.BatchNorm1d(256),
nn.Dropout(0.2),
nn.Linear(256, num_classes)
)
def forward(self, x):
features = self.backbone(x)
return self.classifier(features)
model = ImageClassifier(num_classes=num_classes, backbone='efficientnet_b0').to(device)
# Count parameters
total = sum(p.numel() for p in model.parameters())
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total params: {total:,}")
print(f"Trainable params: {trainable:,} ({trainable/total:.1%} of total)")
Training with Best Practices
class Trainer:
def __init__(self, model, train_loader, val_loader, device):
self.model = model
self.train_loader = train_loader
self.val_loader = val_loader
self.device = device
self.history = {'train_loss': [], 'val_loss': [],
'train_acc': [], 'val_acc': []}
self.best_val_acc = 0
self.best_model_path = 'best_model.pt'
def train_epoch(self, optimizer, criterion):
self.model.train()
total_loss, correct, total = 0, 0, 0
for images, labels in self.train_loader:
images, labels = images.to(self.device), labels.to(self.device)
optimizer.zero_grad()
outputs = self.model(images)
loss = criterion(outputs, labels)
loss.backward()
# Gradient clipping
nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=1.0)
optimizer.step()
total_loss += loss.item() * len(labels)
correct += (outputs.argmax(1) == labels).sum().item()
total += len(labels)
return total_loss / total, correct / total
def validate(self, criterion):
self.model.eval()
total_loss, correct, total = 0, 0, 0
all_preds, all_labels = [], []
with torch.no_grad():
for images, labels in self.val_loader:
images, labels = images.to(self.device), labels.to(self.device)
outputs = self.model(images)
loss = criterion(outputs, labels)
total_loss += loss.item() * len(labels)
correct += (outputs.argmax(1) == labels).sum().item()
total += len(labels)
all_preds.extend(outputs.argmax(1).cpu().numpy())
all_labels.extend(labels.cpu().numpy())
return total_loss / total, correct / total, all_preds, all_labels
def fit(self, epochs=30, lr=1e-3, warmup_epochs=5):
criterion = nn.CrossEntropyLoss(label_smoothing=0.1)
# Two-phase training:
# Phase 1: Train only the head (frozen backbone)
for param in self.model.backbone.parameters():
param.requires_grad = False
optimizer = optim.AdamW(
filter(lambda p: p.requires_grad, self.model.parameters()),
lr=lr * 10, weight_decay=0.01
)
print("Phase 1: Training head only (5 epochs)")
for epoch in range(warmup_epochs):
train_loss, train_acc = self.train_epoch(optimizer, criterion)
val_loss, val_acc, _, _ = self.validate(criterion)
print(f" Epoch {epoch+1}: train_acc={train_acc:.3f}, val_acc={val_acc:.3f}")
# Phase 2: Fine-tune entire network
for param in self.model.backbone.parameters():
param.requires_grad = True
optimizer = optim.AdamW([
{'params': self.model.backbone.parameters(), 'lr': lr * 0.1},
{'params': self.model.classifier.parameters(), 'lr': lr}
], weight_decay=0.01)
scheduler = optim.lr_scheduler.CosineAnnealingLR(
optimizer, T_max=epochs - warmup_epochs
)
print(f"\nPhase 2: Fine-tuning full network ({epochs - warmup_epochs} epochs)")
for epoch in range(epochs - warmup_epochs):
train_loss, train_acc = self.train_epoch(optimizer, criterion)
val_loss, val_acc, preds, labels = self.validate(criterion)
scheduler.step()
self.history['train_loss'].append(train_loss)
self.history['val_loss'].append(val_loss)
self.history['train_acc'].append(train_acc)
self.history['val_acc'].append(val_acc)
if val_acc > self.best_val_acc:
self.best_val_acc = val_acc
torch.save(self.model.state_dict(), self.best_model_path)
if (epoch + 1) % 5 == 0:
current_lr = optimizer.param_groups[0]['lr']
print(f" Epoch {epoch+warmup_epochs+1}: "
f"train={train_acc:.3f}, val={val_acc:.3f}, "
f"lr={current_lr:.6f}")
print(f"\nBest validation accuracy: {self.best_val_acc:.4f}")
return preds, labels
trainer = Trainer(model, train_loader, val_loader, device)
final_preds, final_labels = trainer.fit(epochs=30, lr=1e-3)
Evaluate and Visualize
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
# Load best model
model.load_state_dict(torch.load('best_model.pt'))
_, final_val_acc, preds, labels = trainer.validate(nn.CrossEntropyLoss())
print(f"Final Accuracy: {final_val_acc:.4f}")
print(classification_report(labels, preds, target_names=class_names))
# Confusion matrix
cm = confusion_matrix(labels, preds)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
xticklabels=class_names, yticklabels=class_names)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()
# Training curves
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
ax1.plot(trainer.history['train_loss'], label='Train')
ax1.plot(trainer.history['val_loss'], label='Val')
ax1.set_title('Loss')
ax1.legend()
ax2.plot(trainer.history['train_acc'], label='Train')
ax2.plot(trainer.history['val_acc'], label='Val')
ax2.set_title('Accuracy')
ax2.legend()
plt.show()
Deploy as a FastAPI Endpoint
# app.py
from fastapi import FastAPI, File, UploadFile
from fastapi.responses import JSONResponse
import torch
import torchvision.transforms as transforms
from PIL import Image
import io
app = FastAPI(title="Image Classifier API")
# Load model at startup
model = ImageClassifier(num_classes=3)
model.load_state_dict(torch.load('best_model.pt', map_location='cpu'))
model.eval()
class_names = ['cats', 'dogs', 'wild']
val_transforms = transforms.Compose([
transforms.Resize((224, 224)),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])
@app.post("/predict")
async def predict(file: UploadFile = File(...)):
# Read image
image_data = await file.read()
image = Image.open(io.BytesIO(image_data)).convert('RGB')
# Preprocess
tensor = val_transforms(image).unsqueeze(0)
# Predict
with torch.no_grad():
logits = model(tensor)
probas = torch.softmax(logits, dim=1).squeeze()
# Format results
results = {
name: float(prob)
for name, prob in zip(class_names, probas)
}
predicted_class = class_names[probas.argmax()]
return JSONResponse({
"prediction": predicted_class,
"confidence": float(probas.max()),
"probabilities": results
})
# Run: uvicorn app:app --reload
# Test: curl -X POST -F "file=@dog.jpg" http://localhost:8000/predict
What You Learned
This project taught you:
- Transfer learning workflow — freeze → warmup head → fine-tune whole network
- Data augmentation — critical for small datasets
- Two-phase training — first train head, then fine-tune backbone with lower LR
- Evaluation — confusion matrix reveals per-class errors
- Deployment — wrapping a PyTorch model in a FastAPI endpoint
The architecture — pretrained backbone + custom head + two-phase fine-tuning — is the standard approach for any image classification task in industry.
Next: Building Your ML Portfolio — how to present your projects to employers.
📱
Get Notes Free →Get this course's notes on Telegram!
Free cheat sheets, summaries & practice exercises