Deep Learning with PyTorch | Machine Learning Fundamentals | AiTechWorlds

Deep Learning with PyTorch: Building Real Neural Networks

PyTorch is the dominant framework for deep learning research and increasingly for production. Its dynamic computation graph, intuitive Python API, and massive ecosystem make it the right tool to learn first. This lesson covers everything you need to build, train, and evaluate real neural networks.

PyTorch Fundamentals: Tensors

import torch
import torch.nn as nn
import numpy as np

# Creating tensors
x = torch.tensor([1.0, 2.0, 3.0])
matrix = torch.zeros(3, 4)
random = torch.randn(2, 3)  # Normal distribution

# From NumPy (shared memory — changes affect both)
arr = np.array([1, 2, 3])
tensor = torch.from_numpy(arr)
back_to_numpy = tensor.numpy()

# Moving to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
tensor_gpu = random.to(device)

# Tensor operations
a = torch.randn(3, 4)
b = torch.randn(4, 5)
c = a @ b          # Matrix multiplication
d = a + 1          # Broadcasting
e = torch.relu(a)  # Activation function

Autograd: Automatic Differentiation

PyTorch tracks operations on tensors and computes gradients automatically.

# Requires grad — track operations for backprop
w = torch.tensor(2.0, requires_grad=True)
b = torch.tensor(1.0, requires_grad=True)
x = torch.tensor(3.0)

# Forward pass
y_pred = w * x + b       # y = 2*3 + 1 = 7
loss = (y_pred - 5) ** 2  # (7-5)² = 4

# Backward pass — compute gradients
loss.backward()

print(f"w.grad = {w.grad}")  # ∂loss/∂w = 2(y_pred-5)*x = 2*2*3 = 12
print(f"b.grad = {b.grad}")  # ∂loss/∂b = 2(y_pred-5) = 4

# Clear gradients before next step (important!)
w.grad.zero_()
b.grad.zero_()

Building Networks with nn.Module

import torch.nn as nn
import torch.nn.functional as F

class MLP(nn.Module):
    def __init__(self, input_dim, hidden_dims, output_dim, dropout=0.3):
        super().__init__()
        
        layers = []
        prev_dim = input_dim
        
        for hidden_dim in hidden_dims:
            layers.extend([
                nn.Linear(prev_dim, hidden_dim),
                nn.BatchNorm1d(hidden_dim),
                nn.ReLU(),
                nn.Dropout(dropout)
            ])
            prev_dim = hidden_dim
        
        layers.append(nn.Linear(prev_dim, output_dim))
        
        self.net = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.net(x)

# Create model
model = MLP(
    input_dim=30,
    hidden_dims=[128, 64, 32],
    output_dim=2,
    dropout=0.3
).to(device)

# Inspect architecture
print(model)
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")

Loading Data with DataLoader

from torch.utils.data import Dataset, DataLoader, TensorDataset
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Prepare data
cancer = load_breast_cancer()
X, y = cancer.data, cancer.target

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Scale
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert to tensors
X_train_t = torch.FloatTensor(X_train_scaled).to(device)
y_train_t = torch.LongTensor(y_train).to(device)
X_test_t = torch.FloatTensor(X_test_scaled).to(device)
y_test_t = torch.LongTensor(y_test).to(device)

# Create datasets and loaders
train_dataset = TensorDataset(X_train_t, y_train_t)
test_dataset = TensorDataset(X_test_t, y_test_t)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

The Complete Training Loop

import torch.optim as optim
from torch.optim.lr_scheduler import CosineAnnealingLR

def train_epoch(model, loader, optimizer, criterion):
    model.train()  # Enable dropout and batch norm training mode
    total_loss = 0
    correct = 0
    
    for X_batch, y_batch in loader:
        # Forward pass
        logits = model(X_batch)
        loss = criterion(logits, y_batch)
        
        # Backward pass
        optimizer.zero_grad()  # Clear previous gradients
        loss.backward()
        
        # Gradient clipping (optional but good practice)
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        
        optimizer.step()
        
        total_loss += loss.item() * len(y_batch)
        correct += (logits.argmax(1) == y_batch).sum().item()
    
    return total_loss / len(loader.dataset), correct / len(loader.dataset)

def evaluate(model, loader, criterion):
    model.eval()  # Disable dropout and batch norm training mode
    total_loss = 0
    correct = 0
    
    with torch.no_grad():  # No gradient computation for inference
        for X_batch, y_batch in loader:
            logits = model(X_batch)
            loss = criterion(logits, y_batch)
            total_loss += loss.item() * len(y_batch)
            correct += (logits.argmax(1) == y_batch).sum().item()
    
    return total_loss / len(loader.dataset), correct / len(loader.dataset)

# Setup
model = MLP(input_dim=30, hidden_dims=[128, 64, 32], output_dim=2).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-4)
scheduler = CosineAnnealingLR(optimizer, T_max=50)

# Training
history = {'train_loss': [], 'val_loss': [], 'train_acc': [], 'val_acc': []}
best_val_acc = 0
best_model_state = None

for epoch in range(50):
    train_loss, train_acc = train_epoch(model, train_loader, optimizer, criterion)
    val_loss, val_acc = evaluate(model, test_loader, criterion)
    scheduler.step()
    
    history['train_loss'].append(train_loss)
    history['val_loss'].append(val_loss)
    history['train_acc'].append(train_acc)
    history['val_acc'].append(val_acc)
    
    # Save best model
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        best_model_state = model.state_dict().copy()
    
    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch+1:3d}: "
              f"Train Loss={train_loss:.4f}, Acc={train_acc:.3f} | "
              f"Val Loss={val_loss:.4f}, Acc={val_acc:.3f}")

# Load best model
model.load_state_dict(best_model_state)
print(f"\nBest Validation Accuracy: {best_val_acc:.3f}")

Saving and Loading Models

# Save model weights (recommended)
torch.save(model.state_dict(), 'model_weights.pt')

# Load model weights
model = MLP(input_dim=30, hidden_dims=[128, 64, 32], output_dim=2)
model.load_state_dict(torch.load('model_weights.pt', map_location=device))
model.eval()

# Save entire model (architecture + weights)
torch.save(model, 'full_model.pt')
model_loaded = torch.load('full_model.pt', map_location=device)

# Save for production with TorchScript
scripted = torch.jit.script(model)
scripted.save('model_scripted.pt')

Visualizing Training

import matplotlib.pyplot as plt

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

ax1.plot(history['train_loss'], label='Train Loss')
ax1.plot(history['val_loss'], label='Val Loss')
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss')
ax1.set_title('Training Loss')
ax1.legend()

ax2.plot(history['train_acc'], label='Train Accuracy')
ax2.plot(history['val_acc'], label='Val Accuracy')
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Accuracy')
ax2.set_title('Training Accuracy')
ax2.legend()

plt.tight_layout()
plt.show()

Common Activation Functions

# ReLU — default choice, avoids vanishing gradients
nn.ReLU()

# Leaky ReLU — fixes "dying ReLU" problem
nn.LeakyReLU(negative_slope=0.01)

# GELU — used in transformers (GPT, BERT)
nn.GELU()

# Sigmoid — binary classification output
nn.Sigmoid()

# Softmax — multiclass classification output
nn.Softmax(dim=1)

# Tanh — centered output [-1, 1], used in RNNs
nn.Tanh()

Loss Functions

# Binary classification
nn.BCELoss()           # Binary cross-entropy (output must be 0-1)
nn.BCEWithLogitsLoss() # Binary cross-entropy with logits (numerically stable, preferred)

# Multiclass classification
nn.CrossEntropyLoss()  # Combines log-softmax + NLL loss

# Regression
nn.MSELoss()           # Mean squared error
nn.L1Loss()            # Mean absolute error
nn.HuberLoss()         # Robust to outliers (between MSE and MAE)

Optimizers

# SGD with momentum — simple, reliable, good for CNNs
optim.SGD(model.parameters(), lr=0.01, momentum=0.9, weight_decay=1e-4)

# Adam — adaptive learning rates, good default
optim.Adam(model.parameters(), lr=0.001)

# AdamW — Adam with proper weight decay, widely used
optim.AdamW(model.parameters(), lr=0.001, weight_decay=0.01)

# Lookahead + Adam = RAdam (stable for transformers)

The complete PyTorch workflow — data loading, model definition, training loop, evaluation, saving — is what you'll use for every deep learning project.

Next lesson: Convolutional Neural Networks — the architecture that made computer vision work.