30 minLesson 21 of 31
Neural Networks & Deep Learning
Deep Learning with PyTorch
Deep Learning with PyTorch: Building Real Neural Networks
PyTorch is the dominant framework for deep learning research and increasingly for production. Its dynamic computation graph, intuitive Python API, and massive ecosystem make it the right tool to learn first. This lesson covers everything you need to build, train, and evaluate real neural networks.
PyTorch Fundamentals: Tensors
import torch
import torch.nn as nn
import numpy as np
# Creating tensors
x = torch.tensor([1.0, 2.0, 3.0])
matrix = torch.zeros(3, 4)
random = torch.randn(2, 3) # Normal distribution
# From NumPy (shared memory — changes affect both)
arr = np.array([1, 2, 3])
tensor = torch.from_numpy(arr)
back_to_numpy = tensor.numpy()
# Moving to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
tensor_gpu = random.to(device)
# Tensor operations
a = torch.randn(3, 4)
b = torch.randn(4, 5)
c = a @ b # Matrix multiplication
d = a + 1 # Broadcasting
e = torch.relu(a) # Activation function
Autograd: Automatic Differentiation
PyTorch tracks operations on tensors and computes gradients automatically.
# Requires grad — track operations for backprop
w = torch.tensor(2.0, requires_grad=True)
b = torch.tensor(1.0, requires_grad=True)
x = torch.tensor(3.0)
# Forward pass
y_pred = w * x + b # y = 2*3 + 1 = 7
loss = (y_pred - 5) ** 2 # (7-5)² = 4
# Backward pass — compute gradients
loss.backward()
print(f"w.grad = {w.grad}") # ∂loss/∂w = 2(y_pred-5)*x = 2*2*3 = 12
print(f"b.grad = {b.grad}") # ∂loss/∂b = 2(y_pred-5) = 4
# Clear gradients before next step (important!)
w.grad.zero_()
b.grad.zero_()
Building Networks with nn.Module
import torch.nn as nn
import torch.nn.functional as F
class MLP(nn.Module):
def __init__(self, input_dim, hidden_dims, output_dim, dropout=0.3):
super().__init__()
layers = []
prev_dim = input_dim
for hidden_dim in hidden_dims:
layers.extend([
nn.Linear(prev_dim, hidden_dim),
nn.BatchNorm1d(hidden_dim),
nn.ReLU(),
nn.Dropout(dropout)
])
prev_dim = hidden_dim
layers.append(nn.Linear(prev_dim, output_dim))
self.net = nn.Sequential(*layers)
def forward(self, x):
return self.net(x)
# Create model
model = MLP(
input_dim=30,
hidden_dims=[128, 64, 32],
output_dim=2,
dropout=0.3
).to(device)
# Inspect architecture
print(model)
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")
Loading Data with DataLoader
from torch.utils.data import Dataset, DataLoader, TensorDataset
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
# Prepare data
cancer = load_breast_cancer()
X, y = cancer.data, cancer.target
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, stratify=y, random_state=42
)
# Scale
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Convert to tensors
X_train_t = torch.FloatTensor(X_train_scaled).to(device)
y_train_t = torch.LongTensor(y_train).to(device)
X_test_t = torch.FloatTensor(X_test_scaled).to(device)
y_test_t = torch.LongTensor(y_test).to(device)
# Create datasets and loaders
train_dataset = TensorDataset(X_train_t, y_train_t)
test_dataset = TensorDataset(X_test_t, y_test_t)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)
The Complete Training Loop
import torch.optim as optim
from torch.optim.lr_scheduler import CosineAnnealingLR
def train_epoch(model, loader, optimizer, criterion):
model.train() # Enable dropout and batch norm training mode
total_loss = 0
correct = 0
for X_batch, y_batch in loader:
# Forward pass
logits = model(X_batch)
loss = criterion(logits, y_batch)
# Backward pass
optimizer.zero_grad() # Clear previous gradients
loss.backward()
# Gradient clipping (optional but good practice)
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
optimizer.step()
total_loss += loss.item() * len(y_batch)
correct += (logits.argmax(1) == y_batch).sum().item()
return total_loss / len(loader.dataset), correct / len(loader.dataset)
def evaluate(model, loader, criterion):
model.eval() # Disable dropout and batch norm training mode
total_loss = 0
correct = 0
with torch.no_grad(): # No gradient computation for inference
for X_batch, y_batch in loader:
logits = model(X_batch)
loss = criterion(logits, y_batch)
total_loss += loss.item() * len(y_batch)
correct += (logits.argmax(1) == y_batch).sum().item()
return total_loss / len(loader.dataset), correct / len(loader.dataset)
# Setup
model = MLP(input_dim=30, hidden_dims=[128, 64, 32], output_dim=2).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-4)
scheduler = CosineAnnealingLR(optimizer, T_max=50)
# Training
history = {'train_loss': [], 'val_loss': [], 'train_acc': [], 'val_acc': []}
best_val_acc = 0
best_model_state = None
for epoch in range(50):
train_loss, train_acc = train_epoch(model, train_loader, optimizer, criterion)
val_loss, val_acc = evaluate(model, test_loader, criterion)
scheduler.step()
history['train_loss'].append(train_loss)
history['val_loss'].append(val_loss)
history['train_acc'].append(train_acc)
history['val_acc'].append(val_acc)
# Save best model
if val_acc > best_val_acc:
best_val_acc = val_acc
best_model_state = model.state_dict().copy()
if (epoch + 1) % 10 == 0:
print(f"Epoch {epoch+1:3d}: "
f"Train Loss={train_loss:.4f}, Acc={train_acc:.3f} | "
f"Val Loss={val_loss:.4f}, Acc={val_acc:.3f}")
# Load best model
model.load_state_dict(best_model_state)
print(f"\nBest Validation Accuracy: {best_val_acc:.3f}")
Saving and Loading Models
# Save model weights (recommended)
torch.save(model.state_dict(), 'model_weights.pt')
# Load model weights
model = MLP(input_dim=30, hidden_dims=[128, 64, 32], output_dim=2)
model.load_state_dict(torch.load('model_weights.pt', map_location=device))
model.eval()
# Save entire model (architecture + weights)
torch.save(model, 'full_model.pt')
model_loaded = torch.load('full_model.pt', map_location=device)
# Save for production with TorchScript
scripted = torch.jit.script(model)
scripted.save('model_scripted.pt')
Visualizing Training
import matplotlib.pyplot as plt
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
ax1.plot(history['train_loss'], label='Train Loss')
ax1.plot(history['val_loss'], label='Val Loss')
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss')
ax1.set_title('Training Loss')
ax1.legend()
ax2.plot(history['train_acc'], label='Train Accuracy')
ax2.plot(history['val_acc'], label='Val Accuracy')
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Accuracy')
ax2.set_title('Training Accuracy')
ax2.legend()
plt.tight_layout()
plt.show()
Common Activation Functions
# ReLU — default choice, avoids vanishing gradients
nn.ReLU()
# Leaky ReLU — fixes "dying ReLU" problem
nn.LeakyReLU(negative_slope=0.01)
# GELU — used in transformers (GPT, BERT)
nn.GELU()
# Sigmoid — binary classification output
nn.Sigmoid()
# Softmax — multiclass classification output
nn.Softmax(dim=1)
# Tanh — centered output [-1, 1], used in RNNs
nn.Tanh()
Loss Functions
# Binary classification
nn.BCELoss() # Binary cross-entropy (output must be 0-1)
nn.BCEWithLogitsLoss() # Binary cross-entropy with logits (numerically stable, preferred)
# Multiclass classification
nn.CrossEntropyLoss() # Combines log-softmax + NLL loss
# Regression
nn.MSELoss() # Mean squared error
nn.L1Loss() # Mean absolute error
nn.HuberLoss() # Robust to outliers (between MSE and MAE)
Optimizers
# SGD with momentum — simple, reliable, good for CNNs
optim.SGD(model.parameters(), lr=0.01, momentum=0.9, weight_decay=1e-4)
# Adam — adaptive learning rates, good default
optim.Adam(model.parameters(), lr=0.001)
# AdamW — Adam with proper weight decay, widely used
optim.AdamW(model.parameters(), lr=0.001, weight_decay=0.01)
# Lookahead + Adam = RAdam (stable for transformers)
The complete PyTorch workflow — data loading, model definition, training loop, evaluation, saving — is what you'll use for every deep learning project.
Next lesson: Convolutional Neural Networks — the architecture that made computer vision work.
📱
Get Notes Free →Get this course's notes on Telegram!
Free cheat sheets, summaries & practice exercises