Hyperparameter Tuning
Hyperparameter Tuning: Systematically Finding the Best Configuration
Every ML model has hyperparameters — settings you choose before training that control how learning happens. Learning rate, number of trees, regularization strength. Tuning these systematically, rather than guessing, can be the difference between a mediocre model and a great one.
Grid Search: Exhaustive but Thorough
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import numpy as np
cancer = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(
cancer.data, cancer.target,
test_size=0.2, stratify=cancer.target, random_state=42
)
param_grid = {
'model__n_estimators': [100, 200, 300],
'model__learning_rate': [0.05, 0.1, 0.2],
'model__max_depth': [3, 4, 5],
'model__subsample': [0.8, 1.0]
}
pipeline = Pipeline([
('scaler', StandardScaler()),
('model', GradientBoostingClassifier(random_state=42))
])
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid = GridSearchCV(
pipeline, param_grid,
cv=cv, scoring='roc_auc',
n_jobs=-1, verbose=1
)
grid.fit(X_train, y_train)
print(f"Best CV AUC: {grid.best_score_:.4f}")
print(f"Best params: {grid.best_params_}")
print(f"Test AUC: {grid.best_estimator_.score(X_test, y_test):.4f}")
Problem: Grid search scales as the product of all parameter values. 3 × 3 × 3 × 2 = 54 combinations × 5 folds = 270 model fits. Add one more parameter and it explodes.
Random Search: Usually Better per Compute Budget
When you don't know which parameters matter most, random search explores more of the space.
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform, loguniform
param_dist = {
'model__n_estimators': randint(50, 500),
'model__learning_rate': loguniform(0.01, 0.5), # Log-uniform: good for learning rates
'model__max_depth': randint(2, 8),
'model__subsample': uniform(0.6, 0.4), # 0.6 to 1.0
'model__min_samples_leaf': randint(1, 20),
'model__max_features': uniform(0.3, 0.7) # 0.3 to 1.0
}
random_search = RandomizedSearchCV(
pipeline, param_dist,
n_iter=100, # Try 100 random combinations
cv=cv,
scoring='roc_auc',
n_jobs=-1,
random_state=42,
verbose=1
)
random_search.fit(X_train, y_train)
print(f"Random Search Best CV AUC: {random_search.best_score_:.4f}")
print(f"Best params: {random_search.best_params_}")
Why random > grid for large spaces: With 100 iterations over 6 parameters, random search tries 100 different values for each parameter. Grid search with 3 values per parameter only tries 3 values per parameter (the same combinations each time, plus many redundant evaluations).
Bayesian Optimization: Intelligent Search
Instead of random exploration, Bayesian optimization builds a probabilistic model of which parameter regions are promising, and focuses search there.
# Using Optuna — the most practical Bayesian optimization library
# pip install optuna
import optuna
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
def objective(trial):
# Suggest hyperparameters
params = {
'n_estimators': trial.suggest_int('n_estimators', 50, 500),
'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.5, log=True),
'max_depth': trial.suggest_int('max_depth', 2, 8),
'subsample': trial.suggest_float('subsample', 0.6, 1.0),
'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20),
'max_features': trial.suggest_float('max_features', 0.3, 1.0)
}
pipeline = Pipeline([
('scaler', StandardScaler()),
('model', GradientBoostingClassifier(**params, random_state=42))
])
score = cross_val_score(
pipeline, X_train, y_train,
cv=3, scoring='roc_auc', n_jobs=-1
)
return score.mean()
# Run optimization
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100, show_progress_bar=True)
print(f"Best AUC: {study.best_value:.4f}")
print(f"Best params: {study.best_params}")
# Visualize optimization history
fig = optuna.visualization.plot_optimization_history(study)
fig.show()
# Feature importance (which params mattered?)
fig = optuna.visualization.plot_param_importances(study)
fig.show()
Tuning XGBoost (The Practical Case)
# pip install xgboost
import xgboost as xgb
from sklearn.model_selection import cross_val_score
def tune_xgboost(X_train, y_train, n_trials=100):
def objective(trial):
params = {
'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
'max_depth': trial.suggest_int('max_depth', 3, 9),
'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.3, log=True),
'subsample': trial.suggest_float('subsample', 0.5, 1.0),
'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
'min_child_weight': trial.suggest_int('min_child_weight', 1, 20),
'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
'use_label_encoder': False,
'eval_metric': 'logloss',
'random_state': 42,
'n_jobs': -1
}
model = xgb.XGBClassifier(**params)
score = cross_val_score(model, X_train, y_train, cv=3, scoring='roc_auc')
return score.mean()
study = optuna.create_study(direction='maximize')
optuna.logging.set_verbosity(optuna.logging.WARNING)
study.optimize(objective, n_trials=n_trials)
return study.best_params, study.best_value
best_params, best_score = tune_xgboost(X_train, y_train)
print(f"Best XGBoost AUC: {best_score:.4f}")
print(f"Best params: {best_params}")
Tuning Neural Networks
import torch
import torch.nn as nn
import torch.optim as optim
def tune_nn(X_train, y_train, n_trials=50):
X_t = torch.FloatTensor(X_train)
y_t = torch.FloatTensor(y_train)
def objective(trial):
# Architecture hyperparameters
n_layers = trial.suggest_int('n_layers', 1, 4)
hidden_dims = [
trial.suggest_int(f'hidden_{i}', 16, 256)
for i in range(n_layers)
]
dropout = trial.suggest_float('dropout', 0.1, 0.6)
# Training hyperparameters
lr = trial.suggest_float('lr', 1e-4, 1e-1, log=True)
weight_decay = trial.suggest_float('weight_decay', 1e-6, 1e-2, log=True)
# Build model
layers = [nn.Linear(X_train.shape[1], hidden_dims[0]), nn.ReLU()]
for i in range(1, len(hidden_dims)):
layers.extend([
nn.Dropout(dropout),
nn.Linear(hidden_dims[i-1], hidden_dims[i]),
nn.ReLU()
])
layers.extend([nn.Dropout(dropout), nn.Linear(hidden_dims[-1], 1), nn.Sigmoid()])
model = nn.Sequential(*layers)
optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
criterion = nn.BCELoss()
# Train briefly
for epoch in range(50):
model.train()
pred = model(X_t).squeeze()
loss = criterion(pred, y_t)
optimizer.zero_grad()
loss.backward()
optimizer.step()
model.eval()
with torch.no_grad():
pred = model(X_t).squeeze().numpy()
from sklearn.metrics import roc_auc_score
return roc_auc_score(y_train, pred)
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=n_trials)
return study.best_params, study.best_value
Early Stopping in Tuning
# Pruning unpromising trials early — saves lots of compute
import optuna
from optuna.pruners import MedianPruner
study = optuna.create_study(
direction='maximize',
pruner=MedianPruner(n_startup_trials=10, n_warmup_steps=5)
)
def objective_with_pruning(trial):
params = {
'n_estimators': trial.suggest_int('n_estimators', 50, 500),
'max_depth': trial.suggest_int('max_depth', 2, 8),
'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
}
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for step, (train_idx, val_idx) in enumerate(cv.split(X_train, y_train)):
model = GradientBoostingClassifier(**params, random_state=42)
model.fit(X_train[train_idx], y_train[train_idx])
score = roc_auc_score(y_train[val_idx],
model.predict_proba(X_train[val_idx])[:, 1])
trial.report(score, step)
if trial.should_prune(): # Stop if clearly worse than median
raise optuna.exceptions.TrialPruned()
return score
study.optimize(objective_with_pruning, n_trials=100)
The Full Tuning Workflow
# 1. Quick sanity check with defaults
model_default = Pipeline([
('scaler', StandardScaler()),
('model', GradientBoostingClassifier(random_state=42))
])
base_score = cross_val_score(model_default, X_train, y_train, cv=5, scoring='roc_auc').mean()
print(f"Default: {base_score:.4f}")
# 2. Coarse random search (50 iterations, wide ranges)
# 3. Narrow down around best region (targeted grid search)
# 4. Fine-tune most important parameters (Bayesian with 100+ iterations)
# 5. Final evaluation on held-out test set
# 5. Evaluate final tuned model
best_model = random_search.best_estimator_
test_score = roc_auc_score(y_test, best_model.predict_proba(X_test)[:, 1])
print(f"Final Test AUC: {test_score:.4f}")
print(f"Improvement: {test_score - base_score:.4f}")
Next lesson: ML Project — End-to-End Tabular ML — putting it all together.
Get this course's notes on Telegram!
Free cheat sheets, summaries & practice exercises