Hyperparameter Tuning | Machine Learning Fundamentals | AiTechWorlds

Hyperparameter Tuning: Systematically Finding the Best Configuration

Every ML model has hyperparameters — settings you choose before training that control how learning happens. Learning rate, number of trees, regularization strength. Tuning these systematically, rather than guessing, can be the difference between a mediocre model and a great one.

Grid Search: Exhaustive but Thorough

from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import numpy as np

cancer = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(
    cancer.data, cancer.target, 
    test_size=0.2, stratify=cancer.target, random_state=42
)

param_grid = {
    'model__n_estimators': [100, 200, 300],
    'model__learning_rate': [0.05, 0.1, 0.2],
    'model__max_depth': [3, 4, 5],
    'model__subsample': [0.8, 1.0]
}

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', GradientBoostingClassifier(random_state=42))
])

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid = GridSearchCV(
    pipeline, param_grid, 
    cv=cv, scoring='roc_auc',
    n_jobs=-1, verbose=1
)
grid.fit(X_train, y_train)

print(f"Best CV AUC: {grid.best_score_:.4f}")
print(f"Best params: {grid.best_params_}")
print(f"Test AUC: {grid.best_estimator_.score(X_test, y_test):.4f}")

Problem: Grid search scales as the product of all parameter values. 3 × 3 × 3 × 2 = 54 combinations × 5 folds = 270 model fits. Add one more parameter and it explodes.

Random Search: Usually Better per Compute Budget

When you don't know which parameters matter most, random search explores more of the space.

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform, loguniform

param_dist = {
    'model__n_estimators': randint(50, 500),
    'model__learning_rate': loguniform(0.01, 0.5),  # Log-uniform: good for learning rates
    'model__max_depth': randint(2, 8),
    'model__subsample': uniform(0.6, 0.4),       # 0.6 to 1.0
    'model__min_samples_leaf': randint(1, 20),
    'model__max_features': uniform(0.3, 0.7)     # 0.3 to 1.0
}

random_search = RandomizedSearchCV(
    pipeline, param_dist,
    n_iter=100,           # Try 100 random combinations
    cv=cv,
    scoring='roc_auc',
    n_jobs=-1,
    random_state=42,
    verbose=1
)
random_search.fit(X_train, y_train)

print(f"Random Search Best CV AUC: {random_search.best_score_:.4f}")
print(f"Best params: {random_search.best_params_}")

Why random > grid for large spaces: With 100 iterations over 6 parameters, random search tries 100 different values for each parameter. Grid search with 3 values per parameter only tries 3 values per parameter (the same combinations each time, plus many redundant evaluations).

Bayesian Optimization: Intelligent Search

Instead of random exploration, Bayesian optimization builds a probabilistic model of which parameter regions are promising, and focuses search there.

# Using Optuna — the most practical Bayesian optimization library
# pip install optuna

import optuna
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score

def objective(trial):
    # Suggest hyperparameters
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.5, log=True),
        'max_depth': trial.suggest_int('max_depth', 2, 8),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20),
        'max_features': trial.suggest_float('max_features', 0.3, 1.0)
    }
    
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('model', GradientBoostingClassifier(**params, random_state=42))
    ])
    
    score = cross_val_score(
        pipeline, X_train, y_train,
        cv=3, scoring='roc_auc', n_jobs=-1
    )
    return score.mean()

# Run optimization
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100, show_progress_bar=True)

print(f"Best AUC: {study.best_value:.4f}")
print(f"Best params: {study.best_params}")

# Visualize optimization history
fig = optuna.visualization.plot_optimization_history(study)
fig.show()

# Feature importance (which params mattered?)
fig = optuna.visualization.plot_param_importances(study)
fig.show()

Tuning XGBoost (The Practical Case)

# pip install xgboost
import xgboost as xgb
from sklearn.model_selection import cross_val_score

def tune_xgboost(X_train, y_train, n_trials=100):
    def objective(trial):
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
            'max_depth': trial.suggest_int('max_depth', 3, 9),
            'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.3, log=True),
            'subsample': trial.suggest_float('subsample', 0.5, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
            'min_child_weight': trial.suggest_int('min_child_weight', 1, 20),
            'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
            'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
            'use_label_encoder': False,
            'eval_metric': 'logloss',
            'random_state': 42,
            'n_jobs': -1
        }
        
        model = xgb.XGBClassifier(**params)
        score = cross_val_score(model, X_train, y_train, cv=3, scoring='roc_auc')
        return score.mean()
    
    study = optuna.create_study(direction='maximize')
    optuna.logging.set_verbosity(optuna.logging.WARNING)
    study.optimize(objective, n_trials=n_trials)
    
    return study.best_params, study.best_value

best_params, best_score = tune_xgboost(X_train, y_train)
print(f"Best XGBoost AUC: {best_score:.4f}")
print(f"Best params: {best_params}")

Tuning Neural Networks

import torch
import torch.nn as nn
import torch.optim as optim

def tune_nn(X_train, y_train, n_trials=50):
    X_t = torch.FloatTensor(X_train)
    y_t = torch.FloatTensor(y_train)
    
    def objective(trial):
        # Architecture hyperparameters
        n_layers = trial.suggest_int('n_layers', 1, 4)
        hidden_dims = [
            trial.suggest_int(f'hidden_{i}', 16, 256) 
            for i in range(n_layers)
        ]
        dropout = trial.suggest_float('dropout', 0.1, 0.6)
        
        # Training hyperparameters
        lr = trial.suggest_float('lr', 1e-4, 1e-1, log=True)
        weight_decay = trial.suggest_float('weight_decay', 1e-6, 1e-2, log=True)
        
        # Build model
        layers = [nn.Linear(X_train.shape[1], hidden_dims[0]), nn.ReLU()]
        for i in range(1, len(hidden_dims)):
            layers.extend([
                nn.Dropout(dropout),
                nn.Linear(hidden_dims[i-1], hidden_dims[i]),
                nn.ReLU()
            ])
        layers.extend([nn.Dropout(dropout), nn.Linear(hidden_dims[-1], 1), nn.Sigmoid()])
        
        model = nn.Sequential(*layers)
        optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
        criterion = nn.BCELoss()
        
        # Train briefly
        for epoch in range(50):
            model.train()
            pred = model(X_t).squeeze()
            loss = criterion(pred, y_t)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        
        model.eval()
        with torch.no_grad():
            pred = model(X_t).squeeze().numpy()
        
        from sklearn.metrics import roc_auc_score
        return roc_auc_score(y_train, pred)
    
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=n_trials)
    return study.best_params, study.best_value

Early Stopping in Tuning

# Pruning unpromising trials early — saves lots of compute
import optuna
from optuna.pruners import MedianPruner

study = optuna.create_study(
    direction='maximize',
    pruner=MedianPruner(n_startup_trials=10, n_warmup_steps=5)
)

def objective_with_pruning(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'max_depth': trial.suggest_int('max_depth', 2, 8),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
    }
    
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    for step, (train_idx, val_idx) in enumerate(cv.split(X_train, y_train)):
        model = GradientBoostingClassifier(**params, random_state=42)
        model.fit(X_train[train_idx], y_train[train_idx])
        score = roc_auc_score(y_train[val_idx], 
                               model.predict_proba(X_train[val_idx])[:, 1])
        
        trial.report(score, step)
        if trial.should_prune():  # Stop if clearly worse than median
            raise optuna.exceptions.TrialPruned()
    
    return score

study.optimize(objective_with_pruning, n_trials=100)

The Full Tuning Workflow

# 1. Quick sanity check with defaults
model_default = Pipeline([
    ('scaler', StandardScaler()),
    ('model', GradientBoostingClassifier(random_state=42))
])
base_score = cross_val_score(model_default, X_train, y_train, cv=5, scoring='roc_auc').mean()
print(f"Default: {base_score:.4f}")

# 2. Coarse random search (50 iterations, wide ranges)
# 3. Narrow down around best region (targeted grid search)
# 4. Fine-tune most important parameters (Bayesian with 100+ iterations)
# 5. Final evaluation on held-out test set

# 5. Evaluate final tuned model
best_model = random_search.best_estimator_
test_score = roc_auc_score(y_test, best_model.predict_proba(X_test)[:, 1])
print(f"Final Test AUC: {test_score:.4f}")
print(f"Improvement: {test_score - base_score:.4f}")

Next lesson: ML Project — End-to-End Tabular ML — putting it all together.