Cross-Validation Techniques | Machine Learning Fundamentals | AiTechWorlds

Cross-Validation Techniques: Evaluating Models Reliably on Limited Data

A model's score on a single test set is noisy — it depends heavily on which examples ended up in that set. Cross-validation gives you a stable, reliable estimate of generalization performance by systematically evaluating on multiple different splits.

Why Cross-Validation Matters

from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import numpy as np

iris = load_iris()
X, y = iris.data, iris.target

# Single split — results vary by random_state!
for seed in range(5):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=seed
    )
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    print(f"Seed {seed}: {model.score(X_test, y_test):.3f}")

# Output might be: 0.933, 0.967, 1.000, 0.967, 0.933
# Which one is "real"? None of them alone.

Cross-validation removes this variance by averaging across multiple splits.

K-Fold Cross-Validation

from sklearn.model_selection import (cross_val_score, KFold, StratifiedKFold,
                                      cross_validate)

model = RandomForestClassifier(n_estimators=100, random_state=42)

# Standard K-Fold
kf = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model, X, y, cv=kf, scoring='accuracy')

print(f"CV Scores: {scores}")
print(f"Mean: {scores.mean():.3f} ± {scores.std():.3f}")
# Mean: 0.960 ± 0.021

# Stratified K-Fold — preserves class proportions in each fold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores_strat = cross_val_score(model, X, y, cv=skf, scoring='accuracy')
print(f"Stratified CV: {scores_strat.mean():.3f} ± {scores_strat.std():.3f}")

How K-Fold works:

5-fold cross-validation on 100 samples:

Fold 1: [01-20] = test, [21-100] = train
Fold 2: [21-40] = test, [01-20, 41-100] = train
Fold 3: [41-60] = test, [01-40, 61-100] = train
Fold 4: [61-80] = test, [01-60, 81-100] = train
Fold 5: [81-100] = test, [01-80] = train

Final score = mean of 5 fold scores

Multiple Metrics at Once

from sklearn.model_selection import cross_validate

# Get multiple metrics in one call
cv_results = cross_validate(
    model, X, y,
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    scoring=['accuracy', 'precision_macro', 'recall_macro', 'f1_macro'],
    return_train_score=True  # Also get training scores (check for overfitting)
)

for metric in ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']:
    test_scores = cv_results[f'test_{metric}']
    train_scores = cv_results[f'train_{metric}']
    print(f"{metric:20s}: {test_scores.mean():.3f} ± {test_scores.std():.3f}"
          f"  (train: {train_scores.mean():.3f})")

Leave-One-Out Cross-Validation (LOOCV)

Uses every possible training/test combination — most data-efficient but computationally expensive.

from sklearn.model_selection import LeaveOneOut, cross_val_score

loo = LeaveOneOut()
scores_loo = cross_val_score(model, X, y, cv=loo, scoring='accuracy')

print(f"LOOCV: {scores_loo.mean():.3f} ± {scores_loo.std():.3f}")
print(f"Number of fits: {len(scores_loo)}")  # n_samples times!

LOOCV has high computational cost (n fits for n samples) but minimal bias. Use when data is scarce (under 100 samples) and you need the most unbiased estimate possible.

Repeated K-Fold

Runs K-Fold multiple times with different shuffles — reduces variance from lucky/unlucky folds.

from sklearn.model_selection import RepeatedStratifiedKFold

rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state=42)
scores_repeated = cross_val_score(model, X, y, cv=rskf, scoring='accuracy')

print(f"Repeated 5-Fold × 10: {scores_repeated.mean():.3f} ± {scores_repeated.std():.3f}")
print(f"Number of fits: {len(scores_repeated)}")  # 50

More stable estimates at the cost of 10× more computation.

Time Series Cross-Validation

For time series, you must never use future data to predict the past.

from sklearn.model_selection import TimeSeriesSplit

tscv = TimeSeriesSplit(n_splits=5, gap=0)

# Illustrate the splits
n = 100
for fold, (train_idx, test_idx) in enumerate(tscv.split(range(n))):
    print(f"Fold {fold+1}: Train {train_idx[0]}-{train_idx[-1]}, "
          f"Test {test_idx[0]}-{test_idx[-1]}")

# Output:
# Fold 1: Train 0-16, Test 17-33
# Fold 2: Train 0-33, Test 34-50
# Fold 3: Train 0-50, Test 51-67
# Fold 4: Train 0-67, Test 68-83
# Fold 5: Train 0-83, Test 84-99

Training window grows; test window is always "the future" relative to training data.

Cross-Validation for Hyperparameter Tuning

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from scipy.stats import randint, uniform

# GridSearchCV: exhaustive search
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10],
    'min_samples_leaf': [1, 3, 5]
}

grid_cv = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid,
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    scoring='f1_macro',
    n_jobs=-1,
    verbose=1
)
grid_cv.fit(X, y)

print(f"Best params: {grid_cv.best_params_}")
print(f"Best CV score: {grid_cv.best_score_:.3f}")

# RandomizedSearchCV: faster for large search spaces
param_dist = {
    'n_estimators': randint(50, 500),
    'max_depth': [None, 5, 10, 20],
    'min_samples_leaf': randint(1, 10),
    'max_features': uniform(0.1, 0.9)
}

random_cv = RandomizedSearchCV(
    RandomForestClassifier(random_state=42),
    param_dist,
    n_iter=50,
    cv=5,
    scoring='f1_macro',
    n_jobs=-1,
    random_state=42
)
random_cv.fit(X, y)
print(f"RandomSearch Best CV: {random_cv.best_score_:.3f}")

Interpreting CV Results

import pandas as pd

# Analyze GridSearchCV results in detail
cv_results = pd.DataFrame(grid_cv.cv_results_)

# Sort by mean test score
cv_results = cv_results.sort_values('mean_test_score', ascending=False)

# Top 5 configurations
print("Top 5 configurations:")
print(cv_results[['params', 'mean_test_score', 'std_test_score', 
                   'rank_test_score']].head(5))

# Visualize the search
pivot = cv_results.pivot_table(
    values='mean_test_score',
    index='param_max_depth',
    columns='param_n_estimators'
)
import seaborn as sns
sns.heatmap(pivot, annot=True, fmt='.3f', cmap='viridis')
plt.title('CV Scores: max_depth vs n_estimators')
plt.show()

CV Pitfalls to Avoid

# WRONG: Feature selection before CV leaks information
from sklearn.feature_selection import SelectKBest, f_classif

selector = SelectKBest(k=10)
X_selected = selector.fit_transform(X, y)  # Sees all data including test folds!
scores_wrong = cross_val_score(model, X_selected, y, cv=5)

# CORRECT: Include feature selection in the pipeline
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('selector', SelectKBest(k=10)),  # CV fits selector separately for each fold
    ('model', RandomForestClassifier(n_estimators=100, random_state=42))
])
scores_correct = cross_val_score(pipeline, X, y, cv=5, scoring='accuracy')

print(f"Wrong CV (with leakage): {scores_wrong.mean():.3f}")
print(f"Correct CV (pipeline): {scores_correct.mean():.3f}")
# The pipeline score is lower but honest — the leaky version is overly optimistic

Always put preprocessing inside a Pipeline so CV fits it correctly on each fold.

Choosing the Right CV Strategy

Situation	Recommended CV
Large dataset (>10K), balanced	Simple 5-fold
Imbalanced classes	Stratified K-Fold
Small dataset (under 500)	LOOCV or 10-fold
Need stable estimates	Repeated K-Fold (5×10)
Time series	TimeSeriesSplit
Medical/safety-critical	Nested CV

Next lesson: Hyperparameter Tuning — systematically finding the best model configuration.