Principal Component Analysis (PCA) | Machine Learning Fundamentals | AiTechWorlds

Principal Component Analysis: Finding the Essential Structure in Data

High-dimensional data is hard to visualize, computationally expensive, and often contains redundant information. PCA solves this by finding the directions of maximum variance in your data and projecting onto a lower-dimensional space — keeping the most important structure while discarding noise.

The Core Intuition

Imagine a cloud of data points in 3D space that happens to lie mostly in a flat plane. PCA discovers that plane and represents the data in 2D — no information lost.

Original 3D data (with redundancy):
  x, y, z — but z ≈ 0.9x + 0.8y + noise

PCA finds:
  PC1 = direction of most variance (accounts for ~60%)
  PC2 = direction of second most variance (accounts for ~30%)
  PC3 = direction of least variance (mostly noise, ~10%)

Project to PC1 + PC2: 90% of variance preserved in 2D

PCA doesn't select features — it creates new features (principal components) that are linear combinations of the originals.

How PCA Works Mathematically

1. Center the data (subtract mean from each feature)
2. Compute the covariance matrix
3. Find eigenvectors (principal components) and eigenvalues (variance explained)
4. Sort by eigenvalue (largest first)
5. Project data onto top k eigenvectors

You don't need to implement this — sklearn handles it. But understanding the process helps you interpret results.

Basic PCA Usage

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_digits
import numpy as np
import matplotlib.pyplot as plt

# High-dimensional example: digits dataset (64 features)
digits = load_digits()
X, y = digits.data, digits.target
print(f"Original shape: {X.shape}")  # (1797, 64)

# Scale first — PCA is sensitive to scale
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA
pca = PCA(n_components=2)  # Reduce to 2D for visualization
X_pca = pca.fit_transform(X_scaled)
print(f"Reduced shape: {X_pca.shape}")  # (1797, 2)

# Visualize
plt.figure(figsize=(10, 8))
scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='tab10', alpha=0.7)
plt.colorbar(scatter, label='Digit')
plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%} variance)')
plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%} variance)')
plt.title('PCA of Handwritten Digits (64D → 2D)')
plt.show()

Choosing How Many Components: Explained Variance

# Fit PCA with all components
pca_full = PCA()
pca_full.fit(X_scaled)

# Cumulative explained variance
cumvar = np.cumsum(pca_full.explained_variance_ratio_)

plt.figure(figsize=(10, 6))
plt.plot(range(1, len(cumvar)+1), cumvar, 'bo-')
plt.axhline(y=0.95, color='r', linestyle='--', label='95% threshold')
plt.axhline(y=0.90, color='g', linestyle='--', label='90% threshold')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('How Many Components Do We Need?')
plt.legend()
plt.show()

# Find n_components for 95% variance
n_95 = np.argmax(cumvar >= 0.95) + 1
n_90 = np.argmax(cumvar >= 0.90) + 1
print(f"Components for 90% variance: {n_90}")
print(f"Components for 95% variance: {n_95}")
print(f"Original dimensions: {X.shape[1]}")
print(f"Compression: {X.shape[1]}D → {n_95}D ({n_95/X.shape[1]*100:.1f}% of original)")

PCA for Preprocessing: Speed Up ML

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
import time

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Without PCA
start = time.time()
svm = SVC(kernel='rbf')
scores_no_pca = cross_val_score(svm, X_train, y_train, cv=3)
time_no_pca = time.time() - start

# With PCA (keep 95% variance)
start = time.time()
pipeline = Pipeline([
    ('pca', PCA(n_components=0.95)),  # Automatically choose to preserve 95% variance
    ('svm', SVC(kernel='rbf'))
])
scores_with_pca = cross_val_score(pipeline, X_train, y_train, cv=3)
time_with_pca = time.time() - start

print(f"Without PCA: {scores_no_pca.mean():.3f} in {time_no_pca:.1f}s")
print(f"With PCA:    {scores_with_pca.mean():.3f} in {time_with_pca:.1f}s")

PCA often gives similar or better accuracy while dramatically reducing training time.

Visualizing What PCA Learned

# What do the principal components look like?
# For image data, you can visualize the "eigenfaces"

from sklearn.datasets import fetch_olivetti_faces

faces = fetch_olivetti_faces()
X_faces = faces.data  # 400 images, 4096 features (64x64)

pca_faces = PCA(n_components=50, whiten=True)
pca_faces.fit(X_faces)

# Visualize first 20 eigenfaces
fig, axes = plt.subplots(4, 5, figsize=(12, 10))
for i, ax in enumerate(axes.flat):
    ax.imshow(pca_faces.components_[i].reshape(64, 64), cmap='gray')
    ax.set_title(f'PC {i+1}')
    ax.axis('off')
plt.suptitle('First 20 Principal Components (Eigenfaces)')
plt.tight_layout()
plt.show()

Image Compression with PCA

A beautiful demonstration of PCA — reconstruct images from fewer components:

# Compress and reconstruct a face
face = X_faces[0]

fig, axes = plt.subplots(1, 5, figsize=(15, 3))
for idx, n_components in enumerate([1, 5, 20, 50, 150]):
    pca = PCA(n_components=n_components)
    pca.fit(X_faces)
    
    # Compress: project to lower dimension
    face_compressed = pca.transform([face])
    
    # Decompress: project back to original space
    face_reconstructed = pca.inverse_transform(face_compressed)
    
    variance_kept = pca.explained_variance_ratio_.sum()
    
    axes[idx].imshow(face_reconstructed.reshape(64, 64), cmap='gray')
    axes[idx].set_title(f'{n_components} components\n{variance_kept:.0%} variance')
    axes[idx].axis('off')

plt.suptitle('Face Reconstruction with Different Numbers of PCA Components')
plt.tight_layout()
plt.show()

Kernel PCA: Non-Linear Dimensionality Reduction

Standard PCA is linear. Kernel PCA extends it to non-linear structures.

from sklearn.decomposition import KernelPCA
from sklearn.datasets import make_circles

X_circles, y = make_circles(n_samples=400, noise=0.05, random_state=42)

# Standard PCA fails on non-linear data
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_circles)

# Kernel PCA can separate non-linear structures
kpca = KernelPCA(n_components=2, kernel='rbf', gamma=10)
X_kpca = kpca.fit_transform(X_circles)

fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 4))

ax1.scatter(X_circles[:, 0], X_circles[:, 1], c=y, cmap='bwr')
ax1.set_title('Original Data (2 circles)')

ax2.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='bwr')
ax2.set_title('After PCA (still overlapping)')

ax3.scatter(X_kpca[:, 0], X_kpca[:, 1], c=y, cmap='bwr')
ax3.set_title('After Kernel PCA (separated!)')

plt.tight_layout()
plt.show()

PCA vs t-SNE vs UMAP

For visualization specifically (2D or 3D):

from sklearn.manifold import TSNE

# t-SNE: Better for visualization, reveals local structure
# Computationally expensive, non-deterministic
tsne = TSNE(n_components=2, random_state=42, perplexity=30)
X_tsne = tsne.fit_transform(X_scaled[:500])  # Use subset — t-SNE is slow

plt.figure(figsize=(10, 6))
plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=y[:500], cmap='tab10', alpha=0.7)
plt.title('t-SNE Visualization')
plt.show()

Method	Speed	Linear	Preserves	Best For
PCA	Fast	Yes	Global structure	Preprocessing, compression
t-SNE	Slow	No	Local structure	Visualization only
UMAP	Medium	No	Both	Visualization + preprocessing

When to Use PCA

Use PCA when:

Features are correlated and redundant
You want to reduce training time
You want to visualize high-dimensional data
You want to compress data (images, audio)
You want to reduce noise before ML

Don't use PCA when:

Features have very different meaning (mixing apples and oranges)
You need interpretable features (PCA components are hard to explain)
Your relationship is non-linear (use Kernel PCA or UMAP)
Dataset is small — risk of losing important signal

Next lesson: Anomaly Detection — identifying unusual patterns that don't fit the expected distribution.