Project 1: House Price Prediction

This project takes you through a complete, production-quality ML workflow. You'll solve one of the most classic regression problems — predicting house prices — while applying everything from data cleaning through model deployment.

What You'll Build

A regression system that predicts house prices from features like size, location, and condition. By the end, you'll have:

A cleaned, feature-engineered dataset
Multiple trained and evaluated models
A final model selected through cross-validation
A prediction function ready for deployment

The Dataset

We'll use the Ames Housing dataset — richer and more realistic than the classic Boston Housing dataset.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
import warnings
warnings.filterwarnings('ignore')

# Load data
# Can download from: https://www.kaggle.com/datasets/prevek18/ames-housing-dataset
df = pd.read_csv('AmesHousing.csv')
print(f"Shape: {df.shape}")
print(f"Target: SalePrice")
print(f"\nSalePrice stats:\n{df['SalePrice'].describe()}")

Step 1: Exploratory Data Analysis

# Target distribution
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

axes[0].hist(df['SalePrice'], bins=50, edgecolor='black')
axes[0].set_title('SalePrice Distribution')
axes[0].set_xlabel('Price')

axes[1].hist(np.log1p(df['SalePrice']), bins=50, edgecolor='black')
axes[1].set_title('log(SalePrice) Distribution (More Normal)')
axes[1].set_xlabel('log(Price)')

plt.tight_layout()
plt.show()

# Key: log-transform the target for regression (makes distribution more normal)
df['SalePrice_log'] = np.log1p(df['SalePrice'])

# Most important numerical correlations
numeric_cols = df.select_dtypes(include=[np.number]).columns
correlations = df[numeric_cols].corr()['SalePrice'].abs().sort_values(ascending=False)
print("\nTop 15 correlations with SalePrice:")
print(correlations.head(15))

# Scatter plots of top features
top_features = ['Overall Qual', 'Gr Liv Area', 'Garage Cars', 'Total Bsmt SF']
fig, axes = plt.subplots(1, 4, figsize=(16, 4))
for ax, feat in zip(axes, top_features):
    ax.scatter(df[feat], df['SalePrice'], alpha=0.3)
    ax.set_xlabel(feat)
    ax.set_ylabel('SalePrice')
    ax.set_title(f'{feat} vs Price')
plt.tight_layout()
plt.show()

Step 2: Data Cleaning

# Missing values analysis
missing = df.isnull().sum()
missing_pct = (missing / len(df) * 100).sort_values(ascending=False)
print("Columns with missing values:")
print(missing_pct[missing_pct > 0].head(20))

# Handle missing values
def clean_housing_data(df):
    df = df.copy()
    
    # Pool quality: NaN means no pool
    for col in ['Pool QC', 'Misc Feature', 'Alley', 'Fence', 'Fireplace Qu']:
        df[col] = df[col].fillna('None')
    
    # Garage features: NaN means no garage
    for col in ['Garage Type', 'Garage Finish', 'Garage Qual', 'Garage Cond']:
        df[col] = df[col].fillna('None')
    df['Garage Yr Blt'] = df['Garage Yr Blt'].fillna(0)
    df['Garage Cars'] = df['Garage Cars'].fillna(0)
    df['Garage Area'] = df['Garage Area'].fillna(0)
    
    # Basement features: NaN means no basement
    for col in ['Bsmt Qual', 'Bsmt Cond', 'Bsmt Exposure', 'BsmtFin Type 1', 'BsmtFin Type 2']:
        df[col] = df[col].fillna('None')
    for col in ['BsmtFin SF 1', 'BsmtFin SF 2', 'Bsmt Unf SF', 'Total Bsmt SF']:
        df[col] = df[col].fillna(0)
    
    # Lot frontage: impute by neighborhood median
    df['Lot Frontage'] = df.groupby('Neighborhood')['Lot Frontage'].transform(
        lambda x: x.fillna(x.median())
    )
    
    # Masonry veneer
    df['Mas Vnr Type'] = df['Mas Vnr Type'].fillna('None')
    df['Mas Vnr Area'] = df['Mas Vnr Area'].fillna(0)
    
    # Electrical
    df['Electrical'] = df['Electrical'].fillna(df['Electrical'].mode()[0])
    
    # Drop columns with too much missing data
    df = df.drop(columns=['Order', 'PID'], errors='ignore')
    
    return df

df_clean = clean_housing_data(df)
print(f"Missing values remaining: {df_clean.isnull().sum().sum()}")

Step 3: Feature Engineering

def engineer_features(df):
    df = df.copy()
    
    # Age features
    current_year = 2024
    df['House_Age'] = current_year - df['Year Built']
    df['Remod_Age'] = current_year - df['Year Remod/Add']
    df['Since_Remodel'] = df['Year Remod/Add'] - df['Year Built']
    df['Garage_Age'] = current_year - df['Garage Yr Blt'].replace(0, current_year)
    
    # Total area features
    df['Total_SF'] = df['Total Bsmt SF'] + df['1st Flr SF'] + df['2nd Flr SF']
    df['Total_Bath'] = (df['Full Bath'] + 0.5 * df['Half Bath'] + 
                         df['Bsmt Full Bath'] + 0.5 * df['Bsmt Half Bath'])
    df['Total_Porch_SF'] = (df['Open Porch SF'] + df['Enclosed Porch'] + 
                             df['3Ssn Porch'] + df['Screen Porch'])
    
    # Quality × Area interactions (high-quality large homes worth much more)
    df['Qual_x_Area'] = df['Overall Qual'] * df['Gr Liv Area']
    df['Qual_x_Basement'] = df['Overall Qual'] * df['Total Bsmt SF']
    
    # Has features
    df['Has_Pool'] = (df['Pool Area'] > 0).astype(int)
    df['Has_Garage'] = (df['Garage Area'] > 0).astype(int)
    df['Has_Basement'] = (df['Total Bsmt SF'] > 0).astype(int)
    df['Has_Fireplace'] = (df['Fireplaces'] > 0).astype(int)
    
    return df

df_features = engineer_features(df_clean)
print(f"Features after engineering: {df_features.shape[1]}")

Step 4: Encoding Categorical Features

from sklearn.preprocessing import OrdinalEncoder

def encode_features(df):
    df = df.copy()
    
    # Ordinal encoding for quality/condition columns
    quality_map = {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}
    quality_cols = ['Exter Qual', 'Exter Cond', 'Bsmt Qual', 'Bsmt Cond',
                    'Heating QC', 'Kitchen Qual', 'Fireplace Qu', 'Garage Qual',
                    'Garage Cond', 'Pool QC']
    
    for col in quality_cols:
        if col in df.columns:
            df[col] = df[col].map(quality_map).fillna(0)
    
    # One-hot encode remaining categorical columns
    categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
    df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)
    
    return df

df_encoded = encode_features(df_features)
print(f"Features after encoding: {df_encoded.shape[1]}")

Step 5: Model Training and Comparison

from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import xgboost as xgb

# Prepare final feature matrix
target_col = 'SalePrice_log'
drop_cols = ['SalePrice', 'SalePrice_log']
feature_cols = [c for c in df_encoded.columns if c not in drop_cols]

X = df_encoded[feature_cols].values
y = df_encoded[target_col].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Models to compare
models = {
    'Ridge': Pipeline([
        ('scaler', StandardScaler()),
        ('model', Ridge(alpha=10))
    ]),
    'Lasso': Pipeline([
        ('scaler', StandardScaler()),
        ('model', Lasso(alpha=0.001))
    ]),
    'Random Forest': RandomForestRegressor(
        n_estimators=200, max_features='sqrt', random_state=42
    ),
    'Gradient Boosting': GradientBoostingRegressor(
        n_estimators=300, learning_rate=0.05, max_depth=4, random_state=42
    ),
    'XGBoost': xgb.XGBRegressor(
        n_estimators=300, learning_rate=0.05, max_depth=4,
        subsample=0.8, colsample_bytree=0.8, random_state=42
    )
}

kf = KFold(n_splits=5, shuffle=True, random_state=42)
results = {}

for name, model in models.items():
    cv_scores = cross_val_score(
        model, X_train, y_train,
        cv=kf, scoring='neg_root_mean_squared_error'
    )
    results[name] = {
        'CV RMSE (log)': -cv_scores.mean(),
        'CV Std': cv_scores.std()
    }
    print(f"{name:22s}: CV RMSE = {-cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

Step 6: Evaluate Best Model

# Train and evaluate best model on test set
best_model = GradientBoostingRegressor(
    n_estimators=300, learning_rate=0.05, max_depth=4, random_state=42
)
best_model.fit(X_train, y_train)

y_pred_log = best_model.predict(X_test)

# Convert back from log space
y_pred = np.expm1(y_pred_log)
y_true = np.expm1(y_test)

rmse = np.sqrt(mean_squared_error(y_true, y_pred))
r2 = r2_score(y_true, y_pred)
mae = np.abs(y_true - y_pred).mean()

print(f"Test RMSE: ${rmse:,.0f}")
print(f"Test R²: {r2:.4f}")
print(f"Test MAE: ${mae:,.0f}")
print(f"Median Error: ${np.median(np.abs(y_true - y_pred)):,.0f}")

# Prediction vs Actual scatter
plt.figure(figsize=(10, 6))
plt.scatter(y_true, y_pred, alpha=0.4)
plt.plot([y_true.min(), y_true.max()], [y_true.min(), y_true.max()], 'r--')
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')
plt.title(f'Predicted vs Actual Prices (R² = {r2:.3f})')
plt.show()

Step 7: Feature Importance

# Top 20 most important features
importances = pd.Series(
    best_model.feature_importances_,
    index=feature_cols
).sort_values(ascending=False).head(20)

plt.figure(figsize=(10, 8))
importances.plot(kind='barh')
plt.title('Top 20 Feature Importances')
plt.xlabel('Importance')
plt.tight_layout()
plt.show()

print("Top 10 features:")
print(importances.head(10))

Step 8: Make Predictions on New Homes

def predict_house_price(model, feature_cols, scaler=None):
    """Interactive prediction function."""
    print("Enter house features:")
    
    # Create a default "average" house
    new_house = pd.DataFrame([{
        'Gr Liv Area': float(input("Living area (sqft) [1500]: ") or 1500),
        'Overall Qual': int(input("Overall quality (1-10) [6]: ") or 6),
        'Total Bsmt SF': float(input("Basement sqft [900]: ") or 900),
        'Year Built': int(input("Year built [2000]: ") or 2000),
        'Garage Cars': int(input("Garage spaces [2]: ") or 2),
        # ... other features would be filled with defaults
    }])
    
    # Apply same preprocessing...
    # This simplified version shows the concept
    prediction_log = model.predict(new_house)
    prediction = np.expm1(prediction_log)[0]
    
    print(f"\nEstimated Price: ${prediction:,.0f}")
    return prediction

What You Learned

This project covered the complete ML pipeline:

EDA — understanding your data before modeling
Data cleaning — domain-specific missing value treatment
Feature engineering — creating meaningful combinations
Encoding — handling categorical variables properly
Model comparison — using CV to fairly compare algorithms
Evaluation — multiple metrics, visual analysis
Interpretation — feature importance for business insights

The same workflow applies to any tabular regression problem. Adapt it to your domain.

Next project: Email Spam Classifier — tackling classification with text data.