Project 2: Email Spam Classifier

Build a production-quality spam classifier from scratch. This project teaches you text preprocessing, feature extraction from raw text, and classification — skills that apply directly to sentiment analysis, content moderation, and document classification.

What You'll Build

A spam classifier that:

Preprocesses raw email text (cleaning, tokenizing, stemming)
Extracts features using TF-IDF and bag-of-words
Trains and compares multiple classifiers
Achieves >98% accuracy on the SpamAssassin dataset
Works on new emails in real-time

Setup

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import (classification_report, confusion_matrix, 
                              roc_auc_score, precision_recall_curve)
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import re
import string
import warnings
warnings.filterwarnings('ignore')

# Install: pip install nltk scikit-learn pandas
import nltk
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

print("Libraries loaded successfully")

Load and Explore the Data

# Using the SpamAssassin Public Corpus or SMS Spam Collection
# For SMS Spam: https://archive.ics.uci.edu/dataset/228/sms+spam+collection

df = pd.read_csv('spam.csv', encoding='latin-1')
df = df[['v1', 'v2']].rename(columns={'v1': 'label', 'v2': 'text'})
df['is_spam'] = (df['label'] == 'spam').astype(int)

print(f"Dataset shape: {df.shape}")
print(f"\nClass distribution:")
print(df['label'].value_counts())
print(f"\nSpam rate: {df['is_spam'].mean():.1%}")

# Explore text
print("\nHam examples:")
for text in df[df['is_spam'] == 0]['text'].head(3):
    print(f"  {text[:80]}...")

print("\nSpam examples:")
for text in df[df['is_spam'] == 1]['text'].head(3):
    print(f"  {text[:80]}...")

# Text length analysis
df['text_length'] = df['text'].str.len()
df['word_count'] = df['text'].str.split().str.len()

fig, axes = plt.subplots(1, 2, figsize=(12, 4))
df.boxplot(column='text_length', by='label', ax=axes[0])
axes[0].set_title('Text Length by Label')
df.boxplot(column='word_count', by='label', ax=axes[1])
axes[1].set_title('Word Count by Label')
plt.tight_layout()
plt.show()

Text Preprocessing

class TextPreprocessor:
    def __init__(self, use_stemming=False, use_lemmatization=True):
        self.stop_words = set(stopwords.words('english'))
        self.stemmer = PorterStemmer() if use_stemming else None
        self.lemmatizer = WordNetLemmatizer() if use_lemmatization else None
    
    def clean_text(self, text):
        # Convert to lowercase
        text = text.lower()
        
        # Remove email addresses
        text = re.sub(r'\S+@\S+', 'emailaddress', text)
        
        # Remove URLs
        text = re.sub(r'http\S+|www\S+', 'urladdress', text)
        
        # Remove phone numbers
        text = re.sub(r'(\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4})', 
                      'phonenumber', text)
        
        # Remove currency amounts
        text = re.sub(r'\$[\d,]+', 'moneynumber', text)
        
        # Remove punctuation but keep important chars
        text = re.sub(r'[^\w\s]', ' ', text)
        
        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text).strip()
        
        return text
    
    def tokenize(self, text):
        tokens = text.split()
        
        # Remove stopwords and short tokens
        tokens = [t for t in tokens if t not in self.stop_words and len(t) > 2]
        
        # Apply stemming or lemmatization
        if self.stemmer:
            tokens = [self.stemmer.stem(t) for t in tokens]
        elif self.lemmatizer:
            tokens = [self.lemmatizer.lemmatize(t) for t in tokens]
        
        return tokens
    
    def preprocess(self, text):
        cleaned = self.clean_text(text)
        tokens = self.tokenize(cleaned)
        return ' '.join(tokens)

# Apply preprocessing
preprocessor = TextPreprocessor()
df['text_processed'] = df['text'].apply(preprocessor.preprocess)

print("Before preprocessing:")
print(df['text'].iloc[0])
print("\nAfter preprocessing:")
print(df['text_processed'].iloc[0])

Feature Engineering

# Hand-crafted spam features
def extract_spam_features(text):
    original_text = text
    features = {}
    
    # Uppercase ratio (YELLING is common in spam)
    upper_chars = sum(1 for c in original_text if c.isupper())
    features['upper_ratio'] = upper_chars / max(len(original_text), 1)
    
    # Exclamation marks
    features['exclamation_count'] = original_text.count('!')
    
    # Question marks
    features['question_count'] = original_text.count('?')
    
    # Dollar signs
    features['dollar_count'] = original_text.count('$')
    
    # Digits ratio
    digit_chars = sum(1 for c in original_text if c.isdigit())
    features['digit_ratio'] = digit_chars / max(len(original_text), 1)
    
    # Contains spam trigger words
    spam_words = ['free', 'winner', 'won', 'prize', 'claim', 'urgent', 
                  'offer', 'limited', 'congratulations', 'selected']
    lower_text = original_text.lower()
    features['spam_word_count'] = sum(1 for w in spam_words if w in lower_text)
    
    # Text length
    features['text_length'] = len(original_text)
    features['word_count'] = len(original_text.split())
    
    return features

# Build feature dataframe
manual_features = pd.DataFrame(df['text'].apply(extract_spam_features).tolist())
print("Manual features:")
print(manual_features.head())
print(f"\nCorrelations with spam:")
for col in manual_features.columns:
    corr = manual_features[col].corr(df['is_spam'])
    print(f"  {col}: {corr:.3f}")

Model Training: Multiple Approaches

from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import MinMaxScaler
from scipy.sparse import hstack
import scipy.sparse as sp

X_text = df['text_processed']
y = df['is_spam']

X_train_text, X_test_text, y_train, y_test = train_test_split(
    X_text, y, test_size=0.2, stratify=y, random_state=42
)

# Approach 1: TF-IDF + Naive Bayes (classic spam filtering)
nb_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(
        ngram_range=(1, 2),    # Unigrams and bigrams
        max_features=10000,
        min_df=2,
        max_df=0.95,
        sublinear_tf=True      # Apply log normalization to term frequencies
    )),
    ('clf', ComplementNB(alpha=0.1))  # Complement NB works better for imbalanced
])

# Approach 2: TF-IDF + Logistic Regression (often beats NB)
lr_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(
        ngram_range=(1, 2), max_features=20000,
        min_df=2, max_df=0.95, sublinear_tf=True
    )),
    ('clf', LogisticRegression(C=5.0, max_iter=1000))
])

# Approach 3: TF-IDF + Linear SVM
svm_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(
        ngram_range=(1, 2), max_features=20000, sublinear_tf=True
    )),
    ('clf', LinearSVC(C=1.0, max_iter=2000))
])

# Evaluate with stratified cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

print("Model Comparison (CV):")
print(f"{'Model':30s} {'Accuracy':>10} {'F1':>10} {'ROC-AUC':>10}")

for name, pipeline in [('Complement Naive Bayes', nb_pipeline),
                        ('Logistic Regression', lr_pipeline),
                        ('Linear SVM', svm_pipeline)]:
    acc = cross_val_score(pipeline, X_train_text, y_train, cv=cv, scoring='accuracy')
    f1 = cross_val_score(pipeline, X_train_text, y_train, cv=cv, scoring='f1')
    auc = cross_val_score(pipeline, X_train_text, y_train, cv=cv, scoring='roc_auc')
    
    print(f"{name:30s} {acc.mean():.4f}±{acc.std():.3f}"
          f" {f1.mean():.4f}±{f1.std():.3f}"
          f" {auc.mean():.4f}±{auc.std():.3f}")

Final Model Evaluation

# Train best model on full training set
best_pipeline = lr_pipeline
best_pipeline.fit(X_train_text, y_train)

y_pred = best_pipeline.predict(X_test_text)
y_proba = best_pipeline.predict_proba(X_test_text)[:, 1]

print("Final Test Set Results:")
print(classification_report(y_test, y_pred, target_names=['Ham', 'Spam']))

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Ham', 'Spam'], yticklabels=['Ham', 'Spam'])
plt.title(f'Confusion Matrix\nAccuracy: {(y_pred == y_test).mean():.4f}')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Error analysis: what did we get wrong?
test_df = pd.DataFrame({
    'text': X_test_text.values,
    'true': y_test.values,
    'pred': y_pred,
    'confidence': np.maximum(y_proba, 1 - y_proba)
})

print("\nHigh-confidence false positives (ham classified as spam):")
fp = test_df[(test_df['true'] == 0) & (test_df['pred'] == 1)].nlargest(3, 'confidence')
for _, row in fp.iterrows():
    print(f"  [{row['confidence']:.2f}] {row['text'][:80]}...")

print("\nHigh-confidence false negatives (spam that got through):")
fn = test_df[(test_df['true'] == 1) & (test_df['pred'] == 0)].nlargest(3, 'confidence')
for _, row in fn.iterrows():
    print(f"  [{row['confidence']:.2f}] {row['text'][:80]}...")

Production-Ready Predictor

import pickle

def save_model(pipeline, filepath='spam_classifier.pkl'):
    with open(filepath, 'wb') as f:
        pickle.dump(pipeline, f)
    print(f"Model saved to {filepath}")

def load_model(filepath='spam_classifier.pkl'):
    with open(filepath, 'rb') as f:
        return pickle.load(f)

def classify_email(text, model, preprocessor, threshold=0.5):
    """Classify a single email as spam or ham."""
    processed = preprocessor.preprocess(text)
    proba = model.predict_proba([processed])[0, 1]
    
    label = 'SPAM' if proba >= threshold else 'HAM'
    confidence = max(proba, 1 - proba)
    
    return {
        'label': label,
        'spam_probability': proba,
        'confidence': confidence,
        'processed_text': processed
    }

# Test
save_model(best_pipeline)

test_emails = [
    "Hello, are we still on for lunch tomorrow? Let me know!",
    "CONGRATULATIONS! You've WON a FREE iPhone!!! CLAIM NOW: click here",
    "Meeting rescheduled to 3pm. Please update your calendar.",
    "URGENT: Your account will be SUSPENDED unless you verify NOW!!!"
]

for email in test_emails:
    result = classify_email(email, best_pipeline, preprocessor)
    print(f"[{result['label']} {result['spam_probability']:.1%}] {email[:60]}...")

Key Takeaways

This project demonstrates text classification fundamentals that transfer to any NLP problem:

Text preprocessing matters enormously — cleaning removes noise; stemming/lemmatization reduces vocabulary size
TF-IDF is a powerful, simple baseline for text features — better than raw counts
Logistic Regression often beats Naive Bayes on text despite NB's theoretical advantages
Error analysis reveals what the model doesn't understand — look at misclassified examples
Business context matters — in spam filtering, false negatives (letting spam through) are more acceptable than false positives (blocking real email)

Next project: Image Recognition App — computer vision with CNNs.