30 minLesson 28 of 31
Real Projects
Project 2: Email Spam Classifier
Project 2: Email Spam Classifier
Build a production-quality spam classifier from scratch. This project teaches you text preprocessing, feature extraction from raw text, and classification — skills that apply directly to sentiment analysis, content moderation, and document classification.
What You'll Build
A spam classifier that:
- Preprocesses raw email text (cleaning, tokenizing, stemming)
- Extracts features using TF-IDF and bag-of-words
- Trains and compares multiple classifiers
- Achieves >98% accuracy on the SpamAssassin dataset
- Works on new emails in real-time
Setup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import (classification_report, confusion_matrix,
roc_auc_score, precision_recall_curve)
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import re
import string
import warnings
warnings.filterwarnings('ignore')
# Install: pip install nltk scikit-learn pandas
import nltk
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
print("Libraries loaded successfully")
Load and Explore the Data
# Using the SpamAssassin Public Corpus or SMS Spam Collection
# For SMS Spam: https://archive.ics.uci.edu/dataset/228/sms+spam+collection
df = pd.read_csv('spam.csv', encoding='latin-1')
df = df[['v1', 'v2']].rename(columns={'v1': 'label', 'v2': 'text'})
df['is_spam'] = (df['label'] == 'spam').astype(int)
print(f"Dataset shape: {df.shape}")
print(f"\nClass distribution:")
print(df['label'].value_counts())
print(f"\nSpam rate: {df['is_spam'].mean():.1%}")
# Explore text
print("\nHam examples:")
for text in df[df['is_spam'] == 0]['text'].head(3):
print(f" {text[:80]}...")
print("\nSpam examples:")
for text in df[df['is_spam'] == 1]['text'].head(3):
print(f" {text[:80]}...")
# Text length analysis
df['text_length'] = df['text'].str.len()
df['word_count'] = df['text'].str.split().str.len()
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
df.boxplot(column='text_length', by='label', ax=axes[0])
axes[0].set_title('Text Length by Label')
df.boxplot(column='word_count', by='label', ax=axes[1])
axes[1].set_title('Word Count by Label')
plt.tight_layout()
plt.show()
Text Preprocessing
class TextPreprocessor:
def __init__(self, use_stemming=False, use_lemmatization=True):
self.stop_words = set(stopwords.words('english'))
self.stemmer = PorterStemmer() if use_stemming else None
self.lemmatizer = WordNetLemmatizer() if use_lemmatization else None
def clean_text(self, text):
# Convert to lowercase
text = text.lower()
# Remove email addresses
text = re.sub(r'\S+@\S+', 'emailaddress', text)
# Remove URLs
text = re.sub(r'http\S+|www\S+', 'urladdress', text)
# Remove phone numbers
text = re.sub(r'(\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4})',
'phonenumber', text)
# Remove currency amounts
text = re.sub(r'\$[\d,]+', 'moneynumber', text)
# Remove punctuation but keep important chars
text = re.sub(r'[^\w\s]', ' ', text)
# Remove extra whitespace
text = re.sub(r'\s+', ' ', text).strip()
return text
def tokenize(self, text):
tokens = text.split()
# Remove stopwords and short tokens
tokens = [t for t in tokens if t not in self.stop_words and len(t) > 2]
# Apply stemming or lemmatization
if self.stemmer:
tokens = [self.stemmer.stem(t) for t in tokens]
elif self.lemmatizer:
tokens = [self.lemmatizer.lemmatize(t) for t in tokens]
return tokens
def preprocess(self, text):
cleaned = self.clean_text(text)
tokens = self.tokenize(cleaned)
return ' '.join(tokens)
# Apply preprocessing
preprocessor = TextPreprocessor()
df['text_processed'] = df['text'].apply(preprocessor.preprocess)
print("Before preprocessing:")
print(df['text'].iloc[0])
print("\nAfter preprocessing:")
print(df['text_processed'].iloc[0])
Feature Engineering
# Hand-crafted spam features
def extract_spam_features(text):
original_text = text
features = {}
# Uppercase ratio (YELLING is common in spam)
upper_chars = sum(1 for c in original_text if c.isupper())
features['upper_ratio'] = upper_chars / max(len(original_text), 1)
# Exclamation marks
features['exclamation_count'] = original_text.count('!')
# Question marks
features['question_count'] = original_text.count('?')
# Dollar signs
features['dollar_count'] = original_text.count('$')
# Digits ratio
digit_chars = sum(1 for c in original_text if c.isdigit())
features['digit_ratio'] = digit_chars / max(len(original_text), 1)
# Contains spam trigger words
spam_words = ['free', 'winner', 'won', 'prize', 'claim', 'urgent',
'offer', 'limited', 'congratulations', 'selected']
lower_text = original_text.lower()
features['spam_word_count'] = sum(1 for w in spam_words if w in lower_text)
# Text length
features['text_length'] = len(original_text)
features['word_count'] = len(original_text.split())
return features
# Build feature dataframe
manual_features = pd.DataFrame(df['text'].apply(extract_spam_features).tolist())
print("Manual features:")
print(manual_features.head())
print(f"\nCorrelations with spam:")
for col in manual_features.columns:
corr = manual_features[col].corr(df['is_spam'])
print(f" {col}: {corr:.3f}")
Model Training: Multiple Approaches
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import MinMaxScaler
from scipy.sparse import hstack
import scipy.sparse as sp
X_text = df['text_processed']
y = df['is_spam']
X_train_text, X_test_text, y_train, y_test = train_test_split(
X_text, y, test_size=0.2, stratify=y, random_state=42
)
# Approach 1: TF-IDF + Naive Bayes (classic spam filtering)
nb_pipeline = Pipeline([
('tfidf', TfidfVectorizer(
ngram_range=(1, 2), # Unigrams and bigrams
max_features=10000,
min_df=2,
max_df=0.95,
sublinear_tf=True # Apply log normalization to term frequencies
)),
('clf', ComplementNB(alpha=0.1)) # Complement NB works better for imbalanced
])
# Approach 2: TF-IDF + Logistic Regression (often beats NB)
lr_pipeline = Pipeline([
('tfidf', TfidfVectorizer(
ngram_range=(1, 2), max_features=20000,
min_df=2, max_df=0.95, sublinear_tf=True
)),
('clf', LogisticRegression(C=5.0, max_iter=1000))
])
# Approach 3: TF-IDF + Linear SVM
svm_pipeline = Pipeline([
('tfidf', TfidfVectorizer(
ngram_range=(1, 2), max_features=20000, sublinear_tf=True
)),
('clf', LinearSVC(C=1.0, max_iter=2000))
])
# Evaluate with stratified cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
print("Model Comparison (CV):")
print(f"{'Model':30s} {'Accuracy':>10} {'F1':>10} {'ROC-AUC':>10}")
for name, pipeline in [('Complement Naive Bayes', nb_pipeline),
('Logistic Regression', lr_pipeline),
('Linear SVM', svm_pipeline)]:
acc = cross_val_score(pipeline, X_train_text, y_train, cv=cv, scoring='accuracy')
f1 = cross_val_score(pipeline, X_train_text, y_train, cv=cv, scoring='f1')
auc = cross_val_score(pipeline, X_train_text, y_train, cv=cv, scoring='roc_auc')
print(f"{name:30s} {acc.mean():.4f}±{acc.std():.3f}"
f" {f1.mean():.4f}±{f1.std():.3f}"
f" {auc.mean():.4f}±{auc.std():.3f}")
Final Model Evaluation
# Train best model on full training set
best_pipeline = lr_pipeline
best_pipeline.fit(X_train_text, y_train)
y_pred = best_pipeline.predict(X_test_text)
y_proba = best_pipeline.predict_proba(X_test_text)[:, 1]
print("Final Test Set Results:")
print(classification_report(y_test, y_pred, target_names=['Ham', 'Spam']))
# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
xticklabels=['Ham', 'Spam'], yticklabels=['Ham', 'Spam'])
plt.title(f'Confusion Matrix\nAccuracy: {(y_pred == y_test).mean():.4f}')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()
# Error analysis: what did we get wrong?
test_df = pd.DataFrame({
'text': X_test_text.values,
'true': y_test.values,
'pred': y_pred,
'confidence': np.maximum(y_proba, 1 - y_proba)
})
print("\nHigh-confidence false positives (ham classified as spam):")
fp = test_df[(test_df['true'] == 0) & (test_df['pred'] == 1)].nlargest(3, 'confidence')
for _, row in fp.iterrows():
print(f" [{row['confidence']:.2f}] {row['text'][:80]}...")
print("\nHigh-confidence false negatives (spam that got through):")
fn = test_df[(test_df['true'] == 1) & (test_df['pred'] == 0)].nlargest(3, 'confidence')
for _, row in fn.iterrows():
print(f" [{row['confidence']:.2f}] {row['text'][:80]}...")
Production-Ready Predictor
import pickle
def save_model(pipeline, filepath='spam_classifier.pkl'):
with open(filepath, 'wb') as f:
pickle.dump(pipeline, f)
print(f"Model saved to {filepath}")
def load_model(filepath='spam_classifier.pkl'):
with open(filepath, 'rb') as f:
return pickle.load(f)
def classify_email(text, model, preprocessor, threshold=0.5):
"""Classify a single email as spam or ham."""
processed = preprocessor.preprocess(text)
proba = model.predict_proba([processed])[0, 1]
label = 'SPAM' if proba >= threshold else 'HAM'
confidence = max(proba, 1 - proba)
return {
'label': label,
'spam_probability': proba,
'confidence': confidence,
'processed_text': processed
}
# Test
save_model(best_pipeline)
test_emails = [
"Hello, are we still on for lunch tomorrow? Let me know!",
"CONGRATULATIONS! You've WON a FREE iPhone!!! CLAIM NOW: click here",
"Meeting rescheduled to 3pm. Please update your calendar.",
"URGENT: Your account will be SUSPENDED unless you verify NOW!!!"
]
for email in test_emails:
result = classify_email(email, best_pipeline, preprocessor)
print(f"[{result['label']} {result['spam_probability']:.1%}] {email[:60]}...")
Key Takeaways
This project demonstrates text classification fundamentals that transfer to any NLP problem:
- Text preprocessing matters enormously — cleaning removes noise; stemming/lemmatization reduces vocabulary size
- TF-IDF is a powerful, simple baseline for text features — better than raw counts
- Logistic Regression often beats Naive Bayes on text despite NB's theoretical advantages
- Error analysis reveals what the model doesn't understand — look at misclassified examples
- Business context matters — in spam filtering, false negatives (letting spam through) are more acceptable than false positives (blocking real email)
Next project: Image Recognition App — computer vision with CNNs.
📱
Get Notes Free →Get this course's notes on Telegram!
Free cheat sheets, summaries & practice exercises