35 minLesson 32 of 34
Final Projects
Project: CLI Data Analysis Tool
Project: CLI Data Analysis Tool
This project ties together everything you've learned — file I/O, pandas, data visualization, argparse, and clean Python architecture. You'll build a command-line tool that accepts a CSV file, computes statistics, generates charts, and exports a summary report.
What You'll Build
$ python analyze.py sales_data.csv --report --chart --output results/
✓ Loaded 1,247 rows × 8 columns
✓ Computed statistics for 5 numeric columns
✓ Generated 3 charts → results/charts/
✓ Exported report → results/summary_report.md
=== QUICK STATS ===
revenue: mean=$4,821 median=$3,900 max=$28,400
units: mean=47.3 median=38 max=312
Project Structure
data_analyzer/
├── analyze.py # CLI entry point
├── loader.py # Data loading & validation
├── stats.py # Statistical computations
├── visualizer.py # Chart generation
├── reporter.py # Markdown report generation
├── requirements.txt
└── sample_data.csv
Step 1: Data Loader
# loader.py
import pandas as pd
import sys
from pathlib import Path
SUPPORTED_FORMATS = {'.csv', '.xlsx', '.json', '.parquet'}
def load_data(filepath: str) -> pd.DataFrame:
"""Load data from CSV, Excel, JSON, or Parquet."""
path = Path(filepath)
if not path.exists():
print(f"Error: File not found: {filepath}")
sys.exit(1)
if path.suffix not in SUPPORTED_FORMATS:
print(f"Error: Unsupported format '{path.suffix}'. Use: {SUPPORTED_FORMATS}")
sys.exit(1)
loaders = {
'.csv': pd.read_csv,
'.xlsx': pd.read_excel,
'.json': pd.read_json,
'.parquet': pd.read_parquet
}
try:
df = loaders[path.suffix](filepath)
print(f"✓ Loaded {len(df):,} rows × {len(df.columns)} columns")
return df
except Exception as e:
print(f"Error loading file: {e}")
sys.exit(1)
def validate_and_profile(df: pd.DataFrame) -> dict:
"""Return a profile dict summarizing the dataset."""
numeric_cols = df.select_dtypes(include='number').columns.tolist()
categorical_cols = df.select_dtypes(include='object').columns.tolist()
datetime_cols = df.select_dtypes(include='datetime').columns.tolist()
missing = df.isnull().sum()
missing_pct = (missing / len(df) * 100).round(2)
return {
'rows': len(df),
'columns': len(df.columns),
'numeric_cols': numeric_cols,
'categorical_cols': categorical_cols,
'datetime_cols': datetime_cols,
'missing': missing[missing > 0].to_dict(),
'missing_pct': missing_pct[missing_pct > 0].to_dict(),
'duplicates': df.duplicated().sum()
}
Step 2: Statistical Analysis
# stats.py
import pandas as pd
import numpy as np
from scipy import stats as scipy_stats
def compute_numeric_stats(df: pd.DataFrame) -> pd.DataFrame:
"""Compute comprehensive stats for all numeric columns."""
numeric = df.select_dtypes(include='number')
if numeric.empty:
return pd.DataFrame()
summary = numeric.agg([
'count', 'mean', 'median', 'std', 'min', 'max',
lambda x: x.quantile(0.25),
lambda x: x.quantile(0.75),
'skew', 'kurt'
])
summary.index = ['count', 'mean', 'median', 'std', 'min', 'max',
'q25', 'q75', 'skewness', 'kurtosis']
return summary.round(3)
def detect_outliers(series: pd.Series) -> dict:
"""Detect outliers using IQR method."""
Q1 = series.quantile(0.25)
Q3 = series.quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR
outliers = series[(series < lower) | (series > upper)]
return {
'count': len(outliers),
'pct': round(len(outliers) / len(series) * 100, 2),
'lower_bound': round(lower, 3),
'upper_bound': round(upper, 3)
}
def compute_correlations(df: pd.DataFrame) -> pd.DataFrame:
"""Pearson correlation matrix for numeric columns."""
numeric = df.select_dtypes(include='number')
return numeric.corr().round(3)
def top_correlations(corr_matrix: pd.DataFrame, n: int = 5) -> list:
"""Return top N strongest correlations (excluding self-correlations)."""
pairs = []
cols = corr_matrix.columns
for i in range(len(cols)):
for j in range(i + 1, len(cols)):
pairs.append({
'col1': cols[i],
'col2': cols[j],
'correlation': corr_matrix.iloc[i, j]
})
pairs.sort(key=lambda x: abs(x['correlation']), reverse=True)
return pairs[:n]
def value_counts_summary(df: pd.DataFrame, top_n: int = 5) -> dict:
"""Value counts for categorical columns."""
cat_cols = df.select_dtypes(include='object').columns
result = {}
for col in cat_cols:
result[col] = df[col].value_counts().head(top_n).to_dict()
return result
Step 3: Visualization
# visualizer.py
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
import pandas as pd
from pathlib import Path
sns.set_theme(style='whitegrid', palette='Set2')
def save_charts(df: pd.DataFrame, output_dir: str) -> list:
"""Generate and save all charts. Returns list of saved paths."""
output_path = Path(output_dir) / 'charts'
output_path.mkdir(parents=True, exist_ok=True)
saved = []
numeric = df.select_dtypes(include='number')
# Chart 1: Distribution plots for numeric columns
if not numeric.empty:
n_cols = min(len(numeric.columns), 4)
fig, axes = plt.subplots(1, n_cols, figsize=(5 * n_cols, 4))
if n_cols == 1:
axes = [axes]
for ax, col in zip(axes, numeric.columns[:n_cols]):
sns.histplot(numeric[col].dropna(), kde=True, ax=ax)
ax.set_title(f'Distribution: {col}', fontsize=10)
plt.tight_layout()
path = output_path / 'distributions.png'
plt.savefig(path, dpi=150, bbox_inches='tight')
plt.close()
saved.append(str(path))
# Chart 2: Correlation heatmap
if len(numeric.columns) > 1:
fig, ax = plt.subplots(figsize=(8, 6))
corr = numeric.corr()
sns.heatmap(corr, annot=True, fmt='.2f', cmap='RdBu_r',
center=0, ax=ax, square=True)
ax.set_title('Correlation Matrix')
plt.tight_layout()
path = output_path / 'correlation.png'
plt.savefig(path, dpi=150, bbox_inches='tight')
plt.close()
saved.append(str(path))
# Chart 3: Box plots for outlier visualization
if not numeric.empty:
fig, ax = plt.subplots(figsize=(max(8, len(numeric.columns) * 1.5), 5))
numeric.boxplot(ax=ax)
ax.set_title('Outlier Overview (Box Plots)')
ax.tick_params(axis='x', rotation=45)
plt.tight_layout()
path = output_path / 'boxplots.png'
plt.savefig(path, dpi=150, bbox_inches='tight')
plt.close()
saved.append(str(path))
print(f"✓ Generated {len(saved)} charts → {output_path}/")
return saved
Step 4: Report Generator
# reporter.py
from datetime import datetime
from pathlib import Path
import pandas as pd
def generate_report(
filepath: str,
profile: dict,
stats: pd.DataFrame,
correlations: list,
value_counts: dict,
chart_paths: list,
output_dir: str
) -> str:
"""Generate a Markdown summary report."""
lines = [
f"# Data Analysis Report",
f"**File:** `{filepath}`",
f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M')}",
f"**Rows:** {profile['rows']:,} | **Columns:** {profile['columns']}",
"",
"---",
"",
"## Dataset Overview",
"",
f"| Type | Columns |",
f"|------|---------|",
f"| Numeric | {', '.join(profile['numeric_cols']) or 'None'} |",
f"| Categorical | {', '.join(profile['categorical_cols']) or 'None'} |",
"",
]
# Missing data
if profile['missing']:
lines += ["## Missing Values", ""]
for col, count in profile['missing'].items():
pct = profile['missing_pct'][col]
lines.append(f"- **{col}**: {count:,} missing ({pct}%)")
lines.append("")
if profile['duplicates'] > 0:
lines += [f"> ⚠️ **{profile['duplicates']:,} duplicate rows detected**", ""]
# Numeric stats
if not stats.empty:
lines += ["## Numeric Statistics", ""]
lines.append(stats.to_markdown())
lines.append("")
# Top correlations
if correlations:
lines += ["## Strongest Correlations", ""]
for c in correlations[:5]:
direction = "+" if c['correlation'] > 0 else "-"
lines.append(f"- **{c['col1']}** ↔ **{c['col2']}**: `{c['correlation']:.3f}` ({direction})")
lines.append("")
# Categorical summaries
if value_counts:
lines += ["## Categorical Summaries", ""]
for col, counts in value_counts.items():
lines.append(f"### {col}")
for val, cnt in counts.items():
lines.append(f"- {val}: {cnt:,}")
lines.append("")
# Charts
if chart_paths:
lines += ["## Charts", ""]
for path in chart_paths:
name = Path(path).stem
lines.append(f"")
lines.append("")
report = "\n".join(lines)
output_path = Path(output_dir) / 'summary_report.md'
output_path.parent.mkdir(parents=True, exist_ok=True)
output_path.write_text(report, encoding='utf-8')
print(f"✓ Exported report → {output_path}")
return str(output_path)
Step 5: The CLI Entry Point
# analyze.py
import argparse
import sys
from pathlib import Path
from loader import load_data, validate_and_profile
from stats import compute_numeric_stats, compute_correlations, top_correlations, value_counts_summary, detect_outliers
from visualizer import save_charts
from reporter import generate_report
def print_quick_stats(df, stats):
"""Print a compact summary to the terminal."""
numeric = df.select_dtypes(include='number')
if numeric.empty:
return
print("\n=== QUICK STATS ===")
for col in numeric.columns[:5]:
s = stats[col]
print(f"{col:15s} mean={s['mean']:<10.2f} median={s['median']:<10.2f} max={s['max']:.2f}")
def main():
parser = argparse.ArgumentParser(
description='Analyze any CSV/Excel/JSON dataset from the command line.',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
python analyze.py data.csv
python analyze.py data.csv --report --chart --output results/
python analyze.py data.xlsx --col revenue units --top-corr 10
"""
)
parser.add_argument('file', help='Path to the data file (CSV, Excel, JSON, Parquet)')
parser.add_argument('--output', '-o', default='output/', help='Output directory (default: output/)')
parser.add_argument('--report', '-r', action='store_true', help='Generate Markdown report')
parser.add_argument('--chart', '-c', action='store_true', help='Generate charts')
parser.add_argument('--col', nargs='+', help='Analyze only these columns')
parser.add_argument('--top-corr', type=int, default=5, help='Top N correlations to show')
parser.add_argument('--no-outliers', action='store_true', help='Skip outlier detection')
args = parser.parse_args()
# Load data
df = load_data(args.file)
# Filter columns if specified
if args.col:
missing_cols = [c for c in args.col if c not in df.columns]
if missing_cols:
print(f"Warning: columns not found: {missing_cols}")
df = df[[c for c in args.col if c in df.columns]]
# Profile
profile = validate_and_profile(df)
# Stats
stats = compute_numeric_stats(df)
print_quick_stats(df, stats)
# Outliers
if not args.no_outliers and profile['numeric_cols']:
print("\n=== OUTLIERS ===")
for col in profile['numeric_cols']:
o = detect_outliers(df[col].dropna())
if o['count'] > 0:
print(f"{col:15s}: {o['count']} outliers ({o['pct']}%)")
# Correlations
corr_matrix = compute_correlations(df)
top_corrs = top_correlations(corr_matrix, n=args.top_corr)
if top_corrs:
print("\n=== TOP CORRELATIONS ===")
for c in top_corrs[:3]:
print(f" {c['col1']} ↔ {c['col2']}: {c['correlation']:.3f}")
# Charts
chart_paths = []
if args.chart:
chart_paths = save_charts(df, args.output)
# Report
if args.report:
val_counts = value_counts_summary(df)
generate_report(
filepath=args.file,
profile=profile,
stats=stats,
correlations=top_corrs,
value_counts=val_counts,
chart_paths=chart_paths,
output_dir=args.output
)
print(f"\n✅ Analysis complete.")
if __name__ == '__main__':
main()
Sample Data Generator
# generate_sample_data.py — run this to create test data
import pandas as pd
import numpy as np
np.random.seed(42)
n = 500
df = pd.DataFrame({
'month': pd.date_range('2023-01-01', periods=n, freq='D').strftime('%Y-%m'),
'region': np.random.choice(['North', 'South', 'East', 'West'], n),
'product': np.random.choice(['Widget A', 'Widget B', 'Widget C'], n),
'revenue': np.random.normal(5000, 1500, n).clip(500).round(2),
'units': np.random.randint(10, 200, n),
'discount_pct': np.random.uniform(0, 0.3, n).round(3),
'customer_id': np.random.randint(1000, 9999, n),
'returned': np.random.choice([0, 1], n, p=[0.92, 0.08])
})
# Add some missing values
df.loc[np.random.choice(n, 20), 'revenue'] = np.nan
df.to_csv('sales_data.csv', index=False)
print(f"Generated sales_data.csv with {n} rows")
Running the Tool
# Install dependencies
pip install pandas matplotlib seaborn scipy tabulate
# Generate sample data
python generate_sample_data.py
# Basic analysis
python analyze.py sales_data.csv
# Full analysis with charts and report
python analyze.py sales_data.csv --report --chart --output results/
# Analyze specific columns only
python analyze.py sales_data.csv --col revenue units discount_pct --chart
# View the report
cat results/summary_report.md
Extension Ideas
Once your tool works, try adding:
# Add to analyze.py — time series detection
def detect_time_column(df):
for col in df.columns:
try:
pd.to_datetime(df[col])
return col
except:
continue
return None
# Add trend line for time series
if time_col := detect_time_column(df):
df[time_col] = pd.to_datetime(df[time_col])
df_sorted = df.sort_values(time_col)
# Plot monthly trend automatically...
This project demonstrates the full Python development cycle: modular architecture, CLI design with argparse, data processing with pandas, visualization with matplotlib/seaborn, and file output. These are exactly the patterns used in production data engineering tools.
Next project: REST API with FastAPI — building a production-quality backend service.
📱
Get Notes Free →Get this course's notes on Telegram!
Free cheat sheets, summaries & practice exercises