Project: CLI Data Analysis Tool

This project ties together everything you've learned — file I/O, pandas, data visualization, argparse, and clean Python architecture. You'll build a command-line tool that accepts a CSV file, computes statistics, generates charts, and exports a summary report.

What You'll Build

$ python analyze.py sales_data.csv --report --chart --output results/

✓ Loaded 1,247 rows × 8 columns
✓ Computed statistics for 5 numeric columns
✓ Generated 3 charts → results/charts/
✓ Exported report → results/summary_report.md

=== QUICK STATS ===
revenue:   mean=$4,821  median=$3,900  max=$28,400
units:     mean=47.3    median=38      max=312

Project Structure

data_analyzer/
├── analyze.py          # CLI entry point
├── loader.py           # Data loading & validation
├── stats.py            # Statistical computations
├── visualizer.py       # Chart generation
├── reporter.py         # Markdown report generation
├── requirements.txt
└── sample_data.csv

Step 1: Data Loader

# loader.py
import pandas as pd
import sys
from pathlib import Path

SUPPORTED_FORMATS = {'.csv', '.xlsx', '.json', '.parquet'}

def load_data(filepath: str) -> pd.DataFrame:
    """Load data from CSV, Excel, JSON, or Parquet."""
    path = Path(filepath)
    
    if not path.exists():
        print(f"Error: File not found: {filepath}")
        sys.exit(1)
    
    if path.suffix not in SUPPORTED_FORMATS:
        print(f"Error: Unsupported format '{path.suffix}'. Use: {SUPPORTED_FORMATS}")
        sys.exit(1)
    
    loaders = {
        '.csv': pd.read_csv,
        '.xlsx': pd.read_excel,
        '.json': pd.read_json,
        '.parquet': pd.read_parquet
    }
    
    try:
        df = loaders[path.suffix](filepath)
        print(f"✓ Loaded {len(df):,} rows × {len(df.columns)} columns")
        return df
    except Exception as e:
        print(f"Error loading file: {e}")
        sys.exit(1)

def validate_and_profile(df: pd.DataFrame) -> dict:
    """Return a profile dict summarizing the dataset."""
    numeric_cols = df.select_dtypes(include='number').columns.tolist()
    categorical_cols = df.select_dtypes(include='object').columns.tolist()
    datetime_cols = df.select_dtypes(include='datetime').columns.tolist()
    
    missing = df.isnull().sum()
    missing_pct = (missing / len(df) * 100).round(2)
    
    return {
        'rows': len(df),
        'columns': len(df.columns),
        'numeric_cols': numeric_cols,
        'categorical_cols': categorical_cols,
        'datetime_cols': datetime_cols,
        'missing': missing[missing > 0].to_dict(),
        'missing_pct': missing_pct[missing_pct > 0].to_dict(),
        'duplicates': df.duplicated().sum()
    }

Step 2: Statistical Analysis

# stats.py
import pandas as pd
import numpy as np
from scipy import stats as scipy_stats

def compute_numeric_stats(df: pd.DataFrame) -> pd.DataFrame:
    """Compute comprehensive stats for all numeric columns."""
    numeric = df.select_dtypes(include='number')
    
    if numeric.empty:
        return pd.DataFrame()
    
    summary = numeric.agg([
        'count', 'mean', 'median', 'std', 'min', 'max',
        lambda x: x.quantile(0.25),
        lambda x: x.quantile(0.75),
        'skew', 'kurt'
    ])
    summary.index = ['count', 'mean', 'median', 'std', 'min', 'max', 
                     'q25', 'q75', 'skewness', 'kurtosis']
    return summary.round(3)

def detect_outliers(series: pd.Series) -> dict:
    """Detect outliers using IQR method."""
    Q1 = series.quantile(0.25)
    Q3 = series.quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    
    outliers = series[(series < lower) | (series > upper)]
    return {
        'count': len(outliers),
        'pct': round(len(outliers) / len(series) * 100, 2),
        'lower_bound': round(lower, 3),
        'upper_bound': round(upper, 3)
    }

def compute_correlations(df: pd.DataFrame) -> pd.DataFrame:
    """Pearson correlation matrix for numeric columns."""
    numeric = df.select_dtypes(include='number')
    return numeric.corr().round(3)

def top_correlations(corr_matrix: pd.DataFrame, n: int = 5) -> list:
    """Return top N strongest correlations (excluding self-correlations)."""
    pairs = []
    cols = corr_matrix.columns
    for i in range(len(cols)):
        for j in range(i + 1, len(cols)):
            pairs.append({
                'col1': cols[i],
                'col2': cols[j],
                'correlation': corr_matrix.iloc[i, j]
            })
    
    pairs.sort(key=lambda x: abs(x['correlation']), reverse=True)
    return pairs[:n]

def value_counts_summary(df: pd.DataFrame, top_n: int = 5) -> dict:
    """Value counts for categorical columns."""
    cat_cols = df.select_dtypes(include='object').columns
    result = {}
    for col in cat_cols:
        result[col] = df[col].value_counts().head(top_n).to_dict()
    return result

Step 3: Visualization

# visualizer.py
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
import pandas as pd
from pathlib import Path

sns.set_theme(style='whitegrid', palette='Set2')

def save_charts(df: pd.DataFrame, output_dir: str) -> list:
    """Generate and save all charts. Returns list of saved paths."""
    output_path = Path(output_dir) / 'charts'
    output_path.mkdir(parents=True, exist_ok=True)
    saved = []
    
    numeric = df.select_dtypes(include='number')
    
    # Chart 1: Distribution plots for numeric columns
    if not numeric.empty:
        n_cols = min(len(numeric.columns), 4)
        fig, axes = plt.subplots(1, n_cols, figsize=(5 * n_cols, 4))
        if n_cols == 1:
            axes = [axes]
        
        for ax, col in zip(axes, numeric.columns[:n_cols]):
            sns.histplot(numeric[col].dropna(), kde=True, ax=ax)
            ax.set_title(f'Distribution: {col}', fontsize=10)
        
        plt.tight_layout()
        path = output_path / 'distributions.png'
        plt.savefig(path, dpi=150, bbox_inches='tight')
        plt.close()
        saved.append(str(path))
    
    # Chart 2: Correlation heatmap
    if len(numeric.columns) > 1:
        fig, ax = plt.subplots(figsize=(8, 6))
        corr = numeric.corr()
        sns.heatmap(corr, annot=True, fmt='.2f', cmap='RdBu_r',
                    center=0, ax=ax, square=True)
        ax.set_title('Correlation Matrix')
        plt.tight_layout()
        path = output_path / 'correlation.png'
        plt.savefig(path, dpi=150, bbox_inches='tight')
        plt.close()
        saved.append(str(path))
    
    # Chart 3: Box plots for outlier visualization
    if not numeric.empty:
        fig, ax = plt.subplots(figsize=(max(8, len(numeric.columns) * 1.5), 5))
        numeric.boxplot(ax=ax)
        ax.set_title('Outlier Overview (Box Plots)')
        ax.tick_params(axis='x', rotation=45)
        plt.tight_layout()
        path = output_path / 'boxplots.png'
        plt.savefig(path, dpi=150, bbox_inches='tight')
        plt.close()
        saved.append(str(path))
    
    print(f"✓ Generated {len(saved)} charts → {output_path}/")
    return saved

Step 4: Report Generator

# reporter.py
from datetime import datetime
from pathlib import Path
import pandas as pd

def generate_report(
    filepath: str,
    profile: dict,
    stats: pd.DataFrame,
    correlations: list,
    value_counts: dict,
    chart_paths: list,
    output_dir: str
) -> str:
    """Generate a Markdown summary report."""
    
    lines = [
        f"# Data Analysis Report",
        f"**File:** `{filepath}`",
        f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M')}",
        f"**Rows:** {profile['rows']:,}  |  **Columns:** {profile['columns']}",
        "",
        "---",
        "",
        "## Dataset Overview",
        "",
        f"| Type | Columns |",
        f"|------|---------|",
        f"| Numeric | {', '.join(profile['numeric_cols']) or 'None'} |",
        f"| Categorical | {', '.join(profile['categorical_cols']) or 'None'} |",
        "",
    ]
    
    # Missing data
    if profile['missing']:
        lines += ["## Missing Values", ""]
        for col, count in profile['missing'].items():
            pct = profile['missing_pct'][col]
            lines.append(f"- **{col}**: {count:,} missing ({pct}%)")
        lines.append("")
    
    if profile['duplicates'] > 0:
        lines += [f"> ⚠️ **{profile['duplicates']:,} duplicate rows detected**", ""]
    
    # Numeric stats
    if not stats.empty:
        lines += ["## Numeric Statistics", ""]
        lines.append(stats.to_markdown())
        lines.append("")
    
    # Top correlations
    if correlations:
        lines += ["## Strongest Correlations", ""]
        for c in correlations[:5]:
            direction = "+" if c['correlation'] > 0 else "-"
            lines.append(f"- **{c['col1']}** ↔ **{c['col2']}**: `{c['correlation']:.3f}` ({direction})")
        lines.append("")
    
    # Categorical summaries
    if value_counts:
        lines += ["## Categorical Summaries", ""]
        for col, counts in value_counts.items():
            lines.append(f"### {col}")
            for val, cnt in counts.items():
                lines.append(f"- {val}: {cnt:,}")
            lines.append("")
    
    # Charts
    if chart_paths:
        lines += ["## Charts", ""]
        for path in chart_paths:
            name = Path(path).stem
            lines.append(f"![{name}]({path})")
            lines.append("")
    
    report = "\n".join(lines)
    
    output_path = Path(output_dir) / 'summary_report.md'
    output_path.parent.mkdir(parents=True, exist_ok=True)
    output_path.write_text(report, encoding='utf-8')
    print(f"✓ Exported report → {output_path}")
    
    return str(output_path)

Step 5: The CLI Entry Point

# analyze.py
import argparse
import sys
from pathlib import Path

from loader import load_data, validate_and_profile
from stats import compute_numeric_stats, compute_correlations, top_correlations, value_counts_summary, detect_outliers
from visualizer import save_charts
from reporter import generate_report

def print_quick_stats(df, stats):
    """Print a compact summary to the terminal."""
    numeric = df.select_dtypes(include='number')
    if numeric.empty:
        return
    
    print("\n=== QUICK STATS ===")
    for col in numeric.columns[:5]:
        s = stats[col]
        print(f"{col:15s}  mean={s['mean']:<10.2f}  median={s['median']:<10.2f}  max={s['max']:.2f}")

def main():
    parser = argparse.ArgumentParser(
        description='Analyze any CSV/Excel/JSON dataset from the command line.',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  python analyze.py data.csv
  python analyze.py data.csv --report --chart --output results/
  python analyze.py data.xlsx --col revenue units --top-corr 10
        """
    )
    
    parser.add_argument('file', help='Path to the data file (CSV, Excel, JSON, Parquet)')
    parser.add_argument('--output', '-o', default='output/', help='Output directory (default: output/)')
    parser.add_argument('--report', '-r', action='store_true', help='Generate Markdown report')
    parser.add_argument('--chart', '-c', action='store_true', help='Generate charts')
    parser.add_argument('--col', nargs='+', help='Analyze only these columns')
    parser.add_argument('--top-corr', type=int, default=5, help='Top N correlations to show')
    parser.add_argument('--no-outliers', action='store_true', help='Skip outlier detection')
    
    args = parser.parse_args()
    
    # Load data
    df = load_data(args.file)
    
    # Filter columns if specified
    if args.col:
        missing_cols = [c for c in args.col if c not in df.columns]
        if missing_cols:
            print(f"Warning: columns not found: {missing_cols}")
        df = df[[c for c in args.col if c in df.columns]]
    
    # Profile
    profile = validate_and_profile(df)
    
    # Stats
    stats = compute_numeric_stats(df)
    print_quick_stats(df, stats)
    
    # Outliers
    if not args.no_outliers and profile['numeric_cols']:
        print("\n=== OUTLIERS ===")
        for col in profile['numeric_cols']:
            o = detect_outliers(df[col].dropna())
            if o['count'] > 0:
                print(f"{col:15s}: {o['count']} outliers ({o['pct']}%)")
    
    # Correlations
    corr_matrix = compute_correlations(df)
    top_corrs = top_correlations(corr_matrix, n=args.top_corr)
    
    if top_corrs:
        print("\n=== TOP CORRELATIONS ===")
        for c in top_corrs[:3]:
            print(f"  {c['col1']} ↔ {c['col2']}: {c['correlation']:.3f}")
    
    # Charts
    chart_paths = []
    if args.chart:
        chart_paths = save_charts(df, args.output)
    
    # Report
    if args.report:
        val_counts = value_counts_summary(df)
        generate_report(
            filepath=args.file,
            profile=profile,
            stats=stats,
            correlations=top_corrs,
            value_counts=val_counts,
            chart_paths=chart_paths,
            output_dir=args.output
        )
    
    print(f"\n✅ Analysis complete.")

if __name__ == '__main__':
    main()

Sample Data Generator

# generate_sample_data.py — run this to create test data
import pandas as pd
import numpy as np

np.random.seed(42)
n = 500

df = pd.DataFrame({
    'month': pd.date_range('2023-01-01', periods=n, freq='D').strftime('%Y-%m'),
    'region': np.random.choice(['North', 'South', 'East', 'West'], n),
    'product': np.random.choice(['Widget A', 'Widget B', 'Widget C'], n),
    'revenue': np.random.normal(5000, 1500, n).clip(500).round(2),
    'units': np.random.randint(10, 200, n),
    'discount_pct': np.random.uniform(0, 0.3, n).round(3),
    'customer_id': np.random.randint(1000, 9999, n),
    'returned': np.random.choice([0, 1], n, p=[0.92, 0.08])
})

# Add some missing values
df.loc[np.random.choice(n, 20), 'revenue'] = np.nan

df.to_csv('sales_data.csv', index=False)
print(f"Generated sales_data.csv with {n} rows")

Running the Tool

# Install dependencies
pip install pandas matplotlib seaborn scipy tabulate

# Generate sample data
python generate_sample_data.py

# Basic analysis
python analyze.py sales_data.csv

# Full analysis with charts and report
python analyze.py sales_data.csv --report --chart --output results/

# Analyze specific columns only
python analyze.py sales_data.csv --col revenue units discount_pct --chart

# View the report
cat results/summary_report.md

Extension Ideas

Once your tool works, try adding:

# Add to analyze.py — time series detection
def detect_time_column(df):
    for col in df.columns:
        try:
            pd.to_datetime(df[col])
            return col
        except:
            continue
    return None

# Add trend line for time series
if time_col := detect_time_column(df):
    df[time_col] = pd.to_datetime(df[time_col])
    df_sorted = df.sort_values(time_col)
    # Plot monthly trend automatically...

This project demonstrates the full Python development cycle: modular architecture, CLI design with argparse, data processing with pandas, visualization with matplotlib/seaborn, and file output. These are exactly the patterns used in production data engineering tools.

Next project: REST API with FastAPI — building a production-quality backend service.