26 minLesson 26 of 34
Python for Data Science
Pandas: DataFrames & Series
Pandas: DataFrames & Series — The Data Analysis Powerhouse
Pandas transforms raw data into answers. Whether you're cleaning messy CSVs, aggregating sales data, or preparing features for machine learning, Pandas is the tool. It's used by data scientists, analysts, and engineers everywhere.
Core Data Structures
import pandas as pd
import numpy as np
# Series: 1D labeled array
prices = pd.Series([10.5, 23.0, 15.75, 8.0],
index=['apple', 'banana', 'cherry', 'date'])
print(prices['banana']) # 23.0
print(prices[prices > 10]) # apple and banana
# DataFrame: 2D labeled table
df = pd.DataFrame({
'name': ['Alice', 'Bob', 'Carol', 'Dave'],
'age': [30, 25, 35, 28],
'salary': [95000, 72000, 105000, 88000],
'department': ['Engineering', 'Marketing', 'Engineering', 'Data']
})
print(df.head()) # First 5 rows
print(df.info()) # Column types and nulls
print(df.describe()) # Statistics
print(df.shape) # (4, 4)
Reading and Writing Data
# CSV
df = pd.read_csv('data.csv', encoding='utf-8')
df.to_csv('output.csv', index=False)
# Excel
df = pd.read_excel('report.xlsx', sheet_name='Sales')
df.to_excel('output.xlsx', index=False, sheet_name='Results')
# JSON
df = pd.read_json('data.json')
df.to_json('output.json', orient='records', indent=2)
# SQL
import sqlalchemy
engine = sqlalchemy.create_engine('sqlite:///app.db')
df = pd.read_sql("SELECT * FROM users WHERE active = 1", engine)
df.to_sql('processed_users', engine, if_exists='replace', index=False)
# From Python dict/list
df = pd.DataFrame([{'name': 'Alice', 'score': 88},
{'name': 'Bob', 'score': 92}])
# Parquet (fast columnar format for large datasets)
df.to_parquet('data.parquet', index=False)
df = pd.read_parquet('data.parquet')
Selection and Filtering
df = pd.DataFrame({
'name': ['Alice', 'Bob', 'Carol', 'Dave', 'Eve'],
'age': [30, 25, 35, 28, 32],
'salary': [95000, 72000, 105000, 88000, 91000],
'dept': ['Eng', 'Mkt', 'Eng', 'Data', 'Eng']
})
# Select columns
names = df['name'] # Series
subset = df[['name', 'salary']] # DataFrame
# Row selection
first_row = df.iloc[0] # By position
alice = df.loc[0] # By label/index
rows_2_to_4 = df.iloc[2:5]
# Filtering
engineers = df[df['dept'] == 'Eng']
high_earners = df[df['salary'] > 90000]
senior_eng = df[(df['dept'] == 'Eng') & (df['age'] > 28)]
selected_depts = df[df['dept'].isin(['Eng', 'Data'])]
# Query method (often cleaner)
result = df.query("dept == 'Eng' and salary > 90000")
# Where and mask
high_salary = df['salary'].where(df['dept'] == 'Eng', other=0)
Data Cleaning
# Handling missing values
df_messy = pd.DataFrame({
'name': ['Alice', None, 'Carol', 'Dave'],
'age': [30, 25, None, 28],
'city': ['NYC', 'LA', 'NYC', None]
})
print(df_messy.isnull().sum()) # Count NaN per column
# Drop rows/columns
df_clean = df_messy.dropna() # Drop any row with NaN
df_clean = df_messy.dropna(subset=['name']) # Only require name
df_clean = df_messy.dropna(thresh=2) # Keep rows with 2+ non-NaN
# Fill missing values
df_filled = df_messy.fillna({'name': 'Unknown', 'age': df_messy['age'].median(), 'city': 'Unknown'})
df_messy['age'].fillna(df_messy['age'].mean(), inplace=True)
# Forward/backward fill (for time series)
df_ts = df_messy.ffill() # Use previous value
df_ts = df_messy.bfill() # Use next value
# Duplicates
print(df.duplicated().sum()) # Count duplicate rows
df_unique = df.drop_duplicates()
df_unique = df.drop_duplicates(subset=['name', 'dept'])
# Data type conversion
df['age'] = df['age'].astype(int)
df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')
df['salary'] = pd.to_numeric(df['salary'], errors='coerce') # NaN for invalid values
# String cleaning
df['name'] = df['name'].str.strip().str.title()
df['email'] = df['email'].str.lower()
df['phone'] = df['phone'].str.replace(r'\D', '', regex=True) # Remove non-digits
GroupBy: Aggregating Data
# Basic groupby
by_dept = df.groupby('dept')['salary'].mean()
print(by_dept)
# Multiple aggregations
summary = df.groupby('dept').agg({
'salary': ['mean', 'min', 'max', 'count'],
'age': 'mean'
})
# Named aggregations (clean syntax)
summary = df.groupby('dept').agg(
avg_salary=('salary', 'mean'),
max_salary=('salary', 'max'),
headcount=('name', 'count'),
avg_age=('age', 'mean')
).round(0)
print(summary)
# Apply custom function
def salary_band(group):
return 'High' if group.mean() > 90000 else 'Low'
band = df.groupby('dept')['salary'].apply(salary_band)
Merging and Joining
employees = pd.DataFrame({
'emp_id': [1, 2, 3, 4],
'name': ['Alice', 'Bob', 'Carol', 'Dave'],
'dept_id': [10, 20, 10, 30]
})
departments = pd.DataFrame({
'dept_id': [10, 20, 30],
'dept_name': ['Engineering', 'Marketing', 'Data']
})
# Join types
inner = pd.merge(employees, departments, on='dept_id') # Keep matching only
left = pd.merge(employees, departments, on='dept_id', how='left') # Keep all employees
outer = pd.merge(employees, departments, on='dept_id', how='outer')# Keep everything
# Concatenate DataFrames
q1 = pd.DataFrame({'month': ['Jan', 'Feb', 'Mar'], 'revenue': [100, 120, 110]})
q2 = pd.DataFrame({'month': ['Apr', 'May', 'Jun'], 'revenue': [130, 125, 140]})
full_year = pd.concat([q1, q2], ignore_index=True)
# Pivot tables
pivot = df.pivot_table(
values='salary',
index='dept',
columns='age', # Or another categorical column
aggfunc='mean',
fill_value=0
)
Time Series with Pandas
# Date range and resampling
dates = pd.date_range('2024-01-01', periods=365, freq='D')
ts = pd.Series(np.random.randn(365), index=dates)
# Resample to monthly
monthly = ts.resample('ME').mean()
print(monthly.head())
# Rolling statistics
ts_df = ts.to_frame(name='value')
ts_df['rolling_mean_7'] = ts.rolling(7).mean()
ts_df['rolling_std_30'] = ts.rolling(30).std()
# Shifting for lag features
ts_df['lag_1'] = ts.shift(1)
ts_df['lag_7'] = ts.shift(7)
Apply and Map
# Apply a function to each element
df['salary_k'] = df['salary'].apply(lambda x: f"${x/1000:.0f}K")
# Apply to rows
def classify_employee(row):
if row['dept'] == 'Eng' and row['salary'] > 90000:
return 'Senior Engineer'
return 'Other'
df['role'] = df.apply(classify_employee, axis=1)
# Map values
dept_map = {'Eng': 'Engineering', 'Mkt': 'Marketing', 'Data': 'Data Science'}
df['dept_full'] = df['dept'].map(dept_map)
Pandas is indispensable for any data work in Python. The key patterns here — groupby, merge, pivot, apply — handle 90% of real-world data analysis tasks.
Next lesson: Data Visualization with Matplotlib & Seaborn — turning data into insights.
📱
Get Notes Free →Get this course's notes on Telegram!
Free cheat sheets, summaries & practice exercises