18 minLesson 15 of 34
Advanced Python
File I/O & Working with CSV, JSON
File I/O & Working with CSV, JSON: Reading and Writing Data
Almost every real Python program reads or writes files. Configuration files, data exports, logs, CSVs from spreadsheets, JSON from APIs — file I/O is everywhere. This lesson covers everything you need for production file handling.
Basic File Operations
# Writing a file
with open("output.txt", "w", encoding="utf-8") as f:
f.write("Hello, World!\n")
f.write("Second line\n")
# Reading a file — whole content
with open("output.txt", "r", encoding="utf-8") as f:
content = f.read()
print(content)
# Reading line by line (memory efficient for large files)
with open("output.txt", "r", encoding="utf-8") as f:
for line in f:
print(line.strip()) # strip() removes the \n
# Reading all lines as a list
with open("output.txt", "r") as f:
lines = f.readlines()
# Appending to existing file
with open("output.txt", "a", encoding="utf-8") as f:
f.write("Appended line\n")
File modes:
| Mode | Meaning |
|---|---|
'r' | Read (default). Error if file doesn't exist. |
'w' | Write. Creates file or overwrites existing. |
'a' | Append. Creates or adds to end. |
'x' | Exclusive create. Error if file exists. |
'rb', 'wb' | Binary mode (for images, PDFs, etc.) |
Working with Paths: pathlib
from pathlib import Path
# Create path objects
current_dir = Path(".")
home = Path.home()
data_dir = Path("data")
# Build paths
config_file = data_dir / "config.json" # Works on Windows and Unix
print(config_file) # data/config.json (or data\config.json on Windows)
# Check existence
print(config_file.exists())
print(config_file.is_file())
print(data_dir.is_dir())
# Create directories
output_dir = Path("output") / "results"
output_dir.mkdir(parents=True, exist_ok=True) # Like mkdir -p
# File info
p = Path("output.txt")
print(p.name) # 'output.txt'
print(p.stem) # 'output'
print(p.suffix) # '.txt'
print(p.parent) # '.'
# Glob patterns
for py_file in Path(".").glob("**/*.py"): # All .py files recursively
print(py_file)
# Read and write directly
config_file.write_text('{"debug": true}', encoding="utf-8")
content = config_file.read_text(encoding="utf-8")
# Find all CSV files in a directory
csv_files = list(Path("data").glob("*.csv"))
print(f"Found {len(csv_files)} CSV files")
CSV Files: The Universal Data Format
import csv
# Writing CSV
employees = [
{"name": "Alice", "department": "Engineering", "salary": 95000},
{"name": "Bob", "department": "Marketing", "salary": 72000},
{"name": "Carol", "department": "Engineering", "salary": 105000},
]
with open("employees.csv", "w", newline="", encoding="utf-8") as f:
fieldnames = ["name", "department", "salary"]
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(employees)
# Reading CSV
with open("employees.csv", "r", encoding="utf-8") as f:
reader = csv.DictReader(f)
for row in reader:
print(f"{row['name']}: ${int(row['salary']):,}")
# Reading with regular csv.reader (gives lists, not dicts)
with open("employees.csv", "r") as f:
reader = csv.reader(f)
header = next(reader) # Skip header
for row in reader:
name, dept, salary = row
print(f"{name} in {dept}")
CSV Processing: Real-World Patterns
from pathlib import Path
import csv
from collections import defaultdict
from statistics import mean
def analyze_sales_data(csv_path):
"""Analyze a sales CSV file."""
records = []
with open(csv_path, 'r', encoding='utf-8') as f:
reader = csv.DictReader(f)
for row in reader:
records.append({
'product': row['product'],
'region': row['region'],
'revenue': float(row['revenue']),
'units': int(row['units'])
})
# Total revenue
total_revenue = sum(r['revenue'] for r in records)
# Revenue by region
by_region = defaultdict(float)
for r in records:
by_region[r['region']] += r['revenue']
# Best-selling products
by_product = defaultdict(int)
for r in records:
by_product[r['product']] += r['units']
return {
'total_revenue': total_revenue,
'by_region': dict(sorted(by_region.items(), key=lambda x: -x[1])),
'top_products': sorted(by_product.items(), key=lambda x: -x[1])[:5]
}
JSON Files: Config and API Data
import json
# Writing JSON
config = {
"database": {
"host": "localhost",
"port": 5432,
"name": "myapp"
},
"debug": False,
"max_connections": 100,
"features": ["search", "analytics", "notifications"]
}
with open("config.json", "w", encoding="utf-8") as f:
json.dump(config, f, indent=2) # indent=2 for pretty printing
# Reading JSON
with open("config.json", "r", encoding="utf-8") as f:
loaded_config = json.load(f)
print(loaded_config["database"]["host"]) # localhost
print(loaded_config["features"]) # ['search', 'analytics', 'notifications']
# JSON from/to strings
json_string = json.dumps(config, indent=2)
parsed = json.loads(json_string)
# Handling non-serializable types
from datetime import datetime, date
import uuid
data = {
"id": uuid.uuid4(), # UUID is not JSON-serializable by default
"created_at": datetime.now(), # datetime either
"amount": 99.99
}
# Custom encoder
class CustomEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, (datetime, date)):
return obj.isoformat()
if isinstance(obj, uuid.UUID):
return str(obj)
return super().default(obj)
json_str = json.dumps(data, cls=CustomEncoder, indent=2)
print(json_str)
JSONL: JSON Lines Format
JSONL (one JSON object per line) is great for streaming large datasets:
# Writing JSONL
logs = [
{"time": "2024-01-01T10:00:00", "level": "INFO", "message": "Server started"},
{"time": "2024-01-01T10:01:00", "level": "ERROR", "message": "Connection failed"},
{"time": "2024-01-01T10:02:00", "level": "INFO", "message": "Recovered"},
]
with open("logs.jsonl", "w") as f:
for log in logs:
f.write(json.dumps(log) + "\n")
# Reading JSONL — memory efficient
def read_jsonl(filepath):
with open(filepath, 'r') as f:
for line in f:
line = line.strip()
if line:
yield json.loads(line)
error_logs = [log for log in read_jsonl("logs.jsonl") if log["level"] == "ERROR"]
Error Handling for Files
from pathlib import Path
import json
def safe_read_config(config_path, defaults=None):
"""Safely read a config file with fallback defaults."""
if defaults is None:
defaults = {}
path = Path(config_path)
if not path.exists():
print(f"Config file not found: {config_path}. Using defaults.")
return defaults
try:
with open(path, 'r', encoding='utf-8') as f:
config = json.load(f)
return {**defaults, **config} # Merge: file config overrides defaults
except json.JSONDecodeError as e:
print(f"Invalid JSON in {config_path}: {e}")
return defaults
except PermissionError:
print(f"Permission denied reading {config_path}")
return defaults
config = safe_read_config("config.json", defaults={"debug": False, "port": 8000})
Large Files: Memory-Efficient Processing
def process_large_csv(input_path, output_path, transform_func):
"""Process a large CSV without loading it all into memory."""
with open(input_path, 'r', encoding='utf-8', newline='') as infile, \
open(output_path, 'w', encoding='utf-8', newline='') as outfile:
reader = csv.DictReader(infile)
writer = csv.DictWriter(outfile, fieldnames=reader.fieldnames)
writer.writeheader()
for i, row in enumerate(reader):
transformed = transform_func(row)
if transformed: # Skip if transform returns None/False
writer.writerow(transformed)
if i % 10000 == 0:
print(f"Processed {i:,} rows...")
print(f"Done: {input_path} → {output_path}")
# Usage
def filter_and_transform(row):
"""Keep only high-value orders, add computed column."""
revenue = float(row.get('revenue', 0))
if revenue < 1000:
return None # Skip small orders
return {**row, 'revenue_category': 'high' if revenue > 10000 else 'medium'}
Next lesson: Error Handling & Custom Exceptions — writing resilient Python code.
📱
Get Notes Free →Get this course's notes on Telegram!
Free cheat sheets, summaries & practice exercises