Follow AiTechWorlds on LinkedIn for professional AI content!Follow Now →
18 minLesson 15 of 34
Advanced Python

File I/O & Working with CSV, JSON

File I/O & Working with CSV, JSON: Reading and Writing Data

Almost every real Python program reads or writes files. Configuration files, data exports, logs, CSVs from spreadsheets, JSON from APIs — file I/O is everywhere. This lesson covers everything you need for production file handling.

Basic File Operations

# Writing a file
with open("output.txt", "w", encoding="utf-8") as f:
    f.write("Hello, World!\n")
    f.write("Second line\n")

# Reading a file — whole content
with open("output.txt", "r", encoding="utf-8") as f:
    content = f.read()
print(content)

# Reading line by line (memory efficient for large files)
with open("output.txt", "r", encoding="utf-8") as f:
    for line in f:
        print(line.strip())  # strip() removes the \n

# Reading all lines as a list
with open("output.txt", "r") as f:
    lines = f.readlines()

# Appending to existing file
with open("output.txt", "a", encoding="utf-8") as f:
    f.write("Appended line\n")

File modes:

ModeMeaning
'r'Read (default). Error if file doesn't exist.
'w'Write. Creates file or overwrites existing.
'a'Append. Creates or adds to end.
'x'Exclusive create. Error if file exists.
'rb', 'wb'Binary mode (for images, PDFs, etc.)

Working with Paths: pathlib

from pathlib import Path

# Create path objects
current_dir = Path(".")
home = Path.home()
data_dir = Path("data")

# Build paths
config_file = data_dir / "config.json"  # Works on Windows and Unix
print(config_file)  # data/config.json  (or data\config.json on Windows)

# Check existence
print(config_file.exists())
print(config_file.is_file())
print(data_dir.is_dir())

# Create directories
output_dir = Path("output") / "results"
output_dir.mkdir(parents=True, exist_ok=True)  # Like mkdir -p

# File info
p = Path("output.txt")
print(p.name)          # 'output.txt'
print(p.stem)          # 'output'
print(p.suffix)        # '.txt'
print(p.parent)        # '.'

# Glob patterns
for py_file in Path(".").glob("**/*.py"):  # All .py files recursively
    print(py_file)

# Read and write directly
config_file.write_text('{"debug": true}', encoding="utf-8")
content = config_file.read_text(encoding="utf-8")

# Find all CSV files in a directory
csv_files = list(Path("data").glob("*.csv"))
print(f"Found {len(csv_files)} CSV files")

CSV Files: The Universal Data Format

import csv

# Writing CSV
employees = [
    {"name": "Alice", "department": "Engineering", "salary": 95000},
    {"name": "Bob", "department": "Marketing", "salary": 72000},
    {"name": "Carol", "department": "Engineering", "salary": 105000},
]

with open("employees.csv", "w", newline="", encoding="utf-8") as f:
    fieldnames = ["name", "department", "salary"]
    writer = csv.DictWriter(f, fieldnames=fieldnames)
    
    writer.writeheader()
    writer.writerows(employees)

# Reading CSV
with open("employees.csv", "r", encoding="utf-8") as f:
    reader = csv.DictReader(f)
    for row in reader:
        print(f"{row['name']}: ${int(row['salary']):,}")

# Reading with regular csv.reader (gives lists, not dicts)
with open("employees.csv", "r") as f:
    reader = csv.reader(f)
    header = next(reader)  # Skip header
    for row in reader:
        name, dept, salary = row
        print(f"{name} in {dept}")

CSV Processing: Real-World Patterns

from pathlib import Path
import csv
from collections import defaultdict
from statistics import mean

def analyze_sales_data(csv_path):
    """Analyze a sales CSV file."""
    records = []
    
    with open(csv_path, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            records.append({
                'product': row['product'],
                'region': row['region'],
                'revenue': float(row['revenue']),
                'units': int(row['units'])
            })
    
    # Total revenue
    total_revenue = sum(r['revenue'] for r in records)
    
    # Revenue by region
    by_region = defaultdict(float)
    for r in records:
        by_region[r['region']] += r['revenue']
    
    # Best-selling products
    by_product = defaultdict(int)
    for r in records:
        by_product[r['product']] += r['units']
    
    return {
        'total_revenue': total_revenue,
        'by_region': dict(sorted(by_region.items(), key=lambda x: -x[1])),
        'top_products': sorted(by_product.items(), key=lambda x: -x[1])[:5]
    }

JSON Files: Config and API Data

import json

# Writing JSON
config = {
    "database": {
        "host": "localhost",
        "port": 5432,
        "name": "myapp"
    },
    "debug": False,
    "max_connections": 100,
    "features": ["search", "analytics", "notifications"]
}

with open("config.json", "w", encoding="utf-8") as f:
    json.dump(config, f, indent=2)  # indent=2 for pretty printing

# Reading JSON
with open("config.json", "r", encoding="utf-8") as f:
    loaded_config = json.load(f)

print(loaded_config["database"]["host"])  # localhost
print(loaded_config["features"])          # ['search', 'analytics', 'notifications']

# JSON from/to strings
json_string = json.dumps(config, indent=2)
parsed = json.loads(json_string)

# Handling non-serializable types
from datetime import datetime, date
import uuid

data = {
    "id": uuid.uuid4(),          # UUID is not JSON-serializable by default
    "created_at": datetime.now(),  # datetime either
    "amount": 99.99
}

# Custom encoder
class CustomEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, (datetime, date)):
            return obj.isoformat()
        if isinstance(obj, uuid.UUID):
            return str(obj)
        return super().default(obj)

json_str = json.dumps(data, cls=CustomEncoder, indent=2)
print(json_str)

JSONL: JSON Lines Format

JSONL (one JSON object per line) is great for streaming large datasets:

# Writing JSONL
logs = [
    {"time": "2024-01-01T10:00:00", "level": "INFO", "message": "Server started"},
    {"time": "2024-01-01T10:01:00", "level": "ERROR", "message": "Connection failed"},
    {"time": "2024-01-01T10:02:00", "level": "INFO", "message": "Recovered"},
]

with open("logs.jsonl", "w") as f:
    for log in logs:
        f.write(json.dumps(log) + "\n")

# Reading JSONL — memory efficient
def read_jsonl(filepath):
    with open(filepath, 'r') as f:
        for line in f:
            line = line.strip()
            if line:
                yield json.loads(line)

error_logs = [log for log in read_jsonl("logs.jsonl") if log["level"] == "ERROR"]

Error Handling for Files

from pathlib import Path
import json

def safe_read_config(config_path, defaults=None):
    """Safely read a config file with fallback defaults."""
    if defaults is None:
        defaults = {}
    
    path = Path(config_path)
    
    if not path.exists():
        print(f"Config file not found: {config_path}. Using defaults.")
        return defaults
    
    try:
        with open(path, 'r', encoding='utf-8') as f:
            config = json.load(f)
        return {**defaults, **config}  # Merge: file config overrides defaults
    
    except json.JSONDecodeError as e:
        print(f"Invalid JSON in {config_path}: {e}")
        return defaults
    
    except PermissionError:
        print(f"Permission denied reading {config_path}")
        return defaults

config = safe_read_config("config.json", defaults={"debug": False, "port": 8000})

Large Files: Memory-Efficient Processing

def process_large_csv(input_path, output_path, transform_func):
    """Process a large CSV without loading it all into memory."""
    with open(input_path, 'r', encoding='utf-8', newline='') as infile, \
         open(output_path, 'w', encoding='utf-8', newline='') as outfile:
        
        reader = csv.DictReader(infile)
        writer = csv.DictWriter(outfile, fieldnames=reader.fieldnames)
        writer.writeheader()
        
        for i, row in enumerate(reader):
            transformed = transform_func(row)
            if transformed:  # Skip if transform returns None/False
                writer.writerow(transformed)
            
            if i % 10000 == 0:
                print(f"Processed {i:,} rows...")
    
    print(f"Done: {input_path} → {output_path}")

# Usage
def filter_and_transform(row):
    """Keep only high-value orders, add computed column."""
    revenue = float(row.get('revenue', 0))
    if revenue < 1000:
        return None  # Skip small orders
    return {**row, 'revenue_category': 'high' if revenue > 10000 else 'medium'}

Next lesson: Error Handling & Custom Exceptions — writing resilient Python code.

📱

Get this course's notes on Telegram!

Free cheat sheets, summaries & practice exercises

Get Notes Free →
!