Generators & Iterators

Generators & Iterators: Memory-Efficient Python

Generators are one of Python's most elegant features. Instead of computing all values at once and storing them in memory, generators compute values on demand — one at a time. This makes them essential for processing large files, streams, and infinite sequences.

The Problem Generators Solve

# Memory-hungry approach — loads ALL values at once
def get_all_numbers(n):
    return [i * 2 for i in range(n)]

numbers = get_all_numbers(10_000_000)  # 80MB of RAM, instantly
print(numbers[0])  # Just wanted the first element!

# Generator approach — computes on demand
def generate_numbers(n):
    for i in range(n):
        yield i * 2  # 'yield' instead of 'return' — returns one value at a time

gen = generate_numbers(10_000_000)  # Instant, uses <1KB of RAM
print(next(gen))   # 0 — compute first value
print(next(gen))   # 2 — compute second value
# Only processes what you actually use

Generator Functions: How `yield` Works

When Python sees yield in a function, it becomes a generator function. Calling it returns a generator object — it doesn't run the function body.

def countdown(n):
    print(f"Starting countdown from {n}")
    while n > 0:
        yield n           # Pause here, return n to caller
        n -= 1            # Resume here on next() call
    print("Liftoff!")

gen = countdown(3)        # Function body doesn't run yet
print(type(gen))          # <class 'generator'>

print(next(gen))          # "Starting countdown from 3", returns 3
print(next(gen))          # Returns 2
print(next(gen))          # Returns 1
# next(gen) again would raise StopIteration + print "Liftoff!"

# Using in a for loop (handles StopIteration automatically)
for n in countdown(3):
    print(n)
# Starting countdown from 3, 3, 2, 1, Liftoff!

Practical Generator Patterns

# 1. Processing large files line by line
def read_large_csv(filepath):
    """Read a CSV file one row at a time — no matter how large."""
    with open(filepath, 'r', encoding='utf-8') as f:
        header = f.readline().strip().split(',')
        for line in f:
            values = line.strip().split(',')
            yield dict(zip(header, values))

# Process without loading file into memory
for row in read_large_csv('10gb_file.csv'):
    if row['status'] == 'error':
        print(row)

# 2. Infinite sequences
def fibonacci():
    """Infinite Fibonacci sequence."""
    a, b = 0, 1
    while True:
        yield a
        a, b = b, a + b

def take(gen, n):
    """Take first n values from a generator."""
    for _ in range(n):
        yield next(gen)

fib = fibonacci()
print(list(take(fib, 10)))  # [0, 1, 1, 2, 3, 5, 8, 13, 21, 34]

# 3. Data pipeline
def parse_records(lines):
    for line in lines:
        yield line.strip().split(',')

def filter_active(records):
    for record in records:
        if record[2] == 'active':
            yield record

def format_output(records):
    for record in records:
        yield {"id": record[0], "name": record[1]}

# Chain generators — each operates one record at a time
with open('data.csv') as f:
    pipeline = format_output(filter_active(parse_records(f)))
    for record in pipeline:
        print(record)
# Memory usage: constant regardless of file size

Generator Expressions: Inline Generators

# List comprehension: creates list in memory
squares_list = [x**2 for x in range(1000000)]  # 8MB

# Generator expression: computes on demand
squares_gen = (x**2 for x in range(1000000))   # ~200 bytes

# Useful for one-pass operations
total = sum(x**2 for x in range(1000000))     # Never stores the list
max_val = max(abs(x) for x in [-3, 1, -7, 4])
found = any(x > 100 for x in range(200))      # Short-circuits at 101

# Nested generator expressions
flat = list(x for row in [[1,2],[3,4]] for x in row)
print(flat)  # [1, 2, 3, 4]

The Iterator Protocol

A generator is an iterator. Iterators are objects that implement __iter__ and __next__. You can build custom iterators with classes.

class Range:
    """Custom range iterator — like built-in range but our own."""
    
    def __init__(self, start, stop, step=1):
        self.current = start
        self.stop = stop
        self.step = step
    
    def __iter__(self):
        return self  # An iterator is its own iterator
    
    def __next__(self):
        if self.current >= self.stop:
            raise StopIteration
        value = self.current
        self.current += self.step
        return value

for n in Range(0, 5):
    print(n)  # 0 1 2 3 4

# Any object with __iter__ is "iterable"
class NumberRange:
    def __init__(self, start, stop):
        self.start = start
        self.stop = stop
    
    def __iter__(self):
        current = self.start
        while current < self.stop:
            yield current
            current += 1

for n in NumberRange(1, 5):
    print(n)  # 1 2 3 4

`itertools`: Generator Powerhouse

import itertools

# chain: flatten multiple iterables
combined = list(itertools.chain([1,2], [3,4], [5,6]))
print(combined)  # [1, 2, 3, 4, 5, 6]

# islice: take N items from any iterable (even infinite)
first_10_fibs = list(itertools.islice(fibonacci(), 10))

# zip_longest: zip with fill value for unequal lengths
for a, b in itertools.zip_longest([1,2,3], ['a','b'], fillvalue=None):
    print(a, b)

# product: Cartesian product
for combo in itertools.product([1,2], ['a','b']):
    print(combo)  # (1,'a'), (1,'b'), (2,'a'), (2,'b')

# groupby: group consecutive items
from itertools import groupby
data = [("A", 1), ("A", 2), ("B", 3), ("A", 4)]
for key, group in groupby(data, key=lambda x: x[0]):
    print(key, list(group))

# accumulate: running totals
from itertools import accumulate
import operator
running_total = list(accumulate([1, 2, 3, 4, 5]))
print(running_total)  # [1, 3, 6, 10, 15]

running_product = list(accumulate([1, 2, 3, 4, 5], operator.mul))
print(running_product)  # [1, 2, 6, 24, 120]

`yield from`: Delegating to Sub-Generators

def flatten(nested):
    """Flatten a nested list structure of arbitrary depth."""
    for item in nested:
        if isinstance(item, list):
            yield from flatten(item)  # Delegate to recursive call
        else:
            yield item

deep_list = [1, [2, [3, [4]], 5], 6]
print(list(flatten(deep_list)))  # [1, 2, 3, 4, 5, 6]

# Combine multiple generators
def read_all_logs(log_files):
    for filepath in log_files:
        yield from open(filepath, 'r')  # Yield lines from each file

Memory Comparison

import sys
import tracemalloc

# List approach
tracemalloc.start()
result = [x**2 for x in range(100000)]
current, peak = tracemalloc.get_traced_memory()
tracemalloc.stop()
print(f"List: peak memory = {peak / 1024:.1f} KB")

# Generator approach
tracemalloc.start()
result_gen = (x**2 for x in range(100000))
total = sum(result_gen)
current, peak = tracemalloc.get_traced_memory()
tracemalloc.stop()
print(f"Generator: peak memory = {peak / 1024:.1f} KB")
# Generator uses ~20× less memory

Generators are the right choice whenever you're processing sequences larger than a few thousand items, dealing with I/O streams, or implementing lazy pipelines.

Next lesson: Async Python with asyncio — handling thousands of concurrent operations efficiently.