Generators & Iterators
Generators & Iterators: Memory-Efficient Python
Generators are one of Python's most elegant features. Instead of computing all values at once and storing them in memory, generators compute values on demand — one at a time. This makes them essential for processing large files, streams, and infinite sequences.
The Problem Generators Solve
# Memory-hungry approach — loads ALL values at once
def get_all_numbers(n):
return [i * 2 for i in range(n)]
numbers = get_all_numbers(10_000_000) # 80MB of RAM, instantly
print(numbers[0]) # Just wanted the first element!
# Generator approach — computes on demand
def generate_numbers(n):
for i in range(n):
yield i * 2 # 'yield' instead of 'return' — returns one value at a time
gen = generate_numbers(10_000_000) # Instant, uses <1KB of RAM
print(next(gen)) # 0 — compute first value
print(next(gen)) # 2 — compute second value
# Only processes what you actually use
Generator Functions: How yield Works
When Python sees yield in a function, it becomes a generator function. Calling it returns a generator object — it doesn't run the function body.
def countdown(n):
print(f"Starting countdown from {n}")
while n > 0:
yield n # Pause here, return n to caller
n -= 1 # Resume here on next() call
print("Liftoff!")
gen = countdown(3) # Function body doesn't run yet
print(type(gen)) # <class 'generator'>
print(next(gen)) # "Starting countdown from 3", returns 3
print(next(gen)) # Returns 2
print(next(gen)) # Returns 1
# next(gen) again would raise StopIteration + print "Liftoff!"
# Using in a for loop (handles StopIteration automatically)
for n in countdown(3):
print(n)
# Starting countdown from 3, 3, 2, 1, Liftoff!
Practical Generator Patterns
# 1. Processing large files line by line
def read_large_csv(filepath):
"""Read a CSV file one row at a time — no matter how large."""
with open(filepath, 'r', encoding='utf-8') as f:
header = f.readline().strip().split(',')
for line in f:
values = line.strip().split(',')
yield dict(zip(header, values))
# Process without loading file into memory
for row in read_large_csv('10gb_file.csv'):
if row['status'] == 'error':
print(row)
# 2. Infinite sequences
def fibonacci():
"""Infinite Fibonacci sequence."""
a, b = 0, 1
while True:
yield a
a, b = b, a + b
def take(gen, n):
"""Take first n values from a generator."""
for _ in range(n):
yield next(gen)
fib = fibonacci()
print(list(take(fib, 10))) # [0, 1, 1, 2, 3, 5, 8, 13, 21, 34]
# 3. Data pipeline
def parse_records(lines):
for line in lines:
yield line.strip().split(',')
def filter_active(records):
for record in records:
if record[2] == 'active':
yield record
def format_output(records):
for record in records:
yield {"id": record[0], "name": record[1]}
# Chain generators — each operates one record at a time
with open('data.csv') as f:
pipeline = format_output(filter_active(parse_records(f)))
for record in pipeline:
print(record)
# Memory usage: constant regardless of file size
Generator Expressions: Inline Generators
# List comprehension: creates list in memory
squares_list = [x**2 for x in range(1000000)] # 8MB
# Generator expression: computes on demand
squares_gen = (x**2 for x in range(1000000)) # ~200 bytes
# Useful for one-pass operations
total = sum(x**2 for x in range(1000000)) # Never stores the list
max_val = max(abs(x) for x in [-3, 1, -7, 4])
found = any(x > 100 for x in range(200)) # Short-circuits at 101
# Nested generator expressions
flat = list(x for row in [[1,2],[3,4]] for x in row)
print(flat) # [1, 2, 3, 4]
The Iterator Protocol
A generator is an iterator. Iterators are objects that implement __iter__ and __next__. You can build custom iterators with classes.
class Range:
"""Custom range iterator — like built-in range but our own."""
def __init__(self, start, stop, step=1):
self.current = start
self.stop = stop
self.step = step
def __iter__(self):
return self # An iterator is its own iterator
def __next__(self):
if self.current >= self.stop:
raise StopIteration
value = self.current
self.current += self.step
return value
for n in Range(0, 5):
print(n) # 0 1 2 3 4
# Any object with __iter__ is "iterable"
class NumberRange:
def __init__(self, start, stop):
self.start = start
self.stop = stop
def __iter__(self):
current = self.start
while current < self.stop:
yield current
current += 1
for n in NumberRange(1, 5):
print(n) # 1 2 3 4
itertools: Generator Powerhouse
import itertools
# chain: flatten multiple iterables
combined = list(itertools.chain([1,2], [3,4], [5,6]))
print(combined) # [1, 2, 3, 4, 5, 6]
# islice: take N items from any iterable (even infinite)
first_10_fibs = list(itertools.islice(fibonacci(), 10))
# zip_longest: zip with fill value for unequal lengths
for a, b in itertools.zip_longest([1,2,3], ['a','b'], fillvalue=None):
print(a, b)
# product: Cartesian product
for combo in itertools.product([1,2], ['a','b']):
print(combo) # (1,'a'), (1,'b'), (2,'a'), (2,'b')
# groupby: group consecutive items
from itertools import groupby
data = [("A", 1), ("A", 2), ("B", 3), ("A", 4)]
for key, group in groupby(data, key=lambda x: x[0]):
print(key, list(group))
# accumulate: running totals
from itertools import accumulate
import operator
running_total = list(accumulate([1, 2, 3, 4, 5]))
print(running_total) # [1, 3, 6, 10, 15]
running_product = list(accumulate([1, 2, 3, 4, 5], operator.mul))
print(running_product) # [1, 2, 6, 24, 120]
yield from: Delegating to Sub-Generators
def flatten(nested):
"""Flatten a nested list structure of arbitrary depth."""
for item in nested:
if isinstance(item, list):
yield from flatten(item) # Delegate to recursive call
else:
yield item
deep_list = [1, [2, [3, [4]], 5], 6]
print(list(flatten(deep_list))) # [1, 2, 3, 4, 5, 6]
# Combine multiple generators
def read_all_logs(log_files):
for filepath in log_files:
yield from open(filepath, 'r') # Yield lines from each file
Memory Comparison
import sys
import tracemalloc
# List approach
tracemalloc.start()
result = [x**2 for x in range(100000)]
current, peak = tracemalloc.get_traced_memory()
tracemalloc.stop()
print(f"List: peak memory = {peak / 1024:.1f} KB")
# Generator approach
tracemalloc.start()
result_gen = (x**2 for x in range(100000))
total = sum(result_gen)
current, peak = tracemalloc.get_traced_memory()
tracemalloc.stop()
print(f"Generator: peak memory = {peak / 1024:.1f} KB")
# Generator uses ~20× less memory
Generators are the right choice whenever you're processing sequences larger than a few thousand items, dealing with I/O streams, or implementing lazy pipelines.
Next lesson: Async Python with asyncio — handling thousands of concurrent operations efficiently.
Get this course's notes on Telegram!
Free cheat sheets, summaries & practice exercises