Web Scraping with BeautifulSoup

Web scraping extracts data from websites programmatically. You'll use it to gather datasets, monitor prices, aggregate news, and automate data collection. BeautifulSoup is the most beginner-friendly tool; Playwright handles dynamic JavaScript-rendered content.

The Basic Workflow

# pip install requests beautifulsoup4 lxml
import requests
from bs4 import BeautifulSoup
import time

def scrape_page(url, delay=1.0):
    """Fetch and parse a web page."""
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
    }
    
    try:
        response = requests.get(url, headers=headers, timeout=15)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.text, 'lxml')  # lxml is fastest parser
        
        time.sleep(delay)  # Be respectful — don't hammer servers
        
        return soup
    
    except requests.RequestException as e:
        print(f"Failed to fetch {url}: {e}")
        return None

soup = scrape_page("https://books.toscrape.com")

Finding Elements

# Find a single element
title = soup.find('h1')
print(title.text)

# Find all elements matching a selector
all_books = soup.find_all('article', class_='product_pod')
print(f"Found {len(all_books)} books")

# CSS selectors (most powerful and readable)
prices = soup.select('.price_color')
ratings = soup.select('p.star-rating')
links = soup.select('article.product_pod h3 a')

# Get attributes
for link in links[:3]:
    print(link['href'])           # href attribute
    print(link.get('title', ''))  # get with default

# Get text content
for price in prices[:5]:
    print(price.text.strip())     # .strip() removes whitespace

# Navigate the tree
first_book = all_books[0]
book_title = first_book.select_one('h3 a')['title']
book_price = first_book.select_one('.price_color').text
book_rating = first_book.select_one('.star-rating')['class'][1]  # "Three", "Four", etc.
print(f"{book_title}: {book_price} ({book_rating} stars)")

Complete Scraper: Books to Scrape

def scrape_books_catalog():
    """Scrape all books from books.toscrape.com"""
    base_url = "https://books.toscrape.com/catalogue"
    books = []
    page = 1
    
    RATING_MAP = {"One": 1, "Two": 2, "Three": 3, "Four": 4, "Five": 5}
    
    while True:
        url = f"{base_url}/page-{page}.html" if page > 1 else "https://books.toscrape.com"
        soup = scrape_page(url)
        
        if not soup:
            break
        
        for article in soup.select('article.product_pod'):
            book = {
                'title': article.select_one('h3 a')['title'],
                'price': float(article.select_one('.price_color').text[1:]),  # Remove £
                'rating': RATING_MAP.get(article.select_one('.star-rating')['class'][1], 0),
                'availability': article.select_one('.availability').text.strip(),
                'url': base_url + '/' + article.select_one('h3 a')['href']
            }
            books.append(book)
        
        # Check for next page
        next_btn = soup.select_one('li.next a')
        if not next_btn:
            break
        
        page += 1
        print(f"Scraped page {page-1}: {len(books)} books total")
    
    return books

books = scrape_books_catalog()
print(f"Total books scraped: {len(books)}")

Saving Scraped Data

import csv
import json
from pathlib import Path

def save_to_csv(data, filepath):
    if not data:
        return
    Path(filepath).parent.mkdir(parents=True, exist_ok=True)
    with open(filepath, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=data[0].keys())
        writer.writeheader()
        writer.writerows(data)
    print(f"Saved {len(data)} records to {filepath}")

def save_to_json(data, filepath):
    Path(filepath).parent.mkdir(parents=True, exist_ok=True)
    with open(filepath, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=2, ensure_ascii=False)
    print(f"Saved {len(data)} records to {filepath}")

save_to_csv(books, "output/books.csv")
save_to_json(books, "output/books.json")

Dynamic Content: Playwright for JavaScript Pages

Many modern websites load content via JavaScript — BeautifulSoup can't see it. Playwright automates a real browser.

# pip install playwright
# python -m playwright install chromium
from playwright.sync_api import sync_playwright

def scrape_with_playwright(url):
    with sync_playwright() as p:
        browser = p.chromium.launch(headless=True)  # False to watch it run
        page = browser.new_page()
        
        # Navigate and wait for content to load
        page.goto(url)
        page.wait_for_selector('.product-list', timeout=10000)
        
        # Execute JavaScript to scroll/interact
        page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
        page.wait_for_timeout(1000)  # Wait for lazy-loaded content
        
        # Get the rendered HTML
        content = page.content()
        soup = BeautifulSoup(content, 'lxml')
        
        # Click a button
        page.click('button.load-more')
        page.wait_for_timeout(500)
        
        # Fill a form
        page.fill('input[name="search"]', "python books")
        page.press('input[name="search"]', "Enter")
        page.wait_for_load_state("networkidle")
        
        browser.close()
        return soup

Rate Limiting and Responsible Scraping

import time
import random

class RespectfulScraper:
    def __init__(self, delay_range=(1, 3), max_retries=3):
        self.delay_range = delay_range
        self.max_retries = max_retries
        self.session = requests.Session()
        self.session.headers["User-Agent"] = "My Research Bot 1.0 (contact@email.com)"
    
    def fetch(self, url):
        for attempt in range(self.max_retries):
            try:
                # Check robots.txt (in production, use robotparser)
                response = self.session.get(url, timeout=15)
                
                if response.status_code == 429:  # Too Many Requests
                    wait = int(response.headers.get('Retry-After', 60))
                    print(f"Rate limited. Waiting {wait}s")
                    time.sleep(wait)
                    continue
                
                response.raise_for_status()
                
                # Random delay between requests
                time.sleep(random.uniform(*self.delay_range))
                
                return BeautifulSoup(response.text, 'lxml')
            
            except Exception as e:
                if attempt == self.max_retries - 1:
                    raise
                time.sleep(2 ** attempt)
        
        return None

Always check a site's robots.txt and Terms of Service before scraping. Never scrape login-protected content. Don't overload servers with too many requests.

Next lesson: Working with SQL Databases — storing and querying structured data.