Follow AiTechWorlds on LinkedIn for professional AI content!Follow Now →
24 minLesson 23 of 34
Python for Web & APIs

Web Scraping with BeautifulSoup

Web Scraping with BeautifulSoup

Web scraping extracts data from websites programmatically. You'll use it to gather datasets, monitor prices, aggregate news, and automate data collection. BeautifulSoup is the most beginner-friendly tool; Playwright handles dynamic JavaScript-rendered content.

The Basic Workflow

# pip install requests beautifulsoup4 lxml
import requests
from bs4 import BeautifulSoup
import time

def scrape_page(url, delay=1.0):
    """Fetch and parse a web page."""
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
    }
    
    try:
        response = requests.get(url, headers=headers, timeout=15)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.text, 'lxml')  # lxml is fastest parser
        
        time.sleep(delay)  # Be respectful — don't hammer servers
        
        return soup
    
    except requests.RequestException as e:
        print(f"Failed to fetch {url}: {e}")
        return None

soup = scrape_page("https://books.toscrape.com")

Finding Elements

# Find a single element
title = soup.find('h1')
print(title.text)

# Find all elements matching a selector
all_books = soup.find_all('article', class_='product_pod')
print(f"Found {len(all_books)} books")

# CSS selectors (most powerful and readable)
prices = soup.select('.price_color')
ratings = soup.select('p.star-rating')
links = soup.select('article.product_pod h3 a')

# Get attributes
for link in links[:3]:
    print(link['href'])           # href attribute
    print(link.get('title', ''))  # get with default

# Get text content
for price in prices[:5]:
    print(price.text.strip())     # .strip() removes whitespace

# Navigate the tree
first_book = all_books[0]
book_title = first_book.select_one('h3 a')['title']
book_price = first_book.select_one('.price_color').text
book_rating = first_book.select_one('.star-rating')['class'][1]  # "Three", "Four", etc.
print(f"{book_title}: {book_price} ({book_rating} stars)")

Complete Scraper: Books to Scrape

def scrape_books_catalog():
    """Scrape all books from books.toscrape.com"""
    base_url = "https://books.toscrape.com/catalogue"
    books = []
    page = 1
    
    RATING_MAP = {"One": 1, "Two": 2, "Three": 3, "Four": 4, "Five": 5}
    
    while True:
        url = f"{base_url}/page-{page}.html" if page > 1 else "https://books.toscrape.com"
        soup = scrape_page(url)
        
        if not soup:
            break
        
        for article in soup.select('article.product_pod'):
            book = {
                'title': article.select_one('h3 a')['title'],
                'price': float(article.select_one('.price_color').text[1:]),  # Remove £
                'rating': RATING_MAP.get(article.select_one('.star-rating')['class'][1], 0),
                'availability': article.select_one('.availability').text.strip(),
                'url': base_url + '/' + article.select_one('h3 a')['href']
            }
            books.append(book)
        
        # Check for next page
        next_btn = soup.select_one('li.next a')
        if not next_btn:
            break
        
        page += 1
        print(f"Scraped page {page-1}: {len(books)} books total")
    
    return books

books = scrape_books_catalog()
print(f"Total books scraped: {len(books)}")

Saving Scraped Data

import csv
import json
from pathlib import Path

def save_to_csv(data, filepath):
    if not data:
        return
    Path(filepath).parent.mkdir(parents=True, exist_ok=True)
    with open(filepath, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=data[0].keys())
        writer.writeheader()
        writer.writerows(data)
    print(f"Saved {len(data)} records to {filepath}")

def save_to_json(data, filepath):
    Path(filepath).parent.mkdir(parents=True, exist_ok=True)
    with open(filepath, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=2, ensure_ascii=False)
    print(f"Saved {len(data)} records to {filepath}")

save_to_csv(books, "output/books.csv")
save_to_json(books, "output/books.json")

Dynamic Content: Playwright for JavaScript Pages

Many modern websites load content via JavaScript — BeautifulSoup can't see it. Playwright automates a real browser.

# pip install playwright
# python -m playwright install chromium
from playwright.sync_api import sync_playwright

def scrape_with_playwright(url):
    with sync_playwright() as p:
        browser = p.chromium.launch(headless=True)  # False to watch it run
        page = browser.new_page()
        
        # Navigate and wait for content to load
        page.goto(url)
        page.wait_for_selector('.product-list', timeout=10000)
        
        # Execute JavaScript to scroll/interact
        page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
        page.wait_for_timeout(1000)  # Wait for lazy-loaded content
        
        # Get the rendered HTML
        content = page.content()
        soup = BeautifulSoup(content, 'lxml')
        
        # Click a button
        page.click('button.load-more')
        page.wait_for_timeout(500)
        
        # Fill a form
        page.fill('input[name="search"]', "python books")
        page.press('input[name="search"]', "Enter")
        page.wait_for_load_state("networkidle")
        
        browser.close()
        return soup

Rate Limiting and Responsible Scraping

import time
import random

class RespectfulScraper:
    def __init__(self, delay_range=(1, 3), max_retries=3):
        self.delay_range = delay_range
        self.max_retries = max_retries
        self.session = requests.Session()
        self.session.headers["User-Agent"] = "My Research Bot 1.0 (contact@email.com)"
    
    def fetch(self, url):
        for attempt in range(self.max_retries):
            try:
                # Check robots.txt (in production, use robotparser)
                response = self.session.get(url, timeout=15)
                
                if response.status_code == 429:  # Too Many Requests
                    wait = int(response.headers.get('Retry-After', 60))
                    print(f"Rate limited. Waiting {wait}s")
                    time.sleep(wait)
                    continue
                
                response.raise_for_status()
                
                # Random delay between requests
                time.sleep(random.uniform(*self.delay_range))
                
                return BeautifulSoup(response.text, 'lxml')
            
            except Exception as e:
                if attempt == self.max_retries - 1:
                    raise
                time.sleep(2 ** attempt)
        
        return None

Always check a site's robots.txt and Terms of Service before scraping. Never scrape login-protected content. Don't overload servers with too many requests.

Next lesson: Working with SQL Databases — storing and querying structured data.

📱

Get this course's notes on Telegram!

Free cheat sheets, summaries & practice exercises

Get Notes Free →
!