24 minLesson 23 of 34
Python for Web & APIs
Web Scraping with BeautifulSoup
Web Scraping with BeautifulSoup
Web scraping extracts data from websites programmatically. You'll use it to gather datasets, monitor prices, aggregate news, and automate data collection. BeautifulSoup is the most beginner-friendly tool; Playwright handles dynamic JavaScript-rendered content.
The Basic Workflow
# pip install requests beautifulsoup4 lxml
import requests
from bs4 import BeautifulSoup
import time
def scrape_page(url, delay=1.0):
"""Fetch and parse a web page."""
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
}
try:
response = requests.get(url, headers=headers, timeout=15)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'lxml') # lxml is fastest parser
time.sleep(delay) # Be respectful — don't hammer servers
return soup
except requests.RequestException as e:
print(f"Failed to fetch {url}: {e}")
return None
soup = scrape_page("https://books.toscrape.com")
Finding Elements
# Find a single element
title = soup.find('h1')
print(title.text)
# Find all elements matching a selector
all_books = soup.find_all('article', class_='product_pod')
print(f"Found {len(all_books)} books")
# CSS selectors (most powerful and readable)
prices = soup.select('.price_color')
ratings = soup.select('p.star-rating')
links = soup.select('article.product_pod h3 a')
# Get attributes
for link in links[:3]:
print(link['href']) # href attribute
print(link.get('title', '')) # get with default
# Get text content
for price in prices[:5]:
print(price.text.strip()) # .strip() removes whitespace
# Navigate the tree
first_book = all_books[0]
book_title = first_book.select_one('h3 a')['title']
book_price = first_book.select_one('.price_color').text
book_rating = first_book.select_one('.star-rating')['class'][1] # "Three", "Four", etc.
print(f"{book_title}: {book_price} ({book_rating} stars)")
Complete Scraper: Books to Scrape
def scrape_books_catalog():
"""Scrape all books from books.toscrape.com"""
base_url = "https://books.toscrape.com/catalogue"
books = []
page = 1
RATING_MAP = {"One": 1, "Two": 2, "Three": 3, "Four": 4, "Five": 5}
while True:
url = f"{base_url}/page-{page}.html" if page > 1 else "https://books.toscrape.com"
soup = scrape_page(url)
if not soup:
break
for article in soup.select('article.product_pod'):
book = {
'title': article.select_one('h3 a')['title'],
'price': float(article.select_one('.price_color').text[1:]), # Remove £
'rating': RATING_MAP.get(article.select_one('.star-rating')['class'][1], 0),
'availability': article.select_one('.availability').text.strip(),
'url': base_url + '/' + article.select_one('h3 a')['href']
}
books.append(book)
# Check for next page
next_btn = soup.select_one('li.next a')
if not next_btn:
break
page += 1
print(f"Scraped page {page-1}: {len(books)} books total")
return books
books = scrape_books_catalog()
print(f"Total books scraped: {len(books)}")
Saving Scraped Data
import csv
import json
from pathlib import Path
def save_to_csv(data, filepath):
if not data:
return
Path(filepath).parent.mkdir(parents=True, exist_ok=True)
with open(filepath, 'w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=data[0].keys())
writer.writeheader()
writer.writerows(data)
print(f"Saved {len(data)} records to {filepath}")
def save_to_json(data, filepath):
Path(filepath).parent.mkdir(parents=True, exist_ok=True)
with open(filepath, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=2, ensure_ascii=False)
print(f"Saved {len(data)} records to {filepath}")
save_to_csv(books, "output/books.csv")
save_to_json(books, "output/books.json")
Dynamic Content: Playwright for JavaScript Pages
Many modern websites load content via JavaScript — BeautifulSoup can't see it. Playwright automates a real browser.
# pip install playwright
# python -m playwright install chromium
from playwright.sync_api import sync_playwright
def scrape_with_playwright(url):
with sync_playwright() as p:
browser = p.chromium.launch(headless=True) # False to watch it run
page = browser.new_page()
# Navigate and wait for content to load
page.goto(url)
page.wait_for_selector('.product-list', timeout=10000)
# Execute JavaScript to scroll/interact
page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
page.wait_for_timeout(1000) # Wait for lazy-loaded content
# Get the rendered HTML
content = page.content()
soup = BeautifulSoup(content, 'lxml')
# Click a button
page.click('button.load-more')
page.wait_for_timeout(500)
# Fill a form
page.fill('input[name="search"]', "python books")
page.press('input[name="search"]', "Enter")
page.wait_for_load_state("networkidle")
browser.close()
return soup
Rate Limiting and Responsible Scraping
import time
import random
class RespectfulScraper:
def __init__(self, delay_range=(1, 3), max_retries=3):
self.delay_range = delay_range
self.max_retries = max_retries
self.session = requests.Session()
self.session.headers["User-Agent"] = "My Research Bot 1.0 (contact@email.com)"
def fetch(self, url):
for attempt in range(self.max_retries):
try:
# Check robots.txt (in production, use robotparser)
response = self.session.get(url, timeout=15)
if response.status_code == 429: # Too Many Requests
wait = int(response.headers.get('Retry-After', 60))
print(f"Rate limited. Waiting {wait}s")
time.sleep(wait)
continue
response.raise_for_status()
# Random delay between requests
time.sleep(random.uniform(*self.delay_range))
return BeautifulSoup(response.text, 'lxml')
except Exception as e:
if attempt == self.max_retries - 1:
raise
time.sleep(2 ** attempt)
return None
Always check a site's robots.txt and Terms of Service before scraping. Never scrape login-protected content. Don't overload servers with too many requests.
Next lesson: Working with SQL Databases — storing and querying structured data.
📱
Get Notes Free →Get this course's notes on Telegram!
Free cheat sheets, summaries & practice exercises