Web scraping involves extracting data from websites using automated tools or scripts.
requests
: Fetches web pages.BeautifulSoup
: Parses HTML and XML.lxml
: Faster HTML parsing.selenium
: Automates browser interaction.scrapy
: Advanced web scraping framework.requests
.BeautifulSoup
.requests
and BeautifulSoup
```python import requests from bs4 import BeautifulSoup
url = "https://example.com" response = requests.get(url) html = response.text
soup = BeautifulSoup(html, 'html.parser')
title = soup.title.text Page title links = [a['href'] for a in soup.find_all('a', href=True)] All links
print("Page Title:", title) print("Links:", links) ```
```python table = soup.find('table') Find the table rows = table.find_all('tr') Find all rows
for row in rows: columns = row.find_all('td') Find columns in each row data = [col.text for col in columns] print(data) ```
```python from selenium import webdriver
driver = webdriver.Chrome()
driver.get("https://example.com")
search_box = driver.find_element("name", "q") search_box.send_keys("Python web scraping") search_box.submit()
results = driver.find_elements("css selector", "h3") for result in results: print(result.text)
driver.quit() ```
python
headlines = soup.select('h1, h2, h3') All headline tags
lxml
):
python
element = driver.find_element("xpath", '//div[@class="example-class"]')
python
next_page = soup.find('a', {'rel': 'next'})['href']
time.sleep()
:for page in range(1, 5): response = requests.get(f"https://example.com?page={page}") time.sleep(2) Pause for 2 seconds ```
python
prices = soup.find_all('span', class_='price')
for price in prices:
print(price.text)
python
articles = soup.find_all('div', class_='article')
for article in articles:
title = article.find('h2').text
link = article.find('a')['href']
print(f"Title: {title}, Link: {link}")
requests
.
python
images = soup.find_all('img', src=True)
for img in images:
img_url = img['src']
with open("image.jpg", 'wb') as f:
f.write(requests.get(img_url).content)
python
jobs = soup.find_all('div', class_='job-listing')
for job in jobs:
title = job.find('h2').text
company = job.find('h3').text
print(f"Job: {title}, Company: {company}")
python
response = requests.get("https://api.example.com/data")
data = response.json()
print(data)
python
headers = {'User-Agent': 'Mozilla/5.0'}
response = requests.get(url, headers=headers)
with open('data.csv', 'w') as file: writer = csv.writer(file) writer.writerow(["Title", "Link"]) writer.writerow(["Example Title", "https://example.com"]) ```