Curso-lenguaje-python/30-days-of-python/22_Web_scraping/01_web_scraping.py

"""
01_web_scraping.py
"""
import requests
from bs4 import BeautifulSoup
import json


# 1. Realiza un raspado web del siguiente sitio web
# y guarda los datos en un archivo JSON
# (URL = 'http://www.bu.edu/president/boston-university-facts-stats/').


url = 'http://www.bu.edu/president/boston-university-facts-stats/'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

data = {}
current_section = None

for section in soup.find_all('section', {'class': 'facts-categories'}):

    section_data = {}

    for item in section.find_all('div', {'class': 'facts-wrapper'}):
        section_name = section.find('h5').get_text().strip()
        for li in item.find_all('li'):
            key = li.find('p', {'class': 'text'}).get_text().strip()
            value = li.find('span', {'class': 'value'}).get_text().strip()
            section_data[key] = value

        data[section_name] = section_data

with open('bu_stats.json', 'w') as f:
    json.dump(data, f, indent=2)

print("Datos guardados en bu_stats.json")

# 2. Extrae la tabla de esta URL
# (https://archive.ics.uci.edu/ml/datasets.php)
# y conviértela en un archivo JSON.

url = 'https://webcache.googleusercontent.com/search?q=cache:tT4BY9X5RxAJ:https://archive.ics.uci.edu/datasets&cd=8&hl=ca&ct=clnk&gl=es'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

datasets = []

for div in soup.find_all('div', class_='rounded-box'):
    dataset = {
        'name': div.find('h2').find('a').text.strip(),
        'description': div.find('p').text.strip(),
    }

    metadata_divs = div.find_all('div', class_='col-span-3')
    for metadata_div in metadata_divs:
        icon = metadata_div.find('div').find('svg')['viewBox']
        value = metadata_div.find('span').text.strip()
        dataset[icon] = value

    datasets.append(dataset)

with open('uci_datasets.json', 'w') as f:
    json.dump(datasets, f, indent=2)

print("Datos guardados en uci_datasets.json")


# 3. Realiza un raspado web de la tabla de presidentes
# y guarda los datos como JSON
# (https://en.wikipedia.org/wiki/List_of_presidents_of_the_United_States).
# La tabla no está muy estructurada
# y el proceso de raspado puede llevar mucho tiempo.


url = 'https://en.wikipedia.org/wiki/List_of_presidents_of_the_United_States'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

table = soup.find('table', {'class': 'wikitable'})
headers = [header.get_text().strip() for header in table.find_all('th')]
rows = []

for row in table.find_all('tr'):
    cells = row.find_all('td')
    if len(cells) == len(headers):
        rows.append([cell.get_text().strip() for cell in cells])

data = []
for row in rows:
    president = {}
    for i, header in enumerate(headers):
        if i < len(row):  # Verificar si hay celdas suficientes en la fila
            if header == 'President':
                president['name'] = row[i]
            elif header == 'Party':
                president['party'] = row[i]
            elif header == 'State[a]':
                president['state'] = row[i]
            elif header == 'Took office':
                president['took_office'] = row[i]
            elif header == 'Left office':
                president['left_office'] = row[i]
    data.append(president)

with open('us_presidents.json', 'w') as f:
    json.dump(data, f, indent=2)

print("Datos guardados en us_presidents.json")
Add exercises 21 - Not solved Signed-off-by: Manuel Vergara <manuel@vergaracarmona.es> 2023-10-03 23:50:51 +02:00			`"""`
			`01_web_scraping.py`
			`"""`
			`import requests`
			`from bs4 import BeautifulSoup`
			`import json`


			`# 1. Realiza un raspado web del siguiente sitio web`
			`# y guarda los datos en un archivo JSON`
			`# (URL = 'http://www.bu.edu/president/boston-university-facts-stats/').`


			`url = 'http://www.bu.edu/president/boston-university-facts-stats/'`
			`response = requests.get(url)`
			`soup = BeautifulSoup(response.text, 'html.parser')`

			`data = {}`
			`current_section = None`

			`for section in soup.find_all('section', {'class': 'facts-categories'}):`

			`section_data = {}`

			`for item in section.find_all('div', {'class': 'facts-wrapper'}):`
			`section_name = section.find('h5').get_text().strip()`
			`for li in item.find_all('li'):`
			`key = li.find('p', {'class': 'text'}).get_text().strip()`
			`value = li.find('span', {'class': 'value'}).get_text().strip()`
			`section_data[key] = value`

			`data[section_name] = section_data`

			`with open('bu_stats.json', 'w') as f:`
			`json.dump(data, f, indent=2)`

			`print("Datos guardados en bu_stats.json")`

			`# 2. Extrae la tabla de esta URL`
			`# (https://archive.ics.uci.edu/ml/datasets.php)`
			`# y conviértela en un archivo JSON.`

			`url = 'https://webcache.googleusercontent.com/search?q=cache:tT4BY9X5RxAJ:https://archive.ics.uci.edu/datasets&cd=8&hl=ca&ct=clnk&gl=es'`
			`response = requests.get(url)`
			`soup = BeautifulSoup(response.text, 'html.parser')`

			`datasets = []`

			`for div in soup.find_all('div', class_='rounded-box'):`
			`dataset = {`
			`'name': div.find('h2').find('a').text.strip(),`
			`'description': div.find('p').text.strip(),`
			`}`

			`metadata_divs = div.find_all('div', class_='col-span-3')`
			`for metadata_div in metadata_divs:`
			`icon = metadata_div.find('div').find('svg')['viewBox']`
			`value = metadata_div.find('span').text.strip()`
			`dataset[icon] = value`

			`datasets.append(dataset)`

			`with open('uci_datasets.json', 'w') as f:`
			`json.dump(datasets, f, indent=2)`

			`print("Datos guardados en uci_datasets.json")`


			`# 3. Realiza un raspado web de la tabla de presidentes`
			`# y guarda los datos como JSON`
			`# (https://en.wikipedia.org/wiki/List_of_presidents_of_the_United_States).`
			`# La tabla no está muy estructurada`
			`# y el proceso de raspado puede llevar mucho tiempo.`


			`url = 'https://en.wikipedia.org/wiki/List_of_presidents_of_the_United_States'`
			`response = requests.get(url)`
			`soup = BeautifulSoup(response.text, 'html.parser')`

			`table = soup.find('table', {'class': 'wikitable'})`
			`headers = [header.get_text().strip() for header in table.find_all('th')]`
			`rows = []`

			`for row in table.find_all('tr'):`
			`cells = row.find_all('td')`
			`if len(cells) == len(headers):`
			`rows.append([cell.get_text().strip() for cell in cells])`

			`data = []`
			`for row in rows:`
			`president = {}`
			`for i, header in enumerate(headers):`
			`if i < len(row): # Verificar si hay celdas suficientes en la fila`
			`if header == 'President':`
			`president['name'] = row[i]`
			`elif header == 'Party':`
			`president['party'] = row[i]`
			`elif header == 'State[a]':`
			`president['state'] = row[i]`
			`elif header == 'Took office':`
			`president['took_office'] = row[i]`
			`elif header == 'Left office':`
			`president['left_office'] = row[i]`
			`data.append(president)`

			`with open('us_presidents.json', 'w') as f:`
			`json.dump(data, f, indent=2)`

			`print("Datos guardados en us_presidents.json")`