You've already forked Curso-lenguaje-python
Add exercises 21 - Solved 1
Signed-off-by: Manuel Vergara <manuel@vergaracarmona.es>
This commit is contained in:
@@ -1,6 +1,3 @@
|
||||
"""
|
||||
01_web_scraping.py
|
||||
"""
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
import json
|
||||
@@ -16,94 +13,20 @@ response = requests.get(url)
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
|
||||
data = {}
|
||||
current_section = None
|
||||
|
||||
for section in soup.find_all('section', {'class': 'facts-categories'}):
|
||||
for item in soup.find_all('div', {'class': 'facts-wrapper'}):
|
||||
|
||||
section_name = item.find('h5').get_text().strip()
|
||||
section_data = {}
|
||||
|
||||
for item in section.find_all('div', {'class': 'facts-wrapper'}):
|
||||
section_name = section.find('h5').get_text().strip()
|
||||
for li in item.find_all('li'):
|
||||
key = li.find('p', {'class': 'text'}).get_text().strip()
|
||||
value = li.find('span', {'class': 'value'}).get_text().strip()
|
||||
section_data[key] = value
|
||||
for li in item.find_all('li'):
|
||||
key = li.find('p', {'class': 'text'}).get_text().strip()
|
||||
value = li.find('span', {'class': 'value'}).get_text().strip()
|
||||
section_data[key] = value
|
||||
|
||||
data[section_name] = section_data
|
||||
data[section_name] = section_data
|
||||
|
||||
with open('bu_stats.json', 'w') as f:
|
||||
json.dump(data, f, indent=2)
|
||||
|
||||
print("Datos guardados en bu_stats.json")
|
||||
|
||||
# 2. Extrae la tabla de esta URL
|
||||
# (https://archive.ics.uci.edu/ml/datasets.php)
|
||||
# y conviértela en un archivo JSON.
|
||||
|
||||
url = 'https://webcache.googleusercontent.com/search?q=cache:tT4BY9X5RxAJ:https://archive.ics.uci.edu/datasets&cd=8&hl=ca&ct=clnk&gl=es'
|
||||
response = requests.get(url)
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
|
||||
datasets = []
|
||||
|
||||
for div in soup.find_all('div', class_='rounded-box'):
|
||||
dataset = {
|
||||
'name': div.find('h2').find('a').text.strip(),
|
||||
'description': div.find('p').text.strip(),
|
||||
}
|
||||
|
||||
metadata_divs = div.find_all('div', class_='col-span-3')
|
||||
for metadata_div in metadata_divs:
|
||||
icon = metadata_div.find('div').find('svg')['viewBox']
|
||||
value = metadata_div.find('span').text.strip()
|
||||
dataset[icon] = value
|
||||
|
||||
datasets.append(dataset)
|
||||
|
||||
with open('uci_datasets.json', 'w') as f:
|
||||
json.dump(datasets, f, indent=2)
|
||||
|
||||
print("Datos guardados en uci_datasets.json")
|
||||
|
||||
|
||||
# 3. Realiza un raspado web de la tabla de presidentes
|
||||
# y guarda los datos como JSON
|
||||
# (https://en.wikipedia.org/wiki/List_of_presidents_of_the_United_States).
|
||||
# La tabla no está muy estructurada
|
||||
# y el proceso de raspado puede llevar mucho tiempo.
|
||||
|
||||
|
||||
url = 'https://en.wikipedia.org/wiki/List_of_presidents_of_the_United_States'
|
||||
response = requests.get(url)
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
|
||||
table = soup.find('table', {'class': 'wikitable'})
|
||||
headers = [header.get_text().strip() for header in table.find_all('th')]
|
||||
rows = []
|
||||
|
||||
for row in table.find_all('tr'):
|
||||
cells = row.find_all('td')
|
||||
if len(cells) == len(headers):
|
||||
rows.append([cell.get_text().strip() for cell in cells])
|
||||
|
||||
data = []
|
||||
for row in rows:
|
||||
president = {}
|
||||
for i, header in enumerate(headers):
|
||||
if i < len(row): # Verificar si hay celdas suficientes en la fila
|
||||
if header == 'President':
|
||||
president['name'] = row[i]
|
||||
elif header == 'Party':
|
||||
president['party'] = row[i]
|
||||
elif header == 'State[a]':
|
||||
president['state'] = row[i]
|
||||
elif header == 'Took office':
|
||||
president['took_office'] = row[i]
|
||||
elif header == 'Left office':
|
||||
president['left_office'] = row[i]
|
||||
data.append(president)
|
||||
|
||||
with open('us_presidents.json', 'w') as f:
|
||||
json.dump(data, f, indent=2)
|
||||
|
||||
print("Datos guardados en us_presidents.json")
|
||||
|
||||
Reference in New Issue
Block a user