Add exercises 21 - Solved 1

Signed-off-by: Manuel Vergara <manuel@vergaracarmona.es>
This commit is contained in:
Manuel Vergara 2023-10-04 23:17:38 +02:00
parent 2cd9b703b2
commit 6677000878
3 changed files with 37 additions and 93 deletions

View File

@ -1,6 +1,3 @@
"""
01_web_scraping.py
"""
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import json import json
@ -16,14 +13,12 @@ response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser') soup = BeautifulSoup(response.text, 'html.parser')
data = {} data = {}
current_section = None
for section in soup.find_all('section', {'class': 'facts-categories'}): for item in soup.find_all('div', {'class': 'facts-wrapper'}):
section_name = item.find('h5').get_text().strip()
section_data = {} section_data = {}
for item in section.find_all('div', {'class': 'facts-wrapper'}):
section_name = section.find('h5').get_text().strip()
for li in item.find_all('li'): for li in item.find_all('li'):
key = li.find('p', {'class': 'text'}).get_text().strip() key = li.find('p', {'class': 'text'}).get_text().strip()
value = li.find('span', {'class': 'value'}).get_text().strip() value = li.find('span', {'class': 'value'}).get_text().strip()
@ -35,75 +30,3 @@ with open('bu_stats.json', 'w') as f:
json.dump(data, f, indent=2) json.dump(data, f, indent=2)
print("Datos guardados en bu_stats.json") print("Datos guardados en bu_stats.json")
# 2. Extrae la tabla de esta URL
# (https://archive.ics.uci.edu/ml/datasets.php)
# y conviértela en un archivo JSON.
url = 'https://webcache.googleusercontent.com/search?q=cache:tT4BY9X5RxAJ:https://archive.ics.uci.edu/datasets&cd=8&hl=ca&ct=clnk&gl=es'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
datasets = []
for div in soup.find_all('div', class_='rounded-box'):
dataset = {
'name': div.find('h2').find('a').text.strip(),
'description': div.find('p').text.strip(),
}
metadata_divs = div.find_all('div', class_='col-span-3')
for metadata_div in metadata_divs:
icon = metadata_div.find('div').find('svg')['viewBox']
value = metadata_div.find('span').text.strip()
dataset[icon] = value
datasets.append(dataset)
with open('uci_datasets.json', 'w') as f:
json.dump(datasets, f, indent=2)
print("Datos guardados en uci_datasets.json")
# 3. Realiza un raspado web de la tabla de presidentes
# y guarda los datos como JSON
# (https://en.wikipedia.org/wiki/List_of_presidents_of_the_United_States).
# La tabla no está muy estructurada
# y el proceso de raspado puede llevar mucho tiempo.
url = 'https://en.wikipedia.org/wiki/List_of_presidents_of_the_United_States'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
table = soup.find('table', {'class': 'wikitable'})
headers = [header.get_text().strip() for header in table.find_all('th')]
rows = []
for row in table.find_all('tr'):
cells = row.find_all('td')
if len(cells) == len(headers):
rows.append([cell.get_text().strip() for cell in cells])
data = []
for row in rows:
president = {}
for i, header in enumerate(headers):
if i < len(row): # Verificar si hay celdas suficientes en la fila
if header == 'President':
president['name'] = row[i]
elif header == 'Party':
president['party'] = row[i]
elif header == 'State[a]':
president['state'] = row[i]
elif header == 'Took office':
president['took_office'] = row[i]
elif header == 'Left office':
president['left_office'] = row[i]
data.append(president)
with open('us_presidents.json', 'w') as f:
json.dump(data, f, indent=2)
print("Datos guardados en us_presidents.json")

View File

@ -6,10 +6,14 @@ Documento original en inglés: [Web Scraping](https://github.com/Asabeneh/30-Day
1. Realiza un raspado web del siguiente sitio web y guarda los datos en un archivo JSON (URL = 'http://www.bu.edu/president/boston-university-facts-stats/'). 1. Realiza un raspado web del siguiente sitio web y guarda los datos en un archivo JSON (URL = 'http://www.bu.edu/president/boston-university-facts-stats/').
[Solución](01_web_scraping.py)
2. Extrae la tabla de esta URL (https://archive.ics.uci.edu/ml/datasets.php) y conviértela en un archivo JSON. 2. Extrae la tabla de esta URL (https://archive.ics.uci.edu/ml/datasets.php) y conviértela en un archivo JSON.
[Solución](02_web_scraping.py)
3. Realiza un raspado web de la tabla de presidentes y guarda los datos como JSON (https://en.wikipedia.org/wiki/List_of_presidents_of_the_United_States). La tabla no está muy estructurada y el proceso de raspado puede llevar mucho tiempo. 3. Realiza un raspado web de la tabla de presidentes y guarda los datos como JSON (https://en.wikipedia.org/wiki/List_of_presidents_of_the_United_States). La tabla no está muy estructurada y el proceso de raspado puede llevar mucho tiempo.
[Solución](01_web_scraping.py) [Solución](03_web_scraping.py)
[<< Day 21](../21_Clases_y_objetos/README.md) | [Day 23 >>](../23_Entorno_virtual/README.md) [<< Day 21](../21_Clases_y_objetos/README.md) | [Day 23 >>](../23_Entorno_virtual/README.md)

View File

@ -6,24 +6,35 @@
"Faculty": "4,309", "Faculty": "4,309",
"Nondegree Students": "1,337", "Nondegree Students": "1,337",
"Graduate & Professional Students": "18,476", "Graduate & Professional Students": "18,476",
"Undergraduate Students": "17,744", "Undergraduate Students": "17,744"
},
"Campus": {
"Classrooms": "848", "Classrooms": "848",
"Buildings": "343", "Buildings": "343",
"Laboratories": "1,481", "Laboratories": "1,481",
"Libraries": "13", "Libraries": "13",
"Campus Area (acres)": "140", "Campus Area (acres)": "140"
},
"Academics": {
"Study Abroad Programs": "80+", "Study Abroad Programs": "80+",
"Average Class Size": "30", "Average Class Size": "30",
"Faculty": "4,309",
"Student/Faculty Ratio": "11:1", "Student/Faculty Ratio": "11:1",
"Schools and Colleges": "17", "Schools and Colleges": "17",
"Programs of Study": "300+", "Programs of Study": "300+"
},
"Grant & Contract Awards": {
"Research Expenditures (FY22)": "$630.7M", "Research Expenditures (FY22)": "$630.7M",
"Research Awards": "$674M", "Research Awards": "$674M",
"BMC Clinical Research Grants (FY22)": "$82M", "BMC Clinical Research Grants (FY22)": "$82M"
},
"Undergraduate Financial Aid & Scholarships": {
"Average Total Need-Based Financial Aid": "$57,237", "Average Total Need-Based Financial Aid": "$57,237",
"Average Need-Based Grant/Scholarship": "$53,029", "Average Need-Based Grant/Scholarship": "$53,029",
"Grants & Scholarships (need-based)": "$388.4M", "Grants & Scholarships (need-based)": "$388.4M",
"Grants & Scholarships (non-need-based)": "$26.5M", "Grants & Scholarships (non-need-based)": "$26.5M"
},
"Student Life": {
"Community Service Hours": "130,000+", "Community Service Hours": "130,000+",
"Alternative Service Breaks Participants": "65+", "Alternative Service Breaks Participants": "65+",
"BU on Social": "new accounts daily", "BU on Social": "new accounts daily",
@ -32,14 +43,20 @@
"Academic & Professional Organizations": "140+", "Academic & Professional Organizations": "140+",
"Art & Performance Organizations": "60+", "Art & Performance Organizations": "60+",
"Student Organizations": "450+", "Student Organizations": "450+",
"First-Year Student Outreach Project Volunteers": "400+", "First-Year Student Outreach Project Volunteers": "400+"
},
"Research": {
"Faculty Publications": "7,000+", "Faculty Publications": "7,000+",
"Student UROP Participants": "450+", "Student UROP Participants": "450+",
"Centers & Institutes": "130+", "Centers & Institutes": "130+"
},
"International Community": {
"Global Initiatives": "300+", "Global Initiatives": "300+",
"Cultural Student Groups": "60+", "Cultural Student Groups": "60+",
"Alumni Countries": "180+", "Alumni Countries": "180+",
"International Students": "10,000+", "International Students": "10,000+"
},
"Athletics": {
"Intramural Sports & Tournaments": "12+", "Intramural Sports & Tournaments": "12+",
"Club and Intramural Sports Participants": "7,000+", "Club and Intramural Sports Participants": "7,000+",
"Club Sports": "36", "Club Sports": "36",