Add exercises 21 - Not solved
Signed-off-by: Manuel Vergara <manuel@vergaracarmona.es>
This commit is contained in:
parent
d34da87ee0
commit
2cd9b703b2
109
30-days-of-python/22_Web_scraping/01_web_scraping.py
Normal file
109
30-days-of-python/22_Web_scraping/01_web_scraping.py
Normal file
@ -0,0 +1,109 @@
|
|||||||
|
"""
|
||||||
|
01_web_scraping.py
|
||||||
|
"""
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import json
|
||||||
|
|
||||||
|
|
||||||
|
# 1. Realiza un raspado web del siguiente sitio web
|
||||||
|
# y guarda los datos en un archivo JSON
|
||||||
|
# (URL = 'http://www.bu.edu/president/boston-university-facts-stats/').
|
||||||
|
|
||||||
|
|
||||||
|
url = 'http://www.bu.edu/president/boston-university-facts-stats/'
|
||||||
|
response = requests.get(url)
|
||||||
|
soup = BeautifulSoup(response.text, 'html.parser')
|
||||||
|
|
||||||
|
data = {}
|
||||||
|
current_section = None
|
||||||
|
|
||||||
|
for section in soup.find_all('section', {'class': 'facts-categories'}):
|
||||||
|
|
||||||
|
section_data = {}
|
||||||
|
|
||||||
|
for item in section.find_all('div', {'class': 'facts-wrapper'}):
|
||||||
|
section_name = section.find('h5').get_text().strip()
|
||||||
|
for li in item.find_all('li'):
|
||||||
|
key = li.find('p', {'class': 'text'}).get_text().strip()
|
||||||
|
value = li.find('span', {'class': 'value'}).get_text().strip()
|
||||||
|
section_data[key] = value
|
||||||
|
|
||||||
|
data[section_name] = section_data
|
||||||
|
|
||||||
|
with open('bu_stats.json', 'w') as f:
|
||||||
|
json.dump(data, f, indent=2)
|
||||||
|
|
||||||
|
print("Datos guardados en bu_stats.json")
|
||||||
|
|
||||||
|
# 2. Extrae la tabla de esta URL
|
||||||
|
# (https://archive.ics.uci.edu/ml/datasets.php)
|
||||||
|
# y conviértela en un archivo JSON.
|
||||||
|
|
||||||
|
url = 'https://webcache.googleusercontent.com/search?q=cache:tT4BY9X5RxAJ:https://archive.ics.uci.edu/datasets&cd=8&hl=ca&ct=clnk&gl=es'
|
||||||
|
response = requests.get(url)
|
||||||
|
soup = BeautifulSoup(response.text, 'html.parser')
|
||||||
|
|
||||||
|
datasets = []
|
||||||
|
|
||||||
|
for div in soup.find_all('div', class_='rounded-box'):
|
||||||
|
dataset = {
|
||||||
|
'name': div.find('h2').find('a').text.strip(),
|
||||||
|
'description': div.find('p').text.strip(),
|
||||||
|
}
|
||||||
|
|
||||||
|
metadata_divs = div.find_all('div', class_='col-span-3')
|
||||||
|
for metadata_div in metadata_divs:
|
||||||
|
icon = metadata_div.find('div').find('svg')['viewBox']
|
||||||
|
value = metadata_div.find('span').text.strip()
|
||||||
|
dataset[icon] = value
|
||||||
|
|
||||||
|
datasets.append(dataset)
|
||||||
|
|
||||||
|
with open('uci_datasets.json', 'w') as f:
|
||||||
|
json.dump(datasets, f, indent=2)
|
||||||
|
|
||||||
|
print("Datos guardados en uci_datasets.json")
|
||||||
|
|
||||||
|
|
||||||
|
# 3. Realiza un raspado web de la tabla de presidentes
|
||||||
|
# y guarda los datos como JSON
|
||||||
|
# (https://en.wikipedia.org/wiki/List_of_presidents_of_the_United_States).
|
||||||
|
# La tabla no está muy estructurada
|
||||||
|
# y el proceso de raspado puede llevar mucho tiempo.
|
||||||
|
|
||||||
|
|
||||||
|
url = 'https://en.wikipedia.org/wiki/List_of_presidents_of_the_United_States'
|
||||||
|
response = requests.get(url)
|
||||||
|
soup = BeautifulSoup(response.text, 'html.parser')
|
||||||
|
|
||||||
|
table = soup.find('table', {'class': 'wikitable'})
|
||||||
|
headers = [header.get_text().strip() for header in table.find_all('th')]
|
||||||
|
rows = []
|
||||||
|
|
||||||
|
for row in table.find_all('tr'):
|
||||||
|
cells = row.find_all('td')
|
||||||
|
if len(cells) == len(headers):
|
||||||
|
rows.append([cell.get_text().strip() for cell in cells])
|
||||||
|
|
||||||
|
data = []
|
||||||
|
for row in rows:
|
||||||
|
president = {}
|
||||||
|
for i, header in enumerate(headers):
|
||||||
|
if i < len(row): # Verificar si hay celdas suficientes en la fila
|
||||||
|
if header == 'President':
|
||||||
|
president['name'] = row[i]
|
||||||
|
elif header == 'Party':
|
||||||
|
president['party'] = row[i]
|
||||||
|
elif header == 'State[a]':
|
||||||
|
president['state'] = row[i]
|
||||||
|
elif header == 'Took office':
|
||||||
|
president['took_office'] = row[i]
|
||||||
|
elif header == 'Left office':
|
||||||
|
president['left_office'] = row[i]
|
||||||
|
data.append(president)
|
||||||
|
|
||||||
|
with open('us_presidents.json', 'w') as f:
|
||||||
|
json.dump(data, f, indent=2)
|
||||||
|
|
||||||
|
print("Datos guardados en us_presidents.json")
|
@ -10,4 +10,6 @@ Documento original en inglés: [Web Scraping](https://github.com/Asabeneh/30-Day
|
|||||||
|
|
||||||
3. Realiza un raspado web de la tabla de presidentes y guarda los datos como JSON (https://en.wikipedia.org/wiki/List_of_presidents_of_the_United_States). La tabla no está muy estructurada y el proceso de raspado puede llevar mucho tiempo.
|
3. Realiza un raspado web de la tabla de presidentes y guarda los datos como JSON (https://en.wikipedia.org/wiki/List_of_presidents_of_the_United_States). La tabla no está muy estructurada y el proceso de raspado puede llevar mucho tiempo.
|
||||||
|
|
||||||
|
[Solución](01_web_scraping.py)
|
||||||
|
|
||||||
[<< Day 21](../21_Clases_y_objetos/README.md) | [Day 23 >>](../23_Entorno_virtual/README.md)
|
[<< Day 21](../21_Clases_y_objetos/README.md) | [Day 23 >>](../23_Entorno_virtual/README.md)
|
||||||
|
48
30-days-of-python/22_Web_scraping/bu_stats.json
Normal file
48
30-days-of-python/22_Web_scraping/bu_stats.json
Normal file
@ -0,0 +1,48 @@
|
|||||||
|
{
|
||||||
|
"Community": {
|
||||||
|
"Student Body": "37,557",
|
||||||
|
"Living Alumni": "431,000+",
|
||||||
|
"Total Employees": "10,674",
|
||||||
|
"Faculty": "4,309",
|
||||||
|
"Nondegree Students": "1,337",
|
||||||
|
"Graduate & Professional Students": "18,476",
|
||||||
|
"Undergraduate Students": "17,744",
|
||||||
|
"Classrooms": "848",
|
||||||
|
"Buildings": "343",
|
||||||
|
"Laboratories": "1,481",
|
||||||
|
"Libraries": "13",
|
||||||
|
"Campus Area (acres)": "140",
|
||||||
|
"Study Abroad Programs": "80+",
|
||||||
|
"Average Class Size": "30",
|
||||||
|
"Student/Faculty Ratio": "11:1",
|
||||||
|
"Schools and Colleges": "17",
|
||||||
|
"Programs of Study": "300+",
|
||||||
|
"Research Expenditures (FY22)": "$630.7M",
|
||||||
|
"Research Awards": "$674M",
|
||||||
|
"BMC Clinical Research Grants (FY22)": "$82M",
|
||||||
|
"Average Total Need-Based Financial Aid": "$57,237",
|
||||||
|
"Average Need-Based Grant/Scholarship": "$53,029",
|
||||||
|
"Grants & Scholarships (need-based)": "$388.4M",
|
||||||
|
"Grants & Scholarships (non-need-based)": "$26.5M",
|
||||||
|
"Community Service Hours": "130,000+",
|
||||||
|
"Alternative Service Breaks Participants": "65+",
|
||||||
|
"BU on Social": "new accounts daily",
|
||||||
|
"Cultural & Religious Organizations": "80+",
|
||||||
|
"Community Service & Justice Organizations": "70+",
|
||||||
|
"Academic & Professional Organizations": "140+",
|
||||||
|
"Art & Performance Organizations": "60+",
|
||||||
|
"Student Organizations": "450+",
|
||||||
|
"First-Year Student Outreach Project Volunteers": "400+",
|
||||||
|
"Faculty Publications": "7,000+",
|
||||||
|
"Student UROP Participants": "450+",
|
||||||
|
"Centers & Institutes": "130+",
|
||||||
|
"Global Initiatives": "300+",
|
||||||
|
"Cultural Student Groups": "60+",
|
||||||
|
"Alumni Countries": "180+",
|
||||||
|
"International Students": "10,000+",
|
||||||
|
"Intramural Sports & Tournaments": "12+",
|
||||||
|
"Club and Intramural Sports Participants": "7,000+",
|
||||||
|
"Club Sports": "36",
|
||||||
|
"Varsity Sports": "24"
|
||||||
|
}
|
||||||
|
}
|
1
30-days-of-python/22_Web_scraping/uci_datasets.json
Normal file
1
30-days-of-python/22_Web_scraping/uci_datasets.json
Normal file
@ -0,0 +1 @@
|
|||||||
|
[]
|
1
30-days-of-python/22_Web_scraping/us_presidents.json
Normal file
1
30-days-of-python/22_Web_scraping/us_presidents.json
Normal file
@ -0,0 +1 @@
|
|||||||
|
[]
|
Loading…
Reference in New Issue
Block a user