From 6677000878f03d01f6b97c4a5dcfbba495d14349 Mon Sep 17 00:00:00 2001 From: Manuel Vergara Date: Wed, 4 Oct 2023 23:17:38 +0200 Subject: [PATCH] Add exercises 21 - Solved 1 Signed-off-by: Manuel Vergara --- .../22_Web_scraping/01_web_scraping.py | 91 ++----------------- 30-days-of-python/22_Web_scraping/README.md | 6 +- .../22_Web_scraping/bu_stats.json | 33 +++++-- 3 files changed, 37 insertions(+), 93 deletions(-) diff --git a/30-days-of-python/22_Web_scraping/01_web_scraping.py b/30-days-of-python/22_Web_scraping/01_web_scraping.py index 042e70a..3dbee5d 100644 --- a/30-days-of-python/22_Web_scraping/01_web_scraping.py +++ b/30-days-of-python/22_Web_scraping/01_web_scraping.py @@ -1,6 +1,3 @@ -""" -01_web_scraping.py -""" import requests from bs4 import BeautifulSoup import json @@ -16,94 +13,20 @@ response = requests.get(url) soup = BeautifulSoup(response.text, 'html.parser') data = {} -current_section = None -for section in soup.find_all('section', {'class': 'facts-categories'}): +for item in soup.find_all('div', {'class': 'facts-wrapper'}): + section_name = item.find('h5').get_text().strip() section_data = {} - for item in section.find_all('div', {'class': 'facts-wrapper'}): - section_name = section.find('h5').get_text().strip() - for li in item.find_all('li'): - key = li.find('p', {'class': 'text'}).get_text().strip() - value = li.find('span', {'class': 'value'}).get_text().strip() - section_data[key] = value + for li in item.find_all('li'): + key = li.find('p', {'class': 'text'}).get_text().strip() + value = li.find('span', {'class': 'value'}).get_text().strip() + section_data[key] = value - data[section_name] = section_data + data[section_name] = section_data with open('bu_stats.json', 'w') as f: json.dump(data, f, indent=2) print("Datos guardados en bu_stats.json") - -# 2. Extrae la tabla de esta URL -# (https://archive.ics.uci.edu/ml/datasets.php) -# y conviértela en un archivo JSON. - -url = 'https://webcache.googleusercontent.com/search?q=cache:tT4BY9X5RxAJ:https://archive.ics.uci.edu/datasets&cd=8&hl=ca&ct=clnk&gl=es' -response = requests.get(url) -soup = BeautifulSoup(response.text, 'html.parser') - -datasets = [] - -for div in soup.find_all('div', class_='rounded-box'): - dataset = { - 'name': div.find('h2').find('a').text.strip(), - 'description': div.find('p').text.strip(), - } - - metadata_divs = div.find_all('div', class_='col-span-3') - for metadata_div in metadata_divs: - icon = metadata_div.find('div').find('svg')['viewBox'] - value = metadata_div.find('span').text.strip() - dataset[icon] = value - - datasets.append(dataset) - -with open('uci_datasets.json', 'w') as f: - json.dump(datasets, f, indent=2) - -print("Datos guardados en uci_datasets.json") - - -# 3. Realiza un raspado web de la tabla de presidentes -# y guarda los datos como JSON -# (https://en.wikipedia.org/wiki/List_of_presidents_of_the_United_States). -# La tabla no está muy estructurada -# y el proceso de raspado puede llevar mucho tiempo. - - -url = 'https://en.wikipedia.org/wiki/List_of_presidents_of_the_United_States' -response = requests.get(url) -soup = BeautifulSoup(response.text, 'html.parser') - -table = soup.find('table', {'class': 'wikitable'}) -headers = [header.get_text().strip() for header in table.find_all('th')] -rows = [] - -for row in table.find_all('tr'): - cells = row.find_all('td') - if len(cells) == len(headers): - rows.append([cell.get_text().strip() for cell in cells]) - -data = [] -for row in rows: - president = {} - for i, header in enumerate(headers): - if i < len(row): # Verificar si hay celdas suficientes en la fila - if header == 'President': - president['name'] = row[i] - elif header == 'Party': - president['party'] = row[i] - elif header == 'State[a]': - president['state'] = row[i] - elif header == 'Took office': - president['took_office'] = row[i] - elif header == 'Left office': - president['left_office'] = row[i] - data.append(president) - -with open('us_presidents.json', 'w') as f: - json.dump(data, f, indent=2) - -print("Datos guardados en us_presidents.json") diff --git a/30-days-of-python/22_Web_scraping/README.md b/30-days-of-python/22_Web_scraping/README.md index 54790ac..13fdaf5 100644 --- a/30-days-of-python/22_Web_scraping/README.md +++ b/30-days-of-python/22_Web_scraping/README.md @@ -6,10 +6,14 @@ Documento original en inglés: [Web Scraping](https://github.com/Asabeneh/30-Day 1. Realiza un raspado web del siguiente sitio web y guarda los datos en un archivo JSON (URL = 'http://www.bu.edu/president/boston-university-facts-stats/'). +[Solución](01_web_scraping.py) + 2. Extrae la tabla de esta URL (https://archive.ics.uci.edu/ml/datasets.php) y conviértela en un archivo JSON. +[Solución](02_web_scraping.py) + 3. Realiza un raspado web de la tabla de presidentes y guarda los datos como JSON (https://en.wikipedia.org/wiki/List_of_presidents_of_the_United_States). La tabla no está muy estructurada y el proceso de raspado puede llevar mucho tiempo. -[Solución](01_web_scraping.py) +[Solución](03_web_scraping.py) [<< Day 21](../21_Clases_y_objetos/README.md) | [Day 23 >>](../23_Entorno_virtual/README.md) diff --git a/30-days-of-python/22_Web_scraping/bu_stats.json b/30-days-of-python/22_Web_scraping/bu_stats.json index 86023a1..b64c1ae 100644 --- a/30-days-of-python/22_Web_scraping/bu_stats.json +++ b/30-days-of-python/22_Web_scraping/bu_stats.json @@ -6,24 +6,35 @@ "Faculty": "4,309", "Nondegree Students": "1,337", "Graduate & Professional Students": "18,476", - "Undergraduate Students": "17,744", + "Undergraduate Students": "17,744" + }, + "Campus": { "Classrooms": "848", "Buildings": "343", "Laboratories": "1,481", "Libraries": "13", - "Campus Area (acres)": "140", + "Campus Area (acres)": "140" + }, + "Academics": { "Study Abroad Programs": "80+", "Average Class Size": "30", + "Faculty": "4,309", "Student/Faculty Ratio": "11:1", "Schools and Colleges": "17", - "Programs of Study": "300+", + "Programs of Study": "300+" + }, + "Grant & Contract Awards": { "Research Expenditures (FY22)": "$630.7M", "Research Awards": "$674M", - "BMC Clinical Research Grants (FY22)": "$82M", + "BMC Clinical Research Grants (FY22)": "$82M" + }, + "Undergraduate Financial Aid & Scholarships": { "Average Total Need-Based Financial Aid": "$57,237", "Average Need-Based Grant/Scholarship": "$53,029", "Grants & Scholarships (need-based)": "$388.4M", - "Grants & Scholarships (non-need-based)": "$26.5M", + "Grants & Scholarships (non-need-based)": "$26.5M" + }, + "Student Life": { "Community Service Hours": "130,000+", "Alternative Service Breaks Participants": "65+", "BU on Social": "new accounts daily", @@ -32,14 +43,20 @@ "Academic & Professional Organizations": "140+", "Art & Performance Organizations": "60+", "Student Organizations": "450+", - "First-Year Student Outreach Project Volunteers": "400+", + "First-Year Student Outreach Project Volunteers": "400+" + }, + "Research": { "Faculty Publications": "7,000+", "Student UROP Participants": "450+", - "Centers & Institutes": "130+", + "Centers & Institutes": "130+" + }, + "International Community": { "Global Initiatives": "300+", "Cultural Student Groups": "60+", "Alumni Countries": "180+", - "International Students": "10,000+", + "International Students": "10,000+" + }, + "Athletics": { "Intramural Sports & Tournaments": "12+", "Club and Intramural Sports Participants": "7,000+", "Club Sports": "36",