Add exercises 21 - Solved 1

Signed-off-by: Manuel Vergara <manuel@vergaracarmona.es>
2023-10-04 23:17:38 +02:00
parent 2cd9b703b2
commit 6677000878
3 changed files with 37 additions and 93 deletions
--- a/30-days-of-python/22_Web_scraping/01_web_scraping.py
+++ b/30-days-of-python/22_Web_scraping/01_web_scraping.py
@@ -1,6 +1,3 @@
 """
 01_web_scraping.py
 """
 import requests
 from bs4 import BeautifulSoup
 import json
@@ -16,94 +13,20 @@ response = requests.get(url)
 soup = BeautifulSoup(response.text, 'html.parser')
 data = {}
 current_section = None
-for section in soup.find_all('section', {'class': 'facts-categories'}):
+for item in soup.find_all('div', {'class': 'facts-wrapper'}):
    section_name = item.find('h5').get_text().strip()
    section_data = {}
-    for item in section.find_all('div', {'class': 'facts-wrapper'}):
+    for li in item.find_all('li'):
-        section_name = section.find('h5').get_text().strip()
+        key = li.find('p', {'class': 'text'}).get_text().strip()
-        for li in item.find_all('li'):
+        value = li.find('span', {'class': 'value'}).get_text().strip()
-            key = li.find('p', {'class': 'text'}).get_text().strip()
+        section_data[key] = value
            value = li.find('span', {'class': 'value'}).get_text().strip()
            section_data[key] = value
-        data[section_name] = section_data
+    data[section_name] = section_data
 with open('bu_stats.json', 'w') as f:
    json.dump(data, f, indent=2)
 print("Datos guardados en bu_stats.json")
 # 2. Extrae la tabla de esta URL
 # (https://archive.ics.uci.edu/ml/datasets.php)
 # y conviértela en un archivo JSON.
 url = 'https://webcache.googleusercontent.com/search?q=cache:tT4BY9X5RxAJ:https://archive.ics.uci.edu/datasets&cd=8&hl=ca&ct=clnk&gl=es'
 response = requests.get(url)
 soup = BeautifulSoup(response.text, 'html.parser')
 datasets = []
 for div in soup.find_all('div', class_='rounded-box'):
    dataset = {
        'name': div.find('h2').find('a').text.strip(),
        'description': div.find('p').text.strip(),
    }
    metadata_divs = div.find_all('div', class_='col-span-3')
    for metadata_div in metadata_divs:
        icon = metadata_div.find('div').find('svg')['viewBox']
        value = metadata_div.find('span').text.strip()
        dataset[icon] = value
    datasets.append(dataset)
 with open('uci_datasets.json', 'w') as f:
    json.dump(datasets, f, indent=2)
 print("Datos guardados en uci_datasets.json")
 # 3. Realiza un raspado web de la tabla de presidentes
 # y guarda los datos como JSON
 # (https://en.wikipedia.org/wiki/List_of_presidents_of_the_United_States).
 # La tabla no está muy estructurada
 # y el proceso de raspado puede llevar mucho tiempo.
 url = 'https://en.wikipedia.org/wiki/List_of_presidents_of_the_United_States'
 response = requests.get(url)
 soup = BeautifulSoup(response.text, 'html.parser')
 table = soup.find('table', {'class': 'wikitable'})
 headers = [header.get_text().strip() for header in table.find_all('th')]
 rows = []
 for row in table.find_all('tr'):
    cells = row.find_all('td')
    if len(cells) == len(headers):
        rows.append([cell.get_text().strip() for cell in cells])
 data = []
 for row in rows:
    president = {}
    for i, header in enumerate(headers):
        if i < len(row):  # Verificar si hay celdas suficientes en la fila
            if header == 'President':
                president['name'] = row[i]
            elif header == 'Party':
                president['party'] = row[i]
            elif header == 'State[a]':
                president['state'] = row[i]
            elif header == 'Took office':
                president['took_office'] = row[i]
            elif header == 'Left office':
                president['left_office'] = row[i]
    data.append(president)
 with open('us_presidents.json', 'w') as f:
    json.dump(data, f, indent=2)
 print("Datos guardados en us_presidents.json")
--- a/30-days-of-python/22_Web_scraping/README.md
+++ b/30-days-of-python/22_Web_scraping/README.md
@@ -6,10 +6,14 @@ Documento original en inglés: [Web Scraping](https://github.com/Asabeneh/30-Day
 1. Realiza un raspado web del siguiente sitio web y guarda los datos en un archivo JSON (URL = 'http://www.bu.edu/president/boston-university-facts-stats/').
 [Solución](01_web_scraping.py)
 2. Extrae la tabla de esta URL (https://archive.ics.uci.edu/ml/datasets.php) y conviértela en un archivo JSON.
 [Solución](02_web_scraping.py)
 3. Realiza un raspado web de la tabla de presidentes y guarda los datos como JSON (https://en.wikipedia.org/wiki/List_of_presidents_of_the_United_States). La tabla no está muy estructurada y el proceso de raspado puede llevar mucho tiempo.
-[Solución](01_web_scraping.py)
+[Solución](03_web_scraping.py)
 [<< Day 21](../21_Clases_y_objetos/README.md) | [Day 23 >>](../23_Entorno_virtual/README.md)
--- a/30-days-of-python/22_Web_scraping/bu_stats.json
+++ b/30-days-of-python/22_Web_scraping/bu_stats.json
@@ -6,24 +6,35 @@
    "Faculty": "4,309",
    "Nondegree Students": "1,337",
    "Graduate & Professional Students": "18,476",
-    "Undergraduate Students": "17,744",
+    "Undergraduate Students": "17,744"
  },
  "Campus": {
    "Classrooms": "848",
    "Buildings": "343",
    "Laboratories": "1,481",
    "Libraries": "13",
-    "Campus Area (acres)": "140",
+    "Campus Area (acres)": "140"
  },
  "Academics": {
    "Study Abroad Programs": "80+",
    "Average Class Size": "30",
    "Faculty": "4,309",
    "Student/Faculty Ratio": "11:1",
    "Schools and Colleges": "17",
-    "Programs of Study": "300+",
+    "Programs of Study": "300+"
  },
  "Grant & Contract Awards": {
    "Research Expenditures (FY22)": "$630.7M",
    "Research Awards": "$674M",
-    "BMC Clinical Research Grants (FY22)": "$82M",
+    "BMC Clinical Research Grants (FY22)": "$82M"
  },
  "Undergraduate Financial Aid & Scholarships": {
    "Average Total Need-Based Financial Aid": "$57,237",
    "Average Need-Based Grant/Scholarship": "$53,029",
    "Grants & Scholarships (need-based)": "$388.4M",
-    "Grants & Scholarships (non-need-based)": "$26.5M",
+    "Grants & Scholarships (non-need-based)": "$26.5M"
  },
  "Student Life": {
    "Community Service Hours": "130,000+",
    "Alternative Service Breaks Participants": "65+",
    "BU on Social": "new accounts daily",
@@ -32,14 +43,20 @@
    "Academic & Professional Organizations": "140+",
    "Art & Performance Organizations": "60+",
    "Student Organizations": "450+",
-    "First-Year Student Outreach Project Volunteers": "400+",
+    "First-Year Student Outreach Project Volunteers": "400+"
  },
  "Research": {
    "Faculty Publications": "7,000+",
    "Student UROP Participants": "450+",
-    "Centers & Institutes": "130+",
+    "Centers & Institutes": "130+"
  },
  "International Community": {
    "Global Initiatives": "300+",
    "Cultural Student Groups": "60+",
    "Alumni Countries": "180+",
-    "International Students": "10,000+",
+    "International Students": "10,000+"
  },
  "Athletics": {
    "Intramural Sports & Tournaments": "12+",
    "Club and Intramural Sports Participants": "7,000+",
    "Club Sports": "36",