Add exercises 21 - Not solved

Signed-off-by: Manuel Vergara <manuel@vergaracarmona.es>
2023-10-03 23:50:51 +02:00
parent d34da87ee0
commit 2cd9b703b2
5 changed files with 161 additions and 0 deletions
@@ -0,0 +1,109 @@
+"""
+01_web_scraping.py
+"""
+import requests
+from bs4 import BeautifulSoup
+import json
+
+
+# 1. Realiza un raspado web del siguiente sitio web
+# y guarda los datos en un archivo JSON
+# (URL = 'http://www.bu.edu/president/boston-university-facts-stats/').
+
+
+url = 'http://www.bu.edu/president/boston-university-facts-stats/'
+response = requests.get(url)
+soup = BeautifulSoup(response.text, 'html.parser')
+
+data = {}
+current_section = None
+
+for section in soup.find_all('section', {'class': 'facts-categories'}):
+
+    section_data = {}
+
+    for item in section.find_all('div', {'class': 'facts-wrapper'}):
+        section_name = section.find('h5').get_text().strip()
+        for li in item.find_all('li'):
+            key = li.find('p', {'class': 'text'}).get_text().strip()
+            value = li.find('span', {'class': 'value'}).get_text().strip()
+            section_data[key] = value
+
+        data[section_name] = section_data
+
+with open('bu_stats.json', 'w') as f:
+    json.dump(data, f, indent=2)
+
+print("Datos guardados en bu_stats.json")
+
+# 2. Extrae la tabla de esta URL
+# (https://archive.ics.uci.edu/ml/datasets.php)
+# y conviértela en un archivo JSON.
+
+url = 'https://webcache.googleusercontent.com/search?q=cache:tT4BY9X5RxAJ:https://archive.ics.uci.edu/datasets&cd=8&hl=ca&ct=clnk&gl=es'
+response = requests.get(url)
+soup = BeautifulSoup(response.text, 'html.parser')
+
+datasets = []
+
+for div in soup.find_all('div', class_='rounded-box'):
+    dataset = {
+        'name': div.find('h2').find('a').text.strip(),
+        'description': div.find('p').text.strip(),
+    }
+
+    metadata_divs = div.find_all('div', class_='col-span-3')
+    for metadata_div in metadata_divs:
+        icon = metadata_div.find('div').find('svg')['viewBox']
+        value = metadata_div.find('span').text.strip()
+        dataset[icon] = value
+
+    datasets.append(dataset)
+
+with open('uci_datasets.json', 'w') as f:
+    json.dump(datasets, f, indent=2)
+
+print("Datos guardados en uci_datasets.json")
+
+
+# 3. Realiza un raspado web de la tabla de presidentes
+# y guarda los datos como JSON
+# (https://en.wikipedia.org/wiki/List_of_presidents_of_the_United_States).
+# La tabla no está muy estructurada
+# y el proceso de raspado puede llevar mucho tiempo.
+
+
+url = 'https://en.wikipedia.org/wiki/List_of_presidents_of_the_United_States'
+response = requests.get(url)
+soup = BeautifulSoup(response.text, 'html.parser')
+
+table = soup.find('table', {'class': 'wikitable'})
+headers = [header.get_text().strip() for header in table.find_all('th')]
+rows = []
+
+for row in table.find_all('tr'):
+    cells = row.find_all('td')
+    if len(cells) == len(headers):
+        rows.append([cell.get_text().strip() for cell in cells])
+
+data = []
+for row in rows:
+    president = {}
+    for i, header in enumerate(headers):
+        if i < len(row):  # Verificar si hay celdas suficientes en la fila
+            if header == 'President':
+                president['name'] = row[i]
+            elif header == 'Party':
+                president['party'] = row[i]
+            elif header == 'State[a]':
+                president['state'] = row[i]
+            elif header == 'Took office':
+                president['took_office'] = row[i]
+            elif header == 'Left office':
+                president['left_office'] = row[i]
+    data.append(president)
+
+with open('us_presidents.json', 'w') as f:
+    json.dump(data, f, indent=2)
+
+print("Datos guardados en us_presidents.json")
@@ -10,4 +10,6 @@ Documento original en inglés: [Web Scraping](https://github.com/Asabeneh/30-Day

 3. Realiza un raspado web de la tabla de presidentes y guarda los datos como JSON (https://en.wikipedia.org/wiki/List_of_presidents_of_the_United_States). La tabla no está muy estructurada y el proceso de raspado puede llevar mucho tiempo.

+[Solución](01_web_scraping.py)
+
 [<< Day 21](../21_Clases_y_objetos/README.md) | [Day 23 >>](../23_Entorno_virtual/README.md)
@@ -0,0 +1,48 @@
+{
+  "Community": {
+    "Student Body": "37,557",
+    "Living Alumni": "431,000+",
+    "Total Employees": "10,674",
+    "Faculty": "4,309",
+    "Nondegree Students": "1,337",
+    "Graduate & Professional Students": "18,476",
+    "Undergraduate Students": "17,744",
+    "Classrooms": "848",
+    "Buildings": "343",
+    "Laboratories": "1,481",
+    "Libraries": "13",
+    "Campus Area (acres)": "140",
+    "Study Abroad Programs": "80+",
+    "Average Class Size": "30",
+    "Student/Faculty Ratio": "11:1",
+    "Schools and Colleges": "17",
+    "Programs of Study": "300+",
+    "Research Expenditures (FY22)": "$630.7M",
+    "Research Awards": "$674M",
+    "BMC Clinical Research Grants (FY22)": "$82M",
+    "Average Total Need-Based Financial Aid": "$57,237",
+    "Average Need-Based Grant/Scholarship": "$53,029",
+    "Grants & Scholarships (need-based)": "$388.4M",
+    "Grants & Scholarships (non-need-based)": "$26.5M",
+    "Community Service Hours": "130,000+",
+    "Alternative Service Breaks Participants": "65+",
+    "BU on Social": "new accounts daily",
+    "Cultural & Religious Organizations": "80+",
+    "Community Service & Justice Organizations": "70+",
+    "Academic & Professional Organizations": "140+",
+    "Art & Performance Organizations": "60+",
+    "Student Organizations": "450+",
+    "First-Year Student Outreach Project Volunteers": "400+",
+    "Faculty Publications": "7,000+",
+    "Student UROP Participants": "450+",
+    "Centers & Institutes": "130+",
+    "Global Initiatives": "300+",
+    "Cultural Student Groups": "60+",
+    "Alumni Countries": "180+",
+    "International Students": "10,000+",
+    "Intramural Sports & Tournaments": "12+",
+    "Club and Intramural Sports Participants": "7,000+",
+    "Club Sports": "36",
+    "Varsity Sports": "24"
+  }
+}
@@ -0,0 +1 @@
+[]
@@ -0,0 +1 @@
+[]