Add exercises 21 - Solved 1

Signed-off-by: Manuel Vergara <manuel@vergaracarmona.es>
2023-10-04 23:17:38 +02:00
parent 2cd9b703b2
commit 6677000878
3 changed files with 37 additions and 93 deletions
--- a/30-days-of-python/22_Web_scraping/01_web_scraping.py
+++ b/30-days-of-python/22_Web_scraping/01_web_scraping.py
@@ -1,6 +1,3 @@
-"""
-01_web_scraping.py
-"""
 import requests
 from bs4 import BeautifulSoup
 import json
@@ -16,94 +13,20 @@ response = requests.get(url)
 soup = BeautifulSoup(response.text, 'html.parser')

 data = {}
-current_section = None

-for section in soup.find_all('section', {'class': 'facts-categories'}):
+for item in soup.find_all('div', {'class': 'facts-wrapper'}):

+    section_name = item.find('h5').get_text().strip()
    section_data = {}

-    for item in section.find_all('div', {'class': 'facts-wrapper'}):
-        section_name = section.find('h5').get_text().strip()
-        for li in item.find_all('li'):
-            key = li.find('p', {'class': 'text'}).get_text().strip()
-            value = li.find('span', {'class': 'value'}).get_text().strip()
-            section_data[key] = value
+    for li in item.find_all('li'):
+        key = li.find('p', {'class': 'text'}).get_text().strip()
+        value = li.find('span', {'class': 'value'}).get_text().strip()
+        section_data[key] = value

-        data[section_name] = section_data
+    data[section_name] = section_data

 with open('bu_stats.json', 'w') as f:
    json.dump(data, f, indent=2)

 print("Datos guardados en bu_stats.json")
-
-# 2. Extrae la tabla de esta URL
-# (https://archive.ics.uci.edu/ml/datasets.php)
-# y conviértela en un archivo JSON.
-
-url = 'https://webcache.googleusercontent.com/search?q=cache:tT4BY9X5RxAJ:https://archive.ics.uci.edu/datasets&cd=8&hl=ca&ct=clnk&gl=es'
-response = requests.get(url)
-soup = BeautifulSoup(response.text, 'html.parser')
-
-datasets = []
-
-for div in soup.find_all('div', class_='rounded-box'):
-    dataset = {
-        'name': div.find('h2').find('a').text.strip(),
-        'description': div.find('p').text.strip(),
-    }
-
-    metadata_divs = div.find_all('div', class_='col-span-3')
-    for metadata_div in metadata_divs:
-        icon = metadata_div.find('div').find('svg')['viewBox']
-        value = metadata_div.find('span').text.strip()
-        dataset[icon] = value
-
-    datasets.append(dataset)
-
-with open('uci_datasets.json', 'w') as f:
-    json.dump(datasets, f, indent=2)
-
-print("Datos guardados en uci_datasets.json")
-
-
-# 3. Realiza un raspado web de la tabla de presidentes
-# y guarda los datos como JSON
-# (https://en.wikipedia.org/wiki/List_of_presidents_of_the_United_States).
-# La tabla no está muy estructurada
-# y el proceso de raspado puede llevar mucho tiempo.
-
-
-url = 'https://en.wikipedia.org/wiki/List_of_presidents_of_the_United_States'
-response = requests.get(url)
-soup = BeautifulSoup(response.text, 'html.parser')
-
-table = soup.find('table', {'class': 'wikitable'})
-headers = [header.get_text().strip() for header in table.find_all('th')]
-rows = []
-
-for row in table.find_all('tr'):
-    cells = row.find_all('td')
-    if len(cells) == len(headers):
-        rows.append([cell.get_text().strip() for cell in cells])
-
-data = []
-for row in rows:
-    president = {}
-    for i, header in enumerate(headers):
-        if i < len(row):  # Verificar si hay celdas suficientes en la fila
-            if header == 'President':
-                president['name'] = row[i]
-            elif header == 'Party':
-                president['party'] = row[i]
-            elif header == 'State[a]':
-                president['state'] = row[i]
-            elif header == 'Took office':
-                president['took_office'] = row[i]
-            elif header == 'Left office':
-                president['left_office'] = row[i]
-    data.append(president)
-
-with open('us_presidents.json', 'w') as f:
-    json.dump(data, f, indent=2)
-
-print("Datos guardados en us_presidents.json")