Add exercises 21 - Solved 2

Signed-off-by: Manuel Vergara <manuel@vergaracarmona.es>
2023-10-05 00:01:26 +02:00 · 2023-10-05 00:01:26 +02:00 · 7b2cd6c376
commit 7b2cd6c376
parent 6677000878
2 changed files with 52 additions and 1 deletions
--- a/30-days-of-python/22_Web_scraping/02_web_scraping.py
+++ b/30-days-of-python/22_Web_scraping/02_web_scraping.py
@ -0,0 +1,34 @@
+"""
+01_web_scraping.py
+"""
+import requests
+from bs4 import BeautifulSoup
+import json
+
+# 2. Extrae la tabla de esta URL
+# (https://archive.ics.uci.edu/ml/datasets.php)
+# y conviértela en un archivo JSON.
+
+url = 'https://archive.ics.uci.edu/'
+response = requests.get(url)
+soup = BeautifulSoup(response.text, 'html.parser')
+
+data = {}
+
+for section in soup.find_all('section', class_='rounded-box'):
+
+    section_name = section.h1.get_text().strip()
+    section_data = {}
+
+    for div in section.find_all('div', class_='rounded-box'):
+        key = div.find('a', {'class': 'link-hover'}).get_text().strip()
+        value = div.find('p', {'class': 'truncate'}).get_text().strip()
+        section_data[key] = value
+
+    data[section_name] = section_data
+
+
+with open('uci_datasets.json', 'w') as f:
+    json.dump(data, f, indent=2)
+
+print("Datos guardados en uci_datasets.json")
--- a/30-days-of-python/22_Web_scraping/uci_datasets.json
+++ b/30-days-of-python/22_Web_scraping/uci_datasets.json
@ -1 +1,18 @@
-[]
+{
+  "Popular Datasets": {
+    "Iris": "A small classic dataset from Fisher, 1936. One of the earliest known datasets used for evaluating classification methods.",
+    "Heart Disease": "4 databases: Cleveland, Hungary, Switzerland, and the VA Long Beach",
+    "Adult": "Predict whether income exceeds $50K/yr based on census data. Also known as \"Census Income\" dataset.",
+    "Wine": "Using chemical analysis to determine the origin of wines",
+    "Dry Bean Dataset": "Images of 13,611 grains of 7 different registered dry beans were taken with a high-resolution camera. A total of 16 features; 12 dimensions and 4 shape forms, were obtained from the grains.",
+    "Diabetes": "This diabetes dataset is from AIM '94"
+  },
+  "New Datasets": {
+    "TCGA Kidney Cancers": "The TCGA Kidney Cancers Dataset is a bulk RNA-seq dataset that contains transcriptome profiles of patients diagnosed with three different subtypes of kidney cancers. This dataset can be used to make predictions about the specific subtype of kidney cancers given the normalized transcriptome profile data, as well as providing a hands-on experience on large and sparse genomic information.",
+    "CDC Diabetes Health Indicators": "The Diabetes Health Indicators Dataset contains healthcare statistics and lifestyle survey information about people in general along with their diagnosis of diabetes. The 35 features consist of some demographics, lab test results, and answers to survey questions for each patient. The target variable for classification is whether a patient has diabetes, is pre-diabetic, or healthy.",
+    "AIDS Clinical Trials Group Study 175": "The AIDS Clinical Trials Group Study 175 Dataset contains healthcare statistics and categorical information about patients who have been diagnosed with AIDS. This dataset was initially published in 1996. The prediction task is to predict whether or not each patient died within a certain window of time or not.",
+    "National Health and Nutrition Health Survey 2013-2014 (NHANES) Age Prediction Subset": "The National Health and Nutrition Examination Survey (NHANES), administered by the Centers for Disease Control and Prevention (CDC), collects extensive health and nutritional information from a diverse U.S. population. Though expansive, the dataset is often too broad for specific analytical purposes. In this sub-dataset, we narrow our focus to predicting respondents' age by extracting a subset of features from the larger NHANES dataset. These selected features include physiological measurements, lifestyle choices, and biochemical markers, which were hypothesized to have strong correlations with age.",
+    "Large-scale Wave Energy Farm": "Wave energy is a rapidly advancing and promising renewable energy source that holds great potential for addressing the challenges of global warming and climate change. However, optimizing energy output in large wave farms presents a complex problem due to the expensive calculations required to account for hydrodynamic interactions between wave energy converters (WECs). Developing a fast and accurate surrogate model is crucial to overcome these challenges. In light of this, we have compiled an extensive WEC dataset that includes 54,000 and 9,600 configurations involving 49 and 100 WECs, coordination, power, q-factor, and total farm power output. The dataset was derived from a study published at the GECCO conference and received the prestigious Best Paper award. We want to acknowledge the support of the University of Adelaide Phoenix HPC service in conducting this research. For more details, please refer to the following link: https://dl.acm.org/doi/abs/10.1145/3377930.3390235.",
+    "SUPPORT2": "This dataset comprises 9105 individual critically ill patients across 5 United States medical centers, accessioned throughout 1989-1991 and 1992-1994.\nEach row concerns hospitalized patient records who met the inclusion and exclusion criteria for nine disease categories: acute respiratory failure, chronic obstructive pulmonary disease, congestive heart failure, liver disease, coma, colon cancer, lung cancer, multiple organ system failure with malignancy, and multiple organ system failure with sepsis. The goal is to determine these patients' 2- and 6-month survival rates based on several physiologic, demographics, and disease severity information. \nIt is an important problem because it addresses the growing national concern over patients' loss of control near the end of life. It enables earlier decisions and planning to reduce the frequency of a mechanical, painful, and prolonged dying process."
+  }
+}