From 2cd9b703b2f8e7ca5bb94364a733165e3ee0dfe1 Mon Sep 17 00:00:00 2001 From: Manuel Vergara Date: Tue, 3 Oct 2023 23:50:51 +0200 Subject: [PATCH] Add exercises 21 - Not solved Signed-off-by: Manuel Vergara --- .../22_Web_scraping/01_web_scraping.py | 109 ++++++++++++++++++ 30-days-of-python/22_Web_scraping/README.md | 2 + .../22_Web_scraping/bu_stats.json | 48 ++++++++ .../22_Web_scraping/uci_datasets.json | 1 + .../22_Web_scraping/us_presidents.json | 1 + 5 files changed, 161 insertions(+) create mode 100644 30-days-of-python/22_Web_scraping/01_web_scraping.py create mode 100644 30-days-of-python/22_Web_scraping/bu_stats.json create mode 100644 30-days-of-python/22_Web_scraping/uci_datasets.json create mode 100644 30-days-of-python/22_Web_scraping/us_presidents.json diff --git a/30-days-of-python/22_Web_scraping/01_web_scraping.py b/30-days-of-python/22_Web_scraping/01_web_scraping.py new file mode 100644 index 0000000..042e70a --- /dev/null +++ b/30-days-of-python/22_Web_scraping/01_web_scraping.py @@ -0,0 +1,109 @@ +""" +01_web_scraping.py +""" +import requests +from bs4 import BeautifulSoup +import json + + +# 1. Realiza un raspado web del siguiente sitio web +# y guarda los datos en un archivo JSON +# (URL = 'http://www.bu.edu/president/boston-university-facts-stats/'). + + +url = 'http://www.bu.edu/president/boston-university-facts-stats/' +response = requests.get(url) +soup = BeautifulSoup(response.text, 'html.parser') + +data = {} +current_section = None + +for section in soup.find_all('section', {'class': 'facts-categories'}): + + section_data = {} + + for item in section.find_all('div', {'class': 'facts-wrapper'}): + section_name = section.find('h5').get_text().strip() + for li in item.find_all('li'): + key = li.find('p', {'class': 'text'}).get_text().strip() + value = li.find('span', {'class': 'value'}).get_text().strip() + section_data[key] = value + + data[section_name] = section_data + +with open('bu_stats.json', 'w') as f: + json.dump(data, f, indent=2) + +print("Datos guardados en bu_stats.json") + +# 2. Extrae la tabla de esta URL +# (https://archive.ics.uci.edu/ml/datasets.php) +# y conviértela en un archivo JSON. + +url = 'https://webcache.googleusercontent.com/search?q=cache:tT4BY9X5RxAJ:https://archive.ics.uci.edu/datasets&cd=8&hl=ca&ct=clnk&gl=es' +response = requests.get(url) +soup = BeautifulSoup(response.text, 'html.parser') + +datasets = [] + +for div in soup.find_all('div', class_='rounded-box'): + dataset = { + 'name': div.find('h2').find('a').text.strip(), + 'description': div.find('p').text.strip(), + } + + metadata_divs = div.find_all('div', class_='col-span-3') + for metadata_div in metadata_divs: + icon = metadata_div.find('div').find('svg')['viewBox'] + value = metadata_div.find('span').text.strip() + dataset[icon] = value + + datasets.append(dataset) + +with open('uci_datasets.json', 'w') as f: + json.dump(datasets, f, indent=2) + +print("Datos guardados en uci_datasets.json") + + +# 3. Realiza un raspado web de la tabla de presidentes +# y guarda los datos como JSON +# (https://en.wikipedia.org/wiki/List_of_presidents_of_the_United_States). +# La tabla no está muy estructurada +# y el proceso de raspado puede llevar mucho tiempo. + + +url = 'https://en.wikipedia.org/wiki/List_of_presidents_of_the_United_States' +response = requests.get(url) +soup = BeautifulSoup(response.text, 'html.parser') + +table = soup.find('table', {'class': 'wikitable'}) +headers = [header.get_text().strip() for header in table.find_all('th')] +rows = [] + +for row in table.find_all('tr'): + cells = row.find_all('td') + if len(cells) == len(headers): + rows.append([cell.get_text().strip() for cell in cells]) + +data = [] +for row in rows: + president = {} + for i, header in enumerate(headers): + if i < len(row): # Verificar si hay celdas suficientes en la fila + if header == 'President': + president['name'] = row[i] + elif header == 'Party': + president['party'] = row[i] + elif header == 'State[a]': + president['state'] = row[i] + elif header == 'Took office': + president['took_office'] = row[i] + elif header == 'Left office': + president['left_office'] = row[i] + data.append(president) + +with open('us_presidents.json', 'w') as f: + json.dump(data, f, indent=2) + +print("Datos guardados en us_presidents.json") diff --git a/30-days-of-python/22_Web_scraping/README.md b/30-days-of-python/22_Web_scraping/README.md index 6b3d2d0..54790ac 100644 --- a/30-days-of-python/22_Web_scraping/README.md +++ b/30-days-of-python/22_Web_scraping/README.md @@ -10,4 +10,6 @@ Documento original en inglés: [Web Scraping](https://github.com/Asabeneh/30-Day 3. Realiza un raspado web de la tabla de presidentes y guarda los datos como JSON (https://en.wikipedia.org/wiki/List_of_presidents_of_the_United_States). La tabla no está muy estructurada y el proceso de raspado puede llevar mucho tiempo. +[Solución](01_web_scraping.py) + [<< Day 21](../21_Clases_y_objetos/README.md) | [Day 23 >>](../23_Entorno_virtual/README.md) diff --git a/30-days-of-python/22_Web_scraping/bu_stats.json b/30-days-of-python/22_Web_scraping/bu_stats.json new file mode 100644 index 0000000..86023a1 --- /dev/null +++ b/30-days-of-python/22_Web_scraping/bu_stats.json @@ -0,0 +1,48 @@ +{ + "Community": { + "Student Body": "37,557", + "Living Alumni": "431,000+", + "Total Employees": "10,674", + "Faculty": "4,309", + "Nondegree Students": "1,337", + "Graduate & Professional Students": "18,476", + "Undergraduate Students": "17,744", + "Classrooms": "848", + "Buildings": "343", + "Laboratories": "1,481", + "Libraries": "13", + "Campus Area (acres)": "140", + "Study Abroad Programs": "80+", + "Average Class Size": "30", + "Student/Faculty Ratio": "11:1", + "Schools and Colleges": "17", + "Programs of Study": "300+", + "Research Expenditures (FY22)": "$630.7M", + "Research Awards": "$674M", + "BMC Clinical Research Grants (FY22)": "$82M", + "Average Total Need-Based Financial Aid": "$57,237", + "Average Need-Based Grant/Scholarship": "$53,029", + "Grants & Scholarships (need-based)": "$388.4M", + "Grants & Scholarships (non-need-based)": "$26.5M", + "Community Service Hours": "130,000+", + "Alternative Service Breaks Participants": "65+", + "BU on Social": "new accounts daily", + "Cultural & Religious Organizations": "80+", + "Community Service & Justice Organizations": "70+", + "Academic & Professional Organizations": "140+", + "Art & Performance Organizations": "60+", + "Student Organizations": "450+", + "First-Year Student Outreach Project Volunteers": "400+", + "Faculty Publications": "7,000+", + "Student UROP Participants": "450+", + "Centers & Institutes": "130+", + "Global Initiatives": "300+", + "Cultural Student Groups": "60+", + "Alumni Countries": "180+", + "International Students": "10,000+", + "Intramural Sports & Tournaments": "12+", + "Club and Intramural Sports Participants": "7,000+", + "Club Sports": "36", + "Varsity Sports": "24" + } +} \ No newline at end of file diff --git a/30-days-of-python/22_Web_scraping/uci_datasets.json b/30-days-of-python/22_Web_scraping/uci_datasets.json new file mode 100644 index 0000000..0637a08 --- /dev/null +++ b/30-days-of-python/22_Web_scraping/uci_datasets.json @@ -0,0 +1 @@ +[] \ No newline at end of file diff --git a/30-days-of-python/22_Web_scraping/us_presidents.json b/30-days-of-python/22_Web_scraping/us_presidents.json new file mode 100644 index 0000000..0637a08 --- /dev/null +++ b/30-days-of-python/22_Web_scraping/us_presidents.json @@ -0,0 +1 @@ +[] \ No newline at end of file