library(readxl)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.6
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.1 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.2.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Explanation: Load packages for reading Excel files and data manipulation.
github_url <- "https://raw.githubusercontent.com/DACSS-601/hw5-2/main/HDR25_Statistical_Annex_HDI_Table.xlsx"
download.file(github_url, "HDI_Table.xlsx", mode = "wb", quiet = TRUE)
hdi_data <- read_excel("HDI_Table.xlsx")
## New names:
## • `` -> `...1`
## • `` -> `...3`
## • `` -> `...4`
## • `` -> `...5`
## • `` -> `...6`
## • `` -> `...7`
## • `` -> `...8`
## • `` -> `...9`
## • `` -> `...10`
## • `` -> `...11`
## • `` -> `...12`
## • `` -> `...13`
## • `` -> `...14`
## • `` -> `...15`
Explanation: Download HDI Excel file from GitHub and read it into R.
hdi_data <- read_excel("HDI_Table.xlsx", skip = 5)
## New names:
## • `` -> `...4`
## • `(years)` -> `(years)...5`
## • `` -> `...6`
## • `(years)` -> `(years)...7`
## • `` -> `...8`
## • `(years)` -> `(years)...9`
## • `` -> `...10`
## • `` -> `...12`
## • `` -> `...13`
## • `` -> `...14`
## • `` -> `...15`
head(hdi_data, 15)
## # A tibble: 15 × 15
## `HDI rank` Country Value ...4 `(years)...5` ...6 `(years)...7` ...8
## <dbl> <chr> <chr> <lgl> <dbl> <chr> <dbl> <chr>
## 1 NA <NA> 2023 NA 2023 <NA> 2023 a
## 2 NA Very high hum… <NA> NA NA <NA> NA <NA>
## 3 1 Iceland 0.97… NA 82.7 <NA> 18.9 c
## 4 2 Norway 0.97 NA 83.3 <NA> 18.8 c
## 5 2 Switzerland 0.97 NA 84.0 <NA> 16.7 <NA>
## 6 4 Denmark 0.96… NA 81.9 <NA> 18.7 c
## 7 5 Germany 0.95… NA 81.4 <NA> 17.3 <NA>
## 8 5 Sweden 0.95… NA 83.3 <NA> 19.0 c
## 9 7 Australia 0.95… NA 83.9 <NA> 20.7 c
## 10 8 Hong Kong, Ch… 0.95… NA 85.5 g 16.9 <NA>
## 11 8 Netherlands 0.95… NA 82.2 <NA> 18.6 c
## 12 10 Belgium 0.95… NA 82.1 <NA> 19.0 c
## 13 11 Ireland 0.94… NA 82.4 <NA> 19.2 c
## 14 12 Finland 0.94… NA 81.9 <NA> 19.5 c
## 15 13 Singapore 0.94… NA 83.7 <NA> 16.7 <NA>
## # ℹ 7 more variables: `(years)...9` <chr>, ...10 <chr>, `(2021 PPP $)` <chr>,
## # ...12 <chr>, ...13 <chr>, ...14 <chr>, ...15 <chr>
colnames(hdi_data)
## [1] "HDI rank" "Country" "Value" "...4" "(years)...5"
## [6] "...6" "(years)...7" "...8" "(years)...9" "...10"
## [11] "(2021 PPP $)" "...12" "...13" "...14" "...15"
Explanation: Skip first 5 rows to get past the header formatting, then view the data structure.
hdi_data[1:20, ]
## # A tibble: 20 × 15
## `HDI rank` Country Value ...4 `(years)...5` ...6 `(years)...7` ...8
## <dbl> <chr> <chr> <lgl> <dbl> <chr> <dbl> <chr>
## 1 NA <NA> 2023 NA 2023 <NA> 2023 a
## 2 NA Very high hum… <NA> NA NA <NA> NA <NA>
## 3 1 Iceland 0.97… NA 82.7 <NA> 18.9 c
## 4 2 Norway 0.97 NA 83.3 <NA> 18.8 c
## 5 2 Switzerland 0.97 NA 84.0 <NA> 16.7 <NA>
## 6 4 Denmark 0.96… NA 81.9 <NA> 18.7 c
## 7 5 Germany 0.95… NA 81.4 <NA> 17.3 <NA>
## 8 5 Sweden 0.95… NA 83.3 <NA> 19.0 c
## 9 7 Australia 0.95… NA 83.9 <NA> 20.7 c
## 10 8 Hong Kong, Ch… 0.95… NA 85.5 g 16.9 <NA>
## 11 8 Netherlands 0.95… NA 82.2 <NA> 18.6 c
## 12 10 Belgium 0.95… NA 82.1 <NA> 19.0 c
## 13 11 Ireland 0.94… NA 82.4 <NA> 19.2 c
## 14 12 Finland 0.94… NA 81.9 <NA> 19.5 c
## 15 13 Singapore 0.94… NA 83.7 <NA> 16.7 <NA>
## 16 13 United Kingdom 0.94… NA 81.3 <NA> 17.8 <NA>
## 17 15 United Arab E… 0.94 NA 82.9 <NA> 15.6 <NA>
## 18 16 Canada 0.93… NA 82.6 <NA> 15.9 <NA>
## 19 17 Liechtenstein 0.93… NA 83.6 <NA> 15.4 <NA>
## 20 17 New Zealand 0.93… NA 82.1 <NA> 19.3 c
## # ℹ 7 more variables: `(years)...9` <chr>, ...10 <chr>, `(2021 PPP $)` <chr>,
## # ...12 <chr>, ...13 <chr>, ...14 <chr>, ...15 <chr>
str(hdi_data)
## tibble [273 × 15] (S3: tbl_df/tbl/data.frame)
## $ HDI rank : num [1:273] NA NA 1 2 2 4 5 5 7 8 ...
## $ Country : chr [1:273] NA "Very high human development" "Iceland" "Norway" ...
## $ Value : chr [1:273] "2023" NA "0.97199999999999998" "0.97" ...
## $ ...4 : logi [1:273] NA NA NA NA NA NA ...
## $ (years)...5 : num [1:273] 2023 NA 82.7 83.3 84 ...
## $ ...6 : chr [1:273] NA NA NA NA ...
## $ (years)...7 : num [1:273] 2023 NA 18.9 18.8 16.7 ...
## $ ...8 : chr [1:273] "a" NA "c" "c" ...
## $ (years)...9 : chr [1:273] "2023" NA "13.908926279999999" "13.117962179999999" ...
## $ ...10 : chr [1:273] "a" NA "d" "e" ...
## $ (2021 PPP $): chr [1:273] "2023" NA "69116.937359999996" "112710.0211" ...
## $ ...12 : chr [1:273] NA NA NA "f" ...
## $ ...13 : chr [1:273] "2023" NA "12" "0" ...
## $ ...14 : chr [1:273] "b" NA NA NA ...
## $ ...15 : chr [1:273] "2022" NA "3" "1" ...
Explanation: View more rows and data types to understand which columns correspond to which indicators.
hdi_clean <- hdi_data %>%
select(Country, "(years)...5", "(years)...7", "(years)...9", "(2021 PPP $)") %>%
rename(Life_Expectancy = "(years)...5",
Expected_Schooling = "(years)...7",
Mean_Schooling = "(years)...9",
GNI_per_Capita = "(2021 PPP $)")
Explanation: Select the 4 required columns and rename them with underscores for clarity.
hdi_clean <- hdi_clean %>%
filter(!is.na(as.numeric(Life_Expectancy, na.action = na.pass))) %>%
mutate(Life_Expectancy = as.numeric(Life_Expectancy),
Expected_Schooling = as.numeric(Expected_Schooling),
Mean_Schooling = as.numeric(Mean_Schooling),
GNI_per_Capita = as.numeric(GNI_per_Capita))
## Warning: There were 2 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `Mean_Schooling = as.numeric(Mean_Schooling)`.
## Caused by warning:
## ! NAs introduced by coercion
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
Explanation: Keep only rows with numeric data and convert all columns to numeric type.
hdi_clean <- hdi_clean %>% drop_na()
head(hdi_clean)
## # A tibble: 6 × 5
## Country Life_Expectancy Expected_Schooling Mean_Schooling GNI_per_Capita
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Iceland 82.7 18.9 13.9 69117.
## 2 Norway 83.3 18.8 13.1 112710.
## 3 Switzerland 84.0 16.7 13.9 81949.
## 4 Denmark 81.9 18.7 13.0 76008.
## 5 Germany 81.4 17.3 14.3 64053.
## 6 Sweden 83.3 19.0 12.7 66102.
Explanation: Remove any rows with missing values in any column, then view the cleaned data.
means <- hdi_clean %>% summarise(
Mean_Life_Expectancy = mean(Life_Expectancy, na.rm = TRUE),
Mean_Expected_Schooling = mean(Expected_Schooling, na.rm = TRUE),
Mean_Mean_Schooling = mean(Mean_Schooling, na.rm = TRUE),
Mean_GNI_per_Capita = mean(GNI_per_Capita, na.rm = TRUE))
means
## # A tibble: 1 × 4
## Mean_Life_Expectancy Mean_Expected_Schooling Mean_Mean_Schooling
## <dbl> <dbl> <dbl>
## 1 73.1 13.5 9.11
## # ℹ 1 more variable: Mean_GNI_per_Capita <dbl>
Explanation: Calculate the mean of each of the four variables across all countries.
write.csv(hdi_clean, "hdi_cleaned.csv", row.names = FALSE)
Explanation: Save the cleaned dataset as CSV file.
ggplot(hdi_clean, aes(x = Life_Expectancy)) + geom_histogram(bins = 20, fill = "steelblue", color = "black") +
labs(title = "Distribution of Life Expectancy at Birth", x = "Years", y = "Number of Countries") + theme_minimal()
Explanation: Create histogram showing distribution of life expectancy across countries.
ggplot(hdi_clean, aes(x = GNI_per_Capita, y = Life_Expectancy)) + geom_point(color = "darkred", size = 3) +
labs(title = "Life Expectancy vs GNI per Capita", x = "GNI per Capita (2021 PPP $)", y = "Life Expectancy (years)") + theme_minimal()
Explanation: Create scatter plot showing relationship between GNI and life expectancy.
ggplot(hdi_clean, aes(x = Expected_Schooling, y = Mean_Schooling)) + geom_point(color = "darkgreen", size = 3) +
labs(title = "Expected Years of Schooling vs Mean Years of Schooling", x = "Expected Years", y = "Mean Years") + theme_minimal()
Explanation: Create scatter plot comparing expected and actual years of schooling.