library(readxl)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.1     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.2.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Explanation: Load packages for reading Excel files and data manipulation.

github_url <- "https://raw.githubusercontent.com/DACSS-601/hw5-2/main/HDR25_Statistical_Annex_HDI_Table.xlsx"
download.file(github_url, "HDI_Table.xlsx", mode = "wb", quiet = TRUE)
hdi_data <- read_excel("HDI_Table.xlsx")
## New names:
## • `` -> `...1`
## • `` -> `...3`
## • `` -> `...4`
## • `` -> `...5`
## • `` -> `...6`
## • `` -> `...7`
## • `` -> `...8`
## • `` -> `...9`
## • `` -> `...10`
## • `` -> `...11`
## • `` -> `...12`
## • `` -> `...13`
## • `` -> `...14`
## • `` -> `...15`

Explanation: Download HDI Excel file from GitHub and read it into R.

hdi_data <- read_excel("HDI_Table.xlsx", skip = 5)
## New names:
## • `` -> `...4`
## • `(years)` -> `(years)...5`
## • `` -> `...6`
## • `(years)` -> `(years)...7`
## • `` -> `...8`
## • `(years)` -> `(years)...9`
## • `` -> `...10`
## • `` -> `...12`
## • `` -> `...13`
## • `` -> `...14`
## • `` -> `...15`
head(hdi_data, 15)
## # A tibble: 15 × 15
##    `HDI rank` Country        Value ...4  `(years)...5` ...6  `(years)...7` ...8 
##         <dbl> <chr>          <chr> <lgl>         <dbl> <chr>         <dbl> <chr>
##  1         NA <NA>           2023  NA           2023   <NA>         2023   a    
##  2         NA Very high hum… <NA>  NA             NA   <NA>           NA   <NA> 
##  3          1 Iceland        0.97… NA             82.7 <NA>           18.9 c    
##  4          2 Norway         0.97  NA             83.3 <NA>           18.8 c    
##  5          2 Switzerland    0.97  NA             84.0 <NA>           16.7 <NA> 
##  6          4 Denmark        0.96… NA             81.9 <NA>           18.7 c    
##  7          5 Germany        0.95… NA             81.4 <NA>           17.3 <NA> 
##  8          5 Sweden         0.95… NA             83.3 <NA>           19.0 c    
##  9          7 Australia      0.95… NA             83.9 <NA>           20.7 c    
## 10          8 Hong Kong, Ch… 0.95… NA             85.5 g              16.9 <NA> 
## 11          8 Netherlands    0.95… NA             82.2 <NA>           18.6 c    
## 12         10 Belgium        0.95… NA             82.1 <NA>           19.0 c    
## 13         11 Ireland        0.94… NA             82.4 <NA>           19.2 c    
## 14         12 Finland        0.94… NA             81.9 <NA>           19.5 c    
## 15         13 Singapore      0.94… NA             83.7 <NA>           16.7 <NA> 
## # ℹ 7 more variables: `(years)...9` <chr>, ...10 <chr>, `(2021 PPP $)` <chr>,
## #   ...12 <chr>, ...13 <chr>, ...14 <chr>, ...15 <chr>
colnames(hdi_data)
##  [1] "HDI rank"     "Country"      "Value"        "...4"         "(years)...5" 
##  [6] "...6"         "(years)...7"  "...8"         "(years)...9"  "...10"       
## [11] "(2021 PPP $)" "...12"        "...13"        "...14"        "...15"

Explanation: Skip first 5 rows to get past the header formatting, then view the data structure.

hdi_data[1:20, ]
## # A tibble: 20 × 15
##    `HDI rank` Country        Value ...4  `(years)...5` ...6  `(years)...7` ...8 
##         <dbl> <chr>          <chr> <lgl>         <dbl> <chr>         <dbl> <chr>
##  1         NA <NA>           2023  NA           2023   <NA>         2023   a    
##  2         NA Very high hum… <NA>  NA             NA   <NA>           NA   <NA> 
##  3          1 Iceland        0.97… NA             82.7 <NA>           18.9 c    
##  4          2 Norway         0.97  NA             83.3 <NA>           18.8 c    
##  5          2 Switzerland    0.97  NA             84.0 <NA>           16.7 <NA> 
##  6          4 Denmark        0.96… NA             81.9 <NA>           18.7 c    
##  7          5 Germany        0.95… NA             81.4 <NA>           17.3 <NA> 
##  8          5 Sweden         0.95… NA             83.3 <NA>           19.0 c    
##  9          7 Australia      0.95… NA             83.9 <NA>           20.7 c    
## 10          8 Hong Kong, Ch… 0.95… NA             85.5 g              16.9 <NA> 
## 11          8 Netherlands    0.95… NA             82.2 <NA>           18.6 c    
## 12         10 Belgium        0.95… NA             82.1 <NA>           19.0 c    
## 13         11 Ireland        0.94… NA             82.4 <NA>           19.2 c    
## 14         12 Finland        0.94… NA             81.9 <NA>           19.5 c    
## 15         13 Singapore      0.94… NA             83.7 <NA>           16.7 <NA> 
## 16         13 United Kingdom 0.94… NA             81.3 <NA>           17.8 <NA> 
## 17         15 United Arab E… 0.94  NA             82.9 <NA>           15.6 <NA> 
## 18         16 Canada         0.93… NA             82.6 <NA>           15.9 <NA> 
## 19         17 Liechtenstein  0.93… NA             83.6 <NA>           15.4 <NA> 
## 20         17 New Zealand    0.93… NA             82.1 <NA>           19.3 c    
## # ℹ 7 more variables: `(years)...9` <chr>, ...10 <chr>, `(2021 PPP $)` <chr>,
## #   ...12 <chr>, ...13 <chr>, ...14 <chr>, ...15 <chr>
str(hdi_data)
## tibble [273 × 15] (S3: tbl_df/tbl/data.frame)
##  $ HDI rank    : num [1:273] NA NA 1 2 2 4 5 5 7 8 ...
##  $ Country     : chr [1:273] NA "Very high human development" "Iceland" "Norway" ...
##  $ Value       : chr [1:273] "2023" NA "0.97199999999999998" "0.97" ...
##  $ ...4        : logi [1:273] NA NA NA NA NA NA ...
##  $ (years)...5 : num [1:273] 2023 NA 82.7 83.3 84 ...
##  $ ...6        : chr [1:273] NA NA NA NA ...
##  $ (years)...7 : num [1:273] 2023 NA 18.9 18.8 16.7 ...
##  $ ...8        : chr [1:273] "a" NA "c" "c" ...
##  $ (years)...9 : chr [1:273] "2023" NA "13.908926279999999" "13.117962179999999" ...
##  $ ...10       : chr [1:273] "a" NA "d" "e" ...
##  $ (2021 PPP $): chr [1:273] "2023" NA "69116.937359999996" "112710.0211" ...
##  $ ...12       : chr [1:273] NA NA NA "f" ...
##  $ ...13       : chr [1:273] "2023" NA "12" "0" ...
##  $ ...14       : chr [1:273] "b" NA NA NA ...
##  $ ...15       : chr [1:273] "2022" NA "3" "1" ...

Explanation: View more rows and data types to understand which columns correspond to which indicators.

hdi_clean <- hdi_data %>% 
  select(Country, "(years)...5", "(years)...7", "(years)...9", "(2021 PPP $)") %>%
  rename(Life_Expectancy = "(years)...5",
         Expected_Schooling = "(years)...7",
         Mean_Schooling = "(years)...9",
         GNI_per_Capita = "(2021 PPP $)")

Explanation: Select the 4 required columns and rename them with underscores for clarity.

hdi_clean <- hdi_clean %>% 
  filter(!is.na(as.numeric(Life_Expectancy, na.action = na.pass))) %>%
  mutate(Life_Expectancy = as.numeric(Life_Expectancy),
         Expected_Schooling = as.numeric(Expected_Schooling),
         Mean_Schooling = as.numeric(Mean_Schooling),
         GNI_per_Capita = as.numeric(GNI_per_Capita))
## Warning: There were 2 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `Mean_Schooling = as.numeric(Mean_Schooling)`.
## Caused by warning:
## ! NAs introduced by coercion
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.

Explanation: Keep only rows with numeric data and convert all columns to numeric type.

hdi_clean <- hdi_clean %>% drop_na()
head(hdi_clean)
## # A tibble: 6 × 5
##   Country     Life_Expectancy Expected_Schooling Mean_Schooling GNI_per_Capita
##   <chr>                 <dbl>              <dbl>          <dbl>          <dbl>
## 1 Iceland                82.7               18.9           13.9         69117.
## 2 Norway                 83.3               18.8           13.1        112710.
## 3 Switzerland            84.0               16.7           13.9         81949.
## 4 Denmark                81.9               18.7           13.0         76008.
## 5 Germany                81.4               17.3           14.3         64053.
## 6 Sweden                 83.3               19.0           12.7         66102.

Explanation: Remove any rows with missing values in any column, then view the cleaned data.

means <- hdi_clean %>% summarise(
  Mean_Life_Expectancy = mean(Life_Expectancy, na.rm = TRUE),
  Mean_Expected_Schooling = mean(Expected_Schooling, na.rm = TRUE),
  Mean_Mean_Schooling = mean(Mean_Schooling, na.rm = TRUE),
  Mean_GNI_per_Capita = mean(GNI_per_Capita, na.rm = TRUE))
means
## # A tibble: 1 × 4
##   Mean_Life_Expectancy Mean_Expected_Schooling Mean_Mean_Schooling
##                  <dbl>                   <dbl>               <dbl>
## 1                 73.1                    13.5                9.11
## # ℹ 1 more variable: Mean_GNI_per_Capita <dbl>

Explanation: Calculate the mean of each of the four variables across all countries.

write.csv(hdi_clean, "hdi_cleaned.csv", row.names = FALSE)

Explanation: Save the cleaned dataset as CSV file.

ggplot(hdi_clean, aes(x = Life_Expectancy)) + geom_histogram(bins = 20, fill = "steelblue", color = "black") +
  labs(title = "Distribution of Life Expectancy at Birth", x = "Years", y = "Number of Countries") + theme_minimal()

Explanation: Create histogram showing distribution of life expectancy across countries.

ggplot(hdi_clean, aes(x = GNI_per_Capita, y = Life_Expectancy)) + geom_point(color = "darkred", size = 3) +
  labs(title = "Life Expectancy vs GNI per Capita", x = "GNI per Capita (2021 PPP $)", y = "Life Expectancy (years)") + theme_minimal()

Explanation: Create scatter plot showing relationship between GNI and life expectancy.

ggplot(hdi_clean, aes(x = Expected_Schooling, y = Mean_Schooling)) + geom_point(color = "darkgreen", size = 3) +
  labs(title = "Expected Years of Schooling vs Mean Years of Schooling", x = "Expected Years", y = "Mean Years") + theme_minimal()

Explanation: Create scatter plot comparing expected and actual years of schooling.